Skip to content

Commit

Permalink
bounds command
Browse files Browse the repository at this point in the history
  • Loading branch information
Brian Ondov committed Apr 6, 2016
1 parent bacb5f6 commit e5a8e6a
Show file tree
Hide file tree
Showing 7 changed files with 125 additions and 9 deletions.
1 change: 1 addition & 0 deletions Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ endif

SOURCES=\
src/mash/Command.cpp \
src/mash/CommandBounds.cpp \
src/mash/CommandContain.cpp \
src/mash/CommandDistance.cpp \
src/mash/CommandFind.cpp \
Expand Down
2 changes: 1 addition & 1 deletion src/mash/Command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ Command::Command()
addAvailableOption("reads", Option(Option::Boolean, "r", "Sketch", "Input is a read set. See Reads options below. Incompatible with -i.", ""));
addAvailableOption("memory", Option(Option::Size, "b", "Reads", "Memory bound for k-mer copy counting (raw bytes or with K/M/G/T). If specified, a Bloom filter will be used instead of exact counts, so some unique k-mers will pass erroneously, and copies cannot be counted beyond 2. Implies -r, -m 2."));
addAvailableOption("minCov", Option(Option::Integer, "m", "Reads", "Minimum copies of each k-mer required to pass noise filter for reads. Implies -r.", "2"));
addAvailableOption("targetCov", Option(Option::Integer, "c", "Reads", "Target coverage. Sketching will conclude if this coverage is reached before the end of the input file (estimated by average k-mer multiplicity). Implies -r.", "10"));
addAvailableOption("targetCov", Option(Option::Number, "c", "Reads", "Target coverage. Sketching will conclude if this coverage is reached before the end of the input file (estimated by average k-mer multiplicity). Implies -r.", "10"));
addAvailableOption("noncanonical", Option(Option::Boolean, "n", "Alphabet", "Preserve strand (by default, strand is ignored by using canonical DNA k-mers, which are alphabetical minima of forward-reverse pairs). Implied if an alphabet is specified with -a or -z.", ""));
addAvailableOption("protein", Option(Option::Boolean, "a", "Alphabet", "Use amino acid alphabet (A-Z, except BJOUXZ). Implies -n, -k 9.", ""));
addAvailableOption("alphabet", Option(Option::String, "z", "Alphabet", "Alphabet to base hashes on (case ignored). K-mers with other characters will be ignored. Implies -n.", ""));
Expand Down
95 changes: 95 additions & 0 deletions src/mash/CommandBounds.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
// Copyright © 2015, Battelle National Biodefense Institute (BNBI);
// all rights reserved. Authored by: Brian Ondov, Todd Treangen,
// Sergey Koren, and Adam Phillippy
//
// See the LICENSE.txt file included with this software for license information.

#include "CommandBounds.h"
#include <iostream>
#include <math.h>

#ifdef USE_BOOST
#include <boost/math/distributions/binomial.hpp>
using namespace::boost::math;
#else
#include <gsl/gsl_cdf.h>
#endif

using namespace::std;

CommandBounds::CommandBounds()
: Command()
{
name = "bounds";
summary = "Estimate error bounds.";
description = "Estimate error bounds for various sketch sizes and Mash distances based on a given k-mer size and probability.";
argumentString = "";

useOption("help");
addOption("kmer", Option(Option::Integer, "k", "", "K-mer size.", "21"));
addOption("prob", Option(Option::Number, "p", "", "Probability of each error bound being true.", "0.99", 0, 1));
}

int CommandBounds::run() const
{
if ( options.at("help").active )
{
print();
return 0;
}

const int sketchSizeCount = 9;
const double sketchSizes[] = {100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000};

const int distCount = 8;
const double dists[] = {0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4};

int k = getOption("kmer").getArgumentAsNumber();
double q2 = (1.0 - getOption("prob").getArgumentAsNumber()) / 2.0;

cout << "\tMash distance" << endl;
cout << "Sketch";

for ( int i = 0; i < distCount; i++ )
{
cout << '\t' << dists[i];
}

cout << endl;

for ( int i = 0; i < sketchSizeCount; i++ )
{
int s = sketchSizes[i];
cout << s;

for ( int j = 0; j < distCount; j++ )
{
double m2j = 1.0 / (2.0 * exp(k * dists[j]) - 1.0);

int x = 0;

while ( x < s )
{
#ifdef USE_BOOST
double cdfx = cdf(binomial(s, m2j), x);
#else
double cdfx = gsl_cdf_binomial_P(x, m2j, s);
#endif
if ( cdfx > q2 )
{
break;
}

x++;
}

double je = double(x) / s;
double j2m = -1.0 / k * log(2.0 * je / (1.0 + je));
cout << '\t' << j2m - dists[j];
}

cout << endl;
}

return 0;
}
20 changes: 20 additions & 0 deletions src/mash/CommandBounds.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright © 2015, Battelle National Biodefense Institute (BNBI);
// all rights reserved. Authored by: Brian Ondov, Todd Treangen,
// Sergey Koren, and Adam Phillippy
//
// See the LICENSE.txt file included with this software for license information.

#ifndef INCLUDED_CommandBounds
#define INCLUDED_CommandBounds

#include "Command.h"

class CommandBounds : public Command
{
public:

CommandBounds();
int run() const; // override
};

#endif
12 changes: 5 additions & 7 deletions src/mash/Sketch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1150,6 +1150,8 @@ Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input)
continue;
}

count++;

if ( parameters.windowed )
{
// TODO positionHashesByReference.resize(count + 1);
Expand Down Expand Up @@ -1211,20 +1213,16 @@ Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input)
exit(1);
}

if ( parameters.concatenated )
if ( ! parameters.windowed )
{
if ( ! parameters.windowed )
{
setMinHashesForReference(reference, minHashHeap);
}

count++;
setMinHashesForReference(reference, minHashHeap);
}

if ( parameters.reads )
{
cerr << "Estimated genome size: " << minHashHeap.estimateSetSize() << endl;
cerr << "Estimated coverage: " << minHashHeap.estimateMultiplicity() << endl;
cerr << "Reads used: " << count << endl;
}

kseq_destroy(seq);
Expand Down
2 changes: 1 addition & 1 deletion src/mash/Sketch.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ class Sketch
bool reads;
uint64_t memoryBound;
uint32_t minCov;
uint64_t targetCov;
double targetCov;
};

struct PositionHash
Expand Down
2 changes: 2 additions & 0 deletions src/mash/mash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
//
// See the LICENSE.txt file included with this software for license information.

#include "CommandBounds.h"
#include "CommandList.h"
#include "CommandSketch.h"
#include "CommandFind.h"
Expand All @@ -26,6 +27,7 @@ int main(int argc, const char ** argv)
#endif
commandList.addCommand(new CommandInfo());
commandList.addCommand(new CommandPaste());
commandList.addCommand(new CommandBounds());

return commandList.run(argc, argv);
}

0 comments on commit e5a8e6a

Please sign in to comment.