Permalink
Browse files

Adds initialize parameter to cluster and cluster.split

Opticluster
  • Loading branch information...
1 parent 031e1b8 commit 30d65f3ed494138d74cad99e23781e9c93f283c4 @mothur-westcott mothur-westcott committed Aug 31, 2016
@@ -26,6 +26,7 @@ vector<string> ClusterCommand::setParameters(){
CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "","",false,false,true); parameters.push_back(pcutoff);
CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted-agc-dgc-opti", "average", "", "", "","",false,false,true); parameters.push_back(pmethod);
+ CommandParameter pinitialize("initialize", "Multiple", "oneotu-singleton", "singleton", "", "", "","",false,false,true); parameters.push_back(pinitialize);
CommandParameter pmetric("metric", "Multiple", "mcc-sens-spec-tptn-fpfn-tp-tn-fp-fn-f1score-accuracy-ppv-npv-fdr", "mcc", "", "", "","",false,false,true); parameters.push_back(pmetric);
CommandParameter pmetriccutoff("delta", "Number", "", "0.000", "", "", "","",false,false,true); parameters.push_back(pmetriccutoff);
CommandParameter piters("iters", "Number", "", "100", "", "", "","",false,false,true); parameters.push_back(piters);
@@ -51,14 +52,15 @@ vector<string> ClusterCommand::setParameters(){
string ClusterCommand::getHelpString(){
try {
string helpString = "";
- helpString += "The cluster command parameter options are phylip, column, name, count, method, cutoff, precision, sim, showabund and timing. Fasta or Phylip or column and name are required.\n";
+ helpString += "The cluster command parameter options are phylip, column, name, count, method, cutoff, precision, sim, showabund, timing, metric, iters, initialize. Fasta or Phylip or column and name are required.\n";
//helpString += "The adjust parameter is used to handle missing distances. If you set a cutoff, adjust=f by default. If not, adjust=t by default. Adjust=f, means ignore missing distances and adjust cutoff as needed with the average neighbor method. Adjust=t, will treat missing distances as 1.0. You can also set the value the missing distances should be set to, adjust=0.5 would give missing distances a value of 0.5.\n";
helpString += "The phylip and column parameter allow you to enter your distance file. \n";
helpString += "The fasta parameter allows you to enter your fasta file for use with the agc or dgc methods. \n";
helpString += "The name parameter allows you to enter your name file. \n";
helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format.\n";
helpString += "The iters parameter allow you to set the maxiters for the opticluster method. \n";
helpString += "The metric parameter allows to select the metric in the opticluster method. Options are Matthews correlation coefficient (mcc), sensitivity (sens), specificity (spec), true positives + true negatives (tptn), false positives + false negatives (fpfn), true positives (tp), true negative (tn), false positive (fp), false negative (fn), f1score (f1score), accuracy (accuracy), positive predictive value (ppv), negative predictive value (npv), false discovery rate (fdr). Default=mcc.\n";
+ helpString += "The initialize parameter allows to select the initial randomization for the opticluster method. Options are singleton, meaning each sequence is randomly assigned to its each OTU, or oneotu meaning all sequences are assigned to oneotu. Default=singleton.\n";
helpString += "The delta parameter allows to set the stable value for the metric in the opticluster method (delta=0.0000). \n";
helpString += "The method parameter allows you to enter your clustering mothod. Options are furthest, nearest, average, weighted, agc, dgc and opti. Default=average. The agc and dgc methods require a fasta file.";
helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
@@ -273,6 +275,11 @@ ClusterCommand::ClusterCommand(string option) {
if ((metric == "mcc") || (metric == "sens") || (metric == "spec") || (metric == "tptn") || (metric == "tp") || (metric == "tn") || (metric == "fp") || (metric == "fn") || (metric == "f1score") || (metric == "accuracy") || (metric == "ppv") || (metric == "npv") || (metric == "fdr") || (metric == "fpfn") ){ }
else { m->mothurOut("[ERROR]: Not a valid metric. Valid metrics are mcc, sens, spec, tp, tn, fp, fn, tptn, fpfn, f1score, accuracy, ppv, npv, fdr."); m->mothurOutEndLine(); abort = true; }
+
+ initialize = validParameter.validFile(parameters, "initialize", false); if (initialize == "not found") { initialize = "singleton"; }
+
+ if ((initialize == "singleton") || (initialize == "oneotu")){ }
+ else { m->mothurOut("[ERROR]: Not a valid initialization. Valid initializations are singleton and oneotu."); m->mothurOutEndLine(); abort = true; }
temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "100"; }
m->mothurConvert(temp, maxIters);
@@ -877,7 +884,7 @@ int ClusterCommand::runOptiCluster(){
double listVectorMetric = 0; //worst state
double delta = 1;
- cluster.initialize(listVectorMetric, true);
+ cluster.initialize(listVectorMetric, true, initialize);
m->mothurOut("\n\niter\tlabel\tcutoff\ttp\ttn\tfp\tfn\tsensitivity\tspecificity\tppv\tnpv\tfdr\taccuracy\tmcc\tf1score\n");
outStep << "iter\tlabel\tcutoff\ttp\ttn\tfp\tfn\tsensitivity\tspecificity\tppv\tnpv\tfdr\taccuracy\tmcc\tf1score\n";
@@ -58,7 +58,7 @@ class ClusterCommand : public Command {
bool abort, sim, cutOffSet;
- string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, format, distfile, countfile, fastafile, inputDir, vsearchLocation, metric;
+ string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, format, distfile, countfile, fastafile, inputDir, vsearchLocation, metric, initialize;
double cutoff, stableMetric;
float adjust;
string showabund, timing;
@@ -31,6 +31,7 @@ vector<string> ClusterSplitCommand::setParameters(){
CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
CommandParameter pmetriccutoff("delta", "Number", "", "0.000", "", "", "","",false,false,true); parameters.push_back(pmetriccutoff);
CommandParameter piters("iters", "Number", "", "100", "", "", "","",false,false,true); parameters.push_back(piters);
+ CommandParameter pinitialize("initialize", "Multiple", "oneotu-singleton", "singleton", "", "", "","",false,false,true); parameters.push_back(pinitialize);
CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted-agc-dgc-opti", "average", "", "", "","",false,false,true); parameters.push_back(pmethod);
CommandParameter pmetric("metric", "Multiple", "mcc-sens-spec-tptn-fpfn-tp-tn-fp-fn-f1score-accuracy-ppv-npv-fdr", "mcc", "", "", "","",false,false,true); parameters.push_back(pmetric);
@@ -54,7 +55,7 @@ vector<string> ClusterSplitCommand::setParameters(){
string ClusterSplitCommand::getHelpString(){
try {
string helpString = "";
- helpString += "The cluster.split command parameter options are file, fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, large, cluster, iters, delta, dist, processors. Fasta or Phylip or column and name are required.\n";
+ helpString += "The cluster.split command parameter options are file, fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, large, cluster, iters, delta, initialize, dist, processors. Fasta or Phylip or column and name are required.\n";
helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n";
@@ -73,6 +74,7 @@ string ClusterSplitCommand::getHelpString(){
helpString += "The iters parameter allow you to set the maxiters for the opticluster method. \n";
helpString += "The metric parameter allows to select the metric in the opticluster method. Options are Matthews correlation coefficient (mcc), sensitivity (sens), specificity (spec), true positives + true negatives (tptn), false positives + false negatives (fpfn), true positives (tp), true negative (tn), false positive (fp), false negative (fn), f1score (f1score), accuracy (accuracy), positive predictive value (ppv), negative predictive value (npv), false discovery rate (fdr). Default=mcc.\n";
helpString += "The delta parameter allows to set the stable value for the metric in the opticluster method. Default=0.000\n";
+ helpString += "The initialize parameter allows to select the initial randomization for the opticluster method. Options are singleton, meaning each sequence is randomly assigned to its each OTU, or oneotu meaning all sequences are assigned to oneotu. Default=singleton.\n";
helpString += "The method parameter allows you to enter your clustering mothod. Options are furthest, nearest, average, weighted, agc, dgc and opti. Default=average. The agc and dgc methods require a fasta file.";
helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
@@ -373,6 +375,12 @@ ClusterSplitCommand::ClusterSplitCommand(string option) {
if ((metric == "mcc") || (metric == "sens") || (metric == "spec") || (metric == "tptn") || (metric == "tp") || (metric == "tn") || (metric == "fp") || (metric == "fn") || (metric == "f1score") || (metric == "accuracy") || (metric == "ppv") || (metric == "npv") || (metric == "fdr") || (metric == "fpfn") ){ }
else { m->mothurOut("[ERROR]: Not a valid metric. Valid metrics are mcc, sens, spec, tp, tn, fp, fn, tptn, fpfn, f1score, accuracy, ppv, npv, fdr."); m->mothurOutEndLine(); abort = true; }
+ initialize = validParameter.validFile(parameters, "initialize", false); if (initialize == "not found") { initialize = "singleton"; }
+
+ if ((initialize == "singleton") || (initialize == "oneotu")){ }
+ else { m->mothurOut("[ERROR]: Not a valid initialization. Valid initializations are singleton and oneotu."); m->mothurOutEndLine(); abort = true; }
+
+
method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; }
if ((method == "furthest") || (method == "nearest") || (method == "average") || (method == "weighted") || (method == "agc") || (method == "dgc") || (method == "opti")) { }
@@ -1401,7 +1409,7 @@ string ClusterSplitCommand::runOptiCluster(string thisDistFile, string thisNamef
double listVectorMetric = 0; //worst state
double delta = 1;
- cluster.initialize(listVectorMetric, true);
+ cluster.initialize(listVectorMetric, true, initialize);
while ((delta > stableMetric) && (iters < maxIters)) {
@@ -50,7 +50,7 @@ class ClusterSplitCommand : public Command {
vector<int> processIDS; //processid
vector<string> outputNames;
- string file, method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, countfile, distfile, format, showabund, timing, splitmethod, taxFile, fastafile, inputDir, vsearchLocation, metric;
+ string file, method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, countfile, distfile, format, showabund, timing, splitmethod, taxFile, fastafile, inputDir, vsearchLocation, metric, initialize;
double cutoff, splitcutoff, stableMetric, numSingletons;
int precision, length, processors, taxLevelCutoff, maxIters;
bool print_start, abort, large, classic, runCluster, deleteFiles, isList, cutoffNotSet, makeDist;
View
@@ -10,7 +10,7 @@
/***********************************************************************/
//randomly assign sequences to OTUs
-int OptiCluster::initialize(double& value, bool randomize) {
+int OptiCluster::initialize(double& value, bool randomize, string initialize) {
try {
numSeqs = matrix->getNumSeqs();
truePositives = 0;
@@ -25,27 +25,52 @@ int OptiCluster::initialize(double& value, bool randomize) {
seqBin[numSeqs] = -1;
insertLocation = numSeqs;
- for (int i = 0; i < numSeqs; i++) { bins[i].push_back(i); }
-
- //maps randomized sequences to bins
- for (int i = 0; i < numSeqs; i++) {
- seqBin[i] = bins[i][0];
- randomizeSeqs.push_back(i);
- }
-
- if (randomize) { random_shuffle(randomizeSeqs.begin(), randomizeSeqs.end()); }
-
- //for each sequence (singletons removed on read)
- for (map<int, int>::iterator it = seqBin.begin(); it != seqBin.end(); it++) {
- if (it->second == -1) { }
- else {
- long long numCloseSeqs = (matrix->getCloseSeqs(it->first)).size(); //does not include self
- falseNegatives += numCloseSeqs;
+ if (initialize == "singleton") {
+
+ //put everyone in own bin
+ for (int i = 0; i < numSeqs; i++) { bins[i].push_back(i); }
+
+ //maps randomized sequences to bins
+ for (int i = 0; i < numSeqs; i++) {
+ seqBin[i] = bins[i][0];
+ randomizeSeqs.push_back(i);
+ }
+
+ if (randomize) { random_shuffle(randomizeSeqs.begin(), randomizeSeqs.end()); }
+
+ //for each sequence (singletons removed on read)
+ for (map<int, int>::iterator it = seqBin.begin(); it != seqBin.end(); it++) {
+ if (it->second == -1) { }
+ else {
+ long long numCloseSeqs = (matrix->getCloseSeqs(it->first)).size(); //does not include self
+ falseNegatives += numCloseSeqs;
+ }
+ }
+ falseNegatives /= 2; //square matrix
+ trueNegatives = numSeqs * (numSeqs-1)/2 - (falsePositives + falseNegatives + truePositives); //since everyone is a singleton no one clusters together. True negative = num far apart
+ totalPairs = trueNegatives + truePositives + falseNegatives + falsePositives;
+ }else {
+
+ //put everyone in first bin
+ for (int i = 0; i < numSeqs; i++) {
+ bins[0].push_back(i);
+ seqBin[i] = 0;
+ randomizeSeqs.push_back(i);
+ }
+
+ if (randomize) { random_shuffle(randomizeSeqs.begin(), randomizeSeqs.end()); }
+
+ //for each sequence (singletons removed on read)
+ for (map<int, int>::iterator it = seqBin.begin(); it != seqBin.end(); it++) {
+ if (it->second == -1) { }
+ else {
+ long long numCloseSeqs = (matrix->getCloseSeqs(it->first)).size(); //does not include self
+ truePositives += numCloseSeqs;
+ }
}
+ truePositives /= 2; //square matrix
+ falsePositives = numSeqs * (numSeqs-1)/2 - (trueNegatives + falseNegatives + truePositives);
}
- falseNegatives /= 2; //square matrix
- trueNegatives = numSeqs * (numSeqs-1)/2 - (falsePositives + falseNegatives + truePositives); //since everyone is a singleton no one clusters together. True negative = num far apart
- totalPairs = trueNegatives + truePositives + falseNegatives + falsePositives;
value = 0;
if (metric == "mcc") { value = calcMCC(truePositives, trueNegatives, falsePositives, falseNegatives); }
View
@@ -29,7 +29,7 @@ class OptiCluster : public Cluster {
~OptiCluster() {}
bool updateDistance(PDistCell& colCell, PDistCell& rowCell) { return false; } //inheritance compliant
string getTag() { string tag = "opti_" + metric; return tag; }
- int initialize(double&, bool); //randomize and place in "best" OTUs
+ int initialize(double&, bool, string); //randomize and place in "best" OTUs
bool update(double&); //returns whether list changed and MCC
vector<double> getStats( long long&, long long&, long long&, long long&);
ListVector* getList();

0 comments on commit 30d65f3

Please sign in to comment.