Permalink
Fetching contributors…
Cannot retrieve contributors at this time
806 lines (669 sloc) 37.3 KB
/*
* clustercommand.cpp
* Dotur
*
* Created by Sarah Westcott on 1/2/09.
* Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
*
*/
#include "clustercommand.h"
#include "readphylip.h"
#include "readcolumn.h"
#include "readmatrix.hpp"
#include "clusterdoturcommand.h"
#include "sequence.hpp"
#include "vsearchfileparser.h"
#include "systemcommand.h"
//**********************************************************************************************************************
vector<string> ClusterCommand::setParameters(){
try {
CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "","",false,false,true); parameters.push_back(pcutoff);
CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted-agc-dgc", "average", "", "", "","",false,false,true); parameters.push_back(pmethod);
CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
CommandParameter psim("sim", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psim);
CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
//CommandParameter padjust("adjust", "String", "", "F", "", "", "","",false,false); parameters.push_back(padjust);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
vector<string> myArray;
for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
return myArray;
}
catch(exception& e) {
m->errorOut(e, "ClusterCommand", "setParameters");
exit(1);
}
}
//**********************************************************************************************************************
string ClusterCommand::getHelpString(){
try {
string helpString = "";
helpString += "The cluster command parameter options are phylip, column, name, count, method, cuttoff, hard, precision, sim, showabund and timing. Fasta or Phylip or column and name are required.\n";
//helpString += "The adjust parameter is used to handle missing distances. If you set a cutoff, adjust=f by default. If not, adjust=t by default. Adjust=f, means ignore missing distances and adjust cutoff as needed with the average neighbor method. Adjust=t, will treat missing distances as 1.0. You can also set the value the missing distances should be set to, adjust=0.5 would give missing distances a value of 0.5.\n";
helpString += "The phylip and column parameter allow you to enter your distance file. \n";
helpString += "The fasta parameter allows you to enter your fasta file for use with the agc or dgc methods. \n";
helpString += "The name parameter allows you to enter your name file. \n";
helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format.\n";
helpString += "The method parameter allows you to enter your clustering mothod. Options are furthest, nearest, average, weighted, agc and dgc. Default=average. The agc and dgc methods require a fasta file.";
helpString += "The cluster command should be in the following format: \n";
helpString += "cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n";
helpString += "The acceptable cluster methods are furthest, nearest, average and weighted. If no method is provided then average is assumed.\n";
return helpString;
}
catch(exception& e) {
m->errorOut(e, "ClusterCommand", "getHelpString");
exit(1);
}
}
//**********************************************************************************************************************
string ClusterCommand::getOutputPattern(string type) {
try {
string pattern = "";
if (type == "list") { pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
else if (type == "rabund") { pattern = "[filename],[clustertag],rabund"; }
else if (type == "sabund") { pattern = "[filename],[clustertag],sabund"; }
else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
return pattern;
}
catch(exception& e) {
m->errorOut(e, "ClusterCommand", "getOutputPattern");
exit(1);
}
}
//**********************************************************************************************************************
ClusterCommand::ClusterCommand(){
try {
abort = true; calledHelp = true;
setParameters();
vector<string> tempOutNames;
outputTypes["list"] = tempOutNames;
outputTypes["rabund"] = tempOutNames;
outputTypes["sabund"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "ClusterCommand", "ClusterCommand");
exit(1);
}
}
//**********************************************************************************************************************
//This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
ClusterCommand::ClusterCommand(string option) {
try{
abort = false; calledHelp = false;
//allow user to run help
if(option == "help") { help(); abort = true; calledHelp = true; }
else if(option == "citation") { citation(); abort = true; calledHelp = true;}
else {
vector<string> myArray = setParameters();
OptionParser parser(option);
map<string,string> parameters = parser.getParameters();
map<string,string>::iterator it;
ValidParameters validParameter;
//check to make sure all parameters are valid for command
for (it = parameters.begin(); it != parameters.end(); it++) {
if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
abort = true;
}
}
//initialize outputTypes
vector<string> tempOutNames;
outputTypes["list"] = tempOutNames;
outputTypes["rabund"] = tempOutNames;
outputTypes["sabund"] = tempOutNames;
//if the user changes the output directory command factory will send this info to us in the output parameter
outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
inputDir = validParameter.validFile(parameters, "inputdir", false);
if (inputDir == "not found"){ inputDir = ""; }
else {
string path;
it = parameters.find("phylip");
//user has given a template file
if(it != parameters.end()){
path = m->hasPath(it->second);
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["phylip"] = inputDir + it->second; }
}
it = parameters.find("column");
//user has given a template file
if(it != parameters.end()){
path = m->hasPath(it->second);
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["column"] = inputDir + it->second; }
}
it = parameters.find("name");
//user has given a template file
if(it != parameters.end()){
path = m->hasPath(it->second);
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["name"] = inputDir + it->second; }
}
it = parameters.find("count");
//user has given a template file
if(it != parameters.end()){
path = m->hasPath(it->second);
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["count"] = inputDir + it->second; }
}
it = parameters.find("fasta");
//user has given a template file
if(it != parameters.end()){
path = m->hasPath(it->second);
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["fasta"] = inputDir + it->second; }
}
}
//check for required parameters
phylipfile = validParameter.validFile(parameters, "phylip", true);
if (phylipfile == "not open") { phylipfile = ""; abort = true; }
else if (phylipfile == "not found") { phylipfile = ""; }
else { distfile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
columnfile = validParameter.validFile(parameters, "column", true);
if (columnfile == "not open") { columnfile = ""; abort = true; }
else if (columnfile == "not found") { columnfile = ""; }
else { distfile = columnfile; format = "column"; m->setColumnFile(columnfile); }
fastafile = validParameter.validFile(parameters, "fasta", true);
if (fastafile == "not open") { abort = true; }
else if (fastafile == "not found") { fastafile = ""; }
else { distfile = fastafile; format = "fasta"; m->setFastaFile(fastafile); }
namefile = validParameter.validFile(parameters, "name", true);
if (namefile == "not open") { abort = true; }
else if (namefile == "not found") { namefile = ""; }
else { m->setNameFile(namefile); }
countfile = validParameter.validFile(parameters, "count", true);
if (countfile == "not open") { abort = true; countfile = ""; }
else if (countfile == "not found") { countfile = ""; }
else { m->setCountTableFile(countfile); }
if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
//is there are current file available for either of these?
//give priority to column, then phylip
columnfile = m->getColumnFile();
if (columnfile != "") { distfile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
else {
phylipfile = m->getPhylipFile();
if (phylipfile != "") { distfile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
else {
fastafile = m->getFastaFile();
if (fastafile != "") { distfile = fastafile; format = "fasta"; m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
else {
m->mothurOut("No valid current files. You must provide a phylip, column or fasta file before you can use the cluster command."); m->mothurOutEndLine();
abort = true;
}
}
}
}
else if (((phylipfile != "") && (columnfile != "")) || ((phylipfile != "") && (fastafile != "")) || ((fastafile != "") && (columnfile != ""))) { m->mothurOut("When executing a cluster command you must enter ONLY ONE of the following: phylip, column or fasta."); m->mothurOutEndLine(); abort = true; }
if (columnfile != "") {
if ((namefile == "") && (countfile == "")){
namefile = m->getNameFile();
if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
else {
countfile = m->getCountTableFile();
if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
else {
m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
abort = true;
}
}
}
}
if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
//get user cutoff and precision or use defaults
string temp;
temp = validParameter.validFile(parameters, "precision", false);
if (temp == "not found") { temp = "100"; }
//saves precision legnth for formatting below
length = temp.length();
m->mothurConvert(temp, precision);
temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
hard = m->isTrue(temp);
temp = validParameter.validFile(parameters, "sim", false); if (temp == "not found") { temp = "F"; }
sim = m->isTrue(temp);
//bool cutoffSet = false;
temp = validParameter.validFile(parameters, "cutoff", false);
if (temp == "not found") { temp = "10"; }
//else { cutoffSet = true; }
m->mothurConvert(temp, cutoff);
cutoff += (5 / (precision * 10.0));
//temp = validParameter.validFile(parameters, "adjust", false); if (temp == "not found") { temp = "F"; }
//if (m->isNumeric1(temp)) { m->mothurConvert(temp, adjust); }
//else if (m->isTrue(temp)) { adjust = 1.0; }
//else { adjust = -1.0; }
adjust=-1.0;
method = validParameter.validFile(parameters, "method", false);
if (method == "not found") { method = "average"; }
if ((method == "furthest") || (method == "nearest") || (method == "average") || (method == "weighted") || (method == "agc") || (method == "dgc")) { }
else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest, average, weighted, agc and dgc."); m->mothurOutEndLine(); abort = true; }
if ((method == "agc") || (method == "dgc")) {
if (fastafile == "") { m->mothurOut("[ERROR]: You must provide a fasta file when using the agc or dgc clustering methods, aborting\n."); abort = true;}
}
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
if ((method == "agc") || (method == "dgc")) { m->mothurOut("[ERROR]: The agc and dgc clustering methods are not available for Windows, aborting\n."); abort = true; }
#endif
showabund = validParameter.validFile(parameters, "showabund", false);
if (showabund == "not found") { showabund = "T"; }
timing = validParameter.validFile(parameters, "timing", false);
if (timing == "not found") { timing = "F"; }
}
}
catch(exception& e) {
m->errorOut(e, "ClusterCommand", "ClusterCommand");
exit(1);
}
}
//**********************************************************************************************************************
ClusterCommand::~ClusterCommand(){}
//**********************************************************************************************************************
int ClusterCommand::execute(){
try {
if (abort == true) { if (calledHelp) { return 0; } return 2; }
//phylip file given and cutoff not given - use cluster.classic because it uses less memory and is faster
if ((format == "phylip") && (cutoff > 10.0)) {
m->mothurOutEndLine(); m->mothurOut("You are using a phylip file and no cutoff. I will run cluster.classic to save memory and time."); m->mothurOutEndLine();
//run unique.seqs for deconvolute results
string inputString = "phylip=" + distfile;
if (namefile != "") { inputString += ", name=" + namefile; }
else if (countfile != "") { inputString += ", count=" + countfile; }
inputString += ", precision=" + toString(precision);
inputString += ", method=" + method;
if (hard) { inputString += ", hard=T"; }
else { inputString += ", hard=F"; }
if (sim) { inputString += ", sim=T"; }
else { inputString += ", sim=F"; }
m->mothurOutEndLine();
m->mothurOut("/------------------------------------------------------------/"); m->mothurOutEndLine();
m->mothurOut("Running command: cluster.classic(" + inputString + ")"); m->mothurOutEndLine();
Command* clusterClassicCommand = new ClusterDoturCommand(inputString);
clusterClassicCommand->execute();
delete clusterClassicCommand;
m->mothurOut("/------------------------------------------------------------/"); m->mothurOutEndLine();
return 0;
}
time_t estart = time(NULL);
if (format == "fasta") { runVsearchCluster(); }
else { runMothurCluster(); }
if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
//set list file as new current listfile
string current = "";
itTypes = outputTypes.find("list");
if (itTypes != outputTypes.end()) {
if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
}
//set rabund file as new current rabundfile
itTypes = outputTypes.find("rabund");
if (itTypes != outputTypes.end()) {
if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
}
//set sabund file as new current sabundfile
itTypes = outputTypes.find("sabund");
if (itTypes != outputTypes.end()) {
if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
}
m->mothurOutEndLine();
m->mothurOut("Output File Names: "); m->mothurOutEndLine();
for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
m->mothurOutEndLine();
return 0;
}
catch(exception& e) {
m->errorOut(e, "ClusterCommand", "execute");
exit(1);
}
}
//**********************************************************************************************************************
int ClusterCommand::runVsearchCluster(){
try {
//look for uchime exe
string path = m->argv;
string tempPath = path;
for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
path = path.substr(0, (tempPath.find_last_of('m')));
string vsearchCommand;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
vsearchCommand = path + "vsearch"; // format the database, -o option gives us the ability
if (m->debug) {
m->mothurOut("[DEBUG]: vsearch location using \"which vsearch\" = ");
Command* newCommand = new SystemCommand("which vsearch"); m->mothurOutEndLine();
newCommand->execute();
delete newCommand;
m->mothurOut("[DEBUG]: Mothur's location using \"which mothur\" = ");
newCommand = new SystemCommand("which mothur"); m->mothurOutEndLine();
newCommand->execute();
delete newCommand;
}
#else
vsearchCommand = path + "vsearch.exe";
#endif
//test to make sure uchime exists
ifstream in;
vsearchCommand = m->getFullPathName(vsearchCommand);
int ableToOpen = m->openInputFile(vsearchCommand, in, "no error"); in.close();
if(ableToOpen == 1) {
m->mothurOut(vsearchCommand + " file does not exist. Checking path... \n");
//check to see if uchime is in the path??
string uLocation = m->findProgramPath("vsearch");
ifstream in2;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
ableToOpen = m->openInputFile(uLocation, in2, "no error"); in2.close();
#else
ableToOpen = m->openInputFile((uLocation + ".exe"), in2, "no error"); in2.close();
#endif
if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + uLocation + " file does not exist. mothur requires the vsearch executable."); m->mothurOutEndLine(); abort = true; }
else { m->mothurOut("Found vsearch in your path, using " + uLocation + "\n");vsearchLocation = uLocation; }
}else { vsearchLocation = vsearchCommand; }
vsearchLocation = m->getFullPathName(vsearchLocation);
string vsearchFastafile = ""; VsearchFileParser* vParse;
if ((namefile == "") && (countfile == "")) { vParse = new VsearchFileParser(fastafile); }
else if (namefile != "") { vParse = new VsearchFileParser(fastafile, namefile, "name"); }
else if (countfile != "") { vParse = new VsearchFileParser(fastafile, countfile, "count"); }
else { m->mothurOut("[ERROR]: Opps, should never get here. ClusterCommand::runVsearchCluster() \n"); m->control_pressed = true; }
if (m->control_pressed) { return 0; }
vsearchFastafile = vParse->getVsearchFile();
if (cutoff > 1.0) { m->mothurOut("You did not set a cutoff, using 0.03.\n"); cutoff = 0.03; }
//Run vsearch
string ucVsearchFile = m->getSimpleName(vsearchFastafile) + ".clustered.uc";
string logfile = m->getSimpleName(vsearchFastafile) + ".clustered.log";
vsearchDriver(vsearchFastafile, ucVsearchFile, logfile);
if (m->control_pressed) { m->mothurRemove(ucVsearchFile); m->mothurRemove(logfile); m->mothurRemove(vsearchFastafile); return 0; }
if (outputDir == "") { outputDir += m->hasPath(distfile); }
fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
tag = method;
map<string, string> variables;
variables["[filename]"] = fileroot;
variables["[clustertag]"] = tag;
string sabundFileName = getOutputFileName("sabund", variables);
string rabundFileName = getOutputFileName("rabund", variables);
if (countfile != "") { variables["[tag2]"] = "unique_list"; }
string listFileName = getOutputFileName("list", variables);
outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
if (countfile == "") {
outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
}
//Convert outputted *.uc file into a list file
vParse->createListFile(ucVsearchFile, listFileName, sabundFileName, rabundFileName, vParse->getNumBins(logfile), toString(1.0-cutoff)); delete vParse;
//remove temp files
m->mothurRemove(ucVsearchFile); m->mothurRemove(logfile); m->mothurRemove(vsearchFastafile);
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
return 0;
}
catch(exception& e) {
m->errorOut(e, "ClusterCommand", "runVsearchCluster");
exit(1);
}
}
//**********************************************************************************************************************
int ClusterCommand::vsearchDriver(string inputFile, string ucClusteredFile, string logfile){
try {
//vsearch --maxaccepts 16 --usersort --id 0.97 --minseqlength 30 --wordlength 8 --uc $ROOT.clustered.uc --cluster_smallmem $ROOT.sorted.fna --maxrejects 64 --strand both --log $ROOT.clustered.log --sizeorder
//no sizeorder for dgc
ucClusteredFile = m->getFullPathName(ucClusteredFile);
inputFile = m->getFullPathName(inputFile);
logfile = m->getFullPathName(logfile);
//to allow for spaces in the path
ucClusteredFile = "\"" + ucClusteredFile + "\"";
inputFile = "\"" + inputFile + "\"";
logfile = "\"" + logfile + "\"";
vector<char*> cPara;
string vsearchCommand = vsearchLocation;
vsearchCommand = "\"" + vsearchCommand + "\" ";
vector<char*> vsearchParameters;
char* vsearchParameter = new char[vsearchCommand.length()+1]; vsearchParameter[0] = '\0'; strncat(vsearchParameter, vsearchCommand.c_str(), vsearchCommand.length());
vsearchParameters.push_back(vsearchParameter);
//--maxaccepts=16
char* maxaccepts = new char[16]; maxaccepts[0] = '\0'; strncat(maxaccepts, "--maxaccepts=16", 15);
vsearchParameters.push_back(maxaccepts);
//--usersort
char* usersort = new char[11]; usersort[0] = '\0'; strncat(usersort, "--usersort", 10);
vsearchParameters.push_back(usersort);
//--id=0.97
cutoff = abs(1.0 - cutoff); string cutoffString = toString(cutoff);
if (cutoffString.length() > 4) { cutoffString = cutoffString.substr(0, 4); }
else if (cutoffString.length() < 4) { for (int i = cutoffString.length(); i < 4; i++) { cutoffString += "0"; } }
cutoffString = "--id=" + cutoffString;
char* cutoffParameter = new char[cutoffString.length()+1]; cutoffParameter[0] = '\0'; strncat(cutoffParameter, cutoffString.c_str(), cutoffString.length());
vsearchParameters.push_back(cutoffParameter);
//--minseqlength=30
char* minseqlength = new char[18]; minseqlength[0] = '\0'; strncat(minseqlength, "--minseqlength=30", 17);
vsearchParameters.push_back(minseqlength);
//--wordlength=8
char* wordlength = new char[15]; wordlength[0] = '\0'; strncat(wordlength, "--wordlength=8", 14);
vsearchParameters.push_back(wordlength);
//--uc=$ROOT.clustered.uc
string tempIn = "--uc=" + ucClusteredFile;
char* uc = new char[tempIn.length()+1]; uc[0] = '\0'; strncat(uc, tempIn.c_str(), tempIn.length());
vsearchParameters.push_back(uc);
//--cluster_smallmem $ROOT.sorted.fna
string tempSorted = "--cluster_smallmem=" + inputFile;
char* cluster_smallmen = new char[tempSorted.length()+1]; cluster_smallmen[0] = '\0'; strncat(cluster_smallmen, tempSorted.c_str(), tempSorted.length());
vsearchParameters.push_back(cluster_smallmen);
//--maxrejects=64
char* maxrejects = new char[16]; maxrejects[0] = '\0'; strncat(maxrejects, "--maxrejects=64", 15);
vsearchParameters.push_back(maxrejects);
//--strand=both
char* strand = new char[14]; strand[0] = '\0'; strncat(strand, "--strand=both", 13);
vsearchParameters.push_back(strand);
//--log=$ROOT.clustered.log
string tempLog = "--log=" + logfile;
char* log = new char[tempLog.length()+1]; log[0] = '\0'; strncat(log, tempLog.c_str(), tempLog.length());
vsearchParameters.push_back(log);
if (method == "agc") {
//--sizeorder
char* sizeorder = new char[12]; sizeorder[0] = '\0'; strncat(sizeorder, "--sizeorder", 11);
vsearchParameters.push_back(sizeorder);
}
if (m->debug) { for(int i = 0; i < vsearchParameters.size(); i++) { cout << vsearchParameters[i]; } cout << endl; }
string commandString = "";
for (int i = 0; i < vsearchParameters.size(); i++) { commandString += toString(vsearchParameters[i]) + " "; }
//cout << "commandString = " << commandString << endl;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
commandString = "\"" + commandString + "\"";
#endif
if (m->debug) { m->mothurOut("[DEBUG]: vsearch cluster command = " + commandString + ".\n"); }
system(commandString.c_str());
//free memory
for(int i = 0; i < vsearchParameters.size(); i++) { delete vsearchParameters[i]; }
//remove "" from filenames
ucClusteredFile = ucClusteredFile.substr(1, ucClusteredFile.length()-2);
inputFile = inputFile.substr(1, inputFile.length()-2);
logfile = logfile.substr(1, logfile.length()-2);
return 0;
}
catch(exception& e) {
m->errorOut(e, "ClusterCommand", "vsearchDriver");
exit(1);
}
}
//**********************************************************************************************************************
int ClusterCommand::runMothurCluster(){
try {
ReadMatrix* read;
if (format == "column") { read = new ReadColumnMatrix(columnfile, sim); } //sim indicates whether its a similarity matrix
else if (format == "phylip") { read = new ReadPhylipMatrix(phylipfile, sim); }
read->setCutoff(cutoff);
NameAssignment* nameMap = NULL;
CountTable* ct = NULL;
map<string, int> counts;
if(namefile != ""){
nameMap = new NameAssignment(namefile);
nameMap->readMap();
read->read(nameMap);
}else if (countfile != "") {
ct = new CountTable();
ct->readTable(countfile, false, false);
read->read(ct);
counts = ct->getNameMap();
}else { read->read(nameMap); }
list = read->getListVector();
matrix = read->getDMatrix();
if(countfile != "") {
rabund = new RAbundVector();
createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
delete ct;
}else { rabund = new RAbundVector(list->getRAbundVector()); }
delete read;
if (m->control_pressed) { //clean up
delete list; delete matrix; delete rabund; if(countfile == ""){rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); }
listFile.close(); m->mothurRemove((fileroot+ tag + ".list")); outputTypes.clear(); return 0;
}
//create cluster
if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); }
else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); }
else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust); }
else if(method == "weighted"){ cluster = new WeightedLinkage(rabund, list, matrix, cutoff, method, adjust); }
tag = cluster->getTag();
if (outputDir == "") { outputDir += m->hasPath(distfile); }
fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
map<string, string> variables;
variables["[filename]"] = fileroot;
variables["[clustertag]"] = tag;
string sabundFileName = getOutputFileName("sabund", variables);
string rabundFileName = getOutputFileName("rabund", variables);
if (countfile != "") { variables["[tag2]"] = "unique_list"; }
string listFileName = getOutputFileName("list", variables);
if (countfile == "") {
m->openOutputFile(sabundFileName, sabundFile);
m->openOutputFile(rabundFileName, rabundFile);
outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
}
m->openOutputFile(listFileName, listFile);
outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
list->printHeaders(listFile);
float previousDist = 0.00000;
float rndPreviousDist = 0.00000;
oldRAbund = *rabund;
oldList = *list;
print_start = true;
start = time(NULL);
loops = 0;
double saveCutoff = cutoff;
while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
if (m->control_pressed) { //clean up
delete list; delete matrix; delete rabund; delete cluster;
if(countfile == "") {rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); }
listFile.close(); m->mothurRemove((fileroot+ tag + ".list")); outputTypes.clear(); return 0;
}
if (print_start && m->isTrue(timing)) {
m->mothurOut("Clustering (" + tag + ") dist " + toString(matrix->getSmallDist()) + "/"
+ toString(m->roundDist(matrix->getSmallDist(), precision))
+ "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
cout.flush();
print_start = false;
}
loops++;
cluster->update(cutoff);
float dist = matrix->getSmallDist();
float rndDist;
if (hard) {
rndDist = m->ceilDist(dist, precision);
}else{
rndDist = m->roundDist(dist, precision);
}
if(previousDist <= 0.0000 && dist != previousDist){
printData("unique", counts);
}
else if(rndDist != rndPreviousDist){
printData(toString(rndPreviousDist, length-1), counts);
}
previousDist = dist;
rndPreviousDist = rndDist;
oldRAbund = *rabund;
oldList = *list;
}
if (print_start && m->isTrue(timing)) {
m->mothurOut("Clustering (" + tag + ") for distance " + toString(previousDist) + "/" + toString(rndPreviousDist)
+ "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
cout.flush();
print_start = false;
}
if(previousDist <= 0.0000){
printData("unique", counts);
}
else if(rndPreviousDist<cutoff){
printData(toString(rndPreviousDist, length-1), counts);
}
delete matrix;
delete list;
delete rabund;
delete cluster;
if (countfile == "") {
sabundFile.close();
rabundFile.close();
}
listFile.close();
if (saveCutoff != cutoff) {
if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
else { saveCutoff = m->roundDist(saveCutoff, precision); }
m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();
}
return 0;
}
catch(exception& e) {
m->errorOut(e, "ClusterCommand", "runMothurCluster");
exit(1);
}
}
//**********************************************************************************************************************
void ClusterCommand::printData(string label, map<string, int>& counts){
try {
if (m->isTrue(timing)) {
m->mothurOut("\tTime: " + toString(time(NULL) - start) + "\tsecs for " + toString(oldRAbund.getNumBins())
+ "\tclusters. Updates: " + toString(loops)); m->mothurOutEndLine();
}
print_start = true;
loops = 0;
start = time(NULL);
oldRAbund.setLabel(label);
if (countfile == "") {
oldRAbund.print(rabundFile);
oldRAbund.getSAbundVector().print(sabundFile);
}
if (m->isTrue(showabund)) {
oldRAbund.getSAbundVector().print(cout);
}
oldList.setLabel(label);
if(countfile != "") {
oldList.print(listFile, counts);
}else {
oldList.print(listFile);
}
}
catch(exception& e) {
m->errorOut(e, "ClusterCommand", "printData");
exit(1);
}
}
//**********************************************************************************************************************
int ClusterCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
try {
rabund->setLabel(list->getLabel());
for(int i = 0; i < list->getNumBins(); i++) {
if (m->control_pressed) { break; }
vector<string> binNames;
string bin = list->get(i);
m->splitAtComma(bin, binNames);
int total = 0;
for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]); }
rabund->push_back(total);
}
return 0;
}
catch(exception& e) {
m->errorOut(e, "ClusterCommand", "createRabund");
exit(1);
}
}
//**********************************************************************************************************************