Skip to content

Commit

Permalink
Adds removechimeras parameter to chimera.vsearch
Browse files Browse the repository at this point in the history
Default=t, meaning auto remove chimeras found in input files.

#795 #792
  • Loading branch information
mothur-westcott committed Oct 25, 2021
1 parent 91b2fff commit 30da436
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 3 deletions.
62 changes: 60 additions & 2 deletions source/commands/chimeravsearchcommand.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "sequence.hpp"
#include "systemcommand.h"
#include "degapseqscommand.h"
#include "removeseqscommand.h"

//**********************************************************************************************************************
vector<string> ChimeraVsearchCommand::setParameters(){
Expand All @@ -26,6 +27,7 @@ vector<string> ChimeraVsearchCommand::setParameters(){
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
CommandParameter pabskew("abskew", "Number", "", "1.9", "", "", "","",false,false); parameters.push_back(pabskew);
CommandParameter pchimealns("uchimealns", "Boolean", "", "F", "", "", "","alns",false,false); parameters.push_back(pchimealns);
CommandParameter premovechimeras("removechimeras", "Boolean", "", "t", "", "", "","alns",false,false); parameters.push_back(premovechimeras);
CommandParameter pminh("minh", "Number", "", "0.28", "", "", "","",false,false); parameters.push_back(pminh);
CommandParameter pmindiv("mindiv", "Number", "", "0.8", "", "", "","",false,false); parameters.push_back(pmindiv);
CommandParameter pxn("xn", "Number", "", "8.0", "", "", "","",false,false); parameters.push_back(pxn);
Expand All @@ -39,6 +41,7 @@ vector<string> ChimeraVsearchCommand::setParameters(){
vector<string> tempOutNames;
outputTypes["chimera"] = tempOutNames;
outputTypes["accnos"] = tempOutNames;
outputTypes["fasta"] = tempOutNames;
outputTypes["alns"] = tempOutNames;
outputTypes["count"] = tempOutNames;

Expand All @@ -57,7 +60,7 @@ string ChimeraVsearchCommand::getHelpString(){
string helpString = "";
helpString += "The chimera.vsearch command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n";
helpString += "This command is a wrapper for vsearch https://github.com/torognes/vsearch.\n";
helpString += "The chimera.vsearch command parameters are fasta, name, count, reference, processors, dereplicate, abskew, uchimealns, minh, mindiv, xn, dn, mindiffs.\n";
helpString += "The chimera.vsearch command parameters are fasta, name, count, reference, processors, dereplicate, removechimeras, abskew, uchimealns, minh, mindiv, xn, dn, mindiffs.\n";
helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
helpString += "The name parameter allows you to provide a name file, if you are using template=self. \n";
helpString += "The count parameter allows you to provide a count file, if you are using template=self. When you use a count file with group info and dereplicate=T, mothur will create a *.pick.count_table file containing seqeunces after chimeras are removed. \n";
Expand All @@ -67,6 +70,7 @@ string ChimeraVsearchCommand::getHelpString(){
helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
helpString += "The abskew parameter can only be used with template=self. Minimum abundance skew. Default 1.9. Abundance skew is: min [ abund(parent1), abund(parent2) ] / abund(query).\n";
helpString += "The uchimealns parameter allows you to indicate you would like a file containing multiple alignments of query sequences to parents in human readable format. Alignments show columns with differences that support or contradict a chimeric model.\n";
helpString += "The removechimeras parameter allows you to indicate you would like to automatically remove the sequences that are flagged as chimeric. Default=t.\n";
helpString += "The minh parameter - mininum score to report chimera. Default 0.3. Values from 0.1 to 5 might be reasonable. Lower values increase sensitivity but may report more false positives. If you decrease xn you may need to increase minh, and vice versa.\n";
helpString += "The mindiv parameter - minimum divergence ratio, default 0.5. Div ratio is 100%% - %%identity between query sequence and the closest candidate for being a parent. If you don't care about very close chimeras, then you could increase mindiv to, say, 1.0 or 2.0, and also decrease minh, say to 0.1, to increase sensitivity. How well this works will depend on your data. Best is to tune parameters on a good benchmark.\n";
helpString += "The xn parameter - weight of a no vote. Default 8.0. Decreasing this weight to around 3 or 4 may give better performance on denoised data.\n";
Expand Down Expand Up @@ -113,8 +117,9 @@ string ChimeraVsearchCommand::getOutputPattern(string type) {

if (type == "chimera") { pattern = "[filename],[tag],vsearch.chimeras"; }
else if (type == "accnos") { pattern = "[filename],[tag],vsearch.accnos"; }
else if (type == "fasta") { pattern = "[filename],[tag],vsearch.fasta"; }
else if (type == "alns") { pattern = "[filename],[tag],vsearch.alns"; }
else if (type == "count") { pattern = "[filename],[tag],vsearch.pick.count_table-[filename],count_table"; }
else if (type == "count") { pattern = "[filename],[tag],vsearch.count_table-[filename],count_table"; }
else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true); }

return pattern;
Expand Down Expand Up @@ -197,6 +202,9 @@ ChimeraVsearchCommand::ChimeraVsearchCommand(string option) : Command() {
temp = validParameter.valid(parameters, "chimealns"); if (temp == "not found") { temp = "f"; }
chimealns = util.isTrue(temp);

temp = validParameter.valid(parameters, "removechimeras"); if (temp == "not found") { temp = "t"; }
removeChimeras = util.isTrue(temp);

minh = validParameter.valid(parameters, "minh"); if (minh == "not found") { useMinH = false; minh = "0.28"; } else{ useMinH = true; }
mindiv = validParameter.valid(parameters, "mindiv"); if (mindiv == "not found") { useMindiv = false; mindiv = "0.8"; } else{ useMindiv = true; }
xn = validParameter.valid(parameters, "xn"); if (xn == "not found") { useXn = false; xn = "8.0"; } else{ useXn = true; }
Expand Down Expand Up @@ -626,6 +634,51 @@ int ChimeraVsearchCommand::execute(){
outputNames.push_back(accnosFileName); outputTypes["accnos"].push_back(accnosFileName);
if (chimealns) { outputNames.push_back(alnsFileName); outputTypes["alns"].push_back(alnsFileName); }

if (removeChimeras) {
if (!util.isBlank(accnosFileName)) {
m->mothurOut("\nRemoving chimeras from your input files:\n");

string inputString = "fasta=" + fastafile + ", accnos=" + accnosFileName;
if ((countfile != "") && (!dups)) { inputString += ", count=" + countfile; }

m->mothurOut("/******************************************/\n");
m->mothurOut("Running command: remove.seqs(" + inputString + ")\n");
current->setMothurCalling(true);

Command* removeCommand = new RemoveSeqsCommand(inputString);
removeCommand->execute();

map<string, vector<string> > filenames = removeCommand->getOutputFiles();

delete removeCommand;
current->setMothurCalling(false);

if (countfile != "") {
if (!dups) { //dereplicate=f, so remove sequences where any sample found the reads to be chimeric
map<string, string> variables;
variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(countfile));
variables["[tag]"] = "denovo";
if (templatefile != "self") { variables["[tag]"] = "ref"; }
string currentName = getOutputFileName("count", variables);

util.renameFile(filenames["count"][0], currentName);
util.mothurRemove(filenames["count"][0]);
outputNames.push_back(currentName); outputTypes["count"].push_back(currentName);
}//else, mothur created a modified count file removing chimeras by sample. No need to include count file on remove.seqs command. Deconvolute function created modified count table already
}

map<string, string> variables;
variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(fastafile));
variables["[tag]"] = "denovo";
if (templatefile != "self") { variables["[tag]"] = "ref"; }
string currentName = getOutputFileName("fasta", variables);

util.renameFile(filenames["fasta"][0], currentName);
util.mothurRemove(filenames["fasta"][0]);

outputNames.push_back(currentName); outputTypes["fasta"].push_back(currentName);
}else { m->mothurOut("\nNo chimeras found, skipping remove.seqs.\n"); }
}

//set accnos file as new current accnosfile
string currentName = "";
Expand All @@ -639,6 +692,11 @@ int ChimeraVsearchCommand::execute(){
if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setCountFile(currentName); }
}

itTypes = outputTypes.find("fasta");
if (itTypes != outputTypes.end()) {
if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setFastaFile(currentName); }
}

m->mothurOut("\nOutput File Names:\n");
for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
m->mothurOutEndLine();
Expand Down
2 changes: 1 addition & 1 deletion source/commands/chimeravsearchcommand.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class ChimeraVsearchCommand : public Command {
void help() { m->mothurOut(getHelpString()); }

private:
bool abort, useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, ucl, useMindiffs, hasCount, dups;
bool abort, useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, ucl, useMindiffs, hasCount, dups, removeChimeras;
string fastafile, templatefile, countfile, abskew, minh, mindiv, xn, dn, mindiffs, vsearchLocation;
int processors;
vsearchVariables* vars;
Expand Down

0 comments on commit 30da436

Please sign in to comment.