Skip to content

Commit

Permalink
Adds parent taxon to unclassifieds in classify.seqs
Browse files Browse the repository at this point in the history
  • Loading branch information
mothur-westcott committed Mar 16, 2016
1 parent 22807b6 commit 5615dc0
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 77 deletions.
2 changes: 2 additions & 0 deletions source/classifier/bayesian.cpp
Expand Up @@ -71,6 +71,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
m->mothurOut("Reading template taxonomy... "); cout.flush();

phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
maxLevel = phyloTree->getMaxLevel();

m->mothurOut("DONE."); m->mothurOutEndLine();

Expand Down Expand Up @@ -185,6 +186,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
delete phyloTree;

phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
maxLevel = phyloTree->getMaxLevel();

//save probabilities
if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; }
Expand Down
9 changes: 6 additions & 3 deletions source/classifier/classify.cpp
Expand Up @@ -17,7 +17,8 @@

/**************************************************************************************************/
void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) {
try {
try {
maxLevel = 0;
ReferenceDB* rdb = ReferenceDB::getInstance();

if (tfile == "saved") { tfile = rdb->getSavedTaxonomy(); }
Expand Down Expand Up @@ -168,7 +169,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me
}
}
/**************************************************************************************************/
Classify::Classify() { m = MothurOut::getInstance(); database = NULL; phyloTree=NULL; flipped=false; }
Classify::Classify() { m = MothurOut::getInstance(); database = NULL; phyloTree=NULL; flipped=false; maxLevel = 0; }
/**************************************************************************************************/

int Classify::readTaxonomy(string file) {
Expand Down Expand Up @@ -199,9 +200,11 @@ int Classify::readTaxonomy(string file) {
//taxonomy = tempTaxonomy;

phyloTree->assignHeirarchyIDs(0);

maxLevel = phyloTree->getMaxLevel();

phyloTree->setUp(file);

m->mothurOut("DONE.");
m->mothurOutEndLine(); cout.flush();

Expand Down
3 changes: 2 additions & 1 deletion source/classifier/classify.h
Expand Up @@ -31,6 +31,7 @@ class Classify {
virtual bool getFlipped() { return flipped; }
virtual void generateDatabaseAndNames(string, string, string, int, float, float, float, float);
virtual void setDistName(string s) {} //for knn, so if distance method is selected with knn you can create the smallest distance file in the right place.
int getMaxLevel() { return maxLevel; }

protected:

Expand All @@ -42,7 +43,7 @@ class Classify {

string taxFile, templateFile, simpleTax;
vector<string> names;
int threadID, numLevels, numTaxa;
int threadID, numLevels, numTaxa, maxLevel;
bool flip, flipped, shortcuts;

int readTaxonomy(string);
Expand Down
17 changes: 12 additions & 5 deletions source/classifier/phylotree.cpp
Expand Up @@ -43,6 +43,8 @@ PhyloTree::PhyloTree(ifstream& in, string filename){

tree.resize(numNodes);

in >> maxLevel; m->gobble(in);

for (int i = 0; i < tree.size(); i++) {
in >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(in);
}
Expand All @@ -59,9 +61,7 @@ PhyloTree::PhyloTree(ifstream& in, string filename){
uniqueTaxonomies.insert(gnode);
totals.push_back(gsize);
}

in.close();

}
catch(exception& e) {
m->errorOut(e, "PhyloTree", "PhyloTree");
Expand Down Expand Up @@ -94,7 +94,7 @@ PhyloTree::PhyloTree(string tfile){
string unknownTax = "unknown;";
//added last taxon until you get desired level
for (int i = 1; i < maxLevel; i++) {
unknownTax += "unclassfied;";
unknownTax += "unknown_unclassfied;";
}

addSeqToTree("unknown", unknownTax);
Expand Down Expand Up @@ -226,8 +226,12 @@ int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
}

if (seqTaxonomy == "") { uniqueTaxonomies.insert(currentNode); }

}


//save maxLevel for binning the unclassified seqs
if (level > maxLevel) { maxLevel = level; }

return 0;
}
catch(exception& e) {
Expand Down Expand Up @@ -284,7 +288,7 @@ void PhyloTree::assignHeirarchyIDs(int index){
try {
map<string,int>::iterator it;
int counter = 1;
for(it=tree[index].children.begin();it!=tree[index].children.end();it++){

if (m->debug) { m->mothurOut(toString(index) +'\t' + tree[it->second].name +'\n'); }
Expand Down Expand Up @@ -477,6 +481,9 @@ void PhyloTree::printTreeNodes(string treefilename) {

//print treenodes
outTree << tree.size() << endl;

outTree << maxLevel << endl;

for (int i = 0; i < tree.size(); i++) {
outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
}
Expand Down
125 changes: 57 additions & 68 deletions source/commands/classifyseqscommand.cpp
Expand Up @@ -717,92 +717,78 @@ int ClassifySeqsCommand::execute(){
m->mothurOut(" Done."); m->mothurOutEndLine();
}

//output taxonomy with the unclassified bins added
ifstream inTax;
m->openInputFile(newTaxonomyFile, inTax);

ofstream outTax;
string unclass = newTaxonomyFile + ".unclass.temp";
m->openOutputFile(unclass, outTax);

//get maxLevel from phylotree so you know how many 'unclassified's to add
int maxLevel = classify->getMaxLevel();

//read taxfile - this reading and rewriting is done to preserve the confidence scores.
string name, taxon;
string group = "";
GroupMap* groupMap = NULL;
CountTable* ct = NULL;
PhyloSummary* taxaSum;

if (hasCount) {
ct = new CountTable();
ct->readTable(countfileNames[s], true, false);
taxaSum = new PhyloSummary(taxonomyFileName, ct, relabund, printlevel);
taxaSum->summarize(tempTaxonomyFile);
taxaSum = new PhyloSummary(ct, relabund, printlevel);
}else {
if (groupfile != "") { group = groupfileNames[s]; groupMap = new GroupMap(group); groupMap->readMap(); }
taxaSum = new PhyloSummary(groupMap, relabund, printlevel);
}

while (!inTax.eof()) {
if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; }

taxaSum = new PhyloSummary(taxonomyFileName, groupMap, relabund, printlevel);
inTax >> name >> taxon; m->gobble(inTax);

if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; }
string newTax = addUnclassifieds(taxon, maxLevel);

if (namefile == "") { taxaSum->summarize(tempTaxonomyFile); }
else {
ifstream in;
m->openInputFile(tempTaxonomyFile, in);

//read in users taxonomy file and add sequences to tree
string name, taxon;
outTax << name << '\t' << newTax << endl;

if (namefile != "") {
itNames = nameMap.find(name);

while(!in.eof()){
if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; }

in >> name >> taxon; m->gobble(in);

itNames = nameMap.find(name);

if (itNames == nameMap.end()) {
m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1);
}else{
for (int i = 0; i < itNames->second.size(); i++) {
taxaSum->addSeqToTree(itNames->second[i], taxon); //add it as many times as there are identical seqs
}
itNames->second.clear();
nameMap.erase(itNames->first);
if (itNames == nameMap.end()) {
m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1);
}else{
for (int i = 0; i < itNames->second.size(); i++) {
taxaSum->addSeqToTree(itNames->second[i], newTax); //add it as many times as there are identical seqs
}
itNames->second.clear();
nameMap.erase(itNames->first);
}
in.close();
}else {
taxaSum->addSeqToTree(name, newTax);
}
}
m->mothurRemove(tempTaxonomyFile);

if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; }

//print summary file
ofstream outTaxTree;
m->openOutputFile(taxSummary, outTaxTree);
taxaSum->print(outTaxTree, output);
outTaxTree.close();

//output taxonomy with the unclassified bins added
ifstream inTax;
m->openInputFile(newTaxonomyFile, inTax);

ofstream outTax;
string unclass = newTaxonomyFile + ".unclass.temp";
m->openOutputFile(unclass, outTax);

//get maxLevel from phylotree so you know how many 'unclassified's to add
int maxLevel = taxaSum->getMaxLevel();

//read taxfile - this reading and rewriting is done to preserve the confidence scores.
string name, taxon;
while (!inTax.eof()) {
if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } m->mothurRemove(unclass); delete classify; return 0; }

inTax >> name >> taxon; m->gobble(inTax);

string newTax = addUnclassifieds(taxon, maxLevel);

outTax << name << '\t' << newTax << endl;
}
inTax.close();
outTax.close();

inTax.close();
outTax.close();

m->mothurRemove(newTaxonomyFile);
rename(unclass.c_str(), newTaxonomyFile.c_str());

if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; }

//print summary file
ofstream outTaxTree;
m->openOutputFile(taxSummary, outTaxTree);
taxaSum->print(outTaxTree, output);
outTaxTree.close();

if (ct != NULL) { delete ct; }
if (groupMap != NULL) { delete groupMap; } delete taxaSum;
m->mothurRemove(newTaxonomyFile);
rename(unclass.c_str(), newTaxonomyFile.c_str());

m->mothurOutEndLine();
m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
m->mothurRemove(tempTaxonomyFile);

m->mothurOutEndLine();
m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();

}
delete classify;
Expand Down Expand Up @@ -850,9 +836,12 @@ string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) {
level++;
}

m->removeConfidences(taxon);
string cTax = taxon.substr(0, taxon.length()-1) + "_unclassified;";

//add "unclassified" until you reach maxLevel
while (level < maxlevel) {
newTax += "unclassified;";
newTax += cTax;
level++;
}

Expand Down

0 comments on commit 5615dc0

Please sign in to comment.