Permalink
Browse files

Adds group selection to countParser

  • Loading branch information...
1 parent 924c670 commit 989ed99ea005cd8a8d8b31304d48fbd3d570f9b7 @mothur-westcott mothur-westcott committed Oct 3, 2016
@@ -520,7 +520,8 @@ int ChimeraPerseusCommand::execute(){
ct->readTable(nameFile, true, false);
if (ct->hasGroupInfo()) {
- cparser = new SequenceCountParser(fastaFileNames[s], *ct);
+ vector<string> temp;
+ cparser = new SequenceCountParser(fastaFileNames[s], *ct, temp);
variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile));
newCountFile = getOutputFileName("count", variables);
@@ -1088,7 +1088,8 @@ int ChimeraSlayerCommand::setUpForSelfReference(SequenceCountParser*& parser, ma
fileGroup[fastaFileNames[s]] = "noGroup";
}else {
//Parse sequences by group
- parser = new SequenceCountParser(nameFile, fastaFileNames[s]);
+ vector<string> temp;
+ parser = new SequenceCountParser(nameFile, fastaFileNames[s], temp);
vector<string> groups = parser->getNamesOfGroups();
for (int i = 0; i < groups.size(); i++) {
@@ -690,7 +690,8 @@ int ChimeraUchimeCommand::execute(){
vector<string> groups;
map<string, string> uniqueNames;
if (hasCount) {
- cparser = new SequenceCountParser(nameFile, fastaFileNames[s]);
+ vector<string> temp;
+ cparser = new SequenceCountParser(nameFile, fastaFileNames[s], temp);
groups = cparser->getNamesOfGroups();
uniqueNames = cparser->getAllSeqsMap();
}else{
@@ -643,7 +643,8 @@ int ChimeraVsearchCommand::execute(){
vector<string> groups;
map<string, string> uniqueNames;
if (hasCount) {
- cparser = new SequenceCountParser(nameFile, fastaFileNames[s]);
+ vector<string> temp;
+ cparser = new SequenceCountParser(nameFile, fastaFileNames[s], temp);
groups = cparser->getNamesOfGroups();
uniqueNames = cparser->getAllSeqsMap();
}else{
@@ -299,25 +299,11 @@ int PreClusterCommand::execute(){
ofstream outNames; m->openOutputFile(newNamesFile, outNames); outNames.close();
newMapFile = fileroot + "precluster.";
- //parse fasta and name file by group
- vector<string> groups;
- if (countfile != "") {
- cparser = new SequenceCountParser(countfile, fastafile);
- groups = cparser->getNamesOfGroups();
- }else {
- if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile); }
- else { parser = new SequenceParser(groupfile, fastafile); }
- groups = parser->getNamesOfGroups();
- }
-
- if(processors == 1) { driverGroups(newFastaFile, newNamesFile, newMapFile, 0, groups.size(), groups); }
- else { createProcessesGroups(newFastaFile, newNamesFile, newMapFile, groups); }
+ createProcessesGroups(newFastaFile, newNamesFile, newMapFile);
if (countfile != "") {
mergeGroupCounts(newCountFile, newNamesFile, newFastaFile);
- delete cparser;
- }else {
- delete parser;
+ }else {
//run unique.seqs for deconvolute results
string inputString = "fasta=" + newFastaFile;
if (namefile != "") { inputString += ", name=" + newNamesFile; }
@@ -400,14 +386,19 @@ int PreClusterCommand::execute(){
}
}
/**************************************************************************************************/
-int PreClusterCommand::createProcessesGroups(string newFName, string newNName, string newMFile, vector<string> groups) {
+int PreClusterCommand::createProcessesGroups(string newFName, string newNName, string newMFile) {
try {
vector<int> processIDS;
int process = 1;
int num = 0;
bool recalc = false;
+ //parse fasta and name file by group
+ vector<string> groups;
+ if (countfile != "") { CountTable ct; ct.testGroups(countfile, groups); }
+ else { GroupMap gp; gp.readMap(groupfile); groups = gp.getNamesOfGroups(); }
+
//sanity check
if (groups.size() < processors) { processors = groups.size(); }
@@ -603,15 +594,25 @@ int PreClusterCommand::createProcessesGroups(string newFName, string newNName, s
/**************************************************************************************************/
int PreClusterCommand::driverGroups(string newFFile, string newNFile, string newMFile, int start, int end, vector<string> groups){
try {
-
+ vector<string> subsetGroups;
+ for (int i = start; i < end; i++) { subsetGroups.push_back(groups[i]); }
+
+ //parse fasta and name file by group
+ if (countfile != "") {
+ cparser = new SequenceCountParser(countfile, fastafile, subsetGroups);
+ }else {
+ if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile); }
+ else { parser = new SequenceParser(groupfile, fastafile); }
+ }
+
int numSeqs = 0;
//precluster each group
for (int i = start; i < end; i++) {
start = time(NULL);
- if (m->control_pressed) { return 0; }
+ if (m->control_pressed) { if (countfile != "") { delete cparser; }else { delete parser; } return 0; }
m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine();
@@ -644,6 +645,8 @@ int PreClusterCommand::driverGroups(string newFFile, string newNFile, string new
}
+ if (countfile != "") { delete cparser; }else { delete parser; }
+
return numSeqs;
}
catch(exception& e) {
@@ -92,8 +92,8 @@ class PreClusterCommand : public Command {
void printData(string, string, string); //fasta filename, names file name
int process(string);
int loadSeqs(map<string, string>&, vector<Sequence>&, string);
- int driverGroups(string, string, string, int, int, vector<string> groups);
- int createProcessesGroups(string, string, string, vector<string>);
+ int driverGroups(string, string, string, int, int, vector<string>);
+ int createProcessesGroups(string, string, string);
int mergeGroupCounts(string, string, string);
int filterSeqs();
};
@@ -9,100 +9,87 @@
#include "sequencecountparser.h"
/************************************************************/
-SequenceCountParser::SequenceCountParser(string countfile, string fastafile) {
+SequenceCountParser::SequenceCountParser(string countfile, string fastafile, vector<string> groupsSelected) {
try {
m = MothurOut::getInstance();
//read count file
CountTable countTable;
countTable.readTable(countfile, true, false);
+ vector<string> allNames = countTable.getNamesOfGroups();
//initialize maps
- namesOfGroups = countTable.getNamesOfGroups();
+ if (groupsSelected.size() == 0) { //select all
+ namesOfGroups = countTable.getNamesOfGroups();
+ for (int i = 0; i < allNames.size(); i++) { indexes.push_back(i); }
+ }else{
+ namesOfGroups = groupsSelected;
+ map<string, int> temp;
+ for (int i = 0; i < allNames.size(); i++) {
+ for (int j = 0; j < groupsSelected.size(); j++) {
+ if (allNames[i] == groupsSelected[j]) {
+ temp[groupsSelected[j]] = i;
+ }
+ }
+ }
+ for (map<string, int>::iterator it = temp.begin(); it != temp.end(); it++) {
+ indexes.push_back(it->second);
+ }
+ }
+
for (int i = 0; i < namesOfGroups.size(); i++) {
vector<Sequence> temp;
map<string, int> tempMap;
seqs[namesOfGroups[i]] = temp;
countTablePerGroup[namesOfGroups[i]] = tempMap;
}
-
- //read fasta file making sure each sequence is in the group file
- ifstream in;
- m->openInputFile(fastafile, in);
-
- //int fastaCount = 0;
- while (!in.eof()) {
-
- if (m->control_pressed) { break; }
-
- Sequence seq(in); m->gobble(in);
- //fastaCount++;
- //if (m->debug) { if((fastaCount) % 1000 == 0){ m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n."); } }
-
- if (seq.getName() != "") {
-
- //allSeqsMap[seq.getName()] = seq.getName();
- vector<int> groupCounts = countTable.getGroupCounts(seq.getName());
-
- for (int i = 0; i < namesOfGroups.size(); i++) {
- if (groupCounts[i] != 0) {
- seqs[namesOfGroups[i]].push_back(seq);
- countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i];
- }
- }
- }
- }
- in.close();
- }
+
+ readFasta(fastafile, countTable);
+ }
catch(exception& e) {
m->errorOut(e, "SequenceCountParser", "SequenceCountParser");
exit(1);
}
}
/************************************************************/
-SequenceCountParser::SequenceCountParser(string fastafile, CountTable& countTable) {
+SequenceCountParser::SequenceCountParser(string fastafile, CountTable& countTable, vector<string> groupsSelected) {
try {
m = MothurOut::getInstance();
//initialize maps
if (countTable.hasGroupInfo()) {
- namesOfGroups = countTable.getNamesOfGroups();
+ vector<string> allNames = countTable.getNamesOfGroups();
+
+ //initialize maps
+ if (groupsSelected.size() == 0) { //select all
+ namesOfGroups = countTable.getNamesOfGroups();
+ for (int i = 0; i < allNames.size(); i++) { indexes.push_back(i); }
+ }else{
+ namesOfGroups = groupsSelected;
+ map<string, int> temp;
+ for (int i = 0; i < allNames.size(); i++) {
+ for (int j = 0; j < groupsSelected.size(); j++) {
+ if (allNames[i] == groupsSelected[j]) {
+ temp[groupsSelected[j]] = i;
+ }
+ }
+ }
+ for (map<string, int>::iterator it = temp.begin(); it != temp.end(); it++) {
+ indexes.push_back(it->second);
+ }
+ }
+
for (int i = 0; i < namesOfGroups.size(); i++) {
vector<Sequence> temp;
map<string, int> tempMap;
seqs[namesOfGroups[i]] = temp;
countTablePerGroup[namesOfGroups[i]] = tempMap;
}
- //read fasta file making sure each sequence is in the group file
- ifstream in;
- m->openInputFile(fastafile, in);
+ readFasta(fastafile, countTable);
- int fastaCount = 0;
- while (!in.eof()) {
-
- if (m->control_pressed) { break; }
-
- Sequence seq(in); m->gobble(in);
- fastaCount++;
- if (m->debug) { if((fastaCount) % 1000 == 0){ m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n."); } }
-
- if (seq.getName() != "") {
-
- //allSeqsMap[seq.getName()] = seq.getName();
- vector<int> groupCounts = countTable.getGroupCounts(seq.getName());
-
- for (int i = 0; i < namesOfGroups.size(); i++) {
- if (groupCounts[i] != 0) {
- seqs[namesOfGroups[i]].push_back(seq);
- countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i];
- }
- }
- }
- }
- in.close();
}else { m->control_pressed = true; m->mothurOut("[ERROR]: cannot parse fasta file by group with a count table that does not include group data, please correct.\n"); }
}
@@ -112,6 +99,45 @@ SequenceCountParser::SequenceCountParser(string fastafile, CountTable& countTabl
}
}
/************************************************************/
+int SequenceCountParser::readFasta(string fastafile, CountTable& countTable) {
+ try {
+
+
+ ifstream in;
+ m->openInputFile(fastafile, in);
+
+ int fastaCount = 0;
+ while (!in.eof()) {
+
+ if (m->control_pressed) { break; }
+
+ Sequence seq(in); m->gobble(in);
+ fastaCount++;
+ if (m->debug) { if((fastaCount) % 1000 == 0){ m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n."); } }
+
+ if (seq.getName() != "") {
+
+ //allSeqsMap[seq.getName()] = seq.getName();
+ vector<int> groupCounts = countTable.getGroupCounts(seq.getName());
+
+ for (int i = 0; i < namesOfGroups.size(); i++) {
+ if (groupCounts[indexes[i]] != 0) {
+ seqs[namesOfGroups[i]].push_back(seq);
+ countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i];
+ }
+ }
+ }
+ }
+ in.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SequenceCountParser", "readFasta");
+ exit(1);
+ }
+}
+/************************************************************/
SequenceCountParser::~SequenceCountParser(){ }
/************************************************************/
int SequenceCountParser::getNumGroups(){ return namesOfGroups.size(); }
@@ -26,8 +26,8 @@ class SequenceCountParser {
public:
- SequenceCountParser(string, string); //count, fasta - file mismatches will set m->control_pressed = true
- SequenceCountParser(string, CountTable&); //fasta, counttable - file mismatches will set m->control_pressed = true
+ SequenceCountParser(string, string, vector<string>); //count, fasta - file mismatches will set m->control_pressed = true
+ SequenceCountParser(string, CountTable&, vector<string>); //fasta, counttable - file mismatches will set m->control_pressed = true
~SequenceCountParser();
//general operations
@@ -48,10 +48,12 @@ class SequenceCountParser {
MothurOut* m;
int numSeqs;
+ vector<int> indexes;
//map<string, string> allSeqsMap;
map<string, vector<Sequence> > seqs; //a vector for each group
map<string, map<string, int> > countTablePerGroup; //countTable for each group
vector<string> namesOfGroups;
+ int readFasta(string fastafile, CountTable&);
};

0 comments on commit 989ed99

Please sign in to comment.