Permalink
Browse files

Merge branch 'precluster_220c' into 1.39.0

  • Loading branch information...
2 parents 3f9606a + d8b2ea8 commit 278f690e8a3bb6da2282a1562e77e0703d1dd445 @mothur-westcott mothur-westcott committed Sep 26, 2016
@@ -3410,7 +3410,7 @@
GCC_ENABLE_SSE3_EXTENSIONS = NO;
GCC_ENABLE_SSE41_EXTENSIONS = NO;
GCC_ENABLE_SSE42_EXTENSIONS = NO;
- GCC_OPTIMIZATION_LEVEL = 0;
+ GCC_OPTIMIZATION_LEVEL = 3;
GCC_PREPROCESSOR_DEFINITIONS = (
"MOTHUR_FILES=\"\\\"/Users/sarahwestcott/desktop/release\\\"\"",
"VERSION=\"\\\"1.36.0\\\"\"",
@@ -3456,7 +3456,7 @@
GCC_C_LANGUAGE_STANDARD = c11;
GCC_GENERATE_DEBUGGING_SYMBOLS = NO;
GCC_MODEL_TUNING = "";
- GCC_OPTIMIZATION_LEVEL = 0;
+ GCC_OPTIMIZATION_LEVEL = 3;
GCC_PREPROCESSOR_DEFINITIONS = (
"VERSION=\"\\\"1.37.6\\\"\"",
"RELEASE_DATE=\"\\\"06/20/2016\\\"\"",
@@ -62,6 +62,18 @@ class Filters {
}
}
+
+ void doVerticalAllBases() {
+
+ for(int i=0;i<alignmentLength;i++){
+ if(gap[i] == numSeqs) { filter[i] = '0'; }
+ else if(a[i] == numSeqs) { filter[i] = '0'; }
+ else if(t[i] == numSeqs) { filter[i] = '0'; }
+ else if(c[i] == numSeqs) { filter[i] = '0'; }
+ else if(g[i] == numSeqs) { filter[i] = '0'; }
+ }
+
+ }
void doTrump(Sequence seq) {
@@ -968,7 +968,7 @@ int GetSeqsCommand::readGroup(){
if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
- in >> name; //read from first column
+ in >> name; m->gobble(in); //read from first column
in >> group; //read from second column

Large diffs are not rendered by default.

Oops, something went wrong.
@@ -10,6 +10,7 @@
#include "preclustercommand.h"
#include "deconvolutecommand.h"
+
//**********************************************************************************************************************
vector<string> PreClusterCommand::setParameters(){
try {
@@ -342,7 +343,6 @@ int PreClusterCommand::execute(){
}else {
if (processors != 1) { m->mothurOut("When using running without group information mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
- if (namefile != "") { readNameFile(); }
//reads fasta file and return number of seqs
int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active
@@ -679,7 +679,9 @@ int PreClusterCommand::process(string newMapFile){
if (alignSeqs[j].active) { //this sequence has not been merged yet
//are you within "diff" bases
- int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
+ int mismatch = length;
+ if (method == "unaligned") { mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned()); }
+ else { mismatch = calcMisMatches(alignSeqs[i].filteredSeq, alignSeqs[j].filteredSeq); }
if (mismatch <= diffs) {
//merge
@@ -720,7 +722,9 @@ int PreClusterCommand::process(string newMapFile){
if (originalCount[j] > originalCount[i]) { //this sequence is more abundant than I am
//are you within "diff" bases
- int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
+ int mismatch = length;
+ if (method == "unaligned") { mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned()); }
+ else { mismatch = calcMisMatches(alignSeqs[i].filteredSeq, alignSeqs[j].filteredSeq); }
if (mismatch <= diffs) {
//merge
@@ -762,7 +766,10 @@ int PreClusterCommand::process(string newMapFile){
/**************************************************************************************************/
int PreClusterCommand::readFASTA(){
try {
- //ifstream inNames;
+ map<string, string> nameMap;
+ map<string, string>::iterator it;
+ if (namefile != "") { m->readNames(namefile, nameMap); }
+
ifstream inFasta;
m->openInputFile(fastafile, inFasta);
@@ -776,11 +783,12 @@ int PreClusterCommand::readFASTA(){
if (seq.getName() != "") { //can get "" if commented line is at end of fasta file
if (namefile != "") {
- itSize = sizes.find(seq.getName());
+ it = nameMap.find(seq.getName());
- if (itSize == sizes.end()) { m->mothurOut(seq.getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); exit(1); }
+ if (it == nameMap.end()) { m->mothurOut(seq.getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); exit(1); }
else{
- seqPNode tempNode(itSize->second, seq, names[seq.getName()]);
+ string second = it->second;
+ seqPNode tempNode(m->getNumNames(second), seq, second);
alignSeqs.push_back(tempNode);
lengths.insert(seq.getAligned().length());
}
@@ -796,7 +804,7 @@ int PreClusterCommand::readFASTA(){
inFasta.close();
if (lengths.size() > 1) { method = "unaligned"; }
- else if (lengths.size() == 1) { method = "aligned"; }
+ else if (lengths.size() == 1) { method = "aligned"; filterSeqs(); }
length = *(lengths.begin());
@@ -813,7 +821,6 @@ int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>&
try {
set<int> lengths;
alignSeqs.clear();
- map<string, string>::iterator it;
bool error = false;
map<string, int> thisCount;
if (countfile != "") { thisCount = cparser->getCountTable(group); }
@@ -823,7 +830,7 @@ int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>&
if (m->control_pressed) { return 0; }
if (namefile != "") {
- it = thisName.find(thisSeqs[i].getName());
+ map<string, string>::iterator it = thisName.find(thisSeqs[i].getName());
//should never be true since parser checks for this
if (it == thisName.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); error = true; }
@@ -853,16 +860,16 @@ int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>&
}
}
- if (lengths.size() > 1) { method = "unaligned"; }
- else if (lengths.size() == 1) { method = "aligned"; }
-
length = *(lengths.begin());
+ if (lengths.size() > 1) { method = "unaligned"; }
+ else if (lengths.size() == 1) { method = "aligned"; filterSeqs(); }
+
//sanity check
if (error) { m->control_pressed = true; }
thisSeqs.clear();
-
+
return alignSeqs.size();
}
@@ -1027,30 +1034,39 @@ void PreClusterCommand::printData(string newfasta, string newname, string group)
}
/**************************************************************************************************/
-void PreClusterCommand::readNameFile(){
- try {
- ifstream in;
- m->openInputFile(namefile, in);
- string firstCol, secondCol;
-
- while (!in.eof()) {
- in >> firstCol >> secondCol; m->gobble(in);
-
- m->checkName(firstCol);
- m->checkName(secondCol);
- int size = m->getNumNames(secondCol);
-
- names[firstCol] = secondCol;
- sizes[firstCol] = size;
- }
- in.close();
- }
- catch(exception& e) {
- m->errorOut(e, "PreClusterCommand", "readNameFile");
- exit(1);
- }
+int PreClusterCommand::filterSeqs(){
+ try {
+ string filterString = "";
+ Filters F;
+
+ F.setLength(length);
+ F.initialize();
+ F.setFilter(string(length, '1'));
+
+ for (int i = 0; i < alignSeqs.size(); i++) {
+ F.getFreqs(alignSeqs[i].seq);
+ }
+
+ F.setNumSeqs(alignSeqs.size());
+ F.doVerticalAllBases();
+ filterString = F.getFilter();
+
+ //run filter
+ for (int i = 0; i < alignSeqs.size(); i++) {
+ alignSeqs[i].filteredSeq = "";
+ string align = alignSeqs[i].seq.getAligned();
+ for(int j=0;j<length;j++){
+ if(filterString[j] == '1'){ alignSeqs[i].filteredSeq += align[j]; }
+ }
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PreClusterCommand", "filterSeqs");
+ exit(1);
+ }
}
-
/**************************************************************************************************/
@@ -21,33 +21,28 @@
#include "needlemanoverlap.hpp"
#include "blastalign.hpp"
#include "noalign.hpp"
-
+#include "filters.h"
/************************************************************/
struct seqPNode {
int numIdentical;
Sequence seq;
+ string filteredSeq;
string names;
bool active;
int diffs;
seqPNode() {}
- seqPNode(int n, Sequence s, string nm) : numIdentical(n), seq(s), names(nm), active(1) { diffs = 0; }
+ seqPNode(int n, Sequence s, string nm) : numIdentical(n), seq(s), names(nm), active(1) { diffs = 0; filteredSeq = "";}
~seqPNode() {}
};
/************************************************************/
inline bool comparePriorityTopDown(seqPNode first, seqPNode second) {
if (first.numIdentical > second.numIdentical) { return true; }
- else if (first.numIdentical == second.numIdentical) {
- if (first.seq.getName() > second.seq.getName()) { return true; }
- }
return false;
}
/************************************************************/
inline bool comparePriorityDownTop(seqPNode first, seqPNode second) {
if (first.numIdentical < second.numIdentical) { return true; }
- else if (first.numIdentical == second.numIdentical) {
- if (first.seq.getName() > second.seq.getName()) { return true; }
- }
return false;
}
//************************************************************/
@@ -83,22 +78,18 @@ class PreClusterCommand : public Command {
bool abort, bygroup, topdown;
string fastafile, namefile, outputDir, groupfile, countfile, method, align;
vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
- map<string, string> names; //represents the names file first column maps to second column
- map<string, int> sizes; //this map a seq name to the number of identical seqs in the names file
- map<string, int>::iterator itSize;
-// map<string, bool> active; //maps sequence name to whether it has already been merged or not.
vector<string> outputNames;
int readFASTA();
void readNameFile();
- //int readNamesFASTA();
int calcMisMatches(string, string);
void printData(string, string, string); //fasta filename, names file name
int process(string);
int loadSeqs(map<string, string>&, vector<Sequence>&, string);
int driverGroups(string, string, string, int, int, vector<string> groups);
int createProcessesGroups(string, string, string, vector<string>);
int mergeGroupCounts(string, string, string);
+ int filterSeqs();
};
/**************************************************************************************************/
@@ -250,10 +241,38 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){
}
}
+ length = *(lengths.begin());
+
if (lengths.size() > 1) { pDataArray->method = "unaligned"; }
- else if (lengths.size() == 1) { pDataArray->method = "aligned"; }
+ else if (lengths.size() == 1) {
+ pDataArray->method = "aligned";
+ ////////////////////////////////////////////////////
+ //filterSeqs();
+
+ string filterString = "";
+ Filters F;
+
+ F.setLength(length);
+ F.initialize();
+ F.setFilter(string(length, '1'));
+
+ for (int i = 0; i < alignSeqs.size(); i++) { F.getFreqs(alignSeqs[i].seq);}
+
+ F.setNumSeqs(alignSeqs.size());
+ F.doVerticalAllBases();
+ filterString = F.getFilter();
+
+ //run filter
+ for (int i = 0; i < alignSeqs.size(); i++) {
+ alignSeqs[i].filteredSeq = "";
+ string align = alignSeqs[i].seq.getAligned();
+ for(int j=0;j<length;j++){
+ if(filterString[j] == '1'){ alignSeqs[i].filteredSeq += align[j]; }
+ }
+ }
+ ////////////////////////////////////////////////////
+ }
- length = *(lengths.begin());
//sanity check
if (error) { pDataArray->m->control_pressed = true; }
@@ -323,9 +342,9 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){
if (mismatch > pDataArray->diffs) { mismatch = length; break; } //to far to cluster
}
}else {
- for (int k = 0; k < alignSeqs[i].seq.getAligned().length(); k++) {
+ for (int k = 0; k < alignSeqs[i].filteredSeq.length(); k++) {
//do they match
- if (alignSeqs[i].seq.getAligned()[k] != alignSeqs[j].seq.getAligned()[k]) { mismatch++; }
+ if (alignSeqs[i].filteredSeq[k] != alignSeqs[j].filteredSeq[k]) { mismatch++; }
if (mismatch > pDataArray->diffs) { mismatch = length; break; } //to far to cluster
}
}
@@ -396,9 +415,9 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){
}
}else {
- for (int k = 0; k < alignSeqs[i].seq.getAligned().length(); k++) {
+ for (int k = 0; k < alignSeqs[i].filteredSeq.length(); k++) {
//do they match
- if (alignSeqs[i].seq.getAligned()[k] != alignSeqs[j].seq.getAligned()[k]) { mismatch++; }
+ if (alignSeqs[i].filteredSeq[k] != alignSeqs[j].filteredSeq[k]) { mismatch++; }
if (mismatch > pDataArray->diffs) { mismatch = length; break; } //to far to cluster
}
}
Oops, something went wrong.

0 comments on commit 278f690

Please sign in to comment.