Permalink
Browse files

Failed attempt to make phylip read faster

  • Loading branch information...
1 parent 20a419b commit a36a23591cefa90a1798eea08e55a175f1519e87 @mothur-westcott mothur-westcott committed Nov 30, 2016
Showing with 127 additions and 100 deletions.
  1. +2 −6 source/commands/clustercommand.cpp
  2. +96 −94 source/datastructures/optimatrix.cpp
  3. +28 −0 source/mothurout.cpp
  4. +1 −0 source/mothurout.h
@@ -308,10 +308,6 @@ ClusterCommand::ClusterCommand(string option) {
m->mothurOut("[WARNING]: You can only use the processors option when using the agc or dgc clustering methods. Using 1 processor.\n.");
}
- temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
- m->setProcessors(temp);
- m->mothurConvert(temp, processors);
-
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
if ((method == "agc") || (method == "dgc")) { m->mothurOut("[ERROR]: The agc and dgc clustering methods are not available for Windows, aborting\n."); abort = true; }
@@ -856,11 +852,11 @@ int ClusterCommand::runOptiCluster(){
string distfile = columnfile;
if (format == "phylip") { distfile = phylipfile; }
- int rstart = time(NULL);
+ //int rstart = time(NULL);
OptiMatrix matrix(distfile, thisNamefile, nameOrCount, format, cutoff, false);
- m->mothurOut("It took " + toString(time(NULL) - rstart) + " seconds to read and process matrix"); m->mothurOutEndLine();
+ //m->mothurOut("It took " + toString(time(NULL) - rstart) + " seconds to read and process matrix"); m->mothurOutEndLine();
OptiCluster cluster(&matrix, metric, 0);
tag = cluster.getTag();
@@ -139,105 +139,147 @@ int OptiMatrix::readPhylip(){
float distance;
int square, nseqs;
string name;
- int count = 0;
ifstream fileHandle;
string numTest;
m->openInputFile(distFile, fileHandle);
fileHandle >> numTest >> name;
+ nameMap.push_back(name);
if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
else { convert(numTest, nseqs); }
- closeness.resize(nseqs);
-
- //map shorten name to real name - space saver
- nameMap.push_back(name);
-
//square test
char d;
while((d=fileHandle.get()) != EOF){
+ if(isalnum(d)){ square = 1; fileHandle.putback(d); for(int i=0;i<nseqs;i++){ fileHandle >> distance; } break; }
+ if(d == '\n'){ square = 0; break; }
+ }
+
+ map<int, int> singletonIndexSwap;
+ vector<bool> singleton; singleton.resize(nseqs, true);
+ ///////////////////// Read to eliminate singletons ///////////////////////
+ if(square == 0){
- if(isalnum(d)){
- square = 1;
- fileHandle.putback(d);
- for(int i=0;i<nseqs;i++){
+ for(int i=1;i<nseqs;i++){
+ if (m->control_pressed) { fileHandle.close(); return 0; }
+
+ fileHandle >> name; nameMap.push_back(name);
+
+ for(int j=0;j<i;j++){
+
fileHandle >> distance;
+
+ if (distance == -1) { distance = 1000000; } else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
+
+ if(distance < cutoff){
+ singleton[i] = false;
+ singleton[j] = false;
+ singletonIndexSwap[i] = i;
+ singletonIndexSwap[j] = j;
+
+ }
}
- break;
}
- if(d == '\n'){
- square = 0;
- break;
+ }else{
+ for(int i=1;i<nseqs;i++){
+ if (m->control_pressed) { fileHandle.close(); return 0; }
+
+ fileHandle >> name; nameMap.push_back(name);
+
+ for(int j=0;j<nseqs;j++){
+ fileHandle >> distance;
+
+ if (distance == -1) { distance = 1000000; } else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
+
+ if(distance < cutoff && j < i){
+ singleton[i] = false;
+ singleton[j] = false;
+ singletonIndexSwap[i] = i;
+ singletonIndexSwap[j] = j;
+ }
+ }
}
}
+ fileHandle.close();
+ //////////////////////////////////////////////////////////////////////////
+
+ int nonSingletonCount = 0;
+ for (int i = 0; i < singleton.size(); i++) {
+ if (!singleton[i]) { //if you are a singleton
+ singletonIndexSwap[i] = nonSingletonCount;
+ nonSingletonCount++;
+ }else { singletons.push_back(nameMap[i]); }
+ }
+ singleton.clear();
+
+ closeness.resize(nonSingletonCount);
Progress* reading;
-
+ ifstream in;
+
+ m->openInputFile(distFile, in);
+ in >> nseqs >> name;
+
+ string line = "";
if(square == 0){
reading = new Progress("Reading matrix: ", nseqs * (nseqs - 1) / 2);
-
int index = 0;
for(int i=1;i<nseqs;i++){
- if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
+ if (m->control_pressed) { in.close(); delete reading; return 0; }
- fileHandle >> name;
- nameMap.push_back(name);
+ in >> name; line = m->getline(in); m->gobble(in);
+ vector<float> dists; m->splitWhiteSpace(line, dists, i);
for(int j=0;j<i;j++){
- if (m->control_pressed) { delete reading; fileHandle.close(); return 0; }
+ //in >> distance;
+ distance = dists[j];
- fileHandle >> distance;
-
- if (distance == -1) { distance = 1000000; }
- else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
+ if (distance == -1) { distance = 1000000; } else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
if(distance < cutoff){
- closeness[j].insert(i);
- closeness[i].insert(j);
+ int newB = singletonIndexSwap[j];
+ int newA = singletonIndexSwap[i];
+ closeness[newA].insert(newB);
+ closeness[newB].insert(newA);
}
- index++;
- reading->update(index);
+ index++; reading->update(index);
}
}
- }
- else{
-
+ }else{
reading = new Progress("Reading matrix: ", nseqs * nseqs);
-
int index = nseqs;
for(int i=1;i<nseqs;i++){
- fileHandle >> name;
-
- nameMap.push_back(name);
+ if (m->control_pressed) { in.close(); delete reading; return 0; }
- //list->push_back(toString(i));
+ in >> name;
for(int j=0;j<nseqs;j++){
- fileHandle >> distance;
-
- if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
-
- if (distance == -1) { distance = 1000000; }
- else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
+ in >> distance;
+
+ if (distance == -1) { distance = 1000000; } else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
if(distance < cutoff && j < i){
- closeness[j].insert(i);
- closeness[i].insert(j);
+ int newB = singletonIndexSwap[j];
+ int newA = singletonIndexSwap[i];
+ closeness[newA].insert(newB);
+ closeness[newB].insert(newA);
}
- index++;
- reading->update(index);
+ index++; reading->update(index);
}
}
}
-
- map<string, string> names;
+ in.close();
+ reading->finish();
+ delete reading;
+
if (namefile != "") {
+ map<string, string> names;
m->readNames(namefile, names);
//update nameMap
for (int i = 0; i < nameMap.size(); i++) {
@@ -247,49 +289,12 @@ int OptiMatrix::readPhylip(){
names.clear();
}
- map<int, int> singletonIndexSwap;
- for (int i = 0; i < closeness.size(); i++) { singletonIndexSwap[i] = i; }
- for (int i = 0; i < closeness.size(); i++) {
- if (closeness[i].size() == 0) {
- singletons.push_back(nameMap[i]);
-
- //update indexSwap
- for (map<int, int>::iterator it = singletonIndexSwap.begin(); it != singletonIndexSwap.end(); it++) { if (it->first > i) { it->second++; } }
- }
-
- }
-
- //update matrix indexes
for (int i = 0; i < closeness.size(); i++) {
- set<int> newIndexes;
- for (set<int>::iterator it = closeness[i].begin(); it != closeness[i].end(); it++) { newIndexes.insert(singletonIndexSwap[*it]); }
- closeness[i] = newIndexes;
- }
-
- for (int i = 0; i < closeness.size(); i++) {
- //if you are a singleton we don't include you in the matrix. You are added to the list later
- if (closeness[i].size() == 0) {
-
- //remove row from
- closeness.erase (closeness.begin()+i);
- }else {
- //update namemap
- string newName = nameMap[i];
- int newIndex = singletonIndexSwap[i];
- nameMap[newIndex] = newName;
- }
+ string newName = nameMap[i];
+ int newIndex = singletonIndexSwap[i];
+ nameMap[newIndex] = newName;
}
- return 1;
-
- if (m->control_pressed) { fileHandle.close(); delete reading; return 0; }
-
- reading->finish();
- delete reading;
-
- //list->setLabel("0");
- fileHandle.close();
-
return 0;
}
@@ -347,10 +352,9 @@ int OptiMatrix::readColumn(){
singletonIndexSwap[indexA] = indexA;
singletonIndexSwap[indexB] = indexB;
}
-
-
}
fileHandle.close();
+ //////////////////////////////////////////////////////////////////////////
int nonSingletonCount = 0;
for (int i = 0; i < singleton.size(); i++) {
@@ -359,7 +363,8 @@ int OptiMatrix::readColumn(){
nonSingletonCount++;
}else { singletons.push_back(nameMap[i]); }
}
- //////////////////////////////////////////////////////////////////////////
+ singleton.clear();
+
ifstream in;
m->openInputFile(distFile, in);
@@ -410,9 +415,6 @@ int OptiMatrix::readColumn(){
for (int i = 0; i < closeness.size(); i++) {
-
- if (m->control_pressed) { break; }
-
string newName = nameMap[i];
int newIndex = singletonIndexSwap[i];
nameMap[newIndex] = newName;
View
@@ -2419,6 +2419,34 @@ vector<string> MothurOut::splitWhiteSpace(string input){
}
}
/***********************************************************************/
+int MothurOut::splitWhiteSpace(string input, vector<float>& pieces, int index){
+ try {
+ pieces.clear();
+ string rest = "";
+ int count = 0;
+
+ for (int i = 0; i < input.length(); i++) {
+ if (!isspace(input[i])) { rest += input[i]; }
+ else {
+ if (rest != "") { float tdist; mothurConvert(rest, tdist); pieces.push_back(tdist); count++; rest = ""; }
+ while (i < input.length()) { //gobble white space
+ if (isspace(input[i])) { i++; }
+ else { rest = input[i]; break; } //cout << "next piece buffer = " << nextPiece << endl;
+ }
+ if (count > index) { return 0; }
+ }
+ }
+
+ if (rest != "") { float tdist; mothurConvert(rest, tdist); count++; pieces.push_back(tdist); }
+
+ return 0;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "splitWhiteSpace");
+ exit(1);
+ }
+}
+/***********************************************************************/
vector<string> MothurOut::splitWhiteSpaceWithQuotes(string input){
try {
vector<string> pieces;
View
@@ -125,6 +125,7 @@ class MothurOut {
void zapGremlins(istringstream&);
vector<string> splitWhiteSpace(string& rest, char[], int);
vector<string> splitWhiteSpace(string);
+ int splitWhiteSpace(string, vector<float>&, int);
set<string> readAccnos(string);
int readAccnos(string, vector<string>&);
int readAccnos(string, vector<string>&, string);

0 comments on commit a36a235

Please sign in to comment.