Permalink
Browse files

1. show output size after each split/merge

2. merge first before counting new splits to save disk space
  • Loading branch information...
1 parent a6621e9 commit b74ebaafd86c0d0e9a2132a95f3171dc45ea0da9 @mpacula committed Dec 26, 2011
Showing with 28 additions and 15 deletions.
  1. +28 −15 src/collocations/Collocations.cpp
View
43 src/collocations/Collocations.cpp
@@ -148,7 +148,14 @@ void copyStream(FILE* in, FILE* out)
delete[] buf;
}
-void reportSplitDone()
+double fileSizeGB(FILE* f)
+{
+ struct stat info;
+ fstat(fileno(f), &info);
+ return (double)info.st_size / (1024*1024*1024);
+}
+
+void reportSplitDone(FILE* result)
{
progress.mtx.lock();
@@ -157,15 +164,16 @@ void reportSplitDone()
eta(progress.startTime, progress.splitsComplete, progress.totalSplits,
&h, &m, &s);
- VERBOSE(fprintf(stderr, "%lu/%lu (%.2f%%) splits complete. eta: %uh %02um %02us\n",
+ VERBOSE(fprintf(stderr, "%lu/%lu (%.2f%%) splits complete. output: %.2fGB. eta: %uh %02um %02us\n",
progress.splitsComplete, progress.totalSplits,
100.0 * progress.splitsComplete / progress.totalSplits,
+ fileSizeGB(result),
h, m, s));
progress.mtx.unlock();
}
-void reportMergeDone()
+void reportMergeDone(FILE* result)
{
progress.mtx.lock();
@@ -174,9 +182,10 @@ void reportMergeDone()
eta(progress.startTime, progress.mergesComplete, progress.totalMerges,
&h, &m, &s);
- VERBOSE(fprintf(stderr, "%lu/%lu (%.2f%%) merges complete. eta: %uh %02um %02us\n",
+ VERBOSE(fprintf(stderr, "%lu/%lu (%.2f%%) merges complete. output: %.2fGB. eta: %uh %02um %02us\n",
progress.mergesComplete, progress.totalMerges,
100.0 * progress.mergesComplete / progress.totalMerges,
+ fileSizeGB(result),
h, m, s));
progress.mtx.unlock();
@@ -296,7 +305,7 @@ FILE* splitCounts(const char* filePath, FileSplit split)
}
rewind(out);
- reportSplitDone();
+ reportSplitDone(out);
return out;
}
@@ -436,27 +445,31 @@ bool claimMerge(FILE*& f1, FILE*& f2)
void workerTask()
{
while(!progress.done()) {
- FileSplit split;
- if(claimSplit(split)) {
- // we got ourselves a split task
- FILE* out = splitCounts(options.filePath.c_str(), split);
- scheduleMerge(out);
- continue;
- }
-
- // we didn't get a split, but maybe we'll get a merge?
+ // For large datasets it is important to do merges
+ // first. Otherwsise we'll end up with thousands of unmerged
+ // splits and run out of disk space.
+
FILE* f1, *f2;
if(claimMerge(f1, f2)) {
// we got a merge task!
FILE* out = tmpfile();
mergeCounts(f1, f2, out);
- reportMergeDone();
+ reportMergeDone(out);
fclose(f1);
fclose(f2);
scheduleMerge(out);
continue;
}
+ FileSplit split;
+ if(claimSplit(split)) {
+ // we got ourselves a split task
+ FILE* out = splitCounts(options.filePath.c_str(), split);
+ scheduleMerge(out);
+ continue;
+ }
+
+
// We didn't get a split or a merge,. since none are available.
// Wait a short period of time and retry;
sleep(1);

0 comments on commit b74ebaa

Please sign in to comment.