Skip to content

Commit

Permalink
Fixed another bug in tar_filter
Browse files Browse the repository at this point in the history
  • Loading branch information
Mark Granroth-Wilding committed Apr 6, 2016
1 parent ff4ec6d commit e378335
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions src/python/pimlico/modules/corpora/tar_filter/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ def archive_iter(self, subsample=None, start_after=None):
started = False

for file_num, (doc_name, doc) in enumerate(self.input_datatype):
current_archive_count += 1

# Allow the first portion of the corpus to be skipped
if not started:
if (type(start_after) is int and file_num == start_after) or \
Expand All @@ -79,10 +81,9 @@ def archive_iter(self, subsample=None, start_after=None):

# Check whether we've put enough files in the current archive to move onto the next
if current_archive_count == self.archive_size:
current_archive += 1
current_archive = min(len(tarballs), current_archive+1)
current_archive_count = 0

current_archive_count += 1
yield tarballs[current_archive], doc_name, doc

def list_archive_iter(self):
Expand Down

0 comments on commit e378335

Please sign in to comment.