Skip to content

Commit

Permalink
Fixed extracting files from grouped archives
Browse files Browse the repository at this point in the history
  • Loading branch information
markgw committed Jan 9, 2020
1 parent 4cba406 commit 32497ac
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion src/python/pimlico/datatypes/corpora/grouped.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def __init__(self, *args, **kwargs):
self.archive_filenames = self.setup._get_archive_filenames(self.data_dir)
self.archive_filenames.sort()
self.archives = [os.path.splitext(os.path.basename(f))[0] for f in self.archive_filenames]
self.archive_to_archive_filename = dict(zip(self.archives, self.archive_filenames))

def extract_file(self, archive_name, filename):
"""
Expand All @@ -94,7 +95,7 @@ def extract_file(self, archive_name, filename):
iterate over its files, which is much faster.
"""
with tarfile.open(os.path.join(self.data_dir, archive_name)) as archive:
with tarfile.open(os.path.join(self.data_dir, self.archive_to_archive_filename[archive_name])) as archive:
return archive.extractfile(filename).read()

def __iter__(self):
Expand Down

0 comments on commit 32497ac

Please sign in to comment.