Skip to content

Commit

Permalink
Cmdline tool to convert grouped corpora to prc
Browse files Browse the repository at this point in the history
  • Loading branch information
markgw committed Apr 1, 2020
1 parent bd75187 commit 93ed7f4
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 3 deletions.
121 changes: 121 additions & 0 deletions src/python/pimlico/cli/pimarc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
from __future__ import print_function

import os
from tarfile import TarFile

from pimlico.cli.subcommands import PimlicoCLISubcommand
from pimlico.core.modules.base import satisfies_typecheck
from pimlico.datatypes import GroupedCorpus
from pimlico.datatypes.base import DataNotReadyError
from pimlico.datatypes.corpora.data_points import RawDocumentType


# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html
from pimlico.utils.pimarc import PimarcWriter


class Tar2PimarcCmd(PimlicoCLISubcommand):
"""
Convert grouped corpora from the old tar-based storage format to Pimarc
archives.
"""
command_name = "tar2pimarc"
command_help = "Convert grouped corpora from the old tar-based storage format to pimarc"

def add_arguments(self, parser):
parser.add_argument("outputs", nargs="*",
help="Specification of module outputs to convert. Specific datasets can "
"be given as 'module_name.output_name'. All grouped corpus outputs "
"of a module can be converted by just giving 'module_name'. Or, if "
"nothing's given, all outputs of all modules are converted")
parser.add_argument("--dry", "--check", action="store_true",
help="Just check what format the corpora use, don't run conversion")

def run_command(self, pipeline, opts):
dry = opts.dry
if dry:
print("DRY: Not running any conversions, just checking formats")
output_specs = opts.outputs
if output_specs is None or len(output_specs) == 0:
# Nothing given: convert all modules
outputs = []
for module_name in pipeline.module_order:
# Check module for any grouped corpus outputs
module = pipeline[module_name]
grouped_outputs = [
name for name in module.output_names
if satisfies_typecheck(module.get_output_datatype(name)[1], GroupedCorpus())
]
outputs.extend([(module_name, output) for output in grouped_outputs])
else:
outputs = []
for output_spec in output_specs:
if "." in output_spec:
module_name, __, output_name = output_spec.partition(".")
module = pipeline[module_name]
# Check this output is a grouped corpus
if not satisfies_typecheck(module.get_output_datatype(output_name)[1], GroupedCorpus()):
print("Skipping {}: not a grouped corpus".format(output_spec))
else:
outputs.append((module_name, output_name))
else:
# Just module name: add all outputs that are grouped corpora
module_name = output_spec
module = pipeline[module_name]
grouped_outputs = [
name for name in module.output_names
if satisfies_typecheck(module.get_output_datatype(name)[1], GroupedCorpus())
]
outputs.extend([(module_name, output) for output in grouped_outputs])

if len(outputs) == 0:
print("No corpora to convert")

for module_name, output_name in outputs:
module = pipeline[module_name]
try:
corpus = module.get_output(output_name)
except DataNotReadyError:
print("Skipping {}.{} as data is not ready to read".format(module_name, output_name))
else:
# Check the format of the stored data
if corpus.uses_tar:
# This is an old tar-based corpus
# Look for all the tar files
tar_paths = [
os.path.join(corpus.data_dir, fn) for fn in corpus.archive_filenames
]
if dry:
print("Would convert {}.{} from tar to prc".format(module_name, output_name))
for tp in tar_paths:
print(" {}".format(tp))
else:
print("Converting tar files in {}".format(corpus.data_dir))
tar_to_pimarc(tar_paths)
# Remove the tar files
for tp in tar_paths:
os.remove(tp)
else:
print("Already stored using prc: {}.{}".format(module_name, output_name))


def tar_to_pimarc(in_tar_paths):
for tar_path in in_tar_paths:
tar_path = os.path.abspath(tar_path)

# Work out where to put the converted file
out_filename = "{}.prc".format(os.path.splitext(os.path.basename(tar_path))[0])
out_path = os.path.join(os.path.dirname(tar_path), out_filename)

# Create a writer to add files to
with PimarcWriter(out_path) as arc:
# Read in the tar file
tarfile = TarFile.open(tar_path, "r:")
for tarinfo in tarfile:
name = tarinfo.name
with tarfile.extractfile(tarinfo) as tar_member:
data = tar_member.read()
arc.write_file(data, name)
25 changes: 22 additions & 3 deletions src/python/pimlico/datatypes/corpora/grouped.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,28 @@ def _iter_archive_filenames(cls, data_dir):
if data_dir is None:
return
else:
# Check for any .prc files: if even one is found, we look only at .prc files
ext = ".prc" if cls._uses_prc(data_dir) else ".tar"
for root, dirs, files in os.walk(data_dir):
for filename in files:
f = os.path.join(root, filename)
if f.endswith(".prc") or f.endswith(".tar"):
yield f
if filename.endswith(ext):
yield os.path.join(root, filename)

@classmethod
def _uses_prc(cls, data_dir):
found_tar = False
for root, dirs, files in os.walk(data_dir):
for filename in files:
if filename.endswith(".prc"):
# Found one prc file, so use prc
return True
elif filename.endswith(".tar"):
# Found one tar file: if no prc files found, we're clearly using tar
found_tar = True
# No archives found:
# If tars found, use them
# Otherwise, assume we're using prc but don't have any files yet
return not found_tar

def _has_archives(self, data_dir):
# Return True if there's at least 1 archive in the dir
Expand All @@ -84,6 +101,8 @@ def __init__(self, *args, **kwargs):
self.archive_filenames.sort()
self.archives = [os.path.splitext(os.path.basename(f))[0] for f in self.archive_filenames]
self.archive_to_archive_filename = dict(zip(self.archives, self.archive_filenames))
# Whether this corpus uses Pimarc (prc) files or tar
self.uses_tar = not self.setup._uses_prc(self.data_dir)

# Cache the last-used archive
self._last_used_archive = None
Expand Down

0 comments on commit 93ed7f4

Please sign in to comment.