Cmdline tool to convert grouped corpora to prc

markgw · Apr 1, 2020 · 93ed7f4 · 93ed7f4
1 parent bd75187
commit 93ed7f4
Show file tree

Hide file tree

Showing 2 changed files with 143 additions and 3 deletions.
diff --git a/src/python/pimlico/cli/pimarc.py b/src/python/pimlico/cli/pimarc.py
@@ -0,0 +1,121 @@
+from __future__ import print_function
+
+import os
+from tarfile import TarFile
+
+from pimlico.cli.subcommands import PimlicoCLISubcommand
+from pimlico.core.modules.base import satisfies_typecheck
+from pimlico.datatypes import GroupedCorpus
+from pimlico.datatypes.base import DataNotReadyError
+from pimlico.datatypes.corpora.data_points import RawDocumentType
+
+
+# This file is part of Pimlico
+# Copyright (C) 2016 Mark Granroth-Wilding
+# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html
+from pimlico.utils.pimarc import PimarcWriter
+
+
+class Tar2PimarcCmd(PimlicoCLISubcommand):
+    """
+    Convert grouped corpora from the old tar-based storage format to Pimarc
+    archives.
+
+    """
+    command_name = "tar2pimarc"
+    command_help = "Convert grouped corpora from the old tar-based storage format to pimarc"
+
+    def add_arguments(self, parser):
+        parser.add_argument("outputs", nargs="*",
+                            help="Specification of module outputs to convert. Specific datasets can "
+                                 "be given as 'module_name.output_name'. All grouped corpus outputs "
+                                 "of a module can be converted by just giving 'module_name'. Or, if "
+                                 "nothing's given, all outputs of all modules are converted")
+        parser.add_argument("--dry", "--check", action="store_true",
+                            help="Just check what format the corpora use, don't run conversion")
+
+    def run_command(self, pipeline, opts):
+        dry = opts.dry
+        if dry:
+            print("DRY: Not running any conversions, just checking formats")
+        output_specs = opts.outputs
+        if output_specs is None or len(output_specs) == 0:
+            # Nothing given: convert all modules
+            outputs = []
+            for module_name in pipeline.module_order:
+                # Check module for any grouped corpus outputs
+                module = pipeline[module_name]
+                grouped_outputs = [
+                    name for name in module.output_names
+                    if satisfies_typecheck(module.get_output_datatype(name)[1], GroupedCorpus())
+                ]
+                outputs.extend([(module_name, output) for output in grouped_outputs])
+        else:
+            outputs = []
+            for output_spec in output_specs:
+                if "." in output_spec:
+                    module_name, __, output_name = output_spec.partition(".")
+                    module = pipeline[module_name]
+                    # Check this output is a grouped corpus
+                    if not satisfies_typecheck(module.get_output_datatype(output_name)[1], GroupedCorpus()):
+                        print("Skipping {}: not a grouped corpus".format(output_spec))
+                    else:
+                        outputs.append((module_name, output_name))
+                else:
+                    # Just module name: add all outputs that are grouped corpora
+                    module_name = output_spec
+                    module = pipeline[module_name]
+                    grouped_outputs = [
+                        name for name in module.output_names
+                        if satisfies_typecheck(module.get_output_datatype(name)[1], GroupedCorpus())
+                    ]
+                    outputs.extend([(module_name, output) for output in grouped_outputs])
+
+        if len(outputs) == 0:
+            print("No corpora to convert")
+
+        for module_name, output_name in outputs:
+            module = pipeline[module_name]
+            try:
+                corpus = module.get_output(output_name)
+            except DataNotReadyError:
+                print("Skipping {}.{} as data is not ready to read".format(module_name, output_name))
+            else:
+                # Check the format of the stored data
+                if corpus.uses_tar:
+                    # This is an old tar-based corpus
+                    # Look for all the tar files
+                    tar_paths = [
+                        os.path.join(corpus.data_dir, fn) for fn in corpus.archive_filenames
+                    ]
+                    if dry:
+                        print("Would convert {}.{} from tar to prc".format(module_name, output_name))
+                        for tp in tar_paths:
+                            print("  {}".format(tp))
+                    else:
+                        print("Converting tar files in {}".format(corpus.data_dir))
+                        tar_to_pimarc(tar_paths)
+                        # Remove the tar files
+                        for tp in tar_paths:
+                            os.remove(tp)
+                else:
+                    print("Already stored using prc: {}.{}".format(module_name, output_name))
+
+
+def tar_to_pimarc(in_tar_paths):
+    for tar_path in in_tar_paths:
+        tar_path = os.path.abspath(tar_path)
+
+        # Work out where to put the converted file
+        out_filename = "{}.prc".format(os.path.splitext(os.path.basename(tar_path))[0])
+        out_path = os.path.join(os.path.dirname(tar_path), out_filename)
+
+        # Create a writer to add files to
+        with PimarcWriter(out_path) as arc:
+            # Read in the tar file
+            tarfile = TarFile.open(tar_path, "r:")
+            for tarinfo in tarfile:
+                name = tarinfo.name
+                with tarfile.extractfile(tarinfo) as tar_member:
+                    data = tar_member.read()
+                arc.write_file(data, name)
diff --git a/src/python/pimlico/datatypes/corpora/grouped.py b/src/python/pimlico/datatypes/corpora/grouped.py
@@ -62,11 +62,28 @@ def _iter_archive_filenames(cls, data_dir):
                 if data_dir is None:
                     return
                 else:
+                    # Check for any .prc files: if even one is found, we look only at .prc files
+                    ext = ".prc" if cls._uses_prc(data_dir) else ".tar"
                     for root, dirs, files in os.walk(data_dir):
                         for filename in files:
-                            f = os.path.join(root, filename)
-                            if f.endswith(".prc") or f.endswith(".tar"):
-                                yield f
+                            if filename.endswith(ext):
+                                yield os.path.join(root, filename)
+
+            @classmethod
+            def _uses_prc(cls, data_dir):
+                found_tar = False
+                for root, dirs, files in os.walk(data_dir):
+                    for filename in files:
+                        if filename.endswith(".prc"):
+                            # Found one prc file, so use prc
+                            return True
+                        elif filename.endswith(".tar"):
+                            # Found one tar file: if no prc files found, we're clearly using tar
+                            found_tar = True
+                # No archives found:
+                # If tars found, use them
+                # Otherwise, assume we're using prc but don't have any files yet
+                return not found_tar
 
             def _has_archives(self, data_dir):
                 # Return True if there's at least 1 archive in the dir
@@ -84,6 +101,8 @@ def __init__(self, *args, **kwargs):
             self.archive_filenames.sort()
             self.archives = [os.path.splitext(os.path.basename(f))[0] for f in self.archive_filenames]
             self.archive_to_archive_filename = dict(zip(self.archives, self.archive_filenames))
+            # Whether this corpus uses Pimarc (prc) files or tar
+            self.uses_tar = not self.setup._uses_prc(self.data_dir)
 
             # Cache the last-used archive
             self._last_used_archive = None