Skip to content

Commit

Permalink
Added tool to convert module output to pimarc
Browse files Browse the repository at this point in the history
  • Loading branch information
markgw committed Apr 1, 2020
1 parent 93ed7f4 commit 1182363
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 14 deletions.
3 changes: 2 additions & 1 deletion src/python/pimlico/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from traceback import print_exc

from pimlico.cli.jupyter import JupyterCmd
from pimlico.cli.pimarc import Tar2PimarcCmd

if __name__ == "__main__":
from pimlico import install_core_dependencies
Expand Down Expand Up @@ -138,7 +139,7 @@ def run_command(self, pipeline, opts):
StatusCmd, VariantsCmd, RunCmd, RecoverCmd, FixLengthCmd, BrowseCmd, ShellCLICmd, PythonShellCmd, ResetCmd, CleanCmd,
ListStoresCmd, MoveStoresCmd, UnlockCmd,
DumpCmd, LoadCmd, DepsCmd, InstallCmd, InputsCmd, OutputCmd, NewModuleCmd, VisualizeCmd, EmailCmd,
JupyterCmd,
JupyterCmd, Tar2PimarcCmd,
]


Expand Down
44 changes: 31 additions & 13 deletions src/python/pimlico/cli/pimarc.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,15 @@ def add_arguments(self, parser):
"be given as 'module_name.output_name'. All grouped corpus outputs "
"of a module can be converted by just giving 'module_name'. Or, if "
"nothing's given, all outputs of all modules are converted")
parser.add_argument("--dry", "--check", action="store_true",
help="Just check what format the corpora use, don't run conversion")
parser.add_argument("--run", action="store_true",
help="Run conversion. Without this option, just checks "
"what format the corpora use")

def run_command(self, pipeline, opts):
dry = opts.dry
if dry:
run = opts.run
if not run:
print("DRY: Not running any conversions, just checking formats")

output_specs = opts.outputs
if output_specs is None or len(output_specs) == 0:
# Nothing given: convert all modules
Expand All @@ -49,7 +51,12 @@ def run_command(self, pipeline, opts):
name for name in module.output_names
if satisfies_typecheck(module.get_output_datatype(name)[1], GroupedCorpus())
]
outputs.extend([(module_name, output) for output in grouped_outputs])
module_outputs = [(module_name, output) for output in grouped_outputs]
if len(module_outputs):
print("Including: {}".format(", ".join("{}.{}".format(mn, on) for (mn, on) in module_outputs)))
outputs.extend(module_outputs)
else:
print("No grouped corpus outputs from {}".format(module_name))
else:
outputs = []
for output_spec in output_specs:
Expand All @@ -61,6 +68,7 @@ def run_command(self, pipeline, opts):
print("Skipping {}: not a grouped corpus".format(output_spec))
else:
outputs.append((module_name, output_name))
print("Including: {}.{}".format(module_name, output_name))
else:
# Just module name: add all outputs that are grouped corpora
module_name = output_spec
Expand All @@ -69,7 +77,12 @@ def run_command(self, pipeline, opts):
name for name in module.output_names
if satisfies_typecheck(module.get_output_datatype(name)[1], GroupedCorpus())
]
outputs.extend([(module_name, output) for output in grouped_outputs])
module_outputs = [(module_name, output) for output in grouped_outputs]
if len(module_outputs):
print("Including: {}".format(", ".join("{}.{}".format(mn, on) for (mn, on) in module_outputs)))
outputs.extend(module_outputs)
else:
print("No grouped corpus outputs from {}".format(module_name))

if len(outputs) == 0:
print("No corpora to convert")
Expand All @@ -81,23 +94,28 @@ def run_command(self, pipeline, opts):
except DataNotReadyError:
print("Skipping {}.{} as data is not ready to read".format(module_name, output_name))
else:
# Check the format of the stored data
if corpus.uses_tar:
if not isinstance(corpus, GroupedCorpus.Reader):
# This corpus uses a reader other than the standard grouped corpus reader
# This probably means it's produced on the fly, or stored some other way
# We therefore can't do any kind of conversion
print("Skipping {}.{} which reads its data using {}".format(module_name, output_name, type(corpus)))
elif corpus.uses_tar:
# Check the format of the stored data
# This is an old tar-based corpus
# Look for all the tar files
tar_paths = [
os.path.join(corpus.data_dir, fn) for fn in corpus.archive_filenames
]
if dry:
print("Would convert {}.{} from tar to prc".format(module_name, output_name))
for tp in tar_paths:
print(" {}".format(tp))
else:
if run:
print("Converting tar files in {}".format(corpus.data_dir))
tar_to_pimarc(tar_paths)
# Remove the tar files
for tp in tar_paths:
os.remove(tp)
else:
print("Would convert {}.{} from tar to prc".format(module_name, output_name))
for tp in tar_paths:
print(" {}".format(tp))
else:
print("Already stored using prc: {}.{}".format(module_name, output_name))

Expand Down

0 comments on commit 1182363

Please sign in to comment.