New tools for pimarcs

Rebuild index from data file. Some fixes to existing tools and made writing a little less error prone.
markgw · Apr 1, 2020 · bd75187 · bd75187
1 parent 838d623
commit bd75187
Show file tree

Hide file tree

Showing 6 changed files with 160 additions and 45 deletions.
diff --git a/src/python/pimlico/utils/pimarc/__init__.py b/src/python/pimlico/utils/pimarc/__init__.py
@@ -19,6 +19,10 @@
 JSON metadata dictionary for each file, so metadata like this can be stored as
 necessary.
 
+Pimarcs do not store any directory structures, just a flat collection of files.
+This is all that is needed for storing Pimlico datasets, so it's best for this purpose
+to keep the format as simple as possible.
+
 Iterating over files in order is still likely to be substantially faster than random
 access (depending on the underlying storage), so it is recommended to add files to
 the archive in the sequential order that they are used in. This is the typical use

diff --git a/src/python/pimlico/utils/pimarc/index.py b/src/python/pimlico/utils/pimarc/index.py
@@ -1,7 +1,10 @@
+import json
 import os
 from collections import OrderedDict
 from builtins import *
 
+from .utils import _read_var_length_data, _skip_var_length_data
+
 
 class PimarcIndex(object):
     """
@@ -123,6 +126,45 @@ def flush(self):
         os.fsync(self.fileobj.fileno())
 
 
+def reindex(pimarc_path):
+    """
+    Rebuild the index of a Pimarc archive from its data file (.prc).
+
+    Stores the new index in the correct location (.prci), overwriting any existing index.
+
+    :param pimarc_path: path to the .prc file
+    :return: the PimarcIndex
+    """
+    if not pimarc_path.endswith(".prc"):
+        raise IndexWriteError("input pimarc path does not have the correct extension (.prc)")
+    index_path = "{}i".format(pimarc_path)
+
+    # Create an empty index
+    index = PimarcIndex()
+    # Read in each file in turn, reading the metadata to get the name and skipping the file content
+    with open(pimarc_path, "rb") as data_file:
+        try:
+            while True:
+                # Check where the metadata starts
+                metadata_start_byte = data_file.tell()
+                # First read the file's metadata block
+                metadata = json.loads(_read_var_length_data(data_file).decode("utf-8"))
+                # From that we can get the name
+                filename = metadata["name"]
+                # Now we're at the start of the file data
+                data_start_byte = data_file.tell()
+                # Skip over the data: we don't need to read that
+                _skip_var_length_data(data_file)
+                # Now add the entry to the index, with pointers to the start bytes
+                index.append(filename, metadata_start_byte, data_start_byte)
+        except EOFError:
+            # Reached the end of the file
+            pass
+
+    index.save(index_path)
+    return index
+
+
 class FilenameNotInArchive(Exception):
     def __init__(self, filename):
         super().__init__(u"filename '{}' not found in archive".format(filename))

diff --git a/src/python/pimlico/utils/pimarc/reader.py b/src/python/pimlico/utils/pimarc/reader.py
@@ -2,8 +2,8 @@
 
 from builtins import super, bytes
 
+from .utils import _read_var_length_data, _skip_var_length_data
 from .index import PimarcIndex
-from pimlico.utils.varint import decode_stream
 
 
 class PimarcReader(object):
@@ -201,27 +201,5 @@ def decode(self):
     items = metadata_decode_decorator(dict.items)
 
 
-def _read_var_length_data(reader):
-    """
-    Read some data from a file-like object by first reading a varint that says how many
-    bytes are in the data and then reading the data immediately following.
-
-    """
-    # Get a single varint from the reader stream
-    data_length = decode_stream(reader)
-    # Read the data as a bytes array
-    return reader.read(data_length)
-
-
-def _skip_var_length_data(reader):
-    """
-    Like read_var_length_data, but doesn't actually read the data. Just reads the length
-    indicator and seeks to the end of the data.
-
-    """
-    data_length = decode_stream(reader)
-    reader.seek(data_length, 1)
-
-
 class StartAfterFilenameNotFound(KeyError):
     pass
diff --git a/src/python/pimlico/utils/pimarc/tools.py b/src/python/pimlico/utils/pimarc/tools.py
@@ -6,7 +6,10 @@
 import os
 from tarfile import TarFile
 
+import sys
+
 from pimlico.utils.pimarc import PimarcReader, PimarcWriter
+from .index import reindex
 
 
 def list_files(opts):
@@ -25,12 +28,48 @@ def extract_file(opts):
     path = opts.path
     filenames = opts.filenames
 
-    reader = PimarcReader(path)
-    for filename in filenames:
-        print("Extracting {}".format(filename))
-        metadata, data = reader[filename]
-        with open(os.path.join(out_path, filename), "wb") as f:
-            f.write(data)
+    with PimarcReader(path) as reader:
+        for filename in filenames:
+            print("Extracting {}".format(filename))
+            metadata, data = reader[filename]
+            with open(os.path.join(out_path, filename), "wb") as f:
+                f.write(data)
+
+
+def append_file(opts):
+    path = os.path.abspath(opts.path)
+    if not path.endswith(".prc"):
+        print("Pimarc data file must use extension '.prc'")
+    paths_to_add = opts.files
+    # Append if the file already exists, otherwise create a new archive
+    append = os.path.exists(path)
+    if append:
+        print("Appending files to existing pimarc {}".format(path))
+    else:
+        print("Creating new pimarc {}".format(path))
+
+    with PimarcWriter(path, mode="a" if append else "w") as writer:
+        # First check that all files can be added
+        for path_to_add in paths_to_add:
+            path_to_add = os.path.abspath(path_to_add)
+            if not os.path.exists(path_to_add):
+                print("Cannot add {}: file does not exist".format(path_to_add))
+                sys.exit(1)
+            filename = os.path.basename(path_to_add)
+            if filename in writer.index:
+                print("Cannot add {}: filename '{}' already exists in archive".format(path_to_add, filename))
+                sys.exit(1)
+
+        # Now add the files
+        for path_to_add in paths_to_add:
+            path_to_add = os.path.abspath(path_to_add)
+            # Just use the basename when adding the file: no paths are stored in pimarcs
+            filename = os.path.basename(path_to_add)
+            print("  Adding {} as {}".format(path_to_add, filename))
+            # Read in the file's data
+            with open(path_to_add, "rb") as f:
+                data = f.read()
+            writer.write_file(data, filename)
 
 
 def from_tar(opts):
@@ -64,6 +103,17 @@ def from_tar(opts):
             os.remove(tar_path)
 
 
+def reindex_pimarcs(opts):
+    if not all(path.endswith(".prc") for path in opts.paths):
+        print("Pimarc files must have correct extension: .prc")
+        sys.exit(1)
+
+    for pimarc_path in opts.paths:
+        print("Rebuilding index for {}".format(pimarc_path))
+        reindex(pimarc_path)
+        print("  Success")
+
+
 def no_subcommand(opts):
     print("Specify a subcommand: list, ...")
 
@@ -83,6 +133,12 @@ def run():
     subparser.add_argument("filenames", nargs="+", help="Filename(s) to extract")
     subparser.add_argument("--out", "-o", help="Output dir (default CWD)")
 
+    subparser = subparsers.add_parser("append", help="Append a file to a pimarc. "
+                                                     "If the pimarc doesn't exist, it is created")
+    subparser.set_defaults(func=append_file)
+    subparser.add_argument("path", help="Path to the pimarc (.prc file)")
+    subparser.add_argument("files", nargs="+", help="Path(s) to add")
+
     subparser = subparsers.add_parser("fromtar",
                                       help="Create a Pimarc containing all the same files as a given tar. "
                                            "Outputs to the same filename as input, with '.tar' replaced by '.prc'")
@@ -91,6 +147,13 @@ def run():
     subparser.add_argument("--out-path", "-o", help="Directory to output files to. Defaults to same as input")
     subparser.add_argument("--delete", "-d", action="store_true", help="Delete the tar files after creating pimarcs")
 
+    subparser = subparsers.add_parser("reindex",
+                                      help="Rebuild a pimarc's index (the .prci file) from its data (the .prc file). "
+                                           "This can be necessary if the index has become corrupted or something when "
+                                           "wrong during writing of the archive")
+    subparser.set_defaults(func=reindex_pimarcs)
+    subparser.add_argument("paths", nargs="+", help="Path to the pimarc(s) - .prc files")
+
     opts = parser.parse_args()
     opts.func(opts)
 

diff --git a/src/python/pimlico/utils/pimarc/utils.py b/src/python/pimlico/utils/pimarc/utils.py
@@ -0,0 +1,36 @@
+from pimlico.utils.varint import decode_stream, encode
+
+
+def _read_var_length_data(reader):
+    """
+    Read some data from a file-like object by first reading a varint that says how many
+    bytes are in the data and then reading the data immediately following.
+
+    """
+    # Get a single varint from the reader stream
+    data_length = decode_stream(reader)
+    # Read the data as a bytes array
+    return reader.read(data_length)
+
+
+def _skip_var_length_data(reader):
+    """
+    Like read_var_length_data, but doesn't actually read the data. Just reads the length
+    indicator and seeks to the end of the data.
+
+    """
+    data_length = decode_stream(reader)
+    reader.seek(data_length, 1)
+
+
+def _write_var_length_data(writer, data):
+    """
+    Write some data to a file-like object by first writing a varint that says how many
+    bytes are in the data and then writing the data immediately following.
+
+    """
+    # Store the length of the data in bytes
+    data_length = len(data)
+    writer.write(encode(data_length))
+    # Write the data as a bytes array
+    return writer.write(data)
diff --git a/src/python/pimlico/utils/pimarc/writer.py b/src/python/pimlico/utils/pimarc/writer.py
@@ -3,9 +3,9 @@
 
 from future.utils import raise_from
 
-from pimlico.utils.pimarc.index import PimarcIndexAppender
-from pimlico.utils.varint import encode
-from .index import PimarcIndex
+from pimlico.utils.pimarc.index import DuplicateFilename
+from .utils import _write_var_length_data
+from .index import PimarcIndexAppender
 
 
 class PimarcWriter(object):
@@ -69,6 +69,11 @@ def write_file(self, data, name=None, metadata=None):
                 filename = metadata["name"]
             except KeyError:
                 raise MetadataError("metadata should include 'name' key")
+
+        # Check before we write anything that the filename isn't already used
+        if filename in self.index:
+            raise DuplicateFilename("cannot add {} to pimarc: filename already exists".format(filename))
+
         # Check where we're up to in the file
         # This tells us where the metadata starts, which will be stored in the index
         metadata_start = self.archive_file.tell()
@@ -103,18 +108,5 @@ def flush(self):
         self.index.flush()
 
 
-def _write_var_length_data(writer, data):
-    """
-    Write some data to a file-like object by first writing a varint that says how many
-    bytes are in the data and then writing the data immediately following.
-
-    """
-    # Store the length of the data in bytes
-    data_length = len(data)
-    writer.write(encode(data_length))
-    # Write the data as a bytes array
-    return writer.write(data)
-
-
 class MetadataError(Exception):
     pass