Skip to content

Commit

Permalink
New tools for pimarcs
Browse files Browse the repository at this point in the history
Rebuild index from data file.

Some fixes to existing tools and made writing a little less error prone.
  • Loading branch information
markgw committed Apr 1, 2020
1 parent 838d623 commit bd75187
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 45 deletions.
4 changes: 4 additions & 0 deletions src/python/pimlico/utils/pimarc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
JSON metadata dictionary for each file, so metadata like this can be stored as
necessary.
Pimarcs do not store any directory structures, just a flat collection of files.
This is all that is needed for storing Pimlico datasets, so it's best for this purpose
to keep the format as simple as possible.
Iterating over files in order is still likely to be substantially faster than random
access (depending on the underlying storage), so it is recommended to add files to
the archive in the sequential order that they are used in. This is the typical use
Expand Down
42 changes: 42 additions & 0 deletions src/python/pimlico/utils/pimarc/index.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import json
import os
from collections import OrderedDict
from builtins import *

from .utils import _read_var_length_data, _skip_var_length_data


class PimarcIndex(object):
"""
Expand Down Expand Up @@ -123,6 +126,45 @@ def flush(self):
os.fsync(self.fileobj.fileno())


def reindex(pimarc_path):
"""
Rebuild the index of a Pimarc archive from its data file (.prc).
Stores the new index in the correct location (.prci), overwriting any existing index.
:param pimarc_path: path to the .prc file
:return: the PimarcIndex
"""
if not pimarc_path.endswith(".prc"):
raise IndexWriteError("input pimarc path does not have the correct extension (.prc)")
index_path = "{}i".format(pimarc_path)

# Create an empty index
index = PimarcIndex()
# Read in each file in turn, reading the metadata to get the name and skipping the file content
with open(pimarc_path, "rb") as data_file:
try:
while True:
# Check where the metadata starts
metadata_start_byte = data_file.tell()
# First read the file's metadata block
metadata = json.loads(_read_var_length_data(data_file).decode("utf-8"))
# From that we can get the name
filename = metadata["name"]
# Now we're at the start of the file data
data_start_byte = data_file.tell()
# Skip over the data: we don't need to read that
_skip_var_length_data(data_file)
# Now add the entry to the index, with pointers to the start bytes
index.append(filename, metadata_start_byte, data_start_byte)
except EOFError:
# Reached the end of the file
pass

index.save(index_path)
return index


class FilenameNotInArchive(Exception):
def __init__(self, filename):
super().__init__(u"filename '{}' not found in archive".format(filename))
Expand Down
24 changes: 1 addition & 23 deletions src/python/pimlico/utils/pimarc/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from builtins import super, bytes

from .utils import _read_var_length_data, _skip_var_length_data
from .index import PimarcIndex
from pimlico.utils.varint import decode_stream


class PimarcReader(object):
Expand Down Expand Up @@ -201,27 +201,5 @@ def decode(self):
items = metadata_decode_decorator(dict.items)


def _read_var_length_data(reader):
"""
Read some data from a file-like object by first reading a varint that says how many
bytes are in the data and then reading the data immediately following.
"""
# Get a single varint from the reader stream
data_length = decode_stream(reader)
# Read the data as a bytes array
return reader.read(data_length)


def _skip_var_length_data(reader):
"""
Like read_var_length_data, but doesn't actually read the data. Just reads the length
indicator and seeks to the end of the data.
"""
data_length = decode_stream(reader)
reader.seek(data_length, 1)


class StartAfterFilenameNotFound(KeyError):
pass
75 changes: 69 additions & 6 deletions src/python/pimlico/utils/pimarc/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
import os
from tarfile import TarFile

import sys

from pimlico.utils.pimarc import PimarcReader, PimarcWriter
from .index import reindex


def list_files(opts):
Expand All @@ -25,12 +28,48 @@ def extract_file(opts):
path = opts.path
filenames = opts.filenames

reader = PimarcReader(path)
for filename in filenames:
print("Extracting {}".format(filename))
metadata, data = reader[filename]
with open(os.path.join(out_path, filename), "wb") as f:
f.write(data)
with PimarcReader(path) as reader:
for filename in filenames:
print("Extracting {}".format(filename))
metadata, data = reader[filename]
with open(os.path.join(out_path, filename), "wb") as f:
f.write(data)


def append_file(opts):
path = os.path.abspath(opts.path)
if not path.endswith(".prc"):
print("Pimarc data file must use extension '.prc'")
paths_to_add = opts.files
# Append if the file already exists, otherwise create a new archive
append = os.path.exists(path)
if append:
print("Appending files to existing pimarc {}".format(path))
else:
print("Creating new pimarc {}".format(path))

with PimarcWriter(path, mode="a" if append else "w") as writer:
# First check that all files can be added
for path_to_add in paths_to_add:
path_to_add = os.path.abspath(path_to_add)
if not os.path.exists(path_to_add):
print("Cannot add {}: file does not exist".format(path_to_add))
sys.exit(1)
filename = os.path.basename(path_to_add)
if filename in writer.index:
print("Cannot add {}: filename '{}' already exists in archive".format(path_to_add, filename))
sys.exit(1)

# Now add the files
for path_to_add in paths_to_add:
path_to_add = os.path.abspath(path_to_add)
# Just use the basename when adding the file: no paths are stored in pimarcs
filename = os.path.basename(path_to_add)
print(" Adding {} as {}".format(path_to_add, filename))
# Read in the file's data
with open(path_to_add, "rb") as f:
data = f.read()
writer.write_file(data, filename)


def from_tar(opts):
Expand Down Expand Up @@ -64,6 +103,17 @@ def from_tar(opts):
os.remove(tar_path)


def reindex_pimarcs(opts):
if not all(path.endswith(".prc") for path in opts.paths):
print("Pimarc files must have correct extension: .prc")
sys.exit(1)

for pimarc_path in opts.paths:
print("Rebuilding index for {}".format(pimarc_path))
reindex(pimarc_path)
print(" Success")


def no_subcommand(opts):
print("Specify a subcommand: list, ...")

Expand All @@ -83,6 +133,12 @@ def run():
subparser.add_argument("filenames", nargs="+", help="Filename(s) to extract")
subparser.add_argument("--out", "-o", help="Output dir (default CWD)")

subparser = subparsers.add_parser("append", help="Append a file to a pimarc. "
"If the pimarc doesn't exist, it is created")
subparser.set_defaults(func=append_file)
subparser.add_argument("path", help="Path to the pimarc (.prc file)")
subparser.add_argument("files", nargs="+", help="Path(s) to add")

subparser = subparsers.add_parser("fromtar",
help="Create a Pimarc containing all the same files as a given tar. "
"Outputs to the same filename as input, with '.tar' replaced by '.prc'")
Expand All @@ -91,6 +147,13 @@ def run():
subparser.add_argument("--out-path", "-o", help="Directory to output files to. Defaults to same as input")
subparser.add_argument("--delete", "-d", action="store_true", help="Delete the tar files after creating pimarcs")

subparser = subparsers.add_parser("reindex",
help="Rebuild a pimarc's index (the .prci file) from its data (the .prc file). "
"This can be necessary if the index has become corrupted or something when "
"wrong during writing of the archive")
subparser.set_defaults(func=reindex_pimarcs)
subparser.add_argument("paths", nargs="+", help="Path to the pimarc(s) - .prc files")

opts = parser.parse_args()
opts.func(opts)

Expand Down
36 changes: 36 additions & 0 deletions src/python/pimlico/utils/pimarc/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from pimlico.utils.varint import decode_stream, encode


def _read_var_length_data(reader):
"""
Read some data from a file-like object by first reading a varint that says how many
bytes are in the data and then reading the data immediately following.
"""
# Get a single varint from the reader stream
data_length = decode_stream(reader)
# Read the data as a bytes array
return reader.read(data_length)


def _skip_var_length_data(reader):
"""
Like read_var_length_data, but doesn't actually read the data. Just reads the length
indicator and seeks to the end of the data.
"""
data_length = decode_stream(reader)
reader.seek(data_length, 1)


def _write_var_length_data(writer, data):
"""
Write some data to a file-like object by first writing a varint that says how many
bytes are in the data and then writing the data immediately following.
"""
# Store the length of the data in bytes
data_length = len(data)
writer.write(encode(data_length))
# Write the data as a bytes array
return writer.write(data)
24 changes: 8 additions & 16 deletions src/python/pimlico/utils/pimarc/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

from future.utils import raise_from

from pimlico.utils.pimarc.index import PimarcIndexAppender
from pimlico.utils.varint import encode
from .index import PimarcIndex
from pimlico.utils.pimarc.index import DuplicateFilename
from .utils import _write_var_length_data
from .index import PimarcIndexAppender


class PimarcWriter(object):
Expand Down Expand Up @@ -69,6 +69,11 @@ def write_file(self, data, name=None, metadata=None):
filename = metadata["name"]
except KeyError:
raise MetadataError("metadata should include 'name' key")

# Check before we write anything that the filename isn't already used
if filename in self.index:
raise DuplicateFilename("cannot add {} to pimarc: filename already exists".format(filename))

# Check where we're up to in the file
# This tells us where the metadata starts, which will be stored in the index
metadata_start = self.archive_file.tell()
Expand Down Expand Up @@ -103,18 +108,5 @@ def flush(self):
self.index.flush()


def _write_var_length_data(writer, data):
"""
Write some data to a file-like object by first writing a varint that says how many
bytes are in the data and then writing the data immediately following.
"""
# Store the length of the data in bytes
data_length = len(data)
writer.write(encode(data_length))
# Write the data as a bytes array
return writer.write(data)


class MetadataError(Exception):
pass

0 comments on commit bd75187

Please sign in to comment.