Skip to content

Commit

Permalink
Added metadata storage to grouped corpus
Browse files Browse the repository at this point in the history
Pimarc format now supports the storage of arbitrary metadata for each document in the archive, but until now this wasn't accessible from the grouped corpus reader or writer.

Added now to the writer. Added to reader so that a document object has the metadata available as an attribute.
  • Loading branch information
markgw committed May 26, 2020
1 parent a7175ed commit 36298f6
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 14 deletions.
6 changes: 0 additions & 6 deletions src/python/pimlico/core/modules/map/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@
on the fly and yields the results in order, providing a new
iterable (grouped) corpus for the next module, without storing anything.
.. todo::
During Python 2-3 conversion, an ``object`` base class was added to
``FilterModuleOutputReader.Setup``, since this is required/implicit in Python 3.
Check that this stills work as it used to.
"""
from builtins import object

Expand Down
5 changes: 3 additions & 2 deletions src/python/pimlico/datatypes/corpora/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,12 @@ def __iter__(self):
"""
raise NotImplementedError

def data_to_document(self, data):
def data_to_document(self, data, metadata=None):
"""
Applies the corpus' datatype's processing to the raw data, given as a
bytes object, and produces a document instance.
:param metadata: dict containing doc metadata (optional)
:param data: bytes raw data
:return: document instance
"""
Expand All @@ -194,7 +195,7 @@ def data_to_document(self, data):
# Apply subclass-specific post-processing if we've not been asked to yield just the raw data
try:
# Produce a document instance of the appropriate type
document = self.datatype.data_point_type(raw_data=data)
document = self.datatype.data_point_type(raw_data=data, metadata=metadata)
except BaseException as e:
# If there's any problem reading in the document, yield an invalid doc with the error
document = invalid_document(
Expand Down
4 changes: 3 additions & 1 deletion src/python/pimlico/datatypes/corpora/data_points.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ class Document:
#: The internal data fields corresponding to these can be accessed as attributes of the document
keys = []

def __init__(self, data_point_type, raw_data=None, internal_data=None):
def __init__(self, data_point_type, raw_data=None, internal_data=None, metadata=None):
self.data_point_type = data_point_type

if raw_data is None and internal_data is None:
Expand All @@ -244,6 +244,8 @@ def __init__(self, data_point_type, raw_data=None, internal_data=None):
raise DataPointError("only one of raw_data and internal_data may be given when "
"instantiating a document")

self.metadata = metadata

self._raw_data = raw_data
self._internal_data = internal_data

Expand Down
13 changes: 8 additions & 5 deletions src/python/pimlico/datatypes/corpora/grouped.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ def __init__(self, *args, **kwargs):
self.metadata["length"] = self._count_written_docs()
self.doc_count = self.metadata["length"]

def add_document(self, archive_name, doc_name, doc):
def add_document(self, archive_name, doc_name, doc, metadata=None):
"""
Add a document to the named archive. All docs should be added to a single archive
before moving onto the next. If the archive name is the same as the previous
Expand All @@ -362,7 +362,7 @@ def add_document(self, archive_name, doc_name, doc):
:param archive_name: archive name
:param doc_name: name of document
:param doc: document instance
:param doc: document instance or bytes object containing document's raw data
"""
# A document instance provides access to the raw data for a document as a bytes (Py3) or string (Py2)
# If it's not directly available, it will be converted when we try to retrieve the raw data
Expand All @@ -371,8 +371,11 @@ def add_document(self, archive_name, doc_name, doc):
except AttributeError:
# Instead of type-checking every document, we assume that if it has a raw_data attr, this
# is the right thing to use
# If not, we kick up a fuss, as we've presumably been given something that's not a valid document
if not isinstance(doc, DataPointType.Document):
# If a bytes object is given, we assume that's the doc's raw data
if isinstance(doc, bytes):
data = doc
elif not isinstance(doc, DataPointType.Document):
# If not, we kick up a fuss, as we've presumably been given something that's not a valid document
raise TypeError("documents added to a grouped corpus should be instances of the data point type's "
"document class. Data point type is {}. Got {}".format(
self.datatype.data_point_type.name, type(doc).__name__
Expand Down Expand Up @@ -428,7 +431,7 @@ def add_document(self, archive_name, doc_name, doc):
filename = doc_name

# Append this document's data to the Pimarc
self.current_archive.write_file(data, name=filename)
self.current_archive.write_file(data, name=filename, metadata=metadata)
self.flush()

# Keep a count of how many we've added so we can write metadata
Expand Down

0 comments on commit 36298f6

Please sign in to comment.