Added metadata storage to grouped corpus

Pimarc format now supports the storage of arbitrary metadata for each document in the archive, but until now this wasn't accessible from the grouped corpus reader or writer. Added now to the writer. Added to reader so that a document object has the metadata available as an attribute.
markgw · May 26, 2020 · 36298f6 · 36298f6
1 parent a7175ed
commit 36298f6
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 14 deletions.
diff --git a/src/python/pimlico/core/modules/map/filter.py b/src/python/pimlico/core/modules/map/filter.py
@@ -11,12 +11,6 @@
 on the fly and yields the results in order, providing a new
 iterable (grouped) corpus for the next module, without storing anything.
 
-.. todo::
-
-   During Python 2-3 conversion, an ``object`` base class was added to
-   ``FilterModuleOutputReader.Setup``, since this is required/implicit in Python 3.
-   Check that this stills work as it used to.
-
 """
 from builtins import object
 

diff --git a/src/python/pimlico/datatypes/corpora/base.py b/src/python/pimlico/datatypes/corpora/base.py
@@ -179,11 +179,12 @@ def __iter__(self):
             """
             raise NotImplementedError
 
-        def data_to_document(self, data):
+        def data_to_document(self, data, metadata=None):
             """
             Applies the corpus' datatype's processing to the raw data, given as a
             bytes object, and produces a document instance.
 
+            :param metadata: dict containing doc metadata (optional)
             :param data: bytes raw data
             :return: document instance
             """
@@ -194,7 +195,7 @@ def data_to_document(self, data):
             # Apply subclass-specific post-processing if we've not been asked to yield just the raw data
             try:
                 # Produce a document instance of the appropriate type
-                document = self.datatype.data_point_type(raw_data=data)
+                document = self.datatype.data_point_type(raw_data=data, metadata=metadata)
             except BaseException as e:
                 # If there's any problem reading in the document, yield an invalid doc with the error
                 document = invalid_document(

diff --git a/src/python/pimlico/datatypes/corpora/data_points.py b/src/python/pimlico/datatypes/corpora/data_points.py
@@ -235,7 +235,7 @@ class Document:
         #: The internal data fields corresponding to these can be accessed as attributes of the document
         keys = []
 
-        def __init__(self, data_point_type, raw_data=None, internal_data=None):
+        def __init__(self, data_point_type, raw_data=None, internal_data=None, metadata=None):
             self.data_point_type = data_point_type
 
             if raw_data is None and internal_data is None:
@@ -244,6 +244,8 @@ def __init__(self, data_point_type, raw_data=None, internal_data=None):
                 raise DataPointError("only one of raw_data and internal_data may be given when "
                                      "instantiating a document")
 
+            self.metadata = metadata
+
             self._raw_data = raw_data
             self._internal_data = internal_data
 

diff --git a/src/python/pimlico/datatypes/corpora/grouped.py b/src/python/pimlico/datatypes/corpora/grouped.py
@@ -347,7 +347,7 @@ def __init__(self, *args, **kwargs):
                     self.metadata["length"] = self._count_written_docs()
             self.doc_count = self.metadata["length"]
 
-        def add_document(self, archive_name, doc_name, doc):
+        def add_document(self, archive_name, doc_name, doc, metadata=None):
             """
             Add a document to the named archive. All docs should be added to a single archive
             before moving onto the next. If the archive name is the same as the previous
@@ -362,7 +362,7 @@ def add_document(self, archive_name, doc_name, doc):
 
             :param archive_name: archive name
             :param doc_name: name of document
-            :param doc: document instance
+            :param doc: document instance or bytes object containing document's raw data
             """
             # A document instance provides access to the raw data for a document as a bytes (Py3) or string (Py2)
             # If it's not directly available, it will be converted when we try to retrieve the raw data
@@ -371,8 +371,11 @@ def add_document(self, archive_name, doc_name, doc):
             except AttributeError:
                 # Instead of type-checking every document, we assume that if it has a raw_data attr, this
                 # is the right thing to use
-                # If not, we kick up a fuss, as we've presumably been given something that's not a valid document
-                if not isinstance(doc, DataPointType.Document):
+                # If a bytes object is given, we assume that's the doc's raw data
+                if isinstance(doc, bytes):
+                    data = doc
+                elif not isinstance(doc, DataPointType.Document):
+                    # If not, we kick up a fuss, as we've presumably been given something that's not a valid document
                     raise TypeError("documents added to a grouped corpus should be instances of the data point type's "
                                     "document class. Data point type is {}. Got {}".format(
                         self.datatype.data_point_type.name, type(doc).__name__
@@ -428,7 +431,7 @@ def add_document(self, archive_name, doc_name, doc):
                 filename = doc_name
 
             # Append this document's data to the Pimarc
-            self.current_archive.write_file(data, name=filename)
+            self.current_archive.write_file(data, name=filename, metadata=metadata)
             self.flush()
 
             # Keep a count of how many we've added so we can write metadata