Skip to content

Commit

Permalink
Allow input readers to produce dict doc output
Browse files Browse the repository at this point in the history
Follows the same practice now allowed with a process_document() function for map module. Dict gets automatically wrapper as internal data for a document object. Bytes obj gets wrapped as raw data for a doc of the appropriate typeAllow input readers to produce dict doc output.
  • Loading branch information
markgw committed Mar 19, 2020
1 parent d3feb50 commit bb86ad2
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 0 deletions.
5 changes: 5 additions & 0 deletions src/python/pimlico/core/modules/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from pimlico.core.modules.base import BaseModuleExecutor
from pimlico.core.modules.execute import ModuleExecutionError
from pimlico.core.modules.map import output_to_document
from pimlico.datatypes import PimlicoDatatype, GroupedCorpus
from pimlico.datatypes.base import PimlicoDatatypeReaderMeta
from pimlico.datatypes.corpora import IterableCorpus
Expand Down Expand Up @@ -76,6 +77,9 @@ def __iter__(self):
# of the factory function has made a mistake in building the iter_fn
it = iter(self.iterate())
doc_name, doc = next(it)
dp_type = self.datatype.data_point_type
# Like process_document() on doc map modules, we allow raw data or dicts to be returned
doc = output_to_document(doc, dp_type)
if not self.datatype.data_point_type.is_type_for_doc(doc):
raise TypeError("data iterator for input reader yielded the wrong type of document. Expected "
"a document of data point type {}, but got {}".format(
Expand All @@ -85,6 +89,7 @@ def __iter__(self):
yield doc_name, doc
# Just iterate over the rest
for doc_name, doc in it:
doc = output_to_document(doc, dp_type)
yield doc_name, doc

def process_setup(self):
Expand Down
8 changes: 8 additions & 0 deletions src/python/pimlico/datatypes/corpora/data_points.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,15 @@ def is_type_for_doc(self, doc):
"""
Check whether the given document is of this type, or a subclass of this one.
If the object is not a document instance (or, more precisely, doesn't have a
data_point_type attr), this will always return False.
"""
if not hasattr(doc, "data_point_type"):
# Sometimes things other than document instances will turn up here, e.g. when
# a doc map module's process_document() produces a dict or raw data output
# That's fine: we return false simply
return False
return isinstance(doc.data_point_type, type(self))

def reader_init(self, reader):
Expand Down

0 comments on commit bb86ad2

Please sign in to comment.