Updated XML input module

Reader working. Added test pipeline. Added small subset of Estonian Reference Corpus to use as example input.
markgw · Aug 5, 2019 · 4f6b25d · 4f6b25d
1 parent c50ae83
commit 4f6b25d
Show file tree

Hide file tree

Showing 23 changed files with 62,638 additions and 102 deletions.
diff --git a/src/python/pimlico/modules/input/xml/__init__.py b/src/python/pimlico/modules/input/xml/__init__.py
@@ -1,4 +0,0 @@
-
-# Tell module doc generator to skip this module
-SKIP_MODULE_DOCS = True
-AWAITING_UPDATE = True

diff --git a/src/python/pimlico/modules/input/xml/info.py b/src/python/pimlico/modules/input/xml/info.py
@@ -1,30 +1,26 @@
 # This file is part of Pimlico
 # Copyright (C) 2016 Mark Granroth-Wilding
 # Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html
-
 """
 Input reader for XML file collections.  Gigaword, for example, is stored in this way.
 The data retrieved from the files is plain unicode text.
 
 .. todo::
 
-   Update to new datatypes system and add test pipeline
-
-.. todo::
-
-   Currently skipped from module doc generator, until updated
+   Add test pipeline
 
 """
+
 import fnmatch
+import io
 import os
-from glob import glob, iglob
+from glob import iglob, glob
 from gzip import GzipFile
 
 from pimlico.core.dependencies.python import beautiful_soup_dependency, safe_import_bs4
-from pimlico.core.modules.inputs import iterable_input_reader_factory, ReaderOutputType
-from pimlico.core.modules.options import comma_separated_strings
-from pimlico.old_datatypes.documents import RawTextDocumentType
-from pimlico.utils.core import cached_property
+from pimlico.core.modules.inputs import iterable_input_reader
+from pimlico.core.modules.options import comma_separated_strings, opt_type_help
+from pimlico.datatypes.corpora.data_points import RawTextDocumentType
 
 
 def get_paths_from_options(options, error_on_missing=False):
@@ -61,7 +57,7 @@ def get_paths_from_options(options, error_on_missing=False):
     return paths
 
 
-def check_paths_from_options(options):
+def data_ready(options):
     """
     Like get_paths_from_options, but faster (in some cases), as it just checks whether there are
     any file for each path/glob.
@@ -105,102 +101,87 @@ def get_doc_nodes(filename, document_node_type, encoding, attr_constraints=None)
         # Permit gzip files by opening them using the gzip library
         with GzipFile(filename, fileobj=open(filename, mode="rb")) as f:
             data = f.read()
+            data = data.decode(encoding)
     else:
-        with open(filename, mode="r") as f:
+        with io.open(filename, mode="r", encoding=encoding) as f:
             data = f.read()
 
-    data = data.decode(encoding)
-
     # Read the XML using Beautiful Soup, so we can handle messy XML in a tolerant fashion
     # We specify XML, instead of HTML
     soup = BeautifulSoup(data, "xml")
     # Look for the type of XML node that documents are stored in and count them
     return soup.find_all(document_node_type, attrs=attr_constraints)
 
 
-class XMLOutputType(ReaderOutputType):
-    """
-    Output type used by reader to read the documents straight from external files using the input
-    options.
-
-    May be overridden to provide readers that do some processing of the text, by overriding
-    ``filter_text()``
-
-    """
-    data_point_type = RawTextDocumentType
-
-    def filter_text(self, data):
-        """
-        May be overridden by subclasses to perform postprocessing of document data.
-        Otherwise, the document type's process_document() method is used.
-
-        """
-        return self.data_point_type_instance.process_document(data)
-
-    def data_ready(self, **kwargs):
-        return check_paths_from_options(self.reader_options)
-
-    @cached_property
-    def attr_constraints(self):
-        return dict(key_val.split("=") for key_val in self.reader_options["filter_on_doc_attr"]) \
-            if self.reader_options.get("filter_on_doc_attr", None) is not None else None
-
-    def count_documents(self):
-        """ Faster iteration over doc nodes, just to count up how many there are """
-        options = self.reader_options
-
-        encoding = options["encoding"]
-        # Special case: use filename as doc name
-        doc_node_type = options["document_node_type"]
-
-        # Use the file basenames as doc names where possible, but make sure they're unique
-        paths = get_paths_from_options(options)
-        count = 0
-        if len(paths):
-            for path in paths:
-                count += len(list(get_doc_nodes(path, doc_node_type, encoding, self.attr_constraints)))
-        return count
-
-    def __iter__(self):
-        options = self.reader_options
-
-        encoding = options["encoding"]
-        doc_name_attr = options["document_name_attr"]
-        # Special case: use filename as doc name
-        doc_name_from_fn = doc_name_attr == "filename"
-        doc_node_type = options["document_node_type"]
-
-        # Use the file basenames as doc names where possible, but make sure they're unique
-        used_doc_names = set()
-        paths = get_paths_from_options(options)
-        if len(paths):
-            for path in paths:
+def corpus_len(options):
+    """ Faster iteration over doc nodes, just to count up how many there are """
+    encoding = options["encoding"]
+    # Special case: use filename as doc name
+    doc_node_type = options["document_node_type"]
+    attr_constraints = options["filter_on_doc_attr"]
+
+    # Use the file basenames as doc names where possible, but make sure they're unique
+    paths = get_paths_from_options(options)
+    count = 0
+    if len(paths):
+        for path in paths:
+            count += len(list(get_doc_nodes(path, doc_node_type, encoding, attr_constraints)))
+    return count
+
+
+def iter_files(reader):
+    options = reader.options
+
+    encoding = options["encoding"]
+    doc_name_attr = options["document_name_attr"]
+    # Special case: use filename as doc name
+    doc_name_from_fn = doc_name_attr == "filename"
+    doc_node_type = options["document_node_type"]
+    attr_constraints = options["filter_on_doc_attr"]
+
+    # Use the file basenames as doc names where possible, but make sure they're unique
+    used_doc_names = set()
+    paths = get_paths_from_options(options)
+    if len(paths):
+        for path in paths:
+            if doc_name_from_fn:
+                base_doc_name = os.path.basename(path)
+                distinguish_id = 0
+                # Keep increasing the distinguishing ID until we have a unique name
+                while base_doc_name in used_doc_names:
+                    base, ext = os.path.splitext(base_doc_name)
+                    base_doc_name = "%s-%d%s" % (base, distinguish_id, ext)
+                    distinguish_id += 1
+                used_doc_names.add(base_doc_name)
+
+            for file_doc_id, doc_node in enumerate(
+                    get_doc_nodes(path, doc_node_type, encoding, attr_constraints)):
                 if doc_name_from_fn:
-                    base_doc_name = os.path.basename(path)
-                    distinguish_id = 0
-                    # Keep increasing the distinguishing ID until we have a unique name
-                    while base_doc_name in used_doc_names:
-                        base, ext = os.path.splitext(base_doc_name)
-                        base_doc_name = "%s-%d%s" % (base, distinguish_id, ext)
-                        distinguish_id += 1
-                    used_doc_names.add(base_doc_name)
-
-                for file_doc_id, doc_node in enumerate(
-                        get_doc_nodes(path, doc_node_type, encoding, self.attr_constraints)):
-                    if doc_name_from_fn:
-                        if file_doc_id == 0:
-                            doc_name = base_doc_name
-                        else:
-                            doc_name = u"{}-{}".format(base_doc_name, file_doc_id)
+                    if file_doc_id == 0:
+                        doc_name = base_doc_name
                     else:
-                        # The node should supply us with the document name, using the attribute name specified
-                        doc_name = doc_node.get(doc_name_attr)
-                    # Pull the text out of the document node
-                    doc_text = doc_node.text
-                    yield doc_name, self.filter_text(doc_text)
+                        doc_name = u"{}-{}".format(base_doc_name, file_doc_id)
+                else:
+                    # The node should supply us with the document name, using the attribute name specified
+                    doc_name = doc_node.get(doc_name_attr)
+                # Pull the text out of the document node
+                doc_text = doc_node.text
+                yield doc_name, doc_text
+
+
+def corpus_iter(reader):
+    for doc_name, data in iter_files(reader):
+        yield doc_name, reader.datatype.data_point_type(text=data)
 
 
-ModuleInfo = iterable_input_reader_factory(
+@opt_type_help("comma-separated list of key=value constraints")
+def key_vals(text):
+    if text is None:
+        return None
+    return dict(key_val.strip().split("=") for key_val in text.split(","))
+
+
+ModuleInfo = iterable_input_reader(
     {
         "files": {
             "help": "Comma-separated list of absolute paths to files to include in the collection. Paths may include "
@@ -236,12 +217,13 @@ def __iter__(self):
         "filter_on_doc_attr": {
             "help": "Comma-separated list of key=value constraints. If given, only docs with the attribute 'key' on "
                     "their doc node and the attribute value 'value' will be included",
-            "type": comma_separated_strings,
+            "type": key_vals,
         },
     },
-    XMLOutputType,
-    module_type_name="xml_reader",
-    module_readable_name="XML documents",
+    RawTextDocumentType(),
+    data_ready, corpus_len, corpus_iter,
+    module_type_name="raw_text_files_reader",
+    module_readable_name="Raw text files",
     software_dependencies=[beautiful_soup_dependency],
     execute_count=True,
 )
diff --git a/src/python/pimlico/test/pipeline.py b/src/python/pimlico/test/pipeline.py
@@ -78,6 +78,13 @@ def test_all_modules(self):
             self.log.info("Processing module '{}'".format(module_name))
             module = self.pipeline[module_name]
             if isinstance(module, InputModuleInfo):
+                # If an input module has execute_count=True, it's actually an executable
+                # module, since it needs to be executed before its data is ready to read
+                if module.module_executable:
+                    self.log.info("Input module '{}' needs to be executed before data is ready: "
+                                  "running".format(module_name))
+                    self.test_module_execution(module_name)
+                    self.log.info("Data preparation executed. Testing input module")
                 self.test_input_module(module_name)
             else:
                 self.test_module_execution(module_name)