Updated text_normalize module

Added test pipeline. Updated list of test pipelines in "all" test suite.
markgw · Aug 5, 2019 · 258bd05 · 258bd05
1 parent b51c11f
commit 258bd05
Show file tree

Hide file tree

Showing 9 changed files with 144 additions and 19 deletions.
diff --git a/docs/modules/pimlico.modules.text.text_normalize.rst b/docs/modules/pimlico.modules.text.text_normalize.rst
@@ -1,9 +1,77 @@
-\!\! text\_normalize
-~~~~~~~~~~~~~~~~~~~~
+Normalize raw text
+~~~~~~~~~~~~~~~~~~
 
 .. py:module:: pimlico.modules.text.text_normalize
 
-.. note::
++------------+-------------------------------------+
+| Path       | pimlico.modules.text.text_normalize |
++------------+-------------------------------------+
+| Executable | yes                                 |
++------------+-------------------------------------+
 
-   This module has not yet been updated to the new datatype system, so cannot be used yet. Soon it will be updated.
+Text normalization for raw text documents.
 
+Similar to :mod:`~pimlico.modules.text.normalize` module, but operates on raw text,
+not pre-tokenized text, so provides a slightly different set of tools.
+
+
+Inputs
+======
+
++--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Name   | Type(s)                                                                                                                                                        |
++========+================================================================================================================================================================+
+| corpus | :class:`grouped_corpus <pimlico.datatypes.corpora.grouped.GroupedCorpus>` <:class:`TextDocumentType <pimlico.datatypes.corpora.data_points.TextDocumentType>`> |
++--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+Outputs
+=======
+
++--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Name   | Type(s)                                                                                                                                                              |
++========+======================================================================================================================================================================+
+| corpus | :class:`grouped_corpus <pimlico.datatypes.corpora.grouped.GroupedCorpus>` <:class:`RawTextDocumentType <pimlico.datatypes.corpora.data_points.RawTextDocumentType>`> |
++--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+Options
+=======
+
++-------------+-------------------------------------------------------------------------------------------------------------------------+------------------------+
+| Name        | Description                                                                                                             | Type                   |
++=============+=========================================================================================================================+========================+
+| blank_lines | Remove all blank lines (after whitespace stripping, if requested)                                                       | bool                   |
++-------------+-------------------------------------------------------------------------------------------------------------------------+------------------------+
+| case        | Transform all text to upper or lower case. Choose from 'upper' or 'lower', or leave blank to not perform transformation | 'upper', 'lower' or '' |
++-------------+-------------------------------------------------------------------------------------------------------------------------+------------------------+
+| strip       | Strip whitespace from the start and end of lines                                                                        | bool                   |
++-------------+-------------------------------------------------------------------------------------------------------------------------+------------------------+
+
+Example config
+==============
+
+This is an example of how this module can be used in a pipeline config file.
+
+.. code-block:: ini
+   
+   [my_text_normalize_module]
+   type=pimlico.modules.text.text_normalize
+   input_corpus=module_a.some_output
+   
+
+This example usage includes more options.
+
+.. code-block:: ini
+   
+   [my_text_normalize_module]
+   type=pimlico.modules.text.text_normalize
+   input_corpus=module_a.some_output
+   blank_lines=T
+   case=
+   strip=T
+
+Test pipelines
+==============
+
+This module is used by the following :ref:`test pipelines <test-pipelines>`. They are a further source of examples of the module's usage.
+
+ * :ref:`test-config-text_normalize.conf`
diff --git a/docs/test_config/index.rst b/docs/test_config/index.rst
@@ -19,6 +19,7 @@ Available pipelines
 
    nltk.nist_tokenize.conf.rst
    text.char_tokenize.conf.rst
+   text.text_normalize.conf.rst
    text.normalize.conf.rst
    text.simple_tokenize.conf.rst
    visualization.embeddings_plot.conf.rst

diff --git a/docs/test_config/module_list.tsv b/docs/test_config/module_list.tsv
@@ -1,5 +1,6 @@
 test-config-nist_tokenize.conf	pimlico.modules.nltk.nist_tokenize, pimlico.modules.nltk.nist_tokenize
 test-config-char_tokenize.conf	pimlico.modules.text.char_tokenize
+test-config-text_normalize.conf	pimlico.modules.text.text_normalize
 test-config-normalize.conf	pimlico.modules.text.normalize
 test-config-simple_tokenize.conf	pimlico.modules.text.simple_tokenize
 test-config-embeddings_plot.conf	pimlico.modules.visualization.embeddings_plot

diff --git a/docs/test_config/text.text_normalize.conf.rst b/docs/test_config/text.text_normalize.conf.rst
@@ -0,0 +1,44 @@
+.. _test-config-text_normalize.conf:
+
+normalize
+~~~~~~~~~
+
+
+
+This is one of the test pipelines included in Pimlico's repository.
+See :ref:`test-pipelines` for more details.
+
+Config file
+===========
+
+The complete config file for this test pipeline:
+
+
+.. code-block:: ini
+   
+   [pipeline]
+   name=normalize
+   release=latest
+   
+   # Take input from a prepared Pimlico dataset
+   [europarl]
+   type=pimlico.datatypes.corpora.GroupedCorpus
+   data_point_type=RawTextDocumentType
+   dir=%(test_data_dir)s/datasets/text_corpora/europarl
+   
+   [norm]
+   type=pimlico.modules.text.text_normalize
+   case=lower
+   strip=T
+   blank_lines=T
+
+
+Modules
+=======
+
+
+The following Pimlico module types are used in this pipeline:
+
+ * :mod:`~pimlico.modules.text.text_normalize`
+
+
diff --git a/src/python/pimlico/modules/text/text_normalize/__init__.py b/src/python/pimlico/modules/text/text_normalize/__init__.py
@@ -1 +0,0 @@
-AWAITING_UPDATE = True

diff --git a/src/python/pimlico/modules/text/text_normalize/execute.py b/src/python/pimlico/modules/text/text_normalize/execute.py
@@ -12,7 +12,7 @@ def worker_setup(worker):
 @skip_invalid
 def process_document(worker, archive_name, doc_name, doc):
     # First split into lines, since much works on the line level
-    lines = doc.splitlines()
+    lines = doc.text.splitlines()
 
     if worker.case == "upper":
         lines = [line.upper() for line in lines]
@@ -25,7 +25,7 @@ def process_document(worker, archive_name, doc_name, doc):
     if worker.blank_lines:
         lines = [l for l in lines if len(l) > 0]
 
-    return u"\n".join(lines)
+    return worker.info.document(text=u"\n".join(lines))
 
 
 ModuleExecutor = multiprocessing_executor_factory(process_document, worker_set_up_fn=worker_setup)
diff --git a/src/python/pimlico/modules/text/text_normalize/info.py b/src/python/pimlico/modules/text/text_normalize/info.py
@@ -4,24 +4,22 @@
 """
 Text normalization for raw text documents.
 
-.. todo::
-
-   Update to new datatypes system and add test pipeline
+Similar to :mod:`~pimlico.modules.text.normalize` module, but operates on raw text,
+not pre-tokenized text, so provides a slightly different set of tools.
 
 """
 from pimlico.core.modules.map import DocumentMapModuleInfo
 
 from pimlico.core.modules.options import choose_from_list, str_to_bool
-from pimlico.old_datatypes.documents import TextDocumentType
-from pimlico.old_datatypes.tar import TarredCorpusType, RawTextTarredCorpus
-from pimlico.old_datatypes.tar import TarredCorpusWriter
+from pimlico.datatypes import GroupedCorpus
+from pimlico.datatypes.corpora.data_points import TextDocumentType, RawTextDocumentType
 
 
 class ModuleInfo(DocumentMapModuleInfo):
     module_type_name = "text_normalize"
     module_readable_name = "Normalize raw text"
-    module_inputs = [("corpus", TarredCorpusType(TextDocumentType))]
-    module_outputs = [("corpus", RawTextTarredCorpus)]
+    module_inputs = [("corpus", GroupedCorpus(TextDocumentType()))]
+    module_outputs = [("corpus", GroupedCorpus(RawTextDocumentType()))]
     module_options = {
         "case": {
             "help": "Transform all text to upper or lower case. Choose from 'upper' or 'lower', "
@@ -38,7 +36,3 @@ class ModuleInfo(DocumentMapModuleInfo):
             "type": str_to_bool,
         },
     }
-
-    def get_writer(self, output_name, output_dir, append=False):
-        if output_name == "corpus":
-            return TarredCorpusWriter(output_dir, append=append)
diff --git a/test/data/pipelines/text/text_normalize.conf b/test/data/pipelines/text/text_normalize.conf
@@ -0,0 +1,15 @@
+[pipeline]
+name=normalize
+release=latest
+
+# Take input from a prepared Pimlico dataset
+[europarl]
+type=pimlico.datatypes.corpora.GroupedCorpus
+data_point_type=RawTextDocumentType
+dir=%(test_data_dir)s/datasets/text_corpora/europarl
+
+[norm]
+type=pimlico.modules.text.text_normalize
+case=lower
+strip=T
+blank_lines=T
diff --git a/test/suites/all.csv b/test/suites/all.csv
@@ -1,4 +1,5 @@
 pipelines/input/raw_text_files.conf, europarl
+pipelines/input/xml.conf, input
 pipelines/corpora/group.conf, output
 pipelines/corpora/concat.conf, output
 pipelines/corpora/subset.conf, output
@@ -14,6 +15,8 @@ pipelines/corpora/stats.conf, stats
 pipelines/corpora/filter_tokenize.conf, store
 pipelines/text/normalize.conf, norm
 pipelines/text/simple_tokenize.conf, tokenize
+pipelines/text/char_tokenize.conf, tokenize
+pipelines/text/text_normalize.conf, norm
 pipelines/input/fasttext.conf, vectors
 pipelines/input/glove.conf, vectors
 pipelines/embeddings/word2vec.conf, word2vec