Added spacy parser

Executes full spacy pipeline up to parsing, so includes tokenization, sentence splitting and POS tagging
markgw · Oct 19, 2020 · 6c0cbc2 · 6c0cbc2
1 parent fc9e7ab
commit 6c0cbc2
Show file tree

Hide file tree

Showing 16 changed files with 335 additions and 47 deletions.
diff --git a/docs/api/pimlico.core.dependencies.licenses.rst b/docs/api/pimlico.core.dependencies.licenses.rst
@@ -0,0 +1,7 @@
+licenses
+========
+
+.. automodule:: pimlico.core.dependencies.licenses
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/api/pimlico.core.dependencies.rst b/docs/api/pimlico.core.dependencies.rst
@@ -9,6 +9,7 @@ Submodules
    pimlico.core.dependencies.base
    pimlico.core.dependencies.core
    pimlico.core.dependencies.java
+   pimlico.core.dependencies.licenses
    pimlico.core.dependencies.python
    pimlico.core.dependencies.versions
 

diff --git a/docs/commands/index.rst b/docs/commands/index.rst
@@ -40,6 +40,8 @@ command line.
 +-------------------+----------------------------------------------------------------------------------------------+
 | :doc:`jupyter`    | Create and start a new Jupyter notebook for the pipeline                                     |
 +-------------------+----------------------------------------------------------------------------------------------+
+| :doc:`licenses`   | List information about licsenses of software dependencies                                    |
++-------------------+----------------------------------------------------------------------------------------------+
 | :doc:`load`       | Load a module's output data from a tarball previously created by the dump command            |
 +-------------------+----------------------------------------------------------------------------------------------+
 | :doc:`movestores` | Move data between stores                                                                     |
@@ -101,3 +103,4 @@ command line.
    email
    jupyter
    tar2pimarc
+   licenses
diff --git a/docs/commands/licenses.rst b/docs/commands/licenses.rst
@@ -0,0 +1,28 @@
+.. _command_licenses:
+
+licenses
+~~~~~~~~
+
+
+*Command-line tool subcommand*
+
+
+Output a list of the licenses for all software depended on.
+
+
+Usage:
+
+::
+
+    pimlico.sh [...] licenses [modules [modules ...]] [-h]
+
+
+Positional arguments
+====================
+
++-----------------------------+----------------------------------------------------------------------------------------------------------------+
+| Arg                         | Description                                                                                                    |
++=============================+================================================================================================================+
+| ``[modules [modules ...]]`` | Check dependencies of modules and their datatypes. Use 'all' to list licenses for dependencies for all modules |
++-----------------------------+----------------------------------------------------------------------------------------------------------------+
+
diff --git a/docs/commands/run.rst b/docs/commands/run.rst
@@ -14,7 +14,7 @@ Usage:
 
 ::
 
-    pimlico.sh [...] run [modules [modules ...]] [-h] [--force-rerun] [--all-deps] [--all] [--dry-run] [--step] [--preliminary] [--exit-on-error] [--email {modend,end}]
+    pimlico.sh [...] run [modules [modules ...]] [-h] [--force-rerun] [--all-deps] [--all] [--dry-run] [--step] [--preliminary] [--exit-on-error] [--email {modend,end}] [--last-error]
 
 
 Positional arguments
@@ -44,8 +44,10 @@ Options
 +---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ``--preliminary``, ``--pre``          | Perform a preliminary run of any modules that take multiple datasets into one of their inputs. This means that we will run the module even if not all the datasets are yet available (but at least one is) and mark it as preliminarily completed                                                                      |
 +---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ``--exit-on-error``, ``-e``           | If an error is encountered while executing a module that causes the whole module execution to fail, output the error and exit. By default, Pimlico will send error output to a file (or print it in debug mode) and continue to execute the next module that can be executed, if any                                   |
+| ``--exit-on-error``                   | If an error is encountered while executing a module that causes the whole module execution to fail, output the error and exit. By default, Pimlico will send error output to a file (or print it in debug mode) and continue to execute the next module that can be executed, if any                                   |
 +---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ``--email``                           | Send email notifications when processing is complete, including information about the outcome. Choose from: 'modend' (send notification after module execution if it fails and a summary at the end of everything), 'end' (send only the final summary). Email sending must be configured: see 'email' command to test |
 +---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``--last-error``, ``-e``              | Don't execute, just output the error log from the last execution of the given module(s)                                                                                                                                                                                                                                |
++---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 
diff --git a/docs/modules/pimlico.modules.spacy.parse_text.rst b/docs/modules/pimlico.modules.spacy.parse_text.rst
@@ -0,0 +1,80 @@
+Text parser
+~~~~~~~~~~~
+
+.. py:module:: pimlico.modules.spacy.parse_text
+
++------------+----------------------------------+
+| Path       | pimlico.modules.spacy.parse_text |
++------------+----------------------------------+
+| Executable | yes                              |
++------------+----------------------------------+
+
+Parsing using spaCy
+
+Entire parsing pipeline from raw text using the same spaCy model.
+
+The word annotations in the output contain the information from the spaCy parser
+and the documents are split into sentences following the spaCy's sentence segmentation.
+
+The annotation fields follow those produced by the Malt parser: pos, head and deprel.
+
+
+Inputs
+======
+
++------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Name | Type(s)                                                                                                                                                              |
++======+======================================================================================================================================================================+
+| text | :class:`grouped_corpus <pimlico.datatypes.corpora.grouped.GroupedCorpus>` <:class:`RawTextDocumentType <pimlico.datatypes.corpora.data_points.RawTextDocumentType>`> |
++------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+Outputs
+=======
+
++--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Name   | Type(s)                                                                                                                                                                                   |
++========+===========================================================================================================================================================================================+
+| parsed | :class:`grouped_corpus <pimlico.datatypes.corpora.grouped.GroupedCorpus>` <:class:`WordAnnotationsDocumentType <pimlico.datatypes.corpora.word_annotations.WordAnnotationsDocumentType>`> |
++--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+
+Options
+=======
+
++---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
+| Name    | Description                                                                                                                                                                                                      | Type   |
++=========+==================================================================================================================================================================================================================+========+
+| model   | spaCy model to use. This may be a name of a standard spaCy model or a path to the location of a trained model on disk, if on_disk=T. If it's not a path, the spaCy download command will be run before execution | string |
++---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
+| on_disk | Load the specified model from a location on disk (the model parameter gives the path)                                                                                                                            | bool   |
++---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
+
+Example config
+==============
+
+This is an example of how this module can be used in a pipeline config file.
+
+.. code-block:: ini
+   
+   [my_spacy_text_parser_module]
+   type=pimlico.modules.spacy.parse_text
+   input_text=module_a.some_output
+   
+
+This example usage includes more options.
+
+.. code-block:: ini
+   
+   [my_spacy_text_parser_module]
+   type=pimlico.modules.spacy.parse_text
+   input_text=module_a.some_output
+   model=en_core_web_sm
+   on_disk=T
+
+Test pipelines
+==============
+
+This module is used by the following :ref:`test pipelines <test-pipelines>`. They are a further source of examples of the module's usage.
+
+ * :ref:`test-config-spacy-parse_text.conf`
+
diff --git a/docs/modules/pimlico.modules.spacy.rst b/docs/modules/pimlico.modules.spacy.rst
@@ -19,4 +19,5 @@ module type following the same approach.
    :maxdepth: 2
    :titlesonly:
 
+   pimlico.modules.spacy.parse_text
    pimlico.modules.spacy.tokenize
diff --git a/docs/test_config/index.rst b/docs/test_config/index.rst
@@ -55,6 +55,7 @@ Available pipelines
    input.glove.conf.rst
    input.fasttext.conf.rst
    spacy.tokenize.conf.rst
+   spacy.parse_text.conf.rst
    utility.collect_files.conf.rst
    nltk.nist_tokenize.conf.rst
    malt.parse.conf.rst

diff --git a/docs/test_config/module_list.tsv b/docs/test_config/module_list.tsv
@@ -36,6 +36,7 @@ test-config-input-xml.conf
 test-config-input-glove.conf	pimlico.modules.input.embeddings.glove
 test-config-input-fasttext.conf	pimlico.modules.input.embeddings.fasttext
 test-config-spacy-tokenize.conf	pimlico.modules.spacy.tokenize
+test-config-spacy-parse_text.conf	pimlico.modules.spacy.parse_text
 test-config-utility-collect_files.conf	pimlico.modules.utility.collect_files
 test-config-nltk-nist_tokenize.conf	pimlico.modules.nltk.nist_tokenize, pimlico.modules.nltk.nist_tokenize
 test-config-malt-parse.conf	pimlico.modules.malt

diff --git a/docs/test_config/spacy.parse_text.conf.rst b/docs/test_config/spacy.parse_text.conf.rst
@@ -0,0 +1,42 @@
+.. _test-config-spacy-parse_text.conf:
+
+spacy\_parse\_text
+~~~~~~~~~~~~~~~~~~
+
+
+
+This is one of the test pipelines included in Pimlico's repository.
+See :ref:`test-pipelines` for more details.
+
+Config file
+===========
+
+The complete config file for this test pipeline:
+
+
+.. code-block:: ini
+   
+   [pipeline]
+   name=spacy_parse_text
+   release=latest
+   
+   # Prepared tarred corpus
+   [europarl]
+   type=pimlico.datatypes.corpora.GroupedCorpus
+   data_point_type=RawTextDocumentType
+   dir=%(test_data_dir)s/datasets/text_corpora/europarl
+   
+   [tokenize]
+   type=pimlico.modules.spacy.parse_text
+   model=en_core_web_sm
+
+
+Modules
+=======
+
+
+The following Pimlico module types are used in this pipeline:
+
+ * :mod:`pimlico.modules.spacy.parse_text`
+
+
diff --git a/src/python/pimlico/modules/spacy/parse_text/__init__.py b/src/python/pimlico/modules/spacy/parse_text/__init__.py
diff --git a/src/python/pimlico/modules/spacy/parse_text/execute.py b/src/python/pimlico/modules/spacy/parse_text/execute.py
@@ -0,0 +1,46 @@
+# This file is part of Pimlico
+# Copyright (C) 2020 Mark Granroth-Wilding
+# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html
+from pimlico.core.modules.map import skip_invalid
+from pimlico.core.modules.map.singleproc import single_process_executor_factory
+from ..utils import load_spacy_model
+
+
+def preprocess(worker):
+    model = worker.info.options["model"]
+    nlp = load_spacy_model(model, worker.executor.log, local=worker.info.options["on_disk"])
+
+    pipeline = ["tagger", "parser"]
+    for pipe_name in nlp.pipe_names:
+        if pipe_name not in pipeline:
+            # Remove any components other than the tagger and parser that might be in the model
+            nlp.remove_pipe(pipe_name)
+    worker.nlp = nlp
+
+    # Check the order of the fields in the output
+    output_dt = worker.info.get_output_datatype("parsed")[1]
+    fields_list = output_dt.data_point_type.fields
+    # This little function will put the annotations in the right order
+    def output(token, pos, head, deprel):
+        fields = {"word": token, "pos": pos, "head": head, "deprel": deprel}
+        return [fields[field] for field in fields_list]
+    worker.output_fields = output
+
+
+@skip_invalid
+def process_document(worker, archive, filename, doc):
+    # Apply tagger and parser to the raw text
+    doc = worker.nlp(doc.text)
+    # Now doc.sents contains the separated sentences
+    #  and each word should have a POS tag and head+dep type
+    return {
+        "word_annotations": [
+            [
+                worker.output_fields(token.text, token.pos_, str(token.head.i - sentence.start), token.dep_)
+                for token in sentence
+            ] for sentence in doc.sents
+        ]
+    }
+
+
+ModuleExecutor = single_process_executor_factory(process_document, worker_set_up_fn=preprocess)
diff --git a/src/python/pimlico/modules/spacy/parse_text/info.py b/src/python/pimlico/modules/spacy/parse_text/info.py
@@ -0,0 +1,43 @@
+# This file is part of Pimlico
+# Copyright (C) 2020 Mark Granroth-Wilding
+# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html
+
+"""Parsing using spaCy
+
+Entire parsing pipeline from raw text using the same spaCy model.
+
+The word annotations in the output contain the information from the spaCy parser
+and the documents are split into sentences following the spaCy's sentence segmentation.
+
+The annotation fields follow those produced by the Malt parser: pos, head and deprel.
+
+"""
+from pimlico.core.dependencies.python import spacy_dependency
+from pimlico.core.modules.map import DocumentMapModuleInfo
+from pimlico.core.modules.options import str_to_bool
+from pimlico.datatypes import GroupedCorpus
+from pimlico.datatypes.corpora.data_points import RawTextDocumentType
+from pimlico.datatypes.corpora.word_annotations import WordAnnotationsDocumentType
+
+
+class ModuleInfo(DocumentMapModuleInfo):
+    module_type_name = "spacy_text_parser"
+    module_readable_name = "Text parser"
+    module_inputs = [("text", GroupedCorpus(RawTextDocumentType()))]
+    module_outputs = [("parsed", GroupedCorpus(WordAnnotationsDocumentType(["word", "pos", "head", "deprel"])))]
+    module_options = {
+        "model": {
+            "help": "spaCy model to use. This may be a name of a standard spaCy model or a path to the "
+                    "location of a trained model on disk, if on_disk=T. "
+                    "If it's not a path, the spaCy download command will be run before execution",
+            "default": "en_core_web_sm",
+        },
+        "on_disk": {
+            "help": "Load the specified model from a location on disk (the model parameter gives the path)",
+            "type": str_to_bool,
+        }
+    }
+    module_supports_python2 = True
+
+    def get_software_dependencies(self):
+        return super(ModuleInfo, self).get_software_dependencies() + [spacy_dependency]
diff --git a/src/python/pimlico/modules/spacy/tokenize/execute.py b/src/python/pimlico/modules/spacy/tokenize/execute.py
@@ -2,35 +2,14 @@
 # Copyright (C) 2020 Mark Granroth-Wilding
 # Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html
 
-from importlib import reload
-
-import pkg_resources
-import spacy
-from spacy import about
-from spacy.cli.download import get_json, get_compatibility, get_version, download_model
-
-from pimlico.core.modules.execute import ModuleExecutionError
 from pimlico.core.modules.map import skip_invalid
 from pimlico.core.modules.map.singleproc import single_process_executor_factory
+from ..utils import load_spacy_model
 
 
 def preprocess(executor):
     model = executor.info.options["model"]
-
-    try:
-        nlp = spacy.load(model)
-    except IOError:
-        # Couldn't load spacy model
-        if not executor.info.options["on_disk"]:
-            # If not loading from disk, we need to run the spacy download command
-            executor.log.info("Downloading the model '{}'".format(model))
-            if not download(model):
-                raise ModuleExecutionError("Model could not be downloaded")
-        else:
-            raise
-        # Now the model should be available
-        nlp = spacy.load(model)
-
+    nlp = load_spacy_model(model, executor.log, local=executor.info.options["on_disk"])
     executor.tokenizer = nlp.Defaults.create_tokenizer(nlp)
     executor.sentencizer = nlp.create_pipe("sentencizer")
 
@@ -49,25 +28,3 @@ def process_document(worker, archive, filename, doc):
 
 
 ModuleExecutor = single_process_executor_factory(process_document, preprocess_fn=preprocess)
-
-
-def download(model):
-    """
-    Replicates what spaCy does in its cmdline interface.
-
-    """
-    dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}"
-    shortcuts = get_json(about.__shortcuts__, "available shortcuts")
-    model_name = shortcuts.get(model, model)
-    compatibility = get_compatibility()
-    version = get_version(model_name, compatibility)
-    dl = download_model(dl_tpl.format(m=model_name, v=version))
-    # Returns 0 if download was successful
-    if dl != 0:
-        return False
-
-    # Refresh sys.path so we can import the installed package
-    import site
-    reload(site)
-    reload(pkg_resources)
-    return True