Added spacy NP chunker pipeline

markgw · Oct 29, 2020 · 145a4a1 · 145a4a1
1 parent c2fea5b
commit 145a4a1
Show file tree

Hide file tree

Showing 9 changed files with 218 additions and 0 deletions.
diff --git a/docs/modules/pimlico.modules.spacy.extract_nps.rst b/docs/modules/pimlico.modules.spacy.extract_nps.rst
@@ -0,0 +1,83 @@
+NP chunk extractor
+~~~~~~~~~~~~~~~~~~
+
+.. py:module:: pimlico.modules.spacy.extract_nps
+
++------------+-----------------------------------+
+| Path       | pimlico.modules.spacy.extract_nps |
++------------+-----------------------------------+
+| Executable | yes                               |
++------------+-----------------------------------+
+
+Extract NP chunks
+
+Performs the full spaCy pipeline including tokenization, sentence
+segmentation, POS tagging and parsing and outputs documents containing
+only a list of the noun phrase chunks that were found by the parser.
+
+This functionality is provided very conveniently by spaCy's ``Doc.noun_chunks``
+after parsing, so this is a light wrapper around spaCy.
+
+The output is presented as a tokenized document. Each sentence in the
+document represents a single NP.
+
+
+Inputs
+======
+
++------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Name | Type(s)                                                                                                                                                              |
++======+======================================================================================================================================================================+
+| text | :class:`grouped_corpus <pimlico.datatypes.corpora.grouped.GroupedCorpus>` <:class:`RawTextDocumentType <pimlico.datatypes.corpora.data_points.RawTextDocumentType>`> |
++------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+Outputs
+=======
+
++------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Name | Type(s)                                                                                                                                                                |
++======+========================================================================================================================================================================+
+| nps  | :class:`grouped_corpus <pimlico.datatypes.corpora.grouped.GroupedCorpus>` <:class:`TokenizedDocumentType <pimlico.datatypes.corpora.tokenized.TokenizedDocumentType>`> |
++------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+
+Options
+=======
+
++---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
+| Name    | Description                                                                                                                                                                                                      | Type   |
++=========+==================================================================================================================================================================================================================+========+
+| model   | spaCy model to use. This may be a name of a standard spaCy model or a path to the location of a trained model on disk, if on_disk=T. If it's not a path, the spaCy download command will be run before execution | string |
++---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
+| on_disk | Load the specified model from a location on disk (the model parameter gives the path)                                                                                                                            | bool   |
++---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
+
+Example config
+==============
+
+This is an example of how this module can be used in a pipeline config file.
+
+.. code-block:: ini
+   
+   [my_spacy_extract_nps_module]
+   type=pimlico.modules.spacy.extract_nps
+   input_text=module_a.some_output
+   
+
+This example usage includes more options.
+
+.. code-block:: ini
+   
+   [my_spacy_extract_nps_module]
+   type=pimlico.modules.spacy.extract_nps
+   input_text=module_a.some_output
+   model=en_core_web_sm
+   on_disk=T
+
+Test pipelines
+==============
+
+This module is used by the following :ref:`test pipelines <test-pipelines>`. They are a further source of examples of the module's usage.
+
+ * :ref:`test-config-spacy-extract_nps.conf`
+
diff --git a/docs/modules/pimlico.modules.spacy.rst b/docs/modules/pimlico.modules.spacy.rst
@@ -19,5 +19,6 @@ module type following the same approach.
    :maxdepth: 2
    :titlesonly:
 
+   pimlico.modules.spacy.extract_nps
    pimlico.modules.spacy.parse_text
    pimlico.modules.spacy.tokenize
diff --git a/docs/test_config/index.rst b/docs/test_config/index.rst
@@ -55,6 +55,7 @@ Available pipelines
    input.glove.conf.rst
    input.fasttext.conf.rst
    spacy.tokenize.conf.rst
+   spacy.extract_nps.conf.rst
    spacy.parse_text.conf.rst
    utility.collect_files.conf.rst
    nltk.nist_tokenize.conf.rst

diff --git a/docs/test_config/module_list.tsv b/docs/test_config/module_list.tsv
@@ -36,6 +36,7 @@ test-config-input-xml.conf
 test-config-input-glove.conf	pimlico.modules.input.embeddings.glove
 test-config-input-fasttext.conf	pimlico.modules.input.embeddings.fasttext
 test-config-spacy-tokenize.conf	pimlico.modules.spacy.tokenize
+test-config-spacy-extract_nps.conf	pimlico.modules.spacy.extract_nps
 test-config-spacy-parse_text.conf	pimlico.modules.spacy.parse_text
 test-config-utility-collect_files.conf	pimlico.modules.utility.collect_files
 test-config-nltk-nist_tokenize.conf	pimlico.modules.nltk.nist_tokenize, pimlico.modules.nltk.nist_tokenize

diff --git a/docs/test_config/spacy.extract_nps.conf.rst b/docs/test_config/spacy.extract_nps.conf.rst
@@ -0,0 +1,42 @@
+.. _test-config-spacy-extract_nps.conf:
+
+spacy\_parse\_text
+~~~~~~~~~~~~~~~~~~
+
+
+
+This is one of the test pipelines included in Pimlico's repository.
+See :ref:`test-pipelines` for more details.
+
+Config file
+===========
+
+The complete config file for this test pipeline:
+
+
+.. code-block:: ini
+   
+   [pipeline]
+   name=spacy_parse_text
+   release=latest
+   
+   # Prepared tarred corpus
+   [europarl]
+   type=pimlico.datatypes.corpora.GroupedCorpus
+   data_point_type=RawTextDocumentType
+   dir=%(test_data_dir)s/datasets/text_corpora/europarl
+   
+   [extract_nps]
+   type=pimlico.modules.spacy.extract_nps
+   model=en_core_web_sm
+
+
+Modules
+=======
+
+
+The following Pimlico module types are used in this pipeline:
+
+ * :mod:`pimlico.modules.spacy.extract_nps`
+
+
diff --git a/src/python/pimlico/modules/spacy/extract_nps/__init__.py b/src/python/pimlico/modules/spacy/extract_nps/__init__.py
diff --git a/src/python/pimlico/modules/spacy/extract_nps/execute.py b/src/python/pimlico/modules/spacy/extract_nps/execute.py
@@ -0,0 +1,31 @@
+# This file is part of Pimlico
+# Copyright (C) 2020 Mark Granroth-Wilding
+# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html
+from pimlico.core.modules.map import skip_invalid
+from pimlico.core.modules.map.singleproc import single_process_executor_factory
+from ..utils import load_spacy_model
+
+
+def preprocess(worker):
+    model = worker.info.options["model"]
+    nlp = load_spacy_model(model, worker.executor.log, local=worker.info.options["on_disk"])
+
+    pipeline = ["tagger", "parser"]
+    for pipe_name in nlp.pipe_names:
+        if pipe_name not in pipeline:
+            # Remove any components other than the tagger and parser that might be in the model
+            nlp.remove_pipe(pipe_name)
+    worker.nlp = nlp
+
+
+@skip_invalid
+def process_document(worker, archive, filename, doc):
+    # Apply tagger and parser to the raw text
+    doc = worker.nlp(doc.text)
+    # Now doc.noun_chunks contains the NP chunks from the parser
+    return {
+        "sentences": [[token.text for token in np] for np in doc.noun_chunks]
+    }
+
+
+ModuleExecutor = single_process_executor_factory(process_document, worker_set_up_fn=preprocess)
diff --git a/src/python/pimlico/modules/spacy/extract_nps/info.py b/src/python/pimlico/modules/spacy/extract_nps/info.py
@@ -0,0 +1,46 @@
+# This file is part of Pimlico
+# Copyright (C) 2020 Mark Granroth-Wilding
+# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html
+
+"""Extract NP chunks
+
+Performs the full spaCy pipeline including tokenization, sentence
+segmentation, POS tagging and parsing and outputs documents containing
+only a list of the noun phrase chunks that were found by the parser.
+
+This functionality is provided very conveniently by spaCy's ``Doc.noun_chunks``
+after parsing, so this is a light wrapper around spaCy.
+
+The output is presented as a tokenized document. Each sentence in the
+document represents a single NP.
+
+"""
+from pimlico.core.dependencies.python import spacy_dependency
+from pimlico.core.modules.map import DocumentMapModuleInfo
+from pimlico.core.modules.options import str_to_bool
+from pimlico.datatypes import GroupedCorpus
+from pimlico.datatypes.corpora.data_points import RawTextDocumentType
+from pimlico.datatypes.corpora.tokenized import TokenizedDocumentType
+
+
+class ModuleInfo(DocumentMapModuleInfo):
+    module_type_name = "spacy_extract_nps"
+    module_readable_name = "NP chunk extractor"
+    module_inputs = [("text", GroupedCorpus(RawTextDocumentType()))]
+    module_outputs = [("nps", GroupedCorpus(TokenizedDocumentType()))]
+    module_options = {
+        "model": {
+            "help": "spaCy model to use. This may be a name of a standard spaCy model or a path to the "
+                    "location of a trained model on disk, if on_disk=T. "
+                    "If it's not a path, the spaCy download command will be run before execution",
+            "default": "en_core_web_sm",
+        },
+        "on_disk": {
+            "help": "Load the specified model from a location on disk (the model parameter gives the path)",
+            "type": str_to_bool,
+        }
+    }
+    module_supports_python2 = True
+
+    def get_software_dependencies(self):
+        return super(ModuleInfo, self).get_software_dependencies() + [spacy_dependency]
diff --git a/test/data/pipelines/spacy/extract_nps.conf b/test/data/pipelines/spacy/extract_nps.conf
@@ -0,0 +1,13 @@
+[pipeline]
+name=spacy_parse_text
+release=latest
+
+# Prepared tarred corpus
+[europarl]
+type=pimlico.datatypes.corpora.GroupedCorpus
+data_point_type=RawTextDocumentType
+dir=%(test_data_dir)s/datasets/text_corpora/europarl
+
+[extract_nps]
+type=pimlico.modules.spacy.extract_nps
+model=en_core_web_sm