Added test for spacy tokenizer

Improved model downloading routine as well, including fixing problem of not finding model after downloading.
markgw · May 27, 2020 · 4298f7c · 4298f7c
1 parent 29980aa
commit 4298f7c
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 6 deletions.
diff --git a/src/python/pimlico/modules/spacy/tokenize/execute.py b/src/python/pimlico/modules/spacy/tokenize/execute.py
@@ -1,5 +1,6 @@
 from importlib import reload
 
+import pkg_resources
 import spacy
 from spacy import about
 from spacy.cli.download import get_json, get_compatibility, get_version, download_model
@@ -11,13 +12,21 @@
 
 def preprocess(executor):
     model = executor.info.options["model"]
-    if not executor.info.options["on_disk"]:
-        # If not loading from disk, we need to run the spacy download command
-        executor.log.info("Checking the model '{}' is downloaded".format(model))
-        if not download(model):
-            raise ModuleExecutionError("Model could not be downloaded")
 
-    nlp = spacy.load(model)
+    try:
+        nlp = spacy.load(model)
+    except IOError:
+        # Couldn't load spacy model
+        if not executor.info.options["on_disk"]:
+            # If not loading from disk, we need to run the spacy download command
+            executor.log.info("Downloading the model '{}'".format(model))
+            if not download(model):
+                raise ModuleExecutionError("Model could not be downloaded")
+        else:
+            raise
+        # Now the model should be available
+        nlp = spacy.load(model)
+
     executor.tokenizer = nlp.Defaults.create_tokenizer(nlp)
     executor.sentencizer = nlp.create_pipe("sentencizer")
 
@@ -54,4 +63,5 @@ def download(model):
     # Refresh sys.path so we can import the installed package
     import site
     reload(site)
+    reload(pkg_resources)
     return True
diff --git a/test/data/pipelines/spacy/tokenize.conf b/test/data/pipelines/spacy/tokenize.conf
@@ -0,0 +1,13 @@
+[pipeline]
+name=spacy_tokenize
+release=latest
+
+# Prepared tarred corpus
+[europarl]
+type=pimlico.datatypes.corpora.GroupedCorpus
+data_point_type=RawTextDocumentType
+dir=%(test_data_dir)s/datasets/text_corpora/europarl
+
+[tokenize]
+type=pimlico.modules.spacy.tokenize
+model=en_core_web_sm