Skip to content

Commit

Permalink
Added test for spacy tokenizer
Browse files Browse the repository at this point in the history
Improved model downloading routine as well, including fixing problem of not finding model after downloading.
  • Loading branch information
markgw committed May 27, 2020
1 parent 29980aa commit 4298f7c
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 6 deletions.
22 changes: 16 additions & 6 deletions src/python/pimlico/modules/spacy/tokenize/execute.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from importlib import reload

import pkg_resources
import spacy
from spacy import about
from spacy.cli.download import get_json, get_compatibility, get_version, download_model
Expand All @@ -11,13 +12,21 @@

def preprocess(executor):
model = executor.info.options["model"]
if not executor.info.options["on_disk"]:
# If not loading from disk, we need to run the spacy download command
executor.log.info("Checking the model '{}' is downloaded".format(model))
if not download(model):
raise ModuleExecutionError("Model could not be downloaded")

nlp = spacy.load(model)
try:
nlp = spacy.load(model)
except IOError:
# Couldn't load spacy model
if not executor.info.options["on_disk"]:
# If not loading from disk, we need to run the spacy download command
executor.log.info("Downloading the model '{}'".format(model))
if not download(model):
raise ModuleExecutionError("Model could not be downloaded")
else:
raise
# Now the model should be available
nlp = spacy.load(model)

executor.tokenizer = nlp.Defaults.create_tokenizer(nlp)
executor.sentencizer = nlp.create_pipe("sentencizer")

Expand Down Expand Up @@ -54,4 +63,5 @@ def download(model):
# Refresh sys.path so we can import the installed package
import site
reload(site)
reload(pkg_resources)
return True
13 changes: 13 additions & 0 deletions test/data/pipelines/spacy/tokenize.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[pipeline]
name=spacy_tokenize
release=latest

# Prepared tarred corpus
[europarl]
type=pimlico.datatypes.corpora.GroupedCorpus
data_point_type=RawTextDocumentType
dir=%(test_data_dir)s/datasets/text_corpora/europarl

[tokenize]
type=pimlico.modules.spacy.tokenize
model=en_core_web_sm

0 comments on commit 4298f7c

Please sign in to comment.