In [1]:

import spacy

In [2]:

# Load the language model
nlp_model = spacy.load('en_core_web_sm')

In [3]:

corpus_text = "Hello there. I would like to order a taxi doctor from Musgrove Park Hospital in Zimbabwe. Ideally on 25 December please."

In [4]:

# create a spaCy doc
doc = nlp_model(corpus_text)

In [5]:

# inspect the tokens in the doc, along with their positions
for token in doc:
    print(token.text, token.idx)

Hello 0
there 6
. 11
I 13
would 15
like 21
to 26
order 29
a 35
taxi 37
doctor 42
from 49
Musgrove 54
Park 63
Hospital 68
in 77
Zimbabwe 80
. 88
Ideally 90
on 98
25 101
December 104
please 113
. 119


In [6]:

# inspect sentences in the doc
for sentence in doc.sents:
    print(sentence)

Hello there.
I would like to order a taxi doctor from Musgrove Park Hospital in Zimbabwe.
Ideally on 25 December please.


In [7]:

# find named entities in the doc such as ORG, DATE, PRODUCT, TIME, PERSON, GPE
for entity in doc.ents:
    print(entity.text, entity.label_)

Musgrove Park Hospital ORG
Zimbabwe GPE
25 December DATE


In [8]:

# create custom named entities
# you'll need to modify the Language class and associated pipeline; this is used by spaCy to determine rules for entities
print(nlp_model.pipeline) # current pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7fc81ddedec0>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7fc81ddedde0>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7fc81e0f5cd0>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7fc81de35870>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7fc81e1278c0>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7fc81e0f59d0>)]


In [9]:

# modifying the pipeline
nlp_model2 = spacy.load('en_core_web_sm')
custom_ruler = nlp_model2.add_pipe('entity_ruler', before='ner')
patterns = [{'label':'SERVICE', 'pattern':'taxi'}, {'label':'DHOKOTA', 'pattern':'doctor'}]
custom_ruler.add_patterns(patterns)


print(nlp_model2.pipeline) # new pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7fc81de17ec0>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7fc81ea23f30>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7fc81ddf9950>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7fc81f9d7c80>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7fc81f9e3050>), ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler object at 0x7fc821417b40>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7fc81ea94bd0>)]


In [10]:

# test the new pipeline
doc2 = nlp_model2(corpus_text)
for entity in doc2.ents:
    print(entity.text, entity.label_)

taxi SERVICE
doctor DHOKOTA
Musgrove Park Hospital ORG
Zimbabwe GPE
25 December DATE
