### Linguistic Features

In [None]:
!python -m spacy download en_core_web_md

### POS Tagging

In [2]:
import spacy

nlp = spacy.load('en_core_web_md')

doc = nlp("Alicia and me went to the school by bus")
for token in doc:
    print(token.text, token.pos_, token.tag_, spacy.explain(token.pos_), spacy.explain(token.tag_))

Alicia PROPN NNP proper noun noun, proper singular
and CCONJ CC coordinating conjunction conjunction, coordinating
me PRON PRP pronoun pronoun, personal
went VERB VBD verb verb, past tense
to ADP IN adposition conjunction, subordinating or preposition
the DET DT determiner determiner
school NOUN NN noun noun, singular or mass
by ADP IN adposition conjunction, subordinating or preposition
bus NOUN NN noun noun, singular or mass


In [3]:
doc = nlp("My friend will fly to New York fast and she is staying there for 3 days.")
for token in doc:
    print(token.text, token.pos_, token.tag_, spacy.explain(token.pos_), spacy.explain(token.tag_))

My PRON PRP$ pronoun pronoun, possessive
friend NOUN NN noun noun, singular or mass
will AUX MD auxiliary verb, modal auxiliary
fly VERB VB verb verb, base form
to ADP IN adposition conjunction, subordinating or preposition
New PROPN NNP proper noun noun, proper singular
York PROPN NNP proper noun noun, proper singular
fast ADV RB adverb adverb
and CCONJ CC coordinating conjunction conjunction, coordinating
she PRON PRP pronoun pronoun, personal
is AUX VBZ auxiliary verb, 3rd person singular present
staying VERB VBG verb verb, gerund or present participle
there ADV RB adverb adverb
for ADP IN adposition conjunction, subordinating or preposition
3 NUM CD numeral cardinal number
days NOUN NNS noun noun, plural
. PUNCT . punctuation punctuation mark, sentence closer


In [4]:
doc = nlp("My cat will fish for a fish tomorrow in a fishy way.")
for token in doc:
    print(token.text, token.pos_, token.tag_, spacy.explain(token.pos_), spacy.explain(token.tag_))

My PRON PRP$ pronoun pronoun, possessive
cat NOUN NN noun noun, singular or mass
will AUX MD auxiliary verb, modal auxiliary
fish VERB VB verb verb, base form
for ADP IN adposition conjunction, subordinating or preposition
a DET DT determiner determiner
fish NOUN NN noun noun, singular or mass
tomorrow NOUN NN noun noun, singular or mass
in ADP IN adposition conjunction, subordinating or preposition
a DET DT determiner determiner
fishy ADJ JJ adjective adjective (English), other noun-modifier (Chinese)
way NOUN NN noun noun, singular or mass
. PUNCT . punctuation punctuation mark, sentence closer


In [5]:
doc = nlp("He earned $5.5 million in 2020 and paid %35 tax.")
for token in doc:
    print(token.text, token.pos_, token.tag_, spacy.explain(token.pos_), spacy.explain(token.tag_))

He PRON PRP pronoun pronoun, personal
earned VERB VBD verb verb, past tense
$ SYM $ symbol symbol, currency
5.5 NUM CD numeral cardinal number
million NUM CD numeral cardinal number
in ADP IN adposition conjunction, subordinating or preposition
2020 NUM CD numeral cardinal number
and CCONJ CC coordinating conjunction conjunction, coordinating
paid VERB VBD verb verb, past tense
% NOUN NN noun noun, singular or mass
35 NUM CD numeral cardinal number
tax NOUN NN noun noun, singular or mass
. PUNCT . punctuation punctuation mark, sentence closer


### Dependency

In [6]:
doc = nlp("I counted white sheep.")
for token in doc:
    print(token.text, token.pos_, token.tag_, token.dep_, token.head)

I PRON PRP nsubj counted
counted VERB VBD ROOT counted
white ADJ JJ amod sheep
sheep NOUN NNS dobj counted
. PUNCT . punct counted


In [7]:
from spacy import displacy

displacy.render(doc, jupyter=True, style='dep')

### NER

In [8]:
doc = nlp("The president Donald Trump visited France.")
print(doc.ents)
print(type(doc.ents[1]))


(Donald Trump, France)
<class 'spacy.tokens.span.Span'>


In [9]:
print(spacy.explain("ORG"))

Companies, agencies, institutions, etc.


In [10]:
doc2 = nlp("He worked for NASA")
token = doc2[3]
print(token.text, token.ent_type_, spacy.explain(token.ent_type_))

NASA ORG Companies, agencies, institutions, etc.


In [11]:
doc3 = nlp("“Albert Einstein was born in Ulm on 1987. He studied electronical engineering at ETH Zurich.")
print(doc3.ents)

(Albert Einstein, Ulm, 1987, ETH Zurich)


In [12]:
for token in doc3:
    print(token.text, token.ent_type_, spacy.explain(token.ent_type_))

“  None
Albert PERSON People, including fictional
Einstein PERSON People, including fictional
was  None
born  None
in  None
Ulm GPE Countries, cities, states
on  None
1987 DATE Absolute or relative dates or periods
.  None
He  None
studied  None
electronical  None
engineering  None
at  None
ETH ORG Companies, agencies, institutions, etc.
Zurich ORG Companies, agencies, institutions, etc.
.  None


### Merging-Splitting-Merge

In [13]:
doc = nlp("She lived in New Hampshire.")
print(doc.ents)
print([(token.text, token.i) for token in doc])
print(len(doc))

(New Hampshire,)
[('She', 0), ('lived', 1), ('in', 2), ('New', 3), ('Hampshire', 4), ('.', 5)]
6


In [14]:
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[3:5], attrs={"LEMMA":"new hampshire"})

In [15]:
print(doc.ents)
print([(token.text, token.i) for token in doc])

(New Hampshire,)
[('She', 0), ('lived', 1), ('in', 2), ('New Hampshire', 3), ('.', 4)]


In [16]:
print(len(doc))
print([(token.lemma_) for token in doc])

5
['she', 'live', 'in', 'new hampshire', '.']


In [17]:
doc = nlp("She lived in NewHampshire.")
print(len(doc))
print([(token.text, token.lemma_, token.i) for token in doc])
for token in doc:
    print(token.text, token.pos_, token.tag_, token.dep_)

5
[('She', 'she', 0), ('lived', 'live', 1), ('in', 'in', 2), ('NewHampshire', 'NewHampshire', 3), ('.', '.', 4)]
She PRON PRP nsubj
lived VERB VBD ROOT
in ADP IN prep
NewHampshire PROPN NNP pobj
. PUNCT . punct


In [18]:
with doc.retokenize() as retokenizer:
    heads = [(doc[3], 1), doc[2]]
    attrs = {"TAG":["NNP", "NNP"], "DEP":["compound", "pobj"]}
    retokenizer.split(doc[3], ["New", "Hampshire"], heads=heads, attrs=attrs)

In [19]:
print(len(doc))
print([(token.text, token.lemma_, token.i) for token in doc])
for token in doc:
    print(token.text, token.pos_, token.tag_, token.dep_)

6
[('She', 'she', 0), ('lived', 'live', 1), ('in', 'in', 2), ('New', 'New', 3), ('Hampshire', 'Hampshire', 4), ('.', '.', 5)]
She PRON PRP nsubj
lived VERB VBD ROOT
in ADP IN prep
New PROPN NNP compound
Hampshire PUNCT NNP pobj
. PUNCT . punct


### Rule-Based Matching - Matcher Class

Matching a pattern

In [20]:
from spacy.matcher import Matcher

In [21]:
doc = nlp("Good morning, I want to reserve a ticket.") 

matcher = Matcher(nlp.vocab) # Matcher needs to be intialized with vocabulary object
pattern = [{"LOWER": "good"}, {"LOWER": "morning"}, {"IS_PUNCT": True}] 
matcher.add("morningGreeting", [pattern]) 

matches = matcher(doc) 
for match_id, start, end in matches: 
    m_span = doc[start:end]   
    print(start, end, m_span.text)

0 3 Good morning,


Matching two patterns

In [22]:
doc = nlp("Good morning, I want to reserve a ticket. I will then say good evening!") 
# Initialize the Matcher
matcher = Matcher(nlp.vocab)

# Define the patterns
pattern1 = [{"LOWER": "good"}, {"LOWER": "morning"}, {"IS_PUNCT": True}]
pattern2 = [{"LOWER": "good"}, {"LOWER": "evening"}, {"IS_PUNCT": True}]  

# name and add the patterns
matcher.add("morningGreeting", [pattern1]) 
matcher.add("eveningGreeting", [pattern2]) 

# get the matches
matches = matcher(doc) 

for match_id, start, end in matches: 
    pattern_name = nlp.vocab.strings[match_id]
    m_span = doc[start:end]   
    print(start, end, m_span.text) 

0 3 Good morning,
14 17 good evening!


While matching pattern ORTH and TEXT are similar to LOWER: they mean an exact match of the token text, including the case.

In [23]:
doc = nlp("I bought a pineapple.")

matcher = Matcher(nlp.vocab)

pattern = [{"LENGTH": 1}]

matcher.add("onlyShort",  [pattern])

matches = matcher(doc)

print("no.of matches:", len(matches))

for mid, start, end in matches:
     print(start, end, doc[start:end])

no.of matches: 3
0 1 I
2 3 a
4 5 .


The next block of token attributes is IS_ALPHA, IS_ASCII, and IS_DIGIT. These features are handy for finding number tokens and ordinary words (which do not include any interesting characters). The following pattern matches a sequence of two tokens, a number followed by an ordinary word:

In [24]:
doc1 = nlp("I met him at 2 o'clock.")

matcher = Matcher(nlp.vocab)

pattern = [{"IS_DIGIT": True},{"IS_ALPHA": True}]

matcher.add("numberAndPlainWord",  [pattern])

matches = matcher(doc1)

print(len(matches))

for mid, start, end in matches:
     print(start, end, doc1[start:end])

0


In [25]:
doc2 = nlp("He brought me 2 apples.")

matcher = Matcher(nlp.vocab)

pattern = [{"IS_DIGIT": True},{"IS_ALPHA": True}]

matcher.add("numberAndPlainWord",  [pattern])

matches = matcher(doc2)

print(len(matches))

for mid, start, end in matches:
     print(start, end, doc2[start:end])

1
3 5 2 apples


In the preceding code segment, 2 o'clock didn't match the pattern because o'clock contains an apostrophe, which is not an alphabetic character (alphabetic characters are digits, letters, and the underscore character). 2 apples matched because the token apples consists of letters.

In [26]:
doc = nlp("Take me out of your SPAM list. We never asked you to contact me. If you write again we'll SUE!!!!")

matcher = Matcher(nlp.vocab)

pattern = [{"IS_UPPER": True}]

matcher.add("capitals",  [pattern])

matches = matcher(doc)

for mid, start, end in matches:
     print(start, end, doc[start:end])

5 6 SPAM
22 23 SUE


In [27]:
doc1 = nlp("Can you swim?")

matcher = Matcher(nlp.vocab)

# In here we put two attributes in one  brace
pattern = [{"IS_SENT_START": True, "LOWER": "can"}, {"IS_TITLE": True}]

matcher.add("canThenCapitalized",  [pattern])

matches = matcher(doc1)

for mid, start, end in matches:
     print(start, end, doc1[start:end])

In [28]:
doc2 = nlp("Can Sally swim?")

matcher = Matcher(nlp.vocab)

pattern = [{"IS_SENT_START": True, "LOWER": "can"}, {"IS_TITLE": True}]

matcher.add("canThenCapitalized",  [pattern])

matches = matcher(doc2)

for mid, start, end in matches:
     print(start, end, doc2[start:end])

0 2 Can Sally


LIKE_NUM, LIKE_URL, and LIKE_EMAIL are attributes that are related to token shape.

After seeing the shape attributes, let's see the POS, TAG, DEP, LEMMA, and SHAPE linguistic attributes

In [29]:
doc = nlp("Will you go there?")

matcher = Matcher(nlp.vocab)

pattern = [{"IS_SENT_START": True, "TAG": "MD"}]

matcher.add("sentStart",[pattern])

matches = matcher(doc)

for mid, start, end in matches:
     print(start, end, doc[start:end])

0 1 Will


Extended syntax support - IN, NOT_IN, IS_SUBSET, IS_SUPERSET, INTESECTS and comparison operators.

In [30]:
doc = nlp("Good morning, I'm here. I'll say good evening!!")

matcher = Matcher(nlp.vocab)

pattern = [{"LOWER": "good"},{"LOWER": {"IN": ["morning", "evening"]}},{"IS_PUNCT": True}]

matcher.add("greetings",  [pattern])

matches = matcher(doc)

for mid, start, end in matches:
     print(start, end, doc[start:end])

0 3 Good morning,
10 13 good evening!


In [31]:
doc = nlp("I suffered from Trichotillomania when I was in college. The doctor prescribed me Psychosomatic medicine.")

matcher = Matcher(nlp.vocab)

pattern = [{"LENGTH": {">=" : 10}}]

matcher.add("longWords",  [pattern])

matches = matcher(doc)

for mid, start, end in matches:
     print(start, end, doc[start:end])

3 4 Trichotillomania
12 13 prescribed
14 15 Psychosomatic


Regex-like operators - OP
```
    !	Negate the pattern, by requiring it to match exactly 0 times.
    ?	Make the pattern optional, by allowing it to match 0 or 1 times.
    +	Require the pattern to match 1 or more times.
    *	Allow the pattern to match 0 or more times.
```

In [32]:
doc1 = nlp("Barack Obama visited France.")
doc2 = nlp("Barack Hussein Obama visited France.")

matcher = Matcher(nlp.vocab)

pattern = [{"LOWER": "barack"}, {"LOWER": "hussein", "OP": "?"},{"LOWER": "obama"}]

matcher.add("obamaNames",  [pattern])

print(matcher(doc1))
print(matcher(doc2))

[(9957319642918298529, 0, 2)]
[(9957319642918298529, 0, 3)]


In [33]:
doc1 = nlp("Hello hello hello, how are you?")
doc2 = nlp("Hello, how are you?")
doc3 = nlp("How are you?")

matcher = Matcher(nlp.vocab)

pattern = [{"LOWER": {"IN": ["hello", "hi", "hallo"]}, "OP": "*"}, {"IS_PUNCT": True}]

matcher.add("greetings",  [pattern])

print("**************")

for mid, start, end in matcher(doc1):
     print(start, end, doc1[start:end])

print("**************")

for mid, start, end in matcher(doc2):
     print(start, end, doc1[start:end])

print("**************")

for mid, start, end in matcher(doc3):
     print(start, end, doc1[start:end])

print("**************")

**************
0 4 Hello hello hello,
1 4 hello hello,
2 4 hello,
3 4 ,
7 8 ?
**************
0 2 Hello hello
1 2 hello
5 6 are
**************
3 4 ,
**************


In [34]:
doc1 = nlp("Hello hello hello, how are you?")
doc2 = nlp("Hello, how are you?")
doc3 = nlp("How are you?")

matcher = Matcher(nlp.vocab)

pattern = [{"LOWER": {"IN": ["hello", "hi", "hallo"]}, "OP": "+"}, {"IS_PUNCT": True}]

matcher.add("greetings", [pattern])

print("**************")

for mid, start, end in matcher(doc1):
     print(start, end, doc1[start:end])

print("**************")

for mid, start, end in matcher(doc2):
     print(start, end, doc1[start:end])

print("**************")

for mid, start, end in matcher(doc3):
     print(start, end, doc1[start:end])

print("**************")

**************
2 4 hello,
1 4 hello hello,
0 4 Hello hello hello,
**************
0 2 Hello hello
**************
**************


Regex support - spaCy Matcher offers full support for token-level regex matching

In [35]:
doc1 = nlp("I travelled by bus.") 
doc2 = nlp("She traveled by bike.") 

matcher = Matcher(nlp.vocab)

pattern = [{"POS": "PRON"}, {"TEXT": {"REGEX": "[Tt]ravell?ed"}}] 

matcher.add("travelRegex", [pattern]) 

for mid, start, end in matcher(doc1): 
    print(start, end, doc1[start:end]) 

for mid, start, end in matcher(doc2): 
    print(start, end, doc2[start:end]) 

0 2 I travelled
0 2 She traveled


In [36]:
doc = nlp("I went to Italy; he has been there too. His mother also has told me she wants to visit Rome.")

matcher = Matcher(nlp.vocab)

# Using regex with POS tags
pattern = [{"TAG": {"REGEX": "^V"}}]

matcher.add("verbs",  [pattern])

for mid, start, end in matcher(doc):
    print(start, end, doc1[start:end])

1 2 travelled
6 7 
7 8 
14 15 
15 16 
18 19 
20 21 


We have extracted all the finite verbs (you can think of a finite verb as a non-modal verb). How did we do it? Our token pattern includes the regex ^V, which means all fine-grained POS tags that start with V: VB, VGD, VBG, VBN, VBP, and VBZ. Then we extracted tokens with verbal POS tags.

Wild Card Matching

In [37]:
doc = nlp("My name is Alice and his name was Elliot.")

matcher = Matcher(nlp.vocab)

pattern = [{"LOWER": "name"},{"LEMMA": "be"},{}]

matcher.add("pickName", [pattern])

for mid, start, end in matcher(doc):
     print(start, end, doc[start:end])

1 4 name is Alice
6 9 name was Elliot


In [38]:
doc1 = nlp("I forwarded his email to you.")
doc2 = nlp("I forwarded an email to you.")
doc3 = nlp("I forwarded the email to you.")

matcher = Matcher(nlp.vocab)

pattern = [{"LEMMA": "forward"}, {}, {"LOWER": "email"}]

matcher.add("forwardMail",  [pattern])

print("****************************")

for mid, start, end in matcher(doc1):
     print(start, end, doc1[start:end])

print("****************************")

for mid, start, end in matcher(doc2):
     print(start, end, doc2[start:end])

print("****************************")

for mid, start, end in matcher(doc3):
    print(start, end, doc3[start:end])

print("****************************")

****************************
1 4 forwarded his email
****************************
1 4 forwarded an email
****************************
1 4 forwarded the email
****************************


To check regex and Matcher these sites are useful:

https://regex101.com/  

https://explosion.ai/demos/matcher

Phrase Matcher

In [39]:
from spacy.matcher import PhraseMatcher

In [40]:
doc = nlp("3 EU leaders met in Berlin. German chancellor Angela Merkel first welcomed the US president Donald Trump. The following day Alexis Tsipras joined them in Brandenburg.")

matcher = PhraseMatcher(nlp.vocab) 

terms = ["Angela Merkel", "Donald Trump", "Alexis Tsipras"] 
patterns = [nlp.make_doc(term) for term in terms] 
# make_doc() creates a Doc from every term, and it's quite efficient in terms
# of processing because instead of the whole pipeline, it only calls the Tokenizer
matcher.add("politiciansList", None, *patterns) 

matches = matcher(doc) 

for mid, start, end in matches: 
    print(start, end, doc[start:end]) 

9 11 Angela Merkel
16 18 Donald Trump
22 24 Alexis Tsipras


Example of matching by the LOWER attribute

In [41]:
doc = nlp("During the last decade, derivatives market became an asset class of their own and influenced the financial landscape strongly.")

matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

terms = ["Asset", "Investment", "Derivatives", "Demand",  "Market"]
patterns = [nlp.make_doc(term) for term in terms]
matcher.add("financeTerms", None, *patterns)

matches = matcher(doc)

for mid, start, end in matches:
    print(start, end, doc[start:end])

5 6 derivatives
6 7 market
9 10 asset


Example of matching by the SHAPE attribute

In [42]:
doc = nlp("This log contains the following IP addresses: 192.1.1.1 and 192.12.1.1 and 192.160.1.1 .")

matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")

ip_nums = ["127.0.0.0", "127.256.0.0"]
patterns = [nlp.make_doc(ip) for ip in ip_nums]
matcher.add("IPNums", None, *patterns)

for mid, start, end in matcher(doc):
    print(start, end, doc[start:end])

8 9 192.1.1.1
12 13 192.160.1.1


### Entity Ruler

In [43]:
doc = nlp("Bill Gates visited Berlin.")

matcher = Matcher(nlp.vocab)

pattern = [{"ENT_TYPE": "PERSON"}]
matcher.add("personEnt",  [pattern])

matches = matcher(doc)

for mid, start, end in matches:
    print(start, end, doc[start:end])

0 1 Bill
1 2 Gates


In [44]:
doc = nlp("Today German chancellor Angela Merkel met with the US president.")

matcher = Matcher(nlp.vocab)

pattern = [{"ENT_TYPE": "PERSON", "OP": "+"}, {"POS" : "VERB"}]
matcher.add("personEntAction",  [pattern])

matches = matcher(doc)

for mid, start, end in matches:
    print(start, end, doc[start:end])

4 6 Merkel met
3 6 Angela Merkel met


spaCy's EntityRuler is the component that allows us to add rules on top of the statistical model and creates an even more powerful NER model.

EntityRuler is not a matcher, it's a pipeline component that we can add to our pipeline via nlp.add_pipe. When it finds a match, the match is appended to doc.ents and ent_type will be the label we pass in the pattern

In [45]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [46]:
doc = nlp("I have an acccount with chime since 2017")

for ent in doc.ents:
  print(ent.text, ent.label_)


2017 DATE


In [47]:
ruler = nlp.add_pipe('entity_ruler')

In [48]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [49]:
patterns = [{"label": "ORG", "pattern": [{"LOWER": "chime"}]}]

In [50]:
ruler.add_patterns(patterns)

In [51]:
doc2 = nlp("I have an acccount with chime since 2017")
print(doc2.ents)
print(doc2[5].ent_type_)

(chime, 2017)
ORG


### Combining spaCy models and matchers
Extracting IBAN and account numbers

In [52]:
doc = nlp("My IBAN number is BE71 0961 2345 6769, please send the money there.")
doc1 = nlp("My IBAN number is FR76 3000 6000 0112 3456 7890 189, please send the money there.")

matcher = Matcher(nlp.vocab)

pattern = [{"SHAPE": "XXdd"}, {"TEXT": {"REGEX": "\d{1,4}"}, "OP":"+"}]
matcher.add("ibanNum", [pattern])

for mid, start, end in matcher(doc):
    print(start, end, doc[start:end])

print("**************************************")

for mid, start, end in matcher(doc1):
    print(start, end, doc1[start:end])

4 6 BE71 0961
4 7 BE71 0961 2345
4 8 BE71 0961 2345 6769
**************************************
4 6 FR76 3000
4 7 FR76 3000 6000
4 8 FR76 3000 6000 0112
4 9 FR76 3000 6000 0112 3456
4 10 FR76 3000 6000 0112 3456 7890
4 11 FR76 3000 6000 0112 3456 7890 189


In [53]:
doc = nlp("My account number is 8921273.")

matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "account"},{"LOWER": {"IN": ["num", "number"]}},{},{"IS_DIGIT": True}]
matcher.add("accountNum", [pattern])

for mid, start, end in matcher(doc):
    print(start, end, doc[start:end])

1 5 account number is 8921273


Extracting phone number

In [54]:
doc1 = nlp("You can call my office on +1 (221) 102-2423 or email me directly.")
doc2 = nlp("You can call me on (221) 102 2423 or text me.")

matcher = Matcher(nlp.vocab)
pattern = [{"TEXT": "+1", "OP": "?"}, {"TEXT": "("}, {"SHAPE": "ddd"}, {"TEXT": ")"}, {"SHAPE": "ddd"}, {"TEXT": "-", "OP": "?"}, {"SHAPE": "dddd"}]
matcher.add("usPhonNum", [pattern])

for mid, start, end in matcher(doc1):
    print(start, end, doc1[start:end])

print("**************************************")

for mid, start, end in matcher(doc2):
    print(start, end, doc2[start:end])

6 13 +1 (221) 102-2423
7 13 (221) 102-2423
**************************************
5 10 (221) 102 2423




Extracting mentions  
```
pattern = [{"ENT_TYPE": "ORG"}, {"LEMMA": "be"}, {"POS": "ADV", "OP":"*" {"POS": "ADJ"}]
```
Hashtag and emoji extraction

Expanding named entities

In [55]:
doc = nlp("Ms. Smith left her house 2 hours ago.")
doc.ents

(Smith, 2 hours ago)

In [56]:
doc = nlp("Ms. Smith left her house")

patterns = [{"label": "TITLE", "pattern": [{"LOWER": {"IN": ["ms.", "mr.", "mrs.", "prof.", "dr."]}}]}]
ruler.add_patterns(patterns)

print([(ent.text, ent.label_) for ent in doc.ents])

[('Smith', 'PERSON')]


Combining linguistic features and named entities

In [57]:
doc = nlp("Einstein lived in Zurich.")
print([(ent.text, ent.label_) for ent in doc.ents])


[('Einstein', 'PERSON'), ('Zurich', 'GPE')]


In [58]:
person_ents = [ent for ent in doc.ents if ent.label_ == "PERSON"]

for person_ent in person_ents:
    # We use head of the entity's last token
    head = person_ent[-1].head
    if head.lemma_ == "live":
    #Check if the children of live contains prepositional attachment
        preps = [token for token in head.children if token.dep_ == "prep"]
    
    for prep in preps:
        places = [token for token in prep.children if token.ent_type_ == "GPE"]
        # Verb is in past or present tense
        print({'person': person_ent, 'city': places, 'past': head.tag_ == "VBD"})

{'person': Einstein, 'city': [Zurich], 'past': True}
