In [3]:
import spacy
nlp = spacy.load("en_core_web_md")

### What is POS Tagging?

In [4]:
# Explain can be used to report part of speech tag
spacy.explain("NNS")

'noun, plural'

In [5]:
doc = nlp("I saw flowers.")
token = doc[2]
token.text, token.tag_, spacy.explain(token.tag_)

('flowers', 'NNS', 'noun, plural')

In [6]:
# Example of POS tagging in action
doc = nlp("Alicia and me went to the school by bus.")
for token in doc:
    print(token.text, token.pos_, token.tag_, spacy.explain(token.pos_), spacy.explain(token.tag_))

Alicia PROPN NNP proper noun noun, proper singular
and CCONJ CC coordinating conjunction conjunction, coordinating
me PRON PRP pronoun pronoun, personal
went VERB VBD verb verb, past tense
to ADP IN adposition conjunction, subordinating or preposition
the DET DT determiner determiner
school NOUN NN noun noun, singular or mass
by ADP IN adposition conjunction, subordinating or preposition
bus NOUN NN noun noun, singular or mass
. PUNCT . punctuation punctuation mark, sentence closer


In [7]:
# Another example of POS tagging in action
doc = nlp("My friend will fly to New York fast and she is staying there for 3 days.")
for token in doc:
    print(token.text, token.pos_, token.tag_, spacy.explain(token.pos_),spacy.explain(token.tag_))

My PRON PRP$ pronoun pronoun, possessive
friend NOUN NN noun noun, singular or mass
will AUX MD auxiliary verb, modal auxiliary
fly VERB VB verb verb, base form
to ADP IN adposition conjunction, subordinating or preposition
New PROPN NNP proper noun noun, proper singular
York PROPN NNP proper noun noun, proper singular
fast ADV RB adverb adverb
and CCONJ CC coordinating conjunction conjunction, coordinating
she PRON PRP pronoun pronoun, personal
is AUX VBZ auxiliary verb, 3rd person singular present
staying VERB VBG verb verb, gerund or present participle
there ADV RB adverb adverb
for ADP IN adposition conjunction, subordinating or preposition
3 NUM CD numeral cardinal number
days NOUN NNS noun noun, plural
. PUNCT . punctuation punctuation mark, sentence closer


In [8]:
# One more example of POS tagging in action (with context dependent POS verb/noun)
doc = nlp("I will ship the package tomorrow")
for token in doc:
    print('EX 1: ', token.text, token.tag_, spacy.explain(token.tag_))

doc_2 = nlp("I saw a red ship.")
for token in doc_2:
    print('EX 2: ', token.text, token.tag_, spacy.explain(token.tag_))

EX 1:  I PRP pronoun, personal
EX 1:  will MD verb, modal auxiliary
EX 1:  ship VB verb, base form
EX 1:  the DT determiner
EX 1:  package NN noun, singular or mass
EX 1:  tomorrow NN noun, singular or mass
EX 2:  I PRP pronoun, personal
EX 2:  saw VBD verb, past tense
EX 2:  a DT determiner
EX 2:  red JJ adjective (English), other noun-modifier (Chinese)
EX 2:  ship NN noun, singular or mass
EX 2:  . . punctuation mark, sentence closer


In [9]:
# But what about tricky sentences?
doc = nlp("My cat will fish for a fish tomorrow in a fishy way.")
for token in doc:
    print(token.text, token.pos_, token.tag_, spacy.explain(token.pos_), spacy.explain(token.tag_))

My PRON PRP$ pronoun pronoun, possessive
cat NOUN NN noun noun, singular or mass
will AUX MD auxiliary verb, modal auxiliary
fish VERB VB verb verb, base form
for ADP IN adposition conjunction, subordinating or preposition
a DET DT determiner determiner
fish NOUN NN noun noun, singular or mass
tomorrow NOUN NN noun noun, singular or mass
in ADP IN adposition conjunction, subordinating or preposition
a DET DT determiner determiner
fishy ADJ JJ adjective adjective (English), other noun-modifier (Chinese)
way NOUN NN noun noun, singular or mass
. PUNCT . punctuation punctuation mark, sentence closer


### Verb Tense & Aspect

In [10]:
# Print words from docs if word POS is a present progressive verb or base/infinitive verb
sent1 = "I flew to Rome."
sent2 = "I'm flying to Rome."
sent3 = "I will fly to Rome."

doc1 = nlp(sent1)
doc2 = nlp(sent2)
doc3 = nlp(sent3)

for doc in [doc1, doc2, doc3]:
    print([(w.text, w.lemma_) for w in doc if w.tag_== 'VBG' or w.tag_ == 'VB'])

[]
[('flying', 'fly')]
[('fly', 'fly')]


### Number, symbol, and punctuation tags

In [11]:
doc = nlp("He earned $5.5 million in 2020 and paid %35 tax.")
for token in doc:
    print(token.text, token.tag_, spacy.explain(token.tag_))

He PRP pronoun, personal
earned VBD verb, past tense
$ $ symbol, currency
5.5 CD cardinal number
million CD cardinal number
in IN conjunction, subordinating or preposition
2020 CD cardinal number
and CC conjunction, coordinating
paid VBD verb, past tense
% NN noun, singular or mass
35 CD cardinal number
tax NN noun, singular or mass
. . punctuation mark, sentence closer


### Dependency Parsing

In [12]:
doc = nlp("blue flower")
for token in doc:
    print(token.text, token.dep_)

blue compound
flower ROOT


In [13]:
# ROOT token is only one without a parent
doc = nlp("I counted white sheep.")
for token in doc:
    print(token.text, token.pos, token.dep_)

I 95 nsubj
counted 100 ROOT
white 84 amod
sheep 92 dobj
. 97 punct


In [14]:
# Token.head can be used to identify dependency heads
doc = nlp("I counted white sheep.")
for token in doc:
    print(token.text, token.tag_, token.dep_, token.head)

I PRP nsubj counted
counted VBD ROOT counted
white JJ amod sheep
sheep NN dobj counted
. . punct counted


In [15]:
# Xcomp relation o a verb is a clause without its own subject (open complement)
doc = nlp("We are trying to understand the difference.")
for token in doc:
    print(token.text, token.tag_, token.dep_, token.head)

We PRP nsubj trying
are VBP aux trying
trying VBG ROOT trying
to TO aux understand
understand VB xcomp trying
the DT det difference
difference NN dobj understand
. . punct trying


In [16]:
# Observing relations within sentences with subsentences
doc = nlp("Queen Katherine, who was the mother of Mary Tudor, died at 1536.")
for token in doc:
    print(token.text, token.tag_, token.dep_, token.head)

Queen NNP compound Katherine
Katherine NNP nsubj died
, , punct Katherine
who WP nsubj was
was VBD relcl Katherine
the DT det mother
mother NN attr was
of IN prep mother
Mary NNP compound Tudor
Tudor NNP pobj of
, , punct Katherine
died VBD ROOT died
at IN prep died
1536 CD pobj at
. . punct died


### Named Entity Tagger (NER)
A named entity is a real world object that we can refer to by a proper name or quantity of interest. It can be a person, a place, an organization, a company, a product, dates, times, percentages, monetary amounts, a drug, or a disease name.

In [17]:
doc = nlp("The president Donald Trump visited France.")
doc.ents

(Donald Trump, France)

In [20]:
doc = nlp("He worked for NASA.")
token = doc[3]
token.ent_type, spacy.explain(token.ent_type_), spacy.explain("ORG")


(383,
 'Companies, agencies, institutions, etc.',
 'Companies, agencies, institutions, etc.')

In [23]:
doc = nlp("Albert Einstein was born in Ulm on 1879. He studied electronical engineering at ETH Zurich.")
print(doc.ents)
for token in doc:
    print(token.text, token.ent_type_, spacy.explain(token.ent_type_))

(Albert Einstein, Ulm, 1879, ETH Zurich)
Albert PERSON People, including fictional
Einstein PERSON People, including fictional
was  None
born  None
in  None
Ulm GPE Countries, cities, states
on  None
1879 DATE Absolute or relative dates or periods
.  None
He  None
studied  None
electronical  None
engineering  None
at  None
ETH ORG Companies, agencies, institutions, etc.
Zurich ORG Companies, agencies, institutions, etc.
.  None


In [24]:
doc = nlp("Jean-Michel Basquiat was an American artist of Haitian and Puerto Rican descent who gained fame with his graffiti and street art work")
print(doc.ents)
for ent in doc.ents:
    print(ent, ent.label_,spacy.explain(ent.label_))

(Jean-Michel Basquiat, American, Haitian, Puerto Rican)
Jean-Michel Basquiat PERSON People, including fictional
American NORP Nationalities or religious or political groups
Haitian GPE Countries, cities, states
Puerto Rican NORP Nationalities or religious or political groups


### Example application

In [25]:
from bs4 import BeautifulSoup
import requests
import spacy

def url_text(url_string):
    res = requests.get(url_string)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script","style", 'aside']):
        script.extract()
    text = soup.get_text()
    return " ".join(text.split())

ny_art = url_text("https://www.nytimes.com/2021/01/12/opinion/trump-america-allies.html")

nlp = spacy.load("en_core_web_md")
doc = nlp(ny_art)

FeatureNotFound: Couldn't find a tree builder with the features you requested: html5lib. Do you need to install a parser library?

In [None]:
from collections import Counter
labels = [ent.label_ for ent in doc.ents]
Counter(labels)

### Merging and splitting tokens

In [30]:
doc = nlp("She lived in New Hampshire.")
print(doc.ents)
print([(token.text, token.i) for token in doc])
print(len(doc))

(New Hampshire,)
[('She', 0), ('lived', 1), ('in', 2), ('New', 3), ('Hampshire', 4), ('.', 5)]
6


In [31]:
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[3:5], attrs={"LEMMA": "new hampshire"})

In [32]:
print(doc.ents)
print([(token.text, token.i) for token in doc])
print(len(doc))

(New Hampshire,)
[('She', 0), ('lived', 1), ('in', 2), ('New Hampshire', 3), ('.', 4)]
5


In [33]:
print([(token.lemma_) for token in doc])

['she', 'live', 'in', 'new hampshire', '.']


In [34]:
doc = nlp("She lived in NewHampshire")
print(len(doc))
print([(token.text, token.lemma_, token.i) for token in doc])

4
[('She', 'she', 0), ('lived', 'live', 1), ('in', 'in', 2), ('NewHampshire', 'NewHampshire', 3)]


In [36]:
with doc.retokenize() as retokenizer:
    heads = [(doc[3], 1), doc[2]]
    attrs = {"TAG": ["NNP", "NNP"],"DEP": ["compound","pobj"]}
    retokenizer.split(doc[3], ["New", "Hampshire"], heads=heads, attrs=attrs)

print([(token.text, token.lemma_, token.i) for token in doc])

ValueError: [E117] The newly split tokens must match the text of the original token. New orths: NewHampshire. Old text: New.