# NLP 2

In [5]:
## Lemmatization

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc1 = nlp(u'I am a runner running in a race because I love to run since I ran today.')

In [4]:
for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today
. 	 PUNCT 	 12646065887601541794 	 .


In [6]:
## Stop words

In [8]:
print(nlp.Defaults.stop_words)

{'on', 'it', 'and', 'the', 'out', 'amongst', 'at', 'either', 'why', 'just', 'not', 'sometimes', 'thence', 'were', 'this', 'we', 'together', 'back', 'really', 'before', 'ten', 'must', 'using', 'except', 'towards', 'eight', 'namely', 'to', 'me', 'somewhere', 'twelve', 'meanwhile', 'move', 'noone', 'anyhow', 'how', 'those', 'without', 'against', 'ourselves', '’m', 'get', 'seems', 'whereas', 'a', 'any', 'he', 'that', 'what', 'else', 'front', 'with', 'may', 'was', '’ll', 'every', 'or', 'next', 'thru', 'under', 'sixty', 'doing', 'beyond', 'can', 'become', 'even', 'down', 'above', 'his', 'put', 'around', 'another', 'already', 'formerly', 'make', "'d", 'although', "'ll", 'through', 'hers', 'wherein', 'did', "'m", '’ve', 'itself', 'anyone', 'amount', 'herein', 'for', 'is', 'there', '‘m', 'n’t', 'than', 'along', 'own', 'three', 'beside', 'wherever', 'somehow', 'besides', 'four', '‘s', 'him', 'such', 'serious', 'various', 'so', 'n‘t', 'becoming', 'whenever', 'whither', 'an', 'due', 'hundred', 'no

In [9]:
len(nlp.Defaults.stop_words)

326

In [13]:
nlp.vocab['omg'].is_stop

False

In [14]:
# Add a stop word
nlp.Defaults.stop_words.add('btw')

In [15]:
nlp.vocab['btw'].is_stop = True

In [16]:
len(nlp.Defaults.stop_words)

327

In [17]:
nlp.vocab['btw'].is_stop

True

In [18]:
# remove stop words from defaults
nlp.Defaults.stop_words.remove('beyond')

In [19]:
nlp.vocab['beyond'].is_stop = False

In [20]:
nlp.vocab['beyond'].is_stop

False

In [21]:
## Phrase Matching and Vocabulary

In [22]:
# Import Matcher Library
from spacy.matcher import Matcher

In [23]:
matcher = Matcher(nlp.vocab)

In [25]:
# SolarPower
# Solar-power
# Solar power
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'}, {'IS_PUNCT': True}, {'LOWER':'power'}]
pattern3 = [{'LOWER':'solar'}, {'LOWER':'power'}]

In [26]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [28]:
doc = nlp(u'The Solar Power industry continues to grow as solarpower increases. Solar-power is amazing!')

In [29]:
found_matches = matcher(doc)

In [30]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [31]:
matcher.remove('SolarPower')

In [33]:
# solarpower SolarPower
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'}, {'IS_PUNCT':True, 'OP':'*'}, {'LOWER':'power'}]

In [34]:
matcher.add('SolarPower', None, pattern1, pattern2)

In [35]:
doc2 = nlp(u'Solar--power is solarpower yay!')

In [36]:
found_matches = matcher(doc2)

In [37]:
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [38]:
from spacy.matcher import PhraseMatcher

In [39]:
matcher = PhraseMatcher(nlp.vocab)

In [41]:
with open('./TextFiles/reaganomics.txt', encoding="ISO-8859-1") as f:
    doc3 = nlp(f.read())

In [42]:
# list of phrases
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [43]:
# Create document object
phrase_patterns = [nlp(text) for text in phrase_list]

In [44]:
matcher.add('EconMatcher', None, *phrase_patterns)

In [45]:
found_matches = matcher(doc3)

In [46]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2986, 2990)]

In [48]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id] # get string representation
    span = doc3[start:end]                   # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2986 2990 trickle-down economics


### Part of Speech Tagging

In [53]:
# Create a doc
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [54]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [55]:
print(doc[4].tag_)

VBD


In [56]:
print(doc[4].pos_)

VERB


In [58]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

The        DET        DT         determiner
quick      ADJ        JJ         adjective
brown      ADJ        JJ         adjective
fox        PROPN      NNP        noun, proper singular
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


In [59]:
doc = nlp(u"I read books on NLP.")

In [60]:
word = doc[1]

In [61]:
word.text

'read'

In [63]:
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBD        verb, past tense


In [64]:
doc = nlp(u"I read a book on NLP.")

In [65]:
word = doc[1]

token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBD        verb, past tense


In [66]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [68]:
# Count the parts of speech
POS_counts = doc.count_by(spacy.attrs.POS)

In [69]:
POS_counts

{90: 2, 84: 3, 96: 1, 100: 1, 85: 1, 92: 2, 94: 1, 97: 1}

In [70]:
# look up numerical identifier
doc.vocab[84].text

'ADJ'

In [73]:
doc[2].pos

84

In [79]:
DEP_counts = doc.count_by(spacy.attrs.DEP)


for k, v in sorted(DEP_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

402. amod  3
415. det   2
429. nsubj 1
439. pobj  1
440. poss  1
443. prep  1
445. punct 1
8110129090154140942. case  1
8206900633647566924. ROOT  1


In [77]:
len(doc.vocab)

1973

### Visualizing parts of speech

In [80]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [81]:
from spacy import displacy

In [82]:
displacy.render(doc, style='dep', jupyter = True)

In [83]:
options = {'distance': 100, 'compact':'True', 'color': 'yellow', 'bg': '#09a3d5', 'font':'Times'}

In [85]:
displacy.render(doc, style='dep',options = options, jupyter = True)

In [87]:
doc2 = nlp(u"This is a sentence. This is another sentence. This is another sentence, possibly longer.")

In [88]:
spans = list(doc2.sents)

In [90]:
displacy.serve(spans, style='dep', options={'distance': 110})




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [27/Mar/2021 23:28:03] "GET / HTTP/1.1" 200 11215
127.0.0.1 - - [27/Mar/2021 23:28:08] "GET /favicon.ico HTTP/1.1" 200 11215


Shutting down server on port 5000.


### Named Entity Recognition

In [97]:
# Write a function to display basic entity info:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [98]:
doc = nlp(u"Hi how are you?")

In [99]:
show_ents(doc)

No named entities found.


In [101]:
doc = nlp(u"May I go to Washington D.C., next May to see the Washington monument.")
show_ents(doc)

Washington D.C. - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
Washington - GPE - Countries, cities, states


In [102]:
ORG = doc.vocab.strings[u"ORG"]

In [103]:
ORG

383

In [105]:
from spacy.tokens import Span

In [107]:
new_ent = Span(doc, 0, 1, label = ORG)
# Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]

In [108]:
show_ents(doc)

May - ORG - Companies, agencies, institutions, etc.
Washington D.C. - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
Washington - GPE - Countries, cities, states


In [110]:
doc = nlp(u'Our company created a brand new vacuum cleaner. '
          u'This new vacuum-cleaner is the best in show.')

show_ents(doc)

No named entities found.


In [111]:
from spacy.matcher import PhraseMatcher

In [112]:
matcher = PhraseMatcher(nlp.vocab)

In [113]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']

In [114]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [115]:
matcher.add('newproduct', None, *phrase_patterns)

In [117]:
found_matches = matcher(doc)

In [118]:
from spacy.tokens import Span

In [120]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [121]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [122]:
new_ents = [Span(doc, match[1], match[2], label = PROD) for match in found_matches]

In [123]:
doc.ents = list(doc.ents) + new_ents

In [124]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [125]:
doc = nlp(u"Originally I paid $29.95 for this car toy, but now it is marked down by $10")

In [126]:
[ent for ent in doc.ents if ent.label_ == "MONEY"]

[29.95, 10]

In [127]:
len([ent for ent in doc.ents if ent.label_ == "MONEY"])

2

In [128]:
## Visualizing NER

In [133]:
doc = nlp(u"Over the last quarter, Apple sold nearly 20 thousand Ipods for a profit of $6 million."
         u"By contrast, Sony only sold 8 thousand Walkman music players")

In [134]:
displacy.render(doc, style = 'ent', jupyter = True)

In [135]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style = 'ent', jupyter = True)

In [136]:
options = {'ents':["PRODUCT", "ORG"]}

In [137]:
displacy.render(doc, style = 'ent', jupyter = True, options = options)

In [138]:
colors = {'ORG':'red'}
options = {'ents':["PRODUCT", "ORG"], 'colors':colors}

In [139]:
displacy.render(doc, style = 'ent', jupyter = True, options = options)

In [None]:
displacy.serve(doc, style='ent', options=options)




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [28/Mar/2021 00:07:10] "GET / HTTP/1.1" 200 1545
127.0.0.1 - - [28/Mar/2021 00:07:11] "GET /favicon.ico HTTP/1.1" 200 1545


In [None]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence')