In [35]:
pip install -U spacy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import spacy 

In [3]:
import en_core_web_sm

In [31]:
!python3 -m spacy download en

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.5.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


Part 1 is understanding Spacy

In [5]:
nlp = spacy.load('en_core_web_sm')
doc = nlp('Mark Zuckerberg born May 14, 1984 in New York is an American technology entrepreneur and philanthropist best known for co-founding and leading Facebook as its chairman and CEO.')
for token in doc:
    print(token.text, token.pos_, token.lemma_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

Mark PROPN Mark NNP compound Xxxx True False
Zuckerberg PROPN Zuckerberg NNP nsubj Xxxxx True False
born VERB bear VBN csubj xxxx True False
May PROPN May NNP npadvmod Xxx True True
14 NUM 14 CD nummod dd False False
, PUNCT , , punct , False False
1984 NUM 1984 CD nummod dddd False False
in ADP in IN prep xx True True
New PROPN New NNP compound Xxx True False
York PROPN York NNP pobj Xxxx True False
is AUX be VBZ ROOT xx True True
an DET an DT det xx True True
American ADJ american JJ amod Xxxxx True False
technology NOUN technology NN compound xxxx True False
entrepreneur NOUN entrepreneur NN attr xxxx True False
and CCONJ and CC cc xxx True True
philanthropist NOUN philanthropist NN conj xxxx True False
best ADV well RBS advmod xxxx True False
known VERB know VBN acl xxxx True False
for ADP for IN prep xxx True True
co NOUN co NN pobj xx True False
- ADJ - JJ pobj - False False
founding ADJ founding JJ pobj xxxx True False
and CCONJ and CC cc xxx True True
leading VERB lead VBG conj

Stemming and lammatization:

In [6]:
for token in doc:
    print(token.text, token.lemma_)

Mark Mark
Zuckerberg Zuckerberg
born bear
May May
14 14
, ,
1984 1984
in in
New New
York York
is be
an an
American american
technology technology
entrepreneur entrepreneur
and and
philanthropist philanthropist
best well
known know
for for
co co
- -
founding founding
and and
leading lead
Facebook Facebook
as as
its its
chairman chairman
and and
CEO CEO
. .


In [7]:
import nltk

In [8]:
from nltk.stem.porter import *

In [9]:
from nltk.stem.snowball import SnowballStemmer

In [10]:
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")
print(porter_stemmer.stem('faster'))
print(snowball_stemmer.stem('faster'))

faster
faster


In [11]:
for ent in doc1.ents:
    print(ent.text, ent.label_)

Mark Zuckerberg PERSON
May 14, 1984 DATE
New York GPE
American NORP


In [12]:
from spacy.lang.en.stop_words import STOP_WORDS

In [13]:
print(STOP_WORDS)

{'whereafter', 'however', 'whole', 'out', 'someone', 'an', "'re", 'your', 'none', '’ll', 'whatever', 'her', 'sometime', 'all', 'own', 'thus', 'is', 'done', 'what', 'least', 'ourselves', 'hers', 'sixty', 'first', 'was', 'twelve', 'never', 'thereby', 'i', 'between', 'perhaps', 'to', 'somewhere', 'unless', 'no', 'can', 'much', 'he', 'part', 'eleven', 'wherever', 'might', 'seemed', 'except', 'any', 'not', 'often', 'n’t', 'otherwise', 'if', 'am', 'upon', 'forty', 'along', 'every', 'this', 'became', 'those', 'by', 'under', 'nothing', 'something', 'front', 'beyond', 'four', 'after', 'the', 'three', 'same', 'once', 'mostly', 'make', 'namely', 'last', 'some', 'again', 'we', 'over', 'whither', 'than', 'should', 'so', 'while', 'up', "'ve", 'anything', 'due', 'noone', 'before', 'beside', 'somehow', 'throughout', 'neither', '‘d', 'indeed', 'eight', 'with', 'doing', 'will', 'top', 'give', 'since', 'yet', 'n‘t', 'many', 'mine', 'his', 'next', 'just', 're', 'be', 'ca', 'here', 'onto', 'besides', '‘s',

In [14]:
doc  = nlp('Book a flight from hyderabad to LA')

hyd, losa = doc[4], doc[6]

list(hyd.ancestors) #listing all the ancestors of hyderabad

[from, flight, Book]

In [15]:
list(losa.ancestors)

[to, flight, Book]

Ancestors in dependency parsing are the rightmost token of this token’s syntactic descendants

In [16]:
list(doc[4].ancestors) #accessing ancestors using doc index

[from, flight, Book]

In [17]:
# to check if a doc obj is ancestor of a diff doc obj

doc[2].is_ancestor(doc[4])

True

In [18]:
# understanding multiple tasks and targets

doc = nlp("Book a table at the restaurent and the taxi to the hotel")

tasks = doc[2],doc[8]  #assigning table and taxi as tasks

tasks_target = doc[5], doc[11] # assigning restaurent and hotel as targets

for task in tasks_target:
    for tok in task.ancestors:
        if tok in tasks:
            print("Booking of {} belongs to {}".format(tok,task))
            break
    

Booking of table belongs to restaurent
Booking of taxi belongs to hotel


Children in dependency parsing are immediate syntactic dependents of the token.

In [19]:
list(doc[5].children)

[the, and, taxi]

In [20]:
list(doc[2].children)

[a, at]

In [21]:
#understanding dependency parser visually
from spacy import displacy

In [22]:
doc = nlp("Book a table at the restaurent and the taxi to the hotel")
displacy.render(doc, style='dep')


In [25]:
doc = nlp("What are some places to visit in Berlin and stay in Lubec")
places = doc[7], doc[11]
actions = doc[5], doc[9]

for place in places:
    for tok in place.ancestors:
        if tok in actions:
            print("user is refering {} to {}".format(place, tok))
            break
        

user is refering Berlin to visit
user is refering Lubec to stay


Noun-chunks: These are basically base noun phrases, that have a noun as their head

In [26]:
#understanding noun-chunks in a sentence

doc = nlp("Boston Dynamics is gearing up to produce thousands of robot dogs ")

list(doc.noun_chunks)

[Boston Dynamics, thousands, robot dogs]

In [27]:
doc2 = nlp("DeepLearning cracks the code of messenger RNA's and protien coding potential")

for chunk in doc2.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

DeepLearning DeepLearning nsubj cracks
the code code dobj cracks
messenger RNA RNA pobj of
coding potential potential dobj cracks


Exploring GloVe to understand similarity between two words 

In [32]:
doc = nlp("How good is google's new A.I bot BRAD")
for token in doc:
    print(token.text, token.vector[:5])

How [ 1.6143672  -0.4711746   0.39407504  2.1231742   0.16744262]
good [ 1.0237651  -1.3570685   0.04195169  0.84736043 -0.49153998]
is [-0.49836892  0.70540786 -0.3299709   0.62666774 -0.52085745]
google [-0.48117828 -1.0377525   0.27524015  1.2639378   0.04486725]
's [2.0742245  2.1592314  0.17449465 0.7083967  0.3253508 ]
new [ 0.6110381  -1.0594106   0.90315354  1.34445     0.14536023]
A.I [ 0.02554996 -1.473116    0.6940452   0.57291913  0.22437394]
bot [-0.5032579  -1.3603683  -0.5172367   0.09713605  0.9227977 ]
BRAD [-0.85609686 -0.7711649   0.64675206  0.00531679  0.12968323]


In [33]:
#exploring similar words using their vectors 

hello_doc = nlp("hello")
hi_doc = nlp("hi")
hella_doc = nlp("hella")

print(hello_doc.similarity(hi_doc))
print(hello_doc.similarity(hella_doc))

0.7161105600799103
0.596996698990903


  print(hello_doc.similarity(hi_doc))
  print(hello_doc.similarity(hella_doc))


In [34]:
#understanding similarity in strings 

got_str1 = nlp("When Will next season of Game of Thrones be releasing?")
got_str2 = nlp("Game of Thrones next season release date?")

print(got_str1.similarity(got_str2))

0.49713212617227265


  print(got_str1.similarity(got_str2))
