In [2]:
#importing packages

import spacy as sp
import pandas as pd
from spacy import displacy

In [3]:
!python -m spacy download en_core_web_lg 

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.2MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp36-none-any.whl size=829180944 sha256=5ec0c96351a3bd9f871e2086830a054acd568c81d245465dbda23234beed586d
  Stored in directory: /tmp/pip-ephem-wheel-cache-7hp1ueu9/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [3]:
#loading pre-trained model

nlp = sp.load('en_core_web_lg')

**Performing Word Tokenization**

In [4]:
# Creating a sentence
doc=nlp('the punishment assigned to a defendant found guilty by a court, or fixed by law for a particular offence.')

In [5]:
# Performing the Word Tokenization for the sentence
for i,Token in enumerate(doc):
    print(i,Token.text)

0 the
1 punishment
2 assigned
3 to
4 a
5 defendant
6 found
7 guilty
8 by
9 a
10 court
11 ,
12 or
13 fixed
14 by
15 law
16 for
17 a
18 particular
19 offence
20 .


**Performing Sentence Tokenization**

In [6]:
# Creating a paragraph
sentence_doc = nlp('Quantum computing is the use of quantum-mechanical phenomena such as superposition and entanglement to perform computation. Computers that perform quantum computations are known as quantum computers.Quantum computers are believed to be able to solve certain computational problems, such as integer factorization (which underlies RSA encryption), substantially faster than classical computers. The study of quantum computing is a subfield of quantum information science.')

In [7]:
# Performing Sentence Tokenization for the paragraph
for i,x in enumerate(sentence_doc.sents):
    print(i,x)

0 Quantum computing is the use of quantum-mechanical phenomena such as superposition and entanglement to perform computation.
1 Computers that perform quantum computations are known as quantum computers.
2 Quantum computers are believed to be able to solve certain computational problems, such as integer factorization (which underlies RSA encryption), substantially faster than classical computers.
3 The study of quantum computing is a subfield of quantum information science.


**Performing Lemmatization**

In [8]:
# Creating sentence for Lemmatization
lemma_doc = nlp('Her evil stepmother and stepsisters would not let her go to the ball, but her fairy godmother made it happen with magic. She danced with the prince and they fell in love. Since she had to leave by midnight, she ran and lost one slipper. The prince found the slipper and searched for her. After he found her, they were married and lived happily ever after')

In [9]:
# Performing Lemmatization
for token in lemma_doc:
    print(token.text,token.lemma_,token.lemma_.lower().strip())

Her -PRON- -pron-
evil evil evil
stepmother stepmother stepmother
and and and
stepsisters stepsister stepsister
would would would
not not not
let let let
her -PRON- -pron-
go go go
to to to
the the the
ball ball ball
, , ,
but but but
her -PRON- -pron-
fairy fairy fairy
godmother godmother godmother
made make make
it -PRON- -pron-
happen happen happen
with with with
magic magic magic
. . .
She -PRON- -pron-
danced dance dance
with with with
the the the
prince prince prince
and and and
they -PRON- -pron-
fell fall fall
in in in
love love love
. . .
Since since since
she -PRON- -pron-
had have have
to to to
leave leave leave
by by by
midnight midnight midnight
, , ,
she -PRON- -pron-
ran run run
and and and
lost lose lose
one one one
slipper slipper slipper
. . .
The the the
prince prince prince
found find find
the the the
slipper slipper slipper
and and and
searched search search
for for for
her -PRON- -pron-
. . .
After after after
he -PRON- -pron-
found find find
her -PRON- -pron-
, ,

**Creating dataframe for Lemmatization**

In [10]:
# Creating list for Lemmatization tokens
lemma=[]
for x,token in enumerate(lemma_doc):
    lemma.append([x,token.text,token.lemma_])

In [11]:
# Creating dataframe of Lemma tokens
lemma = pd.DataFrame(lemma,columns=['Index','Token','Lemma'])
lemma

Unnamed: 0,Index,Token,Lemma
0,0,Her,-PRON-
1,1,evil,evil
2,2,stepmother,stepmother
3,3,and,and
4,4,stepsisters,stepsister
...,...,...,...
68,68,and,and
69,69,lived,live
70,70,happily,happily
71,71,ever,ever


**Performing parts of speech Tagging**

In [12]:
# Creating sentence for Parts of Speech Tagging
POS_doc = nlp('A shoemaker and his wife were very poor. One day they ran out of leather so they went to bed. In the morning, they found a pair of shoes and a passerby bought them.')

In [13]:
# Creating list for the words of the Sentence and its corresponding Parts of Speech 
# and then creating dataframe
pos=[]
for i in POS_doc:
    pos.append([i.text,i.pos_])
POS_df = pd.DataFrame(pos,columns=['Words','Pos'])
POS_df

Unnamed: 0,Words,Pos
0,A,DET
1,shoemaker,NOUN
2,and,CCONJ
3,his,DET
4,wife,NOUN
5,were,AUX
6,very,ADV
7,poor,ADJ
8,.,PUNCT
9,One,NUM


**Performing Parts of Speech tagging using TAGGER**

In [14]:
# Creating list for the words of the Sentence and its corresponding Parts of Speech got from TAGGER
# and creating a dataframe
tag=[]
for i in POS_doc:
    tag.append([i.text,i.tag_])
TAG_df = pd.DataFrame(tag,columns=['Words','Pos'])
TAG_df

Unnamed: 0,Words,Pos
0,A,DT
1,shoemaker,NN
2,and,CC
3,his,PRP$
4,wife,NN
5,were,VBD
6,very,RB
7,poor,JJ
8,.,.
9,One,CD


**Explaining some of the Parts of Speech**

In [16]:
# Explain the Parts of Speech 
sp.explain('PRP$')

'pronoun, possessive'

In [17]:
sp.explain('CD')

'cardinal number'

In [18]:
sp.explain('JJ')

'adjective'

In [19]:
sp.explain('IN')

'conjunction, subordinating or preposition'

**Performing Dependency Parsing**

In [20]:
# Creating a Sentence for Parsing
parse_doc = nlp('A vain emperor hired two people to make him some new clothes. The tricked him, telling him the cloth was not visible to people unfit for his position or who were very stupid.')

In [21]:
# Creating dataframe for each words and its dependent word
PARSE_df = pd.DataFrame([(i,i.dep_) for i in parse_doc])
PARSE_df

Unnamed: 0,0,1
0,A,det
1,vain,amod
2,emperor,nsubj
3,hired,ROOT
4,two,nummod
5,people,dobj
6,to,aux
7,make,xcomp
8,him,nsubj
9,some,det


**Visualizing the Parsing model**

In [22]:
# Setting the options for visualization
options={'compact':True,'bg':'seagreen','color':'#fff','font':'Sans Sarif'}

In [23]:
# Displaying the tree
displacy.render(parse_doc,style='dep',jupyter=True,options=options)

In [24]:
# Explaining
sp.explain('aux')

'auxiliary'

In [25]:
# Explaining
sp.explain('xcomp')

'open clausal complement'

**Performing Stop Words analysis**

In [26]:
# Importing Stop Words from library
from spacy.lang.en import STOP_WORDS

In [27]:
# Getting the shape of Stop Words
len(STOP_WORDS)

326

In [28]:
# Checking if it is Stop Word
nlp.vocab['ultra'].is_stop

False

In [29]:
# Checking if it is Stop Word
nlp.vocab['down'].is_stop

True

**Removing Stop Words from a Sentence**

In [30]:
# Removing the Stop Words from the Sentence and printing it
stop_doc=nlp('A frog asked a princess why she was crying and she replied that she had dropped her golden ball into the pond. She promised anything if he would get it for her.')
for i in stop_doc:
  if i.is_stop!=True:
    print(i)

frog
asked
princess
crying
replied
dropped
golden
ball
pond
.
promised
.


In [31]:
# Getting the Stop Words from the Sentence and printing it
for i in stop_doc:
  if i.is_stop==True:
    print(i)

A
a
why
she
was
and
she
that
she
had
her
into
the
She
anything
if
he
would
get
it
for
her


**Customizing Stop Words from a text**

In [32]:
# Creating a sentence for customizing it 
stop_cust_doc = 'A little old woman baked a gingerbread man and when she took him out of the oven, he ran away. The woman and her husband chased him, as well as the pig, cow and horse.'

In [33]:
added_stpw=['little','well']
for word in added_stpw:
  nlp.vocab[word].is_stop==True
ex2=nlp(stop_cust_doc)
new_text=[word.text for word in ex2 if word.is_stop==False]
print('Actual text \n',stop_cust_doc)
print('Filtered text \n',new_text)

Actual text 
 A little old woman baked a gingerbread man and when she took him out of the oven, he ran away. The woman and her husband chased him, as well as the pig, cow and horse.
Filtered text 
 ['little', 'old', 'woman', 'baked', 'gingerbread', 'man', 'took', 'oven', ',', 'ran', 'away', '.', 'woman', 'husband', 'chased', ',', 'pig', ',', 'cow', 'horse', '.']


**Finding out Token Similarity**

In [34]:
# Assigning tokens and getting the similarity percentage between them
token = nlp('cat tiger lion')

for t1 in token:
    for t2 in token:
        print(t1.text.lower(), t2.text, t1.similarity(t2))

cat cat 1.0
cat tiger 0.541339
cat lion 0.5265438
tiger cat 0.541339
tiger tiger 1.0
tiger lion 0.7359829
lion cat 0.5265438
lion tiger 0.7359829
lion lion 1.0


**Merging and Splitting tokens**

In [36]:
# Merging the tokens as one
nlp = sp.load("en_core_web_sm")
docs = nlp("He went trekking to Mount Rushmore during his summer holidays")
print("Before:", [token.text for token in docs])

with docs.retokenize() as retokenizer:
    retokenizer.merge(docs[4:6], attrs={"LEMMA": "mount rushmore"})
print("After:", [token.text for token in docs])

Before: ['He', 'went', 'trekking', 'to', 'Mount', 'Rushmore', 'during', 'his', 'summer', 'holidays']
After: ['He', 'went', 'trekking', 'to', 'Mount Rushmore', 'during', 'his', 'summer', 'holidays']


In [38]:
# Checking which are alphabets in the sentence
for token in stop_doc:
  print(token.text,token.is_alpha)

A True
frog True
asked True
a True
princess True
why True
she True
was True
crying True
and True
she True
replied True
that True
she True
had True
dropped True
her True
golden True
ball True
into True
the True
pond True
. False
She True
promised True
anything True
if True
he True
would True
get True
it True
for True
her True
. False


In [39]:
# Creating a sentence to perform Entity Recognition 
sent = nlp('Shawn Micheals at 7PM went to meet his master at the Shaolin Temple present at China and trained harder so that he will be able to join in the Google organization')

In [40]:
# Performing Entity Recognition in the sentence
for i in sent.ents:
  print(i.text,i.label_)

Shawn Micheals PERSON
7PM CARDINAL
China GPE
Google ORG


In [41]:
# Highlighting the Entities
displacy.render(sent,style='ent',jupyter=True)

In [43]:
# Explaining the entity
sp.explain('GPE')

'Countries, cities, states'

**Performing Entity Annotation**

In [44]:
from spacy.tokens import Span

In [45]:
doc = nlp('Sekiro at 7PM went to meet his master at the Shaolin Temple present at China and trained harder so that he will be able to join in the KungFu.AI organization')
first=[(i.text,i.start_char,i.end_char,i.label_) for i in doc.ents]
print(first)
tn_ent=Span(doc,0,1,label='Person')
doc_ents1=list(doc.ents)+[tn_ent]
[(i.text,i.label_) for i in doc_ents1 ]

[('7PM', 10, 13, 'CARDINAL'), ('China', 71, 76, 'GPE'), ('KungFu', 135, 141, 'CARDINAL')]


[('7PM', 'CARDINAL'),
 ('China', 'GPE'),
 ('KungFu', 'CARDINAL'),
 ('Sekiro', 'Person')]