In [44]:
import spacy
import pandas as pd
from spacy import displacy

In [45]:
nlp = spacy.load('en')

In [46]:
nlp = spacy.load("en_core_web_sm")

Tokenization

Word Tokenization

In [47]:
doc1=nlp('Sophie went to Istanbul to meet Rajeev over some drinks')

In [48]:
for x,token in enumerate(doc1):
  print(x,token.text)

0 Sophie
1 went
2 to
3 Istanbul
4 to
5 meet
6 Rajeev
7 over
8 some
9 drinks


Sentence Tokenization

In [49]:
sent_doc='Machine learning is an application of artificial intelligence (AI) that provides systems the ability to automatically learn and improve from experience without being explicitly programmed. Machine learning focuses on the development of computer programs that can access data and use it learn for themselves.The process of learning begins with observations or data, such as examples, direct experience, or instruction, in order to look for patterns in data and make better decisions in the future based on the examples that we provide. The primary aim is to allow the computers learn automatically without human intervention or assistance and adjust actions accordingly.'

In [50]:
sentence_doc1=nlp(sent_doc)

In [51]:
for x,i in enumerate(sentence_doc1.sents):
  print(x,i)

0 Machine learning is an application of artificial intelligence (AI) that provides systems the ability to automatically learn and improve from experience without being explicitly programmed.
1 Machine learning focuses on the development of computer programs that can access data and use it learn for themselves.
2 The process of learning begins with observations or data, such as examples, direct experience, or instruction, in order to look for patterns in data and make better decisions in the future based on the examples that we provide.
3 The primary aim is to allow the computers learn automatically without human intervention or assistance and adjust actions accordingly.


Lemmatization<br>
Lemmatization helps to  find the actual words in indefinite tense

In [52]:
doc_lemma=nlp('He was crying incessantly in order to douse the flame of the pain caused to him. Charles cried because, he lost his mother in a horrible accident')

In [53]:
for token in doc_lemma:
  print(token.text, token.lemma_, token.lemma_.lower().strip())

He -PRON- -pron-
was be be
crying cry cry
incessantly incessantly incessantly
in in in
order order order
to to to
douse douse douse
the the the
flame flame flame
of of of
the the the
pain pain pain
caused cause cause
to to to
him -PRON- -pron-
. . .
Charles Charles charles
cried cry cry
because because because
, , ,
he -PRON- -pron-
lost lose lose
his -PRON- -pron-
mother mother mother
in in in
a a a
horrible horrible horrible
accident accident accident


**If we want to get in tabular form**

In [54]:
lemma = []
for x,token in enumerate(doc_lemma):
  lemma.append([x, token.text, token.lemma_])

In [55]:
pd.DataFrame(lemma, columns = ['Index', 'Token', 'Lemma'])

Unnamed: 0,Index,Token,Lemma
0,0,He,-PRON-
1,1,was,be
2,2,crying,cry
3,3,incessantly,incessantly
4,4,in,in
5,5,order,order
6,6,to,to
7,7,douse,douse
8,8,the,the
9,9,flame,flame


Parts of Speech Tagging

In [56]:
doc_lemma1 = nlp('Sherry founded the building where she found the box of diamonds')

In [57]:
pos = []
for i in doc_lemma1:
  pos.append([i.text, i.pos_])
pd.DataFrame(pos, columns = ['Words', 'POS'])

Unnamed: 0,Words,POS
0,Sherry,PROPN
1,founded,VERB
2,the,DET
3,building,NOUN
4,where,ADV
5,she,PRON
6,found,VERB
7,the,DET
8,box,NOUN
9,of,ADP


##Tagger<br/>
It returns Parts of Speech in Tree bank format(More detailed POS about words)

In [79]:
tag=[]
for i in doc_lemma1:
  tag.append([i.text,i.tag_])
pd.DataFrame(tag,columns=['Words','Pos'])

Unnamed: 0,Words,Pos
0,Sherry,NNP
1,founded,VBD
2,the,DT
3,building,NN
4,where,WRB
5,she,PRP
6,found,VBD
7,the,DT
8,box,NN
9,of,IN


Let's check what each  tag denotes

In [80]:
spacy.explain('NNS')

'noun, plural'

In [81]:
spacy.explain('WRB')

'wh-adverb'

In [82]:
spacy.explain('DT')

'determiner'

In [83]:
spacy.explain('IN')

'conjunction, subordinating or preposition'

## Dependency parsing
It shows us which words are dependent on which other words<br/>
**Syntactic Dependency**<br/>
It helps us to know the relation betweeen the tokens

In [63]:
doc_dep1=nlp('Debesh ferried two thousand passengers having sailed even during heavy storm, but he conceded to the storm held last week')

In [64]:
pd.DataFrame([(word,word.dep_) for word in doc_dep1])

Unnamed: 0,0,1
0,Debesh,nsubj
1,ferried,ROOT
2,two,compound
3,thousand,nummod
4,passengers,nsubj
5,having,aux
6,sailed,ccomp
7,even,advmod
8,during,prep
9,heavy,amod


Let's visualize the entire model

In [65]:
options={'compact':True,'bg':'seagreen','color':'#fff','font':'Sans Sarif'}

In [66]:
displacy.render(doc_dep1,style='dep',jupyter=True,options=options)

Ask spaCy to exlain if you detail

In [67]:
spacy.explain('ccomp')

'clausal complement'

**Let's take another example**

In [68]:
doc_dep2=nlp('Kolkata shows its true color during corporation polls. Many were victimized of tyrannical democracy  ')

In [69]:
for x,token in enumerate(doc_dep2):
  print(x,token.text,token.dep_,token.head.text)

0 Kolkata nsubj shows
1 shows ROOT shows
2 its poss color
3 true amod color
4 color dobj shows
5 during prep shows
6 corporation compound polls
7 polls pobj during
8 . punct shows
9 Many nsubjpass victimized
10 were auxpass victimized
11 victimized ROOT victimized
12 of prep victimized
13 tyrannical amod democracy
14 democracy pobj of
15    democracy


In [70]:
displacy.render(doc_dep2,style='dep',jupyter=True,options=options)

##Stopwords<br/>

In [71]:
from spacy.lang.en import STOP_WORDS

In [72]:
len(STOP_WORDS)

327

The default number of stopwords in English language listed by spaCy is 326

Let's see, how to check stopwords

In [73]:
nlp.vocab['bottom'].is_stop

True

In [74]:
nlp.vocab['bench'].is_stop

False

**Straining stopwords from text**

In [75]:
ex1=nlp('The door was open till 7PM, Abhishek fought to visit Croatia but postponed due to 24 hrs meeting')
for i in ex1:
  if i.is_stop!=True:
    print(i)

door
till
7PM
,
Abhishek
fought
visit
Croatia
postponed
24
hrs
meeting


In [76]:
#Finding stop word in text

for i in ex1:
  if i.is_stop==True:
    print(i)

The
was
open
to
but
due
to


We can add stopwords . We add **open** as our stop word for this text

In [77]:
STOP_WORDS.add('open')
#STOP_WORDS.add('visit')

In [78]:
nlp.vocab['open'].is_stop

True

**Customize stopwords from text**

In [84]:
doc2='The door was open till 7PM, Abhishek fought to visit Croatia but postponed due to 24 hrs meeting'

In [85]:
added_stpw=['open','visit']
for word in added_stpw:
  nlp.vocab[word].is_stop==True
ex2=nlp(doc2)
new_text=[word.text for word in ex2 if word.is_stop==False]
print('Actual text \n',doc2)
print('Filtered text \n',new_text)

Actual text 
 The door was open till 7PM, Abhishek fought to visit Croatia but postponed due to 24 hrs meeting
Filtered text 
 ['door', 'till', '7PM', ',', 'Abhishek', 'fought', 'visit', 'Croatia', 'postponed', '24', 'hrs', 'meeting']


**Token Similarity**

In [86]:
tokens = nlp(u'fish meat water')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text.lower(), token2.text, token1.similarity(token2))


fish fish 1.0
fish meat 0.48889259
fish water 0.23152366
meat fish 0.48889259
meat meat 1.0
meat water 0.42198178
water fish 0.23152366
water meat 0.42198178
water water 1.0


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


**Merging and splitting**

In [87]:
nlp = spacy.load("en_core_web_sm")
docs = nlp("I live in New York")
print("Before:", [token.text for token in docs])

with docs.retokenize() as retokenizer:
    retokenizer.merge(docs[3:5], attrs={"LEMMA": "new york"})
print("After:", [token.text for token in docs])

Before: ['I', 'live', 'in', 'New', 'York']
After: ['I', 'live', 'in', 'New York']


**Checking if a token is alphabet**

In [88]:
for token in ex1:
  print(token.text,token.is_alpha)

The True
door True
was True
open True
till True
7PM False
, False
Abhishek True
fought True
to True
visit True
Croatia True
but True
postponed True
due True
to True
24 False
hrs True
meeting True


## Entity recognition<br/>
This is very important in te space of NLP. It helps us know the domain or entity a word belongs to

In [89]:
ex1=nlp('The door was open till 7PM, Abhishek fought to visit Croatia but postponed due to 24 hrs meeting')

In [91]:
for i in ex1.ents:
  print(i.text,i.label_)

7PM CARDINAL
Abhishek PERSON
Croatia ORG
24 CARDINAL


7PM is cardinal i.e numeric, Abhishek is a person, Croatia is a country(**hence GPE[Country, City etc]**)

In [92]:
displacy.render(ex1,style='ent',jupyter=True)

In [93]:
spacy.explain('CARDINAL')

'Numerals that do not fall under another type'

**Let's take another example**

In [94]:
ent_doc='Steven Flemming captained in 2007 World Cup for New Zealand'
ent_doc_processed=nlp(ent_doc)
[(token.text,token.label_) for token in ent_doc_processed.ents]

[('Steven Flemming', 'PERSON'),
 ('2007', 'DATE'),
 ('World Cup', 'EVENT'),
 ('New Zealand', 'GPE')]

**Entity annotation**<br/>
When no entity is assigned, we can assign them a particular entity

In [97]:
doc = nlp('RBI is hiring some people')
first=[(i.text,i.start_char,i.end_char,i.label_) for i in doc.ents]
print(first)
tn_ent=Span(doc,0,1,label='GPE')

  doc_ents1=list(doc.ents)+[tn_ent]
[(i.text,i.label_) for i in doc_ents1 ]

IndentationError: ignored