In [0]:
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [0]:
nlp=spacy.load('en')

In [247]:
#words.shape_: determine shape of the word like whether capital or small. if X: capital, if x: small
#words.is_alpha: determine whether it is 
docu=nlp("Hello hello HELLO heLLO 123 *$% heLLo I am Nishith Kotak: a student of PhD")
for words in docu:
  print("token: ",words.text, " ; ", "shape: ",words.shape_," ; ","alphabet: ",words.is_alpha, " ; numeric: ", words.is_digit, " ; stop word: ", words.is_stop)


token:  Hello  ;  shape:  Xxxxx  ;  alphabet:  True  ; numeric:  False  ; stop word:  False
token:  hello  ;  shape:  xxxx  ;  alphabet:  True  ; numeric:  False  ; stop word:  False
token:  HELLO  ;  shape:  XXXX  ;  alphabet:  True  ; numeric:  False  ; stop word:  False
token:  heLLO  ;  shape:  xxXXX  ;  alphabet:  True  ; numeric:  False  ; stop word:  False
token:  123  ;  shape:  ddd  ;  alphabet:  False  ; numeric:  True  ; stop word:  False
token:  *  ;  shape:  *  ;  alphabet:  False  ; numeric:  False  ; stop word:  False
token:  $  ;  shape:  $  ;  alphabet:  False  ; numeric:  False  ; stop word:  False
token:  %  ;  shape:  %  ;  alphabet:  False  ; numeric:  False  ; stop word:  False
token:  heLLo  ;  shape:  xxXXx  ;  alphabet:  True  ; numeric:  False  ; stop word:  False
token:  I  ;  shape:  X  ;  alphabet:  True  ; numeric:  False  ; stop word:  True
token:  am  ;  shape:  xx  ;  alphabet:  True  ; numeric:  False  ; stop word:  True
token:  Nishith  ;  shape:  Xxx

In [0]:
#part of speech (POS) tagging 

In [0]:
eg_pos=nlp("He drinks a drink")

In [250]:
#part of speech with tagging the word
for word in eg_pos:
  print (word.text, " ",word.pos_," ",word.tag_)

He   PRON   PRP
drinks   VERB   VBZ
a   DET   DT
drink   NOUN   NN


In [251]:
spacy.explain('VBZ') #to know meaning of tag

'verb, 3rd person singular present'

In [0]:
#syntactic dependency: it helps us to know the relation between tokens

In [253]:
eg_pos=nlp("He drinks a drink")
for word in eg_pos:
  print (word.text, " ",word.pos_," ",word.tag_," ",word.dep_)



He   PRON   PRP   nsubj
drinks   VERB   VBZ   ROOT
a   DET   DT   det
drink   NOUN   NN   dobj


In [254]:
#display the visualization of dependencies
from spacy import displacy
displacy.render(eg_pos,style='dep',jupyter=True)

In [255]:
for chunk in eg_pos.noun_chunks:
    print(chunk.text, " -> ",chunk.root.text , " -> ",chunk.root.dep_, " -> ", chunk.root.head.text)

He  ->  He  ->  nsubj  ->  drinks
a drink  ->  drink  ->  dobj  ->  drinks


In [256]:
#shows the noun phrases list and the root noun word
# extracting the noun phrase: word that is directly associated with the noun
# The rule states that whenever the chunk finds an optional determiner (DT) followed by any number of adjectives (JJ) 
# and then a noun (NN) then the Noun Phrase(NP) chunk should be formed.   (link: https://spacy.io/usage/linguistic-features)
eg_pos1=nlp("the little yellow dog barked at the cat")
for chunk in eg_pos1.noun_chunks:
    print("noun phrase: ",chunk.text," | root noun text: ",chunk.root.text, " | connector text: ",chunk.root.head.text)

noun phrase:  the little yellow dog  | root noun text:  dog  | connector text:  barked
noun phrase:  the cat  | root noun text:  cat  | connector text:  at


In [0]:
#Lemmatization: Convert the words into the root word

In [258]:
ex_lemma=nlp("I am studying and learning NLP using Spacy which helps me to study for PhD which is better than any other tools")
for word in ex_lemma:
  print(word.text," ---> ",word.lemma_)


I  --->  -PRON-
am  --->  be
studying  --->  study
and  --->  and
learning  --->  learn
NLP  --->  NLP
using  --->  use
Spacy  --->  Spacy
which  --->  which
helps  --->  help
me  --->  -PRON-
to  --->  to
study  --->  study
for  --->  for
PhD  --->  phd
which  --->  which
is  --->  be
better  --->  well
than  --->  than
any  --->  any
other  --->  other
tools  --->  tool


In [259]:
#to extract verb phrase and its lemma
a=nlp("Ram, mohan and Shyam are studying physics. Ram ranked first in Physics, Shyam ranked Second in Physics")
for word in a:
  if(word.pos_ =="VERB"):
    if(nlp.vocab[str(word)].is_stop==False):
      print(word.text, " --->", word.lemma_)

studying  ---> study
ranked  ---> rank
ranked  ---> rank


In [0]:
#Named Entity Recognition: takes the sentence and identifies the noun words in the sentence as the name of person, place, organisation, geological place, etc.

In [261]:
ex_ner=nlp("Digital India is a campaign launched by the Government of India at New Delhi on 01 July 2015 in order to ensure the Government's services are made available to citizens electronically by improved online infrastructure and by increasing Internet connectivity or making the country digitally empowered in the field of technology for 51% growth to increase 50 billion in 50 billion rupees.")
print(ex_ner)
for words in ex_ner.ents:
  print(words.text, " ---> ",words.label_)

Digital India is a campaign launched by the Government of India at New Delhi on 01 July 2015 in order to ensure the Government's services are made available to citizens electronically by improved online infrastructure and by increasing Internet connectivity or making the country digitally empowered in the field of technology for 51% growth to increase 50 billion in 50 billion rupees.
Digital India  --->  ORG
the Government of India  --->  ORG
New Delhi  --->  GPE
01 July 2015  --->  DATE
Government  --->  ORG
51%  --->  PERCENT
50 billion  --->  CARDINAL
50 billion rupees  --->  MONEY


In [262]:
spacy.explain('GPE')

'Countries, cities, states'

In [263]:
displacy.render(ex_ner,style='ent',jupyter=True)

In [264]:
ex2_ner=nlp("C programming language was developed in 1972 by Dennis Ritchie-an American Inventor at bell laboratories of AT&T (American Telephone & Telegraph), located in the U.S.A.Dennis Ritchie is known as the founder of the c language.")
for words in ex2_ner.ents:
  print(words.text, " ---> ",words.label_)


1972  --->  DATE
Dennis Ritchie-an  --->  PERSON
American  --->  NORP
AT&T  --->  ORG
American Telephone & Telegraph  --->  ORG


In [265]:
spacy.explain('NORP')

'Nationalities or religious or political groups'

In [0]:
#similarity between the sentences
doc1=nlp("I am smart")
doc2=nlp("Cat drinks milk")

In [267]:
doc1.similarity(doc2)

  "__main__", mod_spec)


0.22010503239532325

In [268]:
#Stopwords
from spacy.lang.en.stop_words import STOP_WORDS
print(STOP_WORDS)

{'wherever', 'former', 'being', 'it', 'n’t', 'where', '’s', 'me', 'via', 'no', 'hence', 'am', 'their', "'ll", 'her', 'everyone', 'neither', 'none', 'myself', 'thence', 'three', 'why', 'such', 'everywhere', 'so', 'although', '‘ll', 'between', 'across', 'behind', 'enough', 'seem', 'all', 'towards', 'while', 'somehow', 'latter', 'whereby', 'make', 'name', 'him', 'only', 'something', 'were', '‘m', 'whether', 'meanwhile', 'have', 'well', 'fifteen', '’ll', 'or', 'is', 'on', 'some', 'thru', 'made', 'also', 'mostly', 'mine', 'upon', 'beyond', 'except', 'over', 'we', 'next', 'n‘t', 'every', 'sometime', 'for', 'hundred', 'whereupon', '’d', 'his', 'toward', 'down', 'however', 'nobody', "'m", 'regarding', 'hereby', 'which', '‘d', 'four', 'nine', 'several', 'lol', 'hers', 'does', 'onto', '‘s', 'became', '’m', 'third', 'else', 'perhaps', 'they', 'those', 'twelve', 'you', 'always', 'anyhow', 'back', 'besides', 'everything', 'say', 'already', 'that', 'anyone', 'nevertheless', 'after', 'at', 'either', 

In [269]:
#to check if a word is stop word or not
nlp.vocab["Nishith"].is_stop

False

In [270]:
#Assignment 3

from urllib.request import urlopen
document = urlopen('https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2017/04/04080929/Tripadvisor_hotelreviews_Shivambansal.txt').read().decode('utf_8')
print(document)



Nice place Better than some reviews give it credit for. Overall, the rooms were a bit small but nice. Everything was clean, the view was wonderful and it is very well located (the Prudential Center makes shopping and eating easy and the T is nearby for jaunts out and about the city). Overall, it was a good experience and the staff was quite friendly. 
what a surprise What a surprise the Sheraton was after reading some of the reviews. it would appear there is a massive difference in the rooms, the South tower being the best. Check in was very efficient and the room was lovely, very large with the most comfortable beds ever. The hotel as stated is in a fantastic location and the Wrentham Village outlet is well worth a visit for bargain shopping ( the bus picks up outside). The hotel bar is a little pricey ( not helped by the current dollar rate) but is a nice place to relax after a busy day shopping. There is a number of restaurants close by. A cab from the airport to the hotel can be c

In [0]:
#sentence tokenisation


In [271]:
document_file=nlp(document)
document_file

Nice place Better than some reviews give it credit for. Overall, the rooms were a bit small but nice. Everything was clean, the view was wonderful and it is very well located (the Prudential Center makes shopping and eating easy and the T is nearby for jaunts out and about the city). Overall, it was a good experience and the staff was quite friendly. 
what a surprise What a surprise the Sheraton was after reading some of the reviews. it would appear there is a massive difference in the rooms, the South tower being the best. Check in was very efficient and the room was lovely, very large with the most comfortable beds ever. The hotel as stated is in a fantastic location and the Wrentham Village outlet is well worth a visit for bargain shopping ( the bus picks up outside). The hotel bar is a little pricey ( not helped by the current dollar rate) but is a nice place to relax after a busy day shopping. There is a number of restaurants close by. A cab from the airport to the hotel can be ch

In [272]:
for sentences in document_file.sents:
  print(sentences)

Nice place Better than some reviews give it credit for.
Overall, the rooms were a bit small but nice.
Everything was clean, the view was wonderful and it is very well located (the Prudential Center makes shopping and eating easy and the T is nearby for jaunts out and about the city).
Overall, it was a good experience and the staff was quite friendly. 

what a surprise
What a surprise the Sheraton was after reading some of the reviews.
it would appear there is a massive difference in the rooms, the South tower being the best.
Check in was very efficient and the room was lovely, very large with the most comfortable beds ever.
The hotel as stated is in a fantastic location and the Wrentham Village outlet is well worth a visit for bargain shopping ( the bus picks up outside).
The hotel bar is a little pricey ( not helped by the current dollar rate) but is a nice place to relax after a busy day shopping.
There is a number of restaurants close by.
A cab from the airport to the hotel can be 

In [273]:
for num,sentences in enumerate(document_file.sents):
  print(f'{num}:{sentences}')

0:Nice place Better than some reviews give it credit for.
1:Overall, the rooms were a bit small but nice.
2:Everything was clean, the view was wonderful and it is very well located (the Prudential Center makes shopping and eating easy and the T is nearby for jaunts out and about the city).
3:Overall, it was a good experience and the staff was quite friendly. 

4:what a surprise
5:What a surprise the Sheraton was after reading some of the reviews.
6:it would appear there is a massive difference in the rooms, the South tower being the best.
7:Check in was very efficient and the room was lovely, very large with the most comfortable beds ever.
8:The hotel as stated is in a fantastic location and the Wrentham Village outlet is well worth a visit for bargain shopping ( the bus picks up outside).
9:The hotel bar is a little pricey ( not helped by the current dollar rate) but is a nice place to relax after a busy day shopping.
10:There is a number of restaurants close by.
11:A cab from the ai

In [274]:
print (num) #shows number of sentences

3802


In [0]:
#word tokenisation

In [276]:
for numwords,words in enumerate(document_file):
  print(f'{numwords}:{words.text}')

0:Nice
1:place
2:Better
3:than
4:some
5:reviews
6:give
7:it
8:credit
9:for
10:.
11:Overall
12:,
13:the
14:rooms
15:were
16:a
17:bit
18:small
19:but
20:nice
21:.
22:Everything
23:was
24:clean
25:,
26:the
27:view
28:was
29:wonderful
30:and
31:it
32:is
33:very
34:well
35:located
36:(
37:the
38:Prudential
39:Center
40:makes
41:shopping
42:and
43:eating
44:easy
45:and
46:the
47:T
48:is
49:nearby
50:for
51:jaunts
52:out
53:and
54:about
55:the
56:city
57:)
58:.
59:Overall
60:,
61:it
62:was
63:a
64:good
65:experience
66:and
67:the
68:staff
69:was
70:quite
71:friendly
72:.
73:

74:what
75:a
76:surprise
77:What
78:a
79:surprise
80:the
81:Sheraton
82:was
83:after
84:reading
85:some
86:of
87:the
88:reviews
89:.
90:it
91:would
92:appear
93:there
94:is
95:a
96:massive
97:difference
98:in
99:the
100:rooms
101:,
102:the
103:South
104:tower
105:being
106:the
107:best
108:.
109:Check
110:in
111:was
112:very
113:efficient
114:and
115:the
116:room
117:was
118:lovely
119:,
120:very
121:large
122:with
123:

In [277]:
print (numwords) #shows number of words

61461


In [278]:
#List of word tokens
[words.text for words in document_file]

['Nice',
 'place',
 'Better',
 'than',
 'some',
 'reviews',
 'give',
 'it',
 'credit',
 'for',
 '.',
 'Overall',
 ',',
 'the',
 'rooms',
 'were',
 'a',
 'bit',
 'small',
 'but',
 'nice',
 '.',
 'Everything',
 'was',
 'clean',
 ',',
 'the',
 'view',
 'was',
 'wonderful',
 'and',
 'it',
 'is',
 'very',
 'well',
 'located',
 '(',
 'the',
 'Prudential',
 'Center',
 'makes',
 'shopping',
 'and',
 'eating',
 'easy',
 'and',
 'the',
 'T',
 'is',
 'nearby',
 'for',
 'jaunts',
 'out',
 'and',
 'about',
 'the',
 'city',
 ')',
 '.',
 'Overall',
 ',',
 'it',
 'was',
 'a',
 'good',
 'experience',
 'and',
 'the',
 'staff',
 'was',
 'quite',
 'friendly',
 '.',
 '\r\n',
 'what',
 'a',
 'surprise',
 'What',
 'a',
 'surprise',
 'the',
 'Sheraton',
 'was',
 'after',
 'reading',
 'some',
 'of',
 'the',
 'reviews',
 '.',
 'it',
 'would',
 'appear',
 'there',
 'is',
 'a',
 'massive',
 'difference',
 'in',
 'the',
 'rooms',
 ',',
 'the',
 'South',
 'tower',
 'being',
 'the',
 'best',
 '.',
 'Check',
 'in',
 

In [279]:
#1. Noun phrase extraction from item description
for chunk in document_file.noun_chunks:
    print("noun phrase: ",chunk.text," | root noun text: ",chunk.root.text, " | connector text: ",chunk.root.head.text)


noun phrase:  Nice place  | root noun text:  place  | connector text:  place
noun phrase:  some reviews  | root noun text:  reviews  | connector text:  give
noun phrase:  it  | root noun text:  it  | connector text:  give
noun phrase:  credit  | root noun text:  credit  | connector text:  give
noun phrase:  the rooms  | root noun text:  rooms  | connector text:  were
noun phrase:  Everything  | root noun text:  Everything  | connector text:  was
noun phrase:  the view  | root noun text:  view  | connector text:  was
noun phrase:  it  | root noun text:  it  | connector text:  is
noun phrase:  the Prudential Center  | root noun text:  Center  | connector text:  makes
noun phrase:  shopping  | root noun text:  shopping  | connector text:  makes
noun phrase:  the T  | root noun text:  T  | connector text:  is
noun phrase:  jaunts  | root noun text:  jaunts  | connector text:  for
noun phrase:  the city  | root noun text:  city  | connector text:  about
noun phrase:  it  | root noun text:  

In [280]:
#Lemmatization of all the words
for word in document_file:
  print(word.text," ---> ",word.lemma_)

Nice  --->  nice
place  --->  place
Better  --->  better
than  --->  than
some  --->  some
reviews  --->  review
give  --->  give
it  --->  -PRON-
credit  --->  credit
for  --->  for
.  --->  .
Overall  --->  overall
,  --->  ,
the  --->  the
rooms  --->  room
were  --->  be
a  --->  a
bit  --->  bit
small  --->  small
but  --->  but
nice  --->  nice
.  --->  .
Everything  --->  Everything
was  --->  be
clean  --->  clean
,  --->  ,
the  --->  the
view  --->  view
was  --->  be
wonderful  --->  wonderful
and  --->  and
it  --->  -PRON-
is  --->  be
very  --->  very
well  --->  well
located  --->  locate
(  --->  (
the  --->  the
Prudential  --->  Prudential
Center  --->  Center
makes  --->  make
shopping  --->  shopping
and  --->  and
eating  --->  eat
easy  --->  easy
and  --->  and
the  --->  the
T  --->  t
is  --->  be
nearby  --->  nearby
for  --->  for
jaunts  --->  jaunt
out  --->  out
and  --->  and
about  --->  about
the  --->  the
city  --->  city
)  --->  )
.  --->  .
Overall

In [281]:
#2.Lemmatization of verb phrases (if the POS is verb and it is not the stop word then find the lemma)
for word in document_file:
  if(word.pos_ =="VERB"):
    if(nlp.vocab[str(word)].is_stop==False):
      print(word.text, " --->", word.lemma_)

located  ---> locate
makes  ---> make
eating  ---> eat
reading  ---> read
appear  ---> appear
Check  ---> check
stated  ---> state
picks  ---> pick
helped  ---> help
relax  ---> relax
depending  ---> depend
Find  ---> find
stayed  ---> stay
felt  ---> feel
staying  ---> stay
pay  ---> pay
expect  ---> expect
coming  ---> come
felt  ---> feel
staying  ---> stay
stayed  ---> stay
serve  ---> serve
horrified  ---> horrify
charged  ---> charge
load  ---> load
called  ---> call
learn  ---> learn
patched  ---> patch
reactivated  ---> reactivate
having  ---> have
shipped  ---> ship
told  ---> tell
involved  ---> involve
receive  ---> receive
charges  ---> charge
based  ---> base
shipped  ---> ship
charged  ---> charge
receive  ---> receive
claims  ---> claim
stopped  ---> stop
positioned  ---> position
push  ---> push
hit  ---> hit
pushed  ---> push
opened  ---> open
blocked  ---> block
limited  ---> limit
went  ---> go
watching  ---> watch
argued  ---> argue
stay  ---> stay
Got  ---> get
wen

In [282]:
# 3. Named Entity extraction and recognition
for words in document_file.ents:
  print(words.text, " ---> ",words.label_)

displacy.render(document_file,style='ent',jupyter=True)


the Prudential Center  --->  ORG
Sheraton  --->  ORG
South  --->  LOC
Wrentham Village  --->  GPE
the day  --->  DATE
just under $30  --->  MONEY
Boston  --->  GPE
17th  --->  DATE
Sheraton  --->  ORG
3 nights  --->  DATE
the day  --->  DATE
evening  --->  TIME
Hilton  --->  PERSON
one and a half  --->  CARDINAL
Sheraton  --->  FAC
Back Bay  --->  LOC
November 21-25  --->  DATE
9.95  --->  MONEY
the third night  --->  TIME
daily  --->  DATE
an hour  --->  TIME
Sprint  --->  GPE
over 30 minutes  --->  TIME
three  --->  CARDINAL
8  --->  CARDINAL
10  --->  CARDINAL
13  --->  CARDINAL
53  --->  MONEY
NEVER  --->  ORG
24th  --->  ORDINAL
four  --->  CARDINAL
five nights  --->  TIME
41  --->  MONEY
Hilton  --->  ORG
One  --->  CARDINAL
October 5 nights  --->  DATE
Sheraton  --->  ORG
Priceline  --->  GPE
16th  --->  ORDINAL
North Tower  --->  FAC
the Charles River  --->  LOC
3  --->  CARDINAL
Two  --->  CARDINAL
the Prudential Centre  --->  ORG
each morning  --->  TIME
the evening  --->  TI