<a href="https://colab.research.google.com/github/mr-alamdari/NLP-Some-Libraries-Practice-Beginner/blob/main/NLP_Some_Libraries_Practice_Beginner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#[Spacy](https://spacy.io/api)

In [None]:
import spacy

###Loading a Model

In [None]:
nlp = spacy.load('en_core_web_sm')

###Apply the model to a document (doc object)

In [None]:
doc = nlp('The First My spacy code, to learn this awsome library ')

In [None]:
doc

The First My spacy code, to learn this awsome library 

In [None]:
for token in doc:
  print(token)
  #print(token.text)

The
First
My
spacy
code
,
to
learn
this
awsome
library


In [None]:
# pos ===> Part of Speach
for token in doc:
  print(token.text, token.pos, token.pos_)

The 90 DET
First 96 PROPN
My 90 DET
spacy 92 NOUN
code 92 NOUN
, 97 PUNCT
to 94 PART
learn 100 VERB
this 90 DET
awsome 84 ADJ
library 92 NOUN


In [None]:
# dep ===> Syntactic dependency
for token in doc:
  print(token.text, token.dep_)

The det
First amod
My poss
spacy compound
code ROOT
, punct
to aux
learn relcl
this det
awsome compound
library dobj


In [None]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f583595b250>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f5835268f30>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f5835268d70>)]

In [None]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [None]:
doc2 = nlp(u"This is a ?? sentence    to practice. tokenization, It's Awsome")

In [None]:
for token in doc2:
  print(token.text, token.pos_, token.dep_)

This DET nsubj
is AUX ROOT
a DET attr
? PUNCT punct
? PUNCT punct
sentence NOUN npadvmod
    SPACE 
to PART aux
practice VERB ROOT
. PUNCT punct
tokenization NOUN npadvmod
, PUNCT punct
It PRON nsubj
's AUX ROOT
Awsome PROPN attr


In [None]:
for token in doc2:
  print(token.text, token.tag_)

This DT
is VBZ
a DT
? .
? .
sentence NN
    _SP
to TO
practice VB
. .
tokenization NN
, ,
It PRP
's VBZ
Awsome NNP


In [None]:
for token in doc2:
  print(token.text, token.lemma_)

This this
is be
a a
? ?
? ?
sentence sentence
       
to to
practice practice
. .
tokenization tokenization
, ,
It -PRON-
's be
Awsome Awsome


In [None]:
for token in doc2:
  print(token.text, token.shape_)

This Xxxx
is xx
a x
? ?
? ?
sentence xxxx
       
to xx
practice xxxx
. .
tokenization xxxx
, ,
It Xx
's 'x
Awsome Xxxxx


In [None]:
for token in doc2:
  print(type(token))

<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>


###Tokenization

#####**Prefix**: chars at the beginning
#####**Suffix**: chars at the end
#####**Infix**: chars in between
#####Exception


In [None]:
my_str="'We\'re moving to L.A.!, are you coming?"

In [None]:
doc = nlp(my_str)

In [None]:
for token in doc:
  print(token.text)

'
We
're
moving
to
L.A.
!
,
are
you
coming
?


In [None]:
doc2 = nlp(u'Apple helped me create this http://popo@gmail.com gmail!, In L.A. with my freind in Japan so, I can only give you $153.61')

In [None]:
for token in doc2:
  print(token.text)

Apple
helped
me
create
this
http://popo@gmail.com
gmail
!
,
In
L.A.
with
my
freind
in
Japan
so
,
I
can
only
give
you
$
153.61


In [None]:
for entity in doc2.ents:
  print(entity, entity.label_)
  print(str(spacy.explain(entity.label_)))
  print('\n')

Apple ORG
Companies, agencies, institutions, etc.


L.A. GPE
Countries, cities, states


Japan GPE
Countries, cities, states


153.61 MONEY
Monetary values, including unit




In [None]:
doc3 = nlp(u'a beautiful butterfly can go beyonds eyes so esi, be carefull. yesterday IBM bought an apple for $5.5 millions')

In [None]:
for chunk in doc3.noun_chunks:
  print(chunk)

a beautiful butterfly
beyonds eyes
IBM
an apple
$5.5 millions


In [None]:
spacy.displacy.render(doc3, style='dep', jupyter=True, options={'distance':110})
# spacy.displacy.serve(doc3, style='dep', jupyter=False, options={'distance':110})

In [None]:
spacy.displacy.render(doc3, style='ent', jupyter=True)
# spacy.displacy.serve(doc3, style='ent', jupyter=False)

###Stemming

######a crude method for categoling related words

In [None]:
import nltk
from nltk.stem.porter import PorterStemmer

In [None]:
p_stemmer = PorterStemmer()

In [None]:
words = ['runner', 'run', 'ran', 'runs', 'easy', 'easier', 'easiest']

In [None]:
for word in words:
  print(f'{word} ----> {p_stemmer.stem(word)}')

runner ----> runner
run ----> run
ran ----> ran
runs ----> run
easy ----> easi
easier ----> easier
easiest ----> easiest


In [None]:
from nltk.stem.snowball import SnowballStemmer

In [None]:
s_stemmer = SnowballStemmer(language='english')

In [None]:
for word in words:
  print(f'{word} ----> {s_stemmer.stem(word)}')

runner ----> runner
run ----> run
ran ----> ran
runs ----> run
easy ----> easi
easier ----> easier
easiest ----> easiest


###Lemmatization
######It is a more informative way of reducing down words to thier root

In [None]:
doc1 = nlp(u'I am a runner and loved runners, so i think running is awsome, so run when you can')

In [None]:
for token in doc1:
  print(token.text, token.pos_, token.lemma, token.lemma_)

I PRON 561228191312463089 -PRON-
am AUX 10382539506755952630 be
a DET 11901859001352538922 a
runner NOUN 12640964157389618806 runner
and CCONJ 2283656566040971221 and
loved VERB 3702023516439754181 love
runners NOUN 12640964157389618806 runner
, PUNCT 2593208677638477497 ,
so CCONJ 9781598966686434415 so
i PRON 5097672513440128799 i
think VERB 16875814820671380748 think
running VERB 12767647472892411841 run
is AUX 10382539506755952630 be
awsome ADJ 3521391281120521496 awsome
, PUNCT 2593208677638477497 ,
so ADV 9781598966686434415 so
run VERB 12767647472892411841 run
when ADV 15807309897752499399 when
you PRON 561228191312463089 -PRON-
can VERB 6635067063807956629 can


In [None]:
for token in doc1:
  print(token.text, token.lemma_)

I -PRON-
am be
a a
runner runner
and and
loved love
runners runner
, ,
so so
i i
think think
running run
is be
awsome awsome
, ,
so so
run run
when when
you -PRON-
can can


In [None]:
def show_lemmas(doc):
  for token in doc:
    print(f'{token.text:{12}} {token.pos_:{10}} {token.lemma:<{22}} {token.lemma_}')

In [None]:
show_lemmas(doc1)

I            PRON       561228191312463089     -PRON-
am           AUX        10382539506755952630   be
a            DET        11901859001352538922   a
runner       NOUN       12640964157389618806   runner
and          CCONJ      2283656566040971221    and
loved        VERB       3702023516439754181    love
runners      NOUN       12640964157389618806   runner
,            PUNCT      2593208677638477497    ,
so           CCONJ      9781598966686434415    so
i            PRON       5097672513440128799    i
think        VERB       16875814820671380748   think
running      VERB       12767647472892411841   run
is           AUX        10382539506755952630   be
awsome       ADJ        3521391281120521496    awsome
,            PUNCT      2593208677638477497    ,
so           ADV        9781598966686434415    so
run          VERB       12767647472892411841   run
when         ADV        15807309897752499399   when
you          PRON       561228191312463089     -PRON-
can          VERB       

###Stop words

In [None]:
print(nlp.Defaults.stop_words)

{'between', 'namely', 'indeed', 'give', 'myself', 'about', 'anyway', 'enough', 'twelve', 'has', 'otherwise', 'after', 'none', 'n‘t', 'yourself', 'hereafter', 'us', 'had', 'would', 'meanwhile', 'any', 'them', 'whence', 'nor', 'seeming', 'been', 'whom', 'several', 'whole', 'doing', 'move', '‘ll', 'keep', 'without', 'nevertheless', 'whereafter', 'less', 'made', 'why', 'above', 'full', 'down', 'no', 'via', 'then', 'until', 'though', 'anyhow', 'in', '’ll', 'latterly', 'the', 'over', 'if', 'go', 'third', 'but', 'besides', 'beforehand', 'did', 'on', 'of', 'does', 'now', 'being', 'seems', 'take', 'others', 'those', 'five', 'my', 'nothing', 'during', 'either', 'top', 'back', 'as', 'while', 'alone', 'one', 'noone', 'already', 'hundred', 'empty', 'something', 'hers', '’re', 'last', "'m", 'all', 'hereby', 'into', 'except', 'might', 'two', 'whoever', 'an', 'ourselves', 'few', 'nine', 'only', 'please', 'rather', 'unless', 'whose', 'cannot', 'everything', 'with', '‘ve', 'can', 'never', 'least', 'ther

In [None]:
len(nlp.Defaults.stop_words)

326

In [None]:
nlp.vocab['is'].is_stop

True

######Add a Stop Word

In [None]:
s_word = 'btw'
nlp.Defaults.stop_words.add(s_word)
nlp.vocab[s_word].is_stop = True

In [None]:
nlp.vocab['btw'].is_stop

True

######Remove a stop word

In [None]:
s_word = 'beyond'
nlp.Defaults.stop_words.remove(s_word)
nlp.vocab[s_word].is_stop = False

In [None]:
nlp.vocab['beyond'].is_stop

False

######Matching and Vocabulary

In [None]:
matcher = spacy.matcher.Matcher(nlp.vocab)

In [None]:
#solarpower
pattern1 = [{'LOWER': 'solarpower'}]
# solar-power
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]
# solar power
pattern3 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]

In [None]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [None]:
doc = nlp(u'The Solar Power industry uses solarpower, so Solar-Power is amazing')

In [None]:
matches = matcher(doc)

In [None]:
matches

[(8656102463236116519, 1, 3),
 (8656102463236116519, 5, 6),
 (8656102463236116519, 8, 11)]

In [None]:
for id, start, end in matches:
  print(id, nlp.vocab.strings[id], doc[start: end].text)

8656102463236116519 SolarPower Solar Power
8656102463236116519 SolarPower solarpower
8656102463236116519 SolarPower Solar-Power


In [None]:
matcher.remove('SolarPower')

In [None]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP': '*'}, {'LOWER': 'power'}]

In [None]:
matcher.add('SolarPower', None, pattern1, pattern2)

In [None]:
doc2 = nlp(u'Solar----Power is Solar POWER')

In [None]:
matcher(doc2)

[(8656102463236116519, 2, 4)]

In [None]:
doc2 = nlp(u'Solar---Power is Solar POWER')

In [None]:
matcher(doc2)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 6)]

In [None]:
p_matcher = spacy.matcher.PhraseMatcher(nlp.vocab)

In [None]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trikle-down economics']

In [None]:
phrase_patterns = [nlp(phrase) for phrase in phrase_list]

In [None]:
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [None]:
p_matcher.add('EcobMatcher', None, *phrase_patterns)

In [None]:
temp = nlp('hey voodoo economics, got to supply-side economics, say trikle-down economics')
p_matches = p_matcher(temp)

In [None]:
p_matches

[(8838797697251217482, 1, 3),
 (8838797697251217482, 6, 10),
 (8838797697251217482, 12, 16)]

In [None]:
for id, start, end in p_matches:
  print(id, nlp.vocab.strings[id], temp[start: end].text)

8838797697251217482 EcobMatcher voodoo economics
8838797697251217482 EcobMatcher supply-side economics
8838797697251217482 EcobMatcher trikle-down economics
