# Import Libraries

In [3]:
import spacy
import warnings
warnings.filterwarnings('ignore')

In [4]:
# load english library
nlp = spacy.load('en_core_web_sm')

In [5]:
# Create a siple document
doc = nlp(u'Infosys is planning to buy a bangalore startup for $2 core')

In [7]:
for token in doc:
    print(token, token.pos_)

Infosys PROPN
is AUX
planning VERB
to PART
buy VERB
a DET
bangalore NOUN
startup NOUN
for ADP
$ SYM
2 NUM
core NOUN


In [8]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x1e539c56828>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1e548cd4ee8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1e548cd4f48>)]

## Tokenization

In [25]:
mystring = '" We\'re moving to Bangalore! you can reach me at pavna@gmail.com if needed, or visit http://www.pavan.com"'

In [26]:
doc1 = nlp(mystring)

In [27]:
for token in doc1:
    print(token.text)

"
We
're
moving
to
Bangalore
!
you
can
reach
me
at
pavna@gmail.com
if
needed
,
or
visit
http://www.pavan.com
"


In [28]:
doc2 = nlp(u'It costs $20.50 to travel St. Louis from my place!')

In [29]:
for token in doc2:
    print(token)

It
costs
$
20.50
to
travel
St.
Louis
from
my
place
!


In [31]:
len(doc2.vocab)

511

In [34]:
doc2[2:6]

$20.50 to travel

In [35]:
# Assignment not allowed
doc2[0] = 'test'

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [54]:
doc3 = nlp(u'Apple to build a green energy factory for Rupees $2 million')

In [55]:
for token in doc3:
    print(token.text, end=' | ')

Apple | to | build | a | green | energy | factory | for | Rupees | $ | 2 | million | 

In [59]:
for entity in doc3.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


$2 million
MONEY
Monetary values, including unit




In [60]:
doc4 = nlp(u'Electric cars shift insurance liability towards manufacturers!')

In [62]:
for chunk in doc4.noun_chunks:
    print(chunk)

Electric cars
insurance liability
manufacturers


In [63]:
from spacy import displacy

In [66]:
displacy.render(doc3, style='dep', jupyter=True,options={'distance': 100} )

In [67]:
doc5 = nlp(u'Over the last quarter Amazon sold nearly millions of products and made a profit $100 million')

In [70]:
displacy.render(doc5, style='ent', jupyter=True )

## Stemming

In [79]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [72]:
# load porter stemmer
stemmer = PorterStemmer()

In [73]:
words = ['run', 'runner', 'rans', 'easily', 'fairly', 'fairness']

In [77]:
for word in words:
    print(word + ' ---> ' + stemmer.stem(word))

run ---> run
runner ---> runner
rans ---> ran
easily ---> easili
fairly ---> fairli
fairness ---> fair


In [80]:
s_stem = SnowballStemmer(language='english')

In [81]:
for word in words:
    print(word + ' ---> ' + s_stem.stem(word))

run ---> run
runner ---> runner
rans ---> ran
easily ---> easili
fairly ---> fair
fairness ---> fair


In [82]:
words = ['generous', 'generation', 'generously', 'generate']

In [83]:
for word in words:
    print(word + ' --> ' + s_stem.stem(word))

generous --> generous
generation --> generat
generously --> generous
generate --> generat


## Lemmatization

In [84]:
#to show information in table format
def show_lemas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos:{6}} {token.lemma:<{22}} {token.lemma_}')

In [87]:
doc6 = nlp(u'I saw 10 cutetest puppies today!')

In [88]:
show_lemas(doc6)

I                95 561228191312463089     -PRON-
saw             100 11925638236994514241   see
10               93 6572986864102252890    10
cutetest         84 11385279200967829043   cutet
puppies          92 10467886468764181784   puppy
today            92 11042482332948150395   today
!                97 17494803046312582752   !


## Stopwords

In [101]:
print(nlp.Defaults.stop_words)
print("Count :", len(nlp.Defaults.stop_words))

{'can', 'elsewhere', 'between', 'really', '‘d', 'she', 'keep', '’ve', 'that', "'ve", 'herein', 'sometime', 'on', 'into', 'formerly', 'seemed', 'i', 'been', 'among', 'already', 'everywhere', 'whether', '‘ve', 'from', 'mine', 'used', 'several', 'no', 'until', 'show', 'ours', 'whence', 'there', 'why', 'them', 'towards', 'hers', 'across', 'get', 'somehow', 'seem', 'herself', 'those', 'are', 'both', 'too', 'nothing', 'itself', 'therefore', 'they', 'whose', 'neither', 'although', 'or', 'he', 'am', 'would', 'in', 'hereby', 'yours', 'indeed', 'were', 'to', 'then', 'may', 'any', 'seems', 'up', 'empty', 'via', 'most', '’ll', 'must', 'hereafter', 'move', 'within', 'put', 'out', 'along', 'whereafter', 'under', 'next', 'only', 'nowhere', 'take', 'will', 'except', 'as', 'eight', '’d', 'toward', 'none', 'you', 'a', 'and', 'say', 'after', 'if', 'rather', 'at', 'onto', 'see', 'but', 'whereupon', 'an', 'noone', 'n‘t', 'regarding', 'hereupon', 'through', 'does', "'d", 'whoever', 'amount', 'cannot', 'ever

In [102]:
nlp.vocab['for'].is_stop

True

In [103]:
# to add a new stopword
nlp.Defaults.stop_words.add('btw')

In [106]:
len(nlp.Defaults.stop_words)

327

In [107]:
# to remove a stopword
nlp.Defaults.stop_words.remove('various')
len(nlp.Defaults.stop_words)

326

In [108]:
nlp.vocab['various'].is_stop

False

## Phrase Matching and Vocabulary

In [136]:
from spacy.matcher import Matcher, PhraseMatcher

In [111]:
matcher = Matcher(nlp.vocab)

In [122]:
# Create patterns

#SolarPower
pattern1 = [{'LOWER': 'solarpower'}]

#Solar-power
pattern2 = [{'LOWER': 'solar'},{'IS_PUNCT':True} , {'LOWER': 'power'}]

#solar power
pattern3 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]

In [123]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [124]:
doc7 = nlp(u'Here Solar Power industry produces the solarpower and makes th life better and Solar-power is renewable energy')

In [125]:
matcher(doc7)

[(8656102463236116519, 1, 3),
 (8656102463236116519, 6, 7),
 (8656102463236116519, 13, 16)]

In [126]:
# to remove pattern
matcher.remove('SolarPower')

In [128]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'},{'IS_PUNCT':True, 'OP': '*'} , {'LOWER': 'power'}]

In [129]:
matcher.add('SolarPower', None, pattern1, pattern2)

In [134]:
doc2 = nlp(u'check this Solar--power is a solarpower')

In [135]:
matcher(doc2)

[(8656102463236116519, 2, 5), (8656102463236116519, 7, 8)]