In [1]:
import spacy

In [2]:
nlp = spacy.blank("en")

In [5]:
nlp.pipe_names

[]

In [10]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [6]:
#loading a trained pipeline (language processing pipeline)
nlp = spacy.load("en_core_web_sm")

In [8]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [9]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x20ab94fe750>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x20ab94fe3f0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x20ab9510890>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x20ab9793310>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x20ab978bb10>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x20ab8a8e2d0>)]

In [11]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")
#ner:--> named entity recognition 
for token in doc:
    print(token, " | ", token.pos_, " | ", token.lemma_) #pos:--> part of speech 

Captain  |  PROPN  |  Captain
america  |  PROPN  |  america
ate  |  VERB  |  eat
100  |  NUM  |  100
$  |  NUM  |  $
of  |  ADP  |  of
samosa  |  PROPN  |  samosa
.  |  PUNCT  |  .
Then  |  ADV  |  then
he  |  PRON  |  he
said  |  VERB  |  say
I  |  PRON  |  I
can  |  AUX  |  can
do  |  VERB  |  do
this  |  PRON  |  this
all  |  DET  |  all
day  |  NOUN  |  day
.  |  PUNCT  |  .


In [13]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
    print(ent, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


In [14]:
from spacy import displacy

displacy.render(doc, style="ent")

In [15]:
#stemming and lemmatization--> important for preprocessing 

import nltk
import spacy

In [16]:
from nltk import PorterStemmer

stemmer = PorterStemmer() #object creation 

In [17]:
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]

for word in words:
    print(word, "|", stemmer.stem(word))

eating | eat
eats | eat
eat | eat
ate | ate
adjustable | adjust
rafting | raft
ability | abil
meeting | meet


In [21]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("eating eats eat ate adjustable rafting ability meeting")

for token in doc:
    print(token, "|", token.lemma_)

eating | eat
eats | eat
eat | eat
ate | eat
adjustable | adjustable
rafting | raft
ability | ability
meeting | meeting


In [23]:
doc1 = nlp("Mando talked for 3 hours although talking isn't his thing")

for token in doc1:
    print(token, "|", token.lemma_, "|", token.lemma)

Mando | Mando | 7837215228004622142
talked | talk | 13939146775466599234
for | for | 16037325823156266367
3 | 3 | 602994839685422785
hours | hour | 9748623380567160636
although | although | 343236316598008647
talking | talk | 13939146775466599234
is | be | 10382539506755952630
n't | not | 447765159362469301
his | his | 2661093235354845946
thing | thing | 2473243759842082748


In [25]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [27]:
doc2 = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")

for token in doc2:
    print(token, "|", token.lemma_)

Bro | bro
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brah
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust


In [36]:
ar = nlp.get_pipe('attribute_ruler')

ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]],{"LEMMA":"Brother"})

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust
