In [1]:
import sys
!{sys.executable} -m pip install spacy
!{sys.executable} -m spacy download en

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)


2021-07-11 12:16:16.166439: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library cudart64_110.dll


[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the full
pipeline package name 'en_core_web_sm' instead.
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import spacy

In [3]:
english = spacy.load('en_core_web_sm')

In [4]:
text = "We\'re moving to U.S next year"

In [5]:
print(text)

We're moving to U.S next year


In [6]:
text.split()

["We're", 'moving', 'to', 'U.S', 'next', 'year']

In [7]:
doc = english(text)

In [8]:
for token in doc:
    print(token.text, "\t", token.lemma_)

We 	 we
're 	 be
moving 	 move
to 	 to
U.S 	 U.S
next 	 next
year 	 year


In [9]:
doc2 = english("We're here to help! Can you send us email at support@lwindia.com or visit at http://lwindia.com.")

In [10]:
for token in doc2:
    print(token.text)

We
're
here
to
help
!
Can
you
send
us
email
at
support@lwindia.com
or
visit
at
http://lwindia.com
.


In [11]:
doc3 = english("A 10km mumbai cab ride costs me $10.50")

In [12]:
for token in doc3:
    print(token.text)

A
10
km
mumbai
cab
ride
costs
me
$
10.50


In [13]:
doc4 = english("Let's visit St. Clara in the U.S Valley next year")

In [14]:
for token in doc4:
    print(token.text)

Let
's
visit
St.
Clara
in
the
U.S
Valley
next
year


In [15]:
doc5 = english("Microsoft is going to build a Hong Kong factory for $7 million")

In [16]:
for token in doc5:
    print(token.text)

Microsoft
is
going
to
build
a
Hong
Kong
factory
for
$
7
million


In [17]:
# name entity
for token in doc5.ents:
    print(f"{token}\t{token.label_}")

Microsoft	ORG
Hong Kong	GPE
$7 million	MONEY


In [18]:
doc6 = english("Autonomous car's shift the insurance liability towards the manufactures.")

In [19]:
for token in doc6:
    print(token.text)

Autonomous
car
's
shift
the
insurance
liability
towards
the
manufactures
.


In [20]:
# noun chunks
for token in doc6.noun_chunks:
    print(token)

Autonomous car's shift
the insurance liability
the manufactures


In [21]:
doc7 = english("I am a runner running in a race because I love to run since i ran yesterday.")

In [22]:
# stemming --> not supported by spacy because it comes with limited features
# lemmatization  --> supported by spacy. 
# both converts tokens into their base words
# e.g., running --> run

In [23]:
words = ["ran", "run", "running"]

In [24]:
import nltk

ModuleNotFoundError: No module named 'nltk'

In [None]:
# nltk library supports stemming

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
pstem = PorterStemmer()

In [None]:
pstem.stem("run")

In [None]:
pstem.stem("ran")

In [None]:
pstem.stem("running")

In [None]:
from nltk.stem.snowball import SnowballStemmer

In [None]:
sstem = SnowballStemmer(language="english")

In [None]:
sstem.stem('run')

In [None]:
sstem.stem('ran')

In [None]:
sstem.stem('running')

In [None]:
sstem.stem('fairly')

In [25]:
# spacy library supports lemmatization

In [29]:
for token in doc7:
    print(token.text,token.lemma_)

I I
am be
a a
runner runner
running run
in in
a a
race race
because because
I I
love love
to to
run run
since since
i I
ran run
yesterday yesterday
. .


In [30]:
doc8 = english("I saw eight mice yesterday.")

In [32]:
for token in doc8:
    print(token.text, token.lemma_)

I I
saw see
eight eight
mice mouse
yesterday yesterday
. .


In [34]:
stop_words = english.Defaults.stop_words

In [38]:
print(stop_words)

{'can', 'go', 'towards', 'full', 'unless', 'nobody', 'all', 'over', 'during', 'beside', 'was', 'nothing', 'both', 'last', '‘ll', 'them', 'yourself', 'mostly', 'alone', 'really', 'or', 'enough', 'ourselves', 'yourselves', '’re', '‘ve', 'another', 'whatever', 'not', 're', "n't", 'just', 'using', 'twelve', 'while', '‘m', 'had', 'whether', 'below', 'part', 'anyone', 'once', 'afterwards', 'every', 'elsewhere', 'due', 'to', 'how', 'has', 'however', 'down', 'hereafter', 'have', 'many', 'none', 'amongst', 'sometimes', 'whereafter', 'front', 'did', 'first', 'whence', 'various', 'few', 'very', 'why', 'six', 'hence', 'also', 'either', 'doing', 'there', 'this', 'someone', 'herein', 'when', 'we', 'but', '’m', 'among', 'else', 'four', 'please', 'whereas', 'through', 'further', 'wherein', '‘s', 'everything', 'under', 'whom', 'without', 'get', 'being', 'neither', 'put', '‘d', 'next', 'eight', 'regarding', 'within', 'sometime', 'per', 'ever', 'she', 'from', 'across', 'top', 'toward', 'each', 'perhaps',

In [39]:
len(stop_words)

326

In [44]:
doc9 = "i celebrated my birthday, lol!"

In [49]:
len(english.vocab)

816

In [52]:
english.vocab['can'].is_stop   # checking the stop word

True

In [55]:
english.vocab['lol'].is_stop

False

In [56]:
english.Defaults.stop_words.add('lol')

In [58]:
english.vocab['lol'].is_stop = True

In [59]:
english.vocab['lol'].is_stop

True

In [60]:
len(stop_words)

327