# NLP Python Basics

https://www.analyticsvidhya.com/blog/2017/01/ultimate-guide-to-understand-implement-natural-language-processing-codes-in-python/

https://www.analyticsvidhya.com/blog/2017/04/natural-language-processing-made-easy-using-spacy-%E2%80%8Bin-python/


In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

# Tokenization

1. Tokenization
2. Entities
3. Noun Chunks

In [6]:
doc = nlp(u'Tesla is looking at buying a US start-up for $6 million')

for token in doc:
    print(token.text, token.pos)
    print(token.text, token.pos_, token.dep_)

print('\n----')

doc = nlp(u'Apple to build a Hong Kong factory for $6 million')

for token in doc:
    print(token.text, end=' | ')


Tesla 96
Tesla PROPN nsubj
is 100
is VERB aux
looking 100
looking VERB ROOT
at 85
at ADP prep
buying 100
buying VERB pcomp
a 90
a DET det
US 96
US PROPN compound
start 92
start NOUN compound
- 97
- PUNCT punct
up 92
up NOUN dobj
for 85
for ADP prep
$ 99
$ SYM quantmod
6 93
6 NUM compound
million 93
million NUM pobj


In [4]:
# Entities

for entity in doc.ents:
    print(entity)
    print(entity.label_)

print('\n----')

    
print(len(doc.ents))

print('\n----')

for ent in doc.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))


Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 
----
Apple
ORG
Hong Kong
GPE
$6 million
MONEY

----
3

----
Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [5]:
# Noun Chunks
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


# Stemming

In [8]:
# Import the toolkit and the full Porter Stemmer library
import nltk
from nltk.stem.porter import *
p_stemmer = PorterStemmer()

words = ['run','runner','running','ran','runs','easily','fairly']
for word in words:
    print(word+' --> '+p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


In [10]:
from nltk.stem.snowball import SnowballStemmer

# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')

words = ['run','runner','running','ran','runs','easily','fairly']
words = ['generous','generation','generously','generate']
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

generous --> generous
generation --> generat
generously --> generous
generate --> generat


# Lemmatization

In [13]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(token.text, '\t','\t', token.pos_, '\t', token.lemma_)

I 	 	 PRON 	 -PRON-
am 	 	 VERB 	 be
a 	 	 DET 	 a
runner 	 	 NOUN 	 runner
running 	 	 VERB 	 run
in 	 	 ADP 	 in
a 	 	 DET 	 a
race 	 	 NOUN 	 race
because 	 	 ADP 	 because
I 	 	 PRON 	 -PRON-
love 	 	 VERB 	 love
to 	 	 PART 	 to
run 	 	 VERB 	 run
since 	 	 ADP 	 since
I 	 	 PRON 	 -PRON-
ran 	 	 VERB 	 run
today 	 	 NOUN 	 today


# Stop Words

In [14]:
# Print the set of spaCy's default stop words (remember that sets are unordered):
print(nlp.Defaults.stop_words)

{'none', 'put', 'hereby', 'various', 'go', 'was', 'does', 'nowhere', 'as', 'behind', 'else', 'when', 'us', 'any', 'who', 'upon', 'neither', 'whereas', 'on', 'during', '’s', 'their', 'he', 'until', 'that', 'used', 'its', 'n‘t', 'though', 'least', 'not', 'three', 'together', 'whereafter', 'well', 'about', 'ca', 'me', 'into', '’re', 'amount', 'even', '’d', 'fifty', 'my', '‘d', 'show', 'throughout', 'yet', 'where', 'five', 'still', 'your', 'whereby', 'whether', 'being', 'whereupon', 'if', 'nothing', 'really', 'last', 'those', 'under', 'enough', 'thence', 'always', 'off', 'becoming', 'indeed', 'just', 'whence', 'had', 'elsewhere', 'own', 'towards', 'may', 'whole', 'somewhere', 'move', 'wherein', 'nine', 'across', 'himself', 'top', 'otherwise', 'these', 'someone', 'ourselves', 'nor', 'each', 'am', 'might', 'two', 'moreover', 'such', 'of', '’m', 'everyone', 'with', 'herein', 'mine', 'whose', 'below', 'call', 'this', 'or', 'him', 'has', 'eight', 'somehow', 'would', 'using', 'however', 'once', 

In [15]:
len(nlp.Defaults.stop_words)

326

In [16]:
nlp.vocab['myself'].is_stop

True

In [17]:
nlp.vocab['mystery'].is_stop

False