In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

## Spacy Basics

In [3]:
doc = nlp(u"Tesla is looking at buying U.S. startup for $6 millon")

In [4]:
for token in doc:
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 96 PROPN nsubj
is 87 AUX aux
looking 100 VERB ROOT
at 85 ADP prep
buying 100 VERB pcomp
U.S. 96 PROPN compound
startup 92 NOUN dobj
for 85 ADP prep
$ 99 SYM nmod
6 93 NUM nummod
millon 92 NOUN pobj


In [5]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7fda0e2bc690>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fda0e39c600>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fda0e39c520>)]

In [6]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [7]:
doc2 = nlp("Tesla wasn't looking for startups anymore.")

In [8]:
for token in doc2:
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 96 PROPN nsubj
was 87 AUX aux
n't 94 PART neg
looking 100 VERB ROOT
for 85 ADP prep
startups 92 NOUN pobj
anymore 86 ADV advmod
. 97 PUNCT punct


In [9]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [10]:
life_quote = doc3[16:30]

In [11]:
life_quote

"Life is what happens to us while we are making other plans"

In [12]:
type(life_quote)

spacy.tokens.span.Span

In [13]:
type(doc3)

spacy.tokens.doc.Doc

In [14]:
doc4 = nlp("This is the first sentence. This is another sentence. This is the last sentence")

In [15]:
type(doc4)

spacy.tokens.doc.Doc

In [16]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence


In [17]:
doc4[6]

This

## Tokenization

In [18]:
my_string = '"We\'re moving to L.A.!"'

In [19]:
print(my_string)

"We're moving to L.A.!"


In [20]:
doc = nlp(my_string)

In [21]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [22]:
doc2 = nlp("We're here to help! Send snail-mail, email support@oursite.com or visit us at http://oursite.com")

In [23]:
for token in doc2:
    print(token.text)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://oursite.com


In [24]:
doc3 = nlp("A 5 km NYC cab ride costs $10.30")

In [25]:
for token in doc3:
    print(token)

A
5
km
NYC
cab
ride
costs
$
10.30


In [26]:
doc4 = nlp("Let's visit St. Louis in the U.S. next year.")

In [27]:
for token in doc4:
    print(token)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [28]:
len(nlp.vocab)

553

In [29]:
doc5 = nlp("It is better to give than reveive")

In [30]:
doc5[0]

It

In [31]:
doc5[2:5]

better to give

In [32]:
doc[5] = 'test'

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [None]:
doc6 = nlp("Apple to build a Hong Kong factory for $6 million")

In [None]:
for token in doc6:
    print(token.text, end=" | ")

In [None]:
for entity in doc6.ents:
    print(entity, entity.label_, spacy.explain(entity.label_))

In [None]:
doc7 = nlp("Autonomous cars shift insurance liability toward manufactures.")

In [None]:
for token in doc7:
    print(token)

In [None]:
for chunk in doc7.noun_chunks:
    print(chunk)

In [None]:
from spacy import displacy

In [None]:
doc = nlp("Apple is going to build a U.K. factory for $6 million.")

In [None]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})

In [None]:
displacy.render(doc, style='ent', jupyter=True)

In [None]:
displacy.serve(doc, style='dep')

## Stemming

In [None]:
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
p_stemmer = PorterStemmer()

In [None]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly']

In [None]:
for word in words:
    print("{} - {}".format(word, p_stemmer.stem(word)))

In [None]:
from nltk.stem.snowball import SnowballStemmer

In [None]:
s_stemmer = SnowballStemmer(language='english')

In [None]:
for word in words:
    print("{} - {}".format(word, s_stemmer.stem(word)))

In [None]:
words2 = ['generous', 'generation', 'generously', 'generate']

In [None]:
for word in words2:
    print("{} - {}".format(word, s_stemmer.stem(word)))

In [None]:
for word in words2:
    print("{} - {}".format(word, p_stemmer.stem(word)))

## Lemmatization

In [33]:
doc2 = nlp("I am runner running in a race because I love to run since I ran today.")

In [34]:
for token in doc2:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
runner 	 PROPN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today
. 	 PUNCT 	 12646065887601541794 	 .


## Stop Words

In [35]:
nlp.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron