In [23]:
import spacy
from spacy import displacy
print('spaCy Version: %s' % (spacy.__version__))

spaCy Version: 2.0.16


In [24]:
import en_core_web_sm
spacy_nlp = spacy.load('en_core_web_sm')

In [25]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [26]:
print('Number of stop words: %d' % len(spacy_stopwords))
print('First ten stop words: %s' % list(spacy_stopwords)[:10])

Number of stop words: 305
First ten stop words: ['throughout', 'when', 'six', 'they', 'least', 'over', 'show', 'my', 'what', 'about']


## Tokenization, Lemmetization

In [27]:
nlp = spacy.load('en')
doc = nlp('Hello     World!')
for token in doc:
     print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))

Hello	0	hello	False	False	Xxxxx	INTJ	UH
    	6	    	False	True	    	SPACE	_SP
World	10	world	False	False	Xxxxx	NOUN	NN
!	15	!	True	False	!	PUNCT	.


In [28]:
docx2 = nlp("good goods run  running runner runny was be were")
for word in docx2:
    print(word.text, word.lemma_, word.pos_)

good good ADJ
goods good NOUN
run run NOUN
    SPACE
running run VERB
runner runner NOUN
runny runny ADJ
was be VERB
be be VERB
were be VERB


In [29]:
docx3 = nlp("walking walks walk walker")
for word in docx2:
    print(word.text, word.lemma_, word.pos_)

good good ADJ
goods good NOUN
run run NOUN
    SPACE
running run VERB
runner runner NOUN
runny runny ADJ
was be VERB
be be VERB
were be VERB


## NER

In [30]:
doc = nlp("Next week I'll be in Madrid.")
for ent in doc.ents:
    print(ent.text, ent.label_)
    
    
# display NER
displacy.render(doc, style='ent', jupyter=True)

Next week DATE
Madrid GPE


## STOP WORDS

In [31]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

print('Number of stop words: %d' % len(spacy_stopwords))
print('First ten stop words: %s' % list(spacy_stopwords)[:10])

Number of stop words: 305
First ten stop words: ['throughout', 'when', 'six', 'they', 'least', 'over', 'show', 'my', 'what', 'about']


## Part of speech tagging

In [32]:
doc = nlp("Next week I'll be in Madrid.")
print([(token.text, token.tag_) for token in doc])
 
# [('Next', 'JJ'), ('week', 'NN'), ('I', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('in', 'IN'), ('Madrid', 'NNP'), ('.', '.')]

[('Next', 'JJ'), ('week', 'NN'), ('I', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('in', 'IN'), ('Madrid', 'NNP'), ('.', '.')]


## Dependency Parser

In [33]:
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
 
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

Wall/NNP <--compound-- Street/NNP
Street/NNP <--compound-- Journal/NNP
Journal/NNP <--nsubj-- published/VBD
just/RB <--advmod-- published/VBD
published/VBD <--ROOT-- published/VBD
an/DT <--det-- piece/NN
interesting/JJ <--amod-- piece/NN
piece/NN <--dobj-- published/VBD
on/IN <--prep-- piece/NN
crypto/JJ <--compound-- currencies/NNS
currencies/NNS <--pobj-- on/IN


In [34]:
#visualize dependencies
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})