In [20]:
import spacy
from spacy import displacy
print('spaCy Version: %s' % (spacy.__version__))

spaCy Version: 2.0.16


In [21]:
import en_core_web_sm
spacy_nlp = spacy.load('en_core_web_sm')

In [22]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [23]:
print('Number of stop words: %d' % len(spacy_stopwords))
print('First ten stop words: %s' % list(spacy_stopwords)[:10])

Number of stop words: 305
First ten stop words: ['becoming', 'make', 'eleven', 'but', 'no', 'see', 'she', 'anyhow', 'first', 'most']


## Tokenization, Lemmetization

In [24]:
nlp = spacy.load('en')
doc = nlp('Hello     World!')
for token in doc:
     print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))

Hello	0	hello	False	False	Xxxxx	INTJ	UH
    	6	    	False	True	    	SPACE	_SP
World	10	world	False	False	Xxxxx	NOUN	NN
!	15	!	True	False	!	PUNCT	.


In [25]:
docx2 = nlp("good goods run  running runner runny was be were")
for word in docx2:
    print(word.text, word.lemma_, word.pos_)

good good ADJ
goods good NOUN
run run NOUN
    SPACE
running run VERB
runner runner NOUN
runny runny ADJ
was be VERB
be be VERB
were be VERB


In [26]:
docx3 = nlp("walking walks walk walker")
for word in docx2:
    print(word.text, word.lemma_, word.pos_)

good good ADJ
goods good NOUN
run run NOUN
    SPACE
running run VERB
runner runner NOUN
runny runny ADJ
was be VERB
be be VERB
were be VERB


## NER

In [27]:
doc = nlp("Next week I'll be in Madrid.")
for ent in doc.ents:
    print(ent.text, ent.label_)
    
    
# display NER
displacy.render(doc, style='ent', jupyter=True)

Next week DATE
Madrid GPE


## STOP WORDS

In [19]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

print('Number of stop words: %d' % len(spacy_stopwords))
print('First ten stop words: %s' % list(spacy_stopwords)[:10])

Number of stop words: 305
First ten stop words: ['becoming', 'make', 'eleven', 'but', 'no', 'see', 'she', 'anyhow', 'first', 'most']


## Part of speech tagging

In [10]:
doc = nlp("Next week I'll be in Madrid.")
print([(token.text, token.tag_) for token in doc])
 
# [('Next', 'JJ'), ('week', 'NN'), ('I', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('in', 'IN'), ('Madrid', 'NNP'), ('.', '.')]

[('Next', 'JJ'), ('week', 'NN'), ('I', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('in', 'IN'), ('Madrid', 'NNP'), ('.', '.')]


## Dependency Parser

In [11]:
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
 
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

Wall/NNP <--compound-- Street/NNP
Street/NNP <--compound-- Journal/NNP
Journal/NNP <--nsubj-- published/VBD
just/RB <--advmod-- published/VBD
published/VBD <--ROOT-- published/VBD
an/DT <--det-- piece/NN
interesting/JJ <--amod-- piece/NN
piece/NN <--dobj-- published/VBD
on/IN <--prep-- piece/NN
crypto/JJ <--compound-- currencies/NNS
currencies/NNS <--pobj-- on/IN


In [12]:
#visualize dependencies
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

### Lemmitization - Reux

In [13]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

In [14]:
print(lemmatization(list("good goods run  running runner runny was be were")))

['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']


In [15]:
name = "some name, with   wonders! and i like it..."

In [16]:
doc = nlp(name)
for token in doc:
     print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))

some	0	some	False	False	xxxx	DET	DT
name	5	name	False	False	xxxx	NOUN	NN
,	9	,	True	False	,	PUNCT	,
with	11	with	False	False	xxxx	ADP	IN
  	16	  	False	True	  	SPACE	_SP
wonders	18	wonder	False	False	xxxx	NOUN	NNS
!	25	!	True	False	!	PUNCT	.
and	27	and	False	False	xxx	CCONJ	CC
i	31	i	False	False	x	PRON	PRP
like	33	like	False	False	xxxx	VERB	VBP
it	38	-PRON-	False	False	xx	PRON	PRP
...	40	...	True	False	...	PUNCT	.


In [17]:
tokenize_blacklist = ['PUNCT', 'SPACE']
doc = nlp(name)
print("original text = {}".format(name))
for token in doc:
    if(token.pos_ not in tokenize_blacklist):
         print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
            token.text,
            token.idx,
            token.lemma_,
            token.is_punct,
            token.is_space,
            token.shape_,
            token.pos_,
            token.tag_
        ))

original text = some name, with   wonders! and i like it...
some	0	some	False	False	xxxx	DET	DT
name	5	name	False	False	xxxx	NOUN	NN
with	11	with	False	False	xxxx	ADP	IN
wonders	18	wonder	False	False	xxxx	NOUN	NNS
and	27	and	False	False	xxx	CCONJ	CC
i	31	i	False	False	x	PRON	PRP
like	33	like	False	False	xxxx	VERB	VBP
it	38	-PRON-	False	False	xx	PRON	PRP


In [18]:
doc = nlp(name)
print(name)

tokens = [token.text.lower() for token in doc if token.pos_ not in tokenize_blacklist]
tokens

some name, with   wonders! and i like it...


['some', 'name', 'with', 'wonders', 'and', 'i', 'like', 'it']