In [1]:
# Perform standard imports
import spacy

In [2]:
nlp=spacy.load('en_core_web_sm')

In [5]:
# Create a simple Doc object
doc=nlp(u'the quick brown fox jumped over the lazy dog"s back.')


In [6]:
print(doc[4].tag_)

VBD


In [8]:
print(doc[4].pos_)

VERB


In [11]:
for token in doc:
    print(f"{token.text:10} {token.pos_:10} {token.tag_:10} {spacy.explain(token.tag_):10}")

the        DET        DT         determiner
quick      ADJ        JJ         adjective 
brown      PROPN      NNP        noun, proper singular
fox        PROPN      NNP        noun, proper singular
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective 
dog"s      NOUN       NN         noun, singular or mass
back       ADV        RB         adverb    
.          PUNCT      .          punctuation mark, sentence closer


In [30]:
doc=nlp(u'I read a book on nlp')
doc1=nlp(u'I have read books on nlp')

In [31]:
token=doc[1]
print(f"{token.text:10} {token.pos_:10} {token.tag_:10} {spacy.explain(token.tag_):10}")

read       VERB       VBD        verb, past tense


In [32]:
token=doc1[2]
print(f"{token.text:10} {token.pos_:10} {token.tag_:10} {spacy.explain(token.tag_):10}")

read       VERB       VBN        verb, past participle


In [33]:
doc=nlp(u'the quick brown fox jumped over the lazy dog"s back.')

In [34]:
POS_counts=doc.count_by(spacy.attrs.POS)

In [35]:
POS_counts

{90: 2, 84: 2, 96: 2, 100: 1, 85: 1, 92: 1, 86: 1, 97: 1}

In [38]:
doc.vocab[84].text

'ADJ'

In [53]:
for k,v in sorted(POS_counts.items()):
    print(f"{k} {doc.vocab[k].text:{10}}{v}")

84 ADJ       2
85 ADP       1
86 ADV       1
90 DET       2
92 NOUN      1
96 PROPN     2
97 PUNCT     1
100 VERB      1


Named Entity Recognition 

In [54]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print('no entity found')

In [55]:
doc=nlp(u'hi! how are you?')

In [56]:
show_ents(doc)

no entity found


In [66]:
doc=nlp(u'May i go to Washington DC next May to see the washington monument?')

In [67]:
show_ents(doc)

Washington DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
washington - GPE - Countries, cities, states


In [74]:
doc=nlp(u'Can i have please $500 of Windows stocks?')
show_ents(doc)

500 - MONEY - Monetary values, including unit
Windows - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [75]:
doc=nlp(u'Tesla to build UK factory for 6 million dollars')
show_ents(doc)

UK - GPE - Countries, cities, states
6 million dollars - MONEY - Monetary values, including unit


In [81]:
from spacy.tokens import Span

In [82]:
ORG=doc.vocab.strings[u'ORG']

In [83]:
ORG

383

In [84]:
new_entity=Span(doc,0,1,label=ORG)

In [85]:
doc.ents=list(doc.ents)+[new_entity]

In [86]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
UK - GPE - Countries, cities, states
6 million dollars - MONEY - Monetary values, including unit


In [92]:
doc=nlp(u'our company created a vaccum-cleaner'
       u"this new vaccum cleaner is the best.")

In [93]:
show_ents(doc)

vaccum - ORG - Companies, agencies, institutions, etc.


Sentence Segmentation

In [98]:
doc=nlp(u"This is the first sentance.This is the second sentance.This is the third sentance.")

In [99]:
for sentence in doc.sents:
    print(sentence)

This is the first sentance.
This is the second sentance.
This is the third sentance.


In [101]:
list(doc.sents)

[This is the first sentance.,
 This is the second sentance.,
 This is the third sentance.]

In [104]:
doc=nlp(u'"management rules; people work" -vaibhav')

In [105]:
for sentence in doc.sents:
    print(sentence)

"management rules; people work" -vaibhav


In [108]:
#Add a segmentation rule
def set_custom_boundary(doc):
    for token in doc[:-1]:
        if token.text==';':
            doc[token.i+1].is_sent_start=True
    return doc

In [111]:
# nlp.add_pipe(set_custom_boundary,before='parser')

nlp.pipe_names

['tagger', 'set_custom_boundary', 'parser', 'ner']

In [110]:
doc=nlp(u'"management rules; people work" -vaibhav')
for sentence in doc.sents:
    print(sentence)

"management rules;
people work" -vaibhav
