In [10]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [6]:
mystring = 'We\'re moving to L.A.!'

# Tokenization

In [7]:
doc = nlp(mystring)

for token in doc:
    print(token.text, end=",")

We,'re,moving,to,L.A.,!,

In [8]:
len(doc)

6

In [12]:
len(doc.vocab)

765

In [13]:
doc2 = nlp(u'It is better to give than to receive.')

# Retrieve the third token:
doc2[2]

better

In [15]:
doc2[3:6]

to give than

# POS-Part of Speech

In [34]:
for token in doc3:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

Apple      PROPN    NNP    noun, proper singular
to         PART     TO     infinitival "to"
build      VERB     VB     verb, base form
a          DET      DT     determiner
Hong       PROPN    NNP    noun, proper singular
Kong       PROPN    NNP    noun, proper singular
factory    NOUN     NN     noun, singular or mass
for        ADP      IN     conjunction, subordinating or preposition
$          SYM      $      symbol, currency
6          NUM      CD     cardinal number
million    NUM      CD     cardinal number


In [35]:
doc4 = nlp(u"The quick brown fox jumped over the lazy dog's back.")

# Count the frequencies of different coarse-grained POS tags:
POS_counts = doc4.count_by(spacy.attrs.POS)
POS_counts

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}

In [37]:
doc4.vocab[84].text

'ADJ'

In [38]:
for k,v in sorted(POS_counts.items()):
    print(f'{k}. {doc4.vocab[k].text:{5}}: {v}')

84. ADJ  : 3
85. ADP  : 1
90. DET  : 2
92. NOUN : 3
94. PART : 1
97. PUNCT: 1
100. VERB : 1


In [21]:
from spacy import displacy
displacy.render(doc3, style='dep', jupyter=True, options={'distance': 110})

In [39]:
options = {'distance': 110, 'compact': 'True', 'color': 'yellow', 'bg': '#09a3d5', 'font': 'Times'}

displacy.render(doc3, style='dep', jupyter=True, options=options)

# NER-Named Entity Recognation

In [16]:
doc3 = nlp(u'Apple to build a Hong Kong factory for $6 million')


for ent in doc3.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [18]:
list(zip(doc3.ents, ent.label_))

[(Apple, 'M'), (Hong Kong, 'O'), ($6 million, 'N')]

In [40]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [41]:
doc5 = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')

show_ents(doc5)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [43]:
doc6 = nlp(u'Can I please borrow 500 dollars from you to buy some Microsoft stock?')

for ent in doc6.ents:
    print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

500 dollars 4 6 20 31 MONEY
Microsoft 11 12 53 62 ORG


In [44]:
len([ent for ent in doc6.ents if ent.label_=='MONEY'])

1

In [45]:
doc7 = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Sony sold only 7 thousand Walkman music players.')

displacy.render(doc7, style='ent', jupyter=True)

In [46]:
for sent in doc7.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True)

In [47]:
doc8 = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, my kids sold a lot of lemonade.')

In [48]:
for sent in doc8.sents:
    docx = nlp(sent.text)
    if docx.ents:
        displacy.render(docx, style='ent', jupyter=True)
    else:
        print(docx.text)

By contrast, my kids sold a lot of lemonade.


In [49]:
options = {'ents': ['ORG', 'PRODUCT']}

displacy.render(doc7, style='ent', jupyter=True, options=options)

In [50]:
colors = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', 'PRODUCT': 'radial-gradient(yellow, green)'}

options = {'ents': ['ORG', 'PRODUCT'], 'colors':colors}

displacy.render(doc7, style='ent', jupyter=True, options=options)

# Lemmas

In [23]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [25]:
doc4 = nlp(u"I am meeting him tomorrow at the meeting.")
show_lemmas(doc4)

I            PRON   4690420944186131903    I
am           AUX    10382539506755952630   be
meeting      VERB   6880656908171229526    meet
him          PRON   1655312771067108281    he
tomorrow     NOUN   3573583789758258062    tomorrow
at           ADP    11667289587015813222   at
the          DET    7425985699627899538    the
meeting      NOUN   14798207169164081740   meeting
.            PUNCT  12646065887601541794   .


# Stop-words

In [26]:
print(nlp.Defaults.stop_words)

{'whole', 'serious', 'whereas', 'those', 'most', 'though', 'own', 'into', 'three', 'neither', 'this', '‘m', 'give', 'few', 'him', 'up', 'ourselves', 'already', 'hereby', 'elsewhere', 'were', 'doing', 'next', 'everyone', 'your', 'enough', 'whereby', 'nevertheless', "'ve", 'sixty', 'her', 'somehow', 'across', 'during', 'could', 'really', 'some', 'she', 'per', 'top', 'nothing', 'empty', 'here', 'when', 'two', 'a', 'eight', 'namely', 'his', 'because', 'below', 'one', 'ca', 'become', 'nor', 'herself', 'always', 'four', 'done', 'yet', 'whoever', 'very', 'forty', 'name', 'others', 'then', 'somewhere', 'whence', 'mostly', 'beside', 'would', 'get', 'noone', 'out', 'been', 'anything', 'ever', 'in', 'onto', 'no', 'how', 'thereby', 'toward', 'n’t', 'there', 'amongst', 'have', 'alone', 'too', '‘ve', 'we', 'much', 'an', 'until', 'front', "n't", 'did', 'please', 'do', 'after', 'himself', 'together', 'all', 'am', 'call', 'sometimes', 'again', 'anyone', 'i', 'anyway', 'whose', 'show', 'last', 'of', 'ma

In [27]:
len(nlp.Defaults.stop_words)

326

In [28]:
nlp.vocab['myself'].is_stop

True

In [29]:
nlp.vocab['mystery'].is_stop

False

In [30]:
# Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('btw')

# Set the stop_word tag on the lexeme
nlp.vocab['btw'].is_stop = True

In [31]:
nlp.vocab['btw'].is_stop

True

In [32]:
# Remove the word from the set of stop words
nlp.Defaults.stop_words.remove('beyond')

# Remove the stop_word tag from the lexeme
nlp.vocab['beyond'].is_stop = False

In [33]:
nlp.vocab['beyond'].is_stop

False