In [2]:
import spacy
nlp = spacy.load('en_core_web_sm') # load the english language library (load a model called nlp)
# this is actually a small language library

In [4]:
# PARTS OF SPEECH TAGGING

doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")

# pos_ is the coarse grain POS tag
# tag_ is the fine grain POS tag (more detail)

The        DET        DT         determiner
quick      ADJ        JJ         adjective 
brown      ADJ        JJ         adjective 
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective 
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


In [8]:
doc = nlp(u"I read books on NLP.") # read is in the present tense, and spacy knows this

token = doc[1]

print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")

doc2 = nlp(u"I read a book on NLP.") # now, spacy understands that 'read' is in the past tense
                                     # I technically disagree with spacy because 'read' above can be past tense too

token = doc2[1]

print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")

doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

POS_counts = doc.count_by(spacy.attrs.POS)

POS_counts # this basically shows how many times each POS happens (e.g., 3 verbs), but it has the numerical identifiers for each POS tag (e.g., '85' instead of 'verb')

for k,v in sorted(POS_counts.items()): # output the POS counts nicely
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

# we can repeat this same process with the fine grain POS tags by using the TAG attribute instead of the POS one

read       VERB       VBP        verb, non-3rd person singular present
read       VERB       VBD        verb, past tense
83. ADJ   3
84. ADP   1
89. DET   2
91. NOUN  3
93. PART  1
96. PUNCT 1
99. VERB  1


In [11]:
# VISUALIZING PART OF SPEECH

from spacy import displacy

doc = nlp(u'Apple will build Brandon\'s factory for the low low price of $56 billion.')

displacy.render(doc,style='dep',jupyter=True,
                    options={'distance':110,'compact':True,'color':'yellow','bg':'#006400','font':'Arial'})

In [5]:
# NAMED ENTITY RECOGNITION:

# CLASSIFY WORDS OR TOKENS INTO PRE-DEFINED CATEGORIES/ENTITIES

# WE CAN ADD OUR OWN CUSTOM ENTITIES!

def show_ents(doc): # show the entities in a sentence
    if doc.ents: # if there are named entities
        for ent in doc.ents: # for each entity
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print("No entities.")

doc = nlp(u'How are you?')

show_ents(doc)

doc2 = nlp(u'May I travel to Horace Mann School, to see Brandon Pae?')

show_ents(doc2)

# there are many different entities in spacy (people, products, organizations, countries, etc.)
# it's basically a way to classify the token

No entities.
Horace Mann School - ORG - Companies, agencies, institutions, etc.
Brandon Pae - PERSON - People, including fictional


In [7]:
# we can tell spacy what certain words should be classified as:

from spacy.tokens import Span

doc = nlp(u'Tesla')

ORG = doc.vocab.strings[u'ORG'] # hash value for the org label in spacy

new_ent = Span(doc,0,1,label=ORG) # doc is the object, 0 is the start index, 1 is the stop index

doc.ents = list(doc.ents) + [new_ent]

show_ents(doc) # now, spacy classifies Tesla as an organization

Tesla - ORG - Companies, agencies, institutions, etc.


In [10]:
# if we want to add multiple named entity options (e.g. phrases) into spacy:
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

doc = nlp(u"Our company created a new vacuum cleaner." u"This new vacuum-cleaner is the best.")

matcher = PhraseMatcher(nlp.vocab)

phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list] # make each thing a spacy doc

matcher.add('newproduct',None,*phrase_patterns) # add the vacuum cleaner patterns under the heading newproduct
found_matches = matcher(doc)

PROD = doc.vocab.strings[u'PRODUCT'] # hash value for the product label in spacy
new_ents = [Span(doc,match[1],match[2],label=PROD) for match in found_matches] # make Span objects (vacuum cleaner) and label them as a product

doc.ents = list(doc.ents) + new_ents

show_ents(doc)

# now, let's create a method for counting the number of entities in a sentence

doc = nlp(u"Originally, I paid $29.95 for this car, but now it's worth $10.")

# let's find how many times money was mentioned in this sentence:

len([ent for ent in doc.ents if ent.label_ == "MONEY"])

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


2

In [14]:
# VISUALIZING NAMED ENTITY RECOGNITION:

from spacy import displacy

doc = nlp(u"Over the last quarter, Apple sold nearly 20,000 iPods for a profit of $2.00. Sony only sold 8,000 tablets.")

displacy.render(doc,style='ent',jupyter=True)

for sent in doc.sents: # render each line separately
    displacy.render(nlp(sent.text),style='ent',jupyter=True)

# if we want to only highlight certain entities:

options = {'ents':['PRODUCT','ORG']}
displacy.render(doc,style='ent',jupyter=True,options=options)

# there's also an options to change the colors of the entities too

In [18]:
# Sentence Segementation (breaking up a sentence w. our own rules)

doc = nlp("This is the first sentence. This is another sentence. This is my final sentence.")

for sent in doc.sents: # shows how spacy will segment the sentence. default: by periods and whitespaces
    print(sent)

doc = nlp("Hello there; my name is brandon pae; i've come to tell you something.")

# add a segmentation rule

def set_custom_boundaries(doc):
    for token in doc[:-1]: # go up to the last word
        # token.i -> i is the index position of the token in the sentence

        if token.text ==';': # when we see the semicolon, the very next word is the start of a sentence
            doc[token.i+1].is_sent_start = True

    return doc

# nlp.add_pipe(set_custom_boundaries,before='parser') # in the pipeline (when spacy tokenizes the sentence), make sure our custom method happens before parsing

doc2 = nlp("Hello there; my name is brandon pae; i've come to tell you something.")

for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is my final sentence.
Hello there;
my name is brandon pae;
i've come to tell you something.


In [19]:
# if we want to completely change the segmentation rules:

from spacy.pipeline import SentenceSegmenter

nlp = spacy.load('en_core_web_sm') # reload the library in order to go back to the default segmentation rules

doc = nlp(u'This is a sentence. This is a new one.\n\nThis is a new sentence.')

# if we want to split just on newlines instead of whitespace or periods

def split_on_newlines(doc):
    start = 0
    seen_newline = False

    for word in doc: # for every token in the doc
        if seen_newline:
            yield doc[start:word.i] # yield is like return
            start = word.i # index of the token
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True

    yield doc[start:]

sbd = SentenceSegmenter(nlp.vocab,strategy=split_on_newlines)
nlp.add_pipe(sbd)

doc = nlp(u'This is a sentence. This is a new one.\n\nThis is a new sentence.')

for sent in doc.sents:
    print(sent)

This is a sentence. This is a new one.


This is a new sentence.
