In [1]:
import spacy
from spacy.language import Language
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
from spacy import displacy

In [2]:
nlp = spacy.blank('en')
# a blank has no pipeline, just a tokenizer
nlp.pipeline

[]

In [3]:
nlp = spacy.load('en_core_web_md')
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x221d19c0ca0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x221d15436a0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x221d0e72040>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x221d14e5480>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x221d1b5c040>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x221d0df7eb0>)]

In [4]:
p_matcher = PhraseMatcher(nlp.vocab)

In [5]:
# nlp.pipe(itbl) converts everything in itbl to a Doc with model nlp.
# It is much faster than [nlp(text) for text in MANY_TEXTS]
caffeine = nlp.pipe([
    'coffee',
    'tea',
    'cola',
    'energy drinks',
    'Cola',
    'caffeine'
])

p_matcher.add('CAFFEINE', caffeine)

## Custom pipelines

In [6]:
# add a pipeline that matches caffeinated beverages and adds those
# entities to the entity list
@Language.component('caffeinator')
def caffeinator(doc):
    caffs = p_matcher(doc)
    mtchs = [Span(doc, start, end, 'CAFFEINE')
             for (m_id, start, end) in caffs]
    doc.ents = list(doc.ents) + mtchs
    return doc

nlp.add_pipe('caffeinator')

<function __main__.caffeinator(doc)>

In [7]:
try:
    doc = nlp('I love drinking coffee and tea, but Coca-Cola is too fizzy. '
         'Energy drinks just make me loopy - too much caffeine!')
    displacy.render(doc, style='ent')
except Exception as ex:
    print(ex)

[E1010] Unable to set entity information for token 10 which is included in more than one span in entities, blocked, missing or outside.


Because the "Cola" in "Coca-Cola" was already tagged as being part of an "ORG" entity, we get an error

In [8]:
nlp.remove_pipe('caffeinator')
# add the pipe before the tagger so there aren't any conflicts
nlp.add_pipe('caffeinator', before='tagger')
# other optional args for add_pipe include "first", "last", and "after"

<function __main__.caffeinator(doc)>

In [9]:
caff = nlp('I love drinking coffee and tea, but Coca-Cola is too fizzy. '
         'Energy drinks just make me loopy - too much caffeine!')
displacy.render(caff, style='ent')

Note that we still miss "Energy drinks", because the PhraseMatcher is case-sensitive in its matches and we only specified "energy drinks" in our matches.

## set_extension method for custom metadata

In [10]:
emotions = {
    'happy',
    'sad',
    'angry',
    'bored',
    'silly',
}
# the "force" argument means overwrite the existing attribute
# don't do this unless you're sure you want to!
Token.set_extension('emotion', default=None, force=True)

@Language.component('emotionizer')
def emotionizer(doc):
    # tag_ == 'NN' includes noun types, proper and non-proper
    # tag == 'PRP' includes personal pronouns (I, he, she, it)
    nouns = {tok.i: tok for tok in doc 
             if tok.tag_[:2] == 'NN' or tok.tag_ == 'PRP'}
    # we want all adjectives to match the adjectives in emotions
    # even if the case doesn't match. Matching the lemma to emotions
    # is a good way to do that
    emotion_adjectives = {tok.i: tok for tok in doc if tok.tag_ == 'JJ'
                           and tok.lemma_ in emotions
                         }
    for ii, adj in emotion_adjectives.items():
        doc[ii]._.emotion = adj.lemma_
        # assume that a noun N with the same syntactic governor
        # as an adjective J is modified by J.
        # that covers constructions like "Mark is sad"
        # but it doesn't cover constructions like "angry dog"
        for jj in range(ii + 1, len(doc)):
            tok = doc[jj]
            if tok.tag_[:2] == 'CC' or tok.pos_ == 'PUNCT':
                break
            # the next noun after an adjective is probably modified
            # by that adjective, even if there are intervening words
            # but not if there is a coordinating conjunction or
            # punctuation between the adjective and the next noun
            # so in, "the angry lazy dog", the dog's emotion is "angry"
            # but in "the man is angry and the dog is hungry",
            # the dog's emotion is hungry, not stupid
            if jj in nouns:
                doc[jj]._.emotion = adj.lemma_
                break
    
    for ii, noun in nouns.items():
        for adj in emotion_adjectives.values():
            if noun.head.i == adj.head.i:
                doc[ii]._.emotion = adj.text
    
    return doc

nlp.add_pipe('emotionizer', last=True)

<function __main__.emotionizer(doc)>

In [11]:
emot = nlp("Angry lazy dog boats bark because I feel sad "
          "and Mom is bored. Rain is falling stupidly.")
for tok in emot:
    print(tok.text, tok._.emotion, tok.head.text)

Angry angry dog
lazy None dog
dog angry boats
boats None bark
bark None bark
because None feel
I sad feel
feel None bark
sad sad feel
and None feel
Mom bored is
is None feel
bored bored is
. None bark
Rain None falling
is None falling
falling None falling
stupidly None falling
. None falling


Above we see that "boats" in the "dog boats" is not modified by "angry" because my emotionizer is imperfect.
Ideally our emotionizer would also tag verbs with an emotion if they were modified by an adverb that was emotional.

In [12]:
displacy.render(emot, style='dep')

In [13]:
def get_is_caffeinated(doc):
    return any(ent.label_ == 'CAFFEINE' for ent in doc.ents)
# property extensions
Doc.set_extension('is_caffeinated', getter=get_is_caffeinated)

In [14]:
print(caff._.is_caffeinated)
print(emot._.is_caffeinated)

True
False


In [15]:
def get_emotion_count(doc, emotion):
    return sum(tok._.emotion == emotion for tok in doc)
# method extensions
Doc.set_extension('emotion_count', method=get_emotion_count)
Span.set_extension('emotion_count', method=get_emotion_count)

In [16]:
print(emot._.emotion_count('angry'))
print(emot[5:]._.emotion_count('angry'))

2
0


## Scaling and performance

### nlp.pipe for efficiency and adding metadata

In [17]:
# earlier we saw nlp.pipe at work
# we can do other things with it, like adding metadata to Docs
Doc.set_extension('album', default=None)
Doc.set_extension('year', default=None)
songs = [
    ('in the land of foo', {'album': 'Foo', 'year': 1000}),
    ('an ancient warrior of bar', {'album': 'Bar', 'year': 2000}),
    ('deadly orks of Baz', {'album': 'Baz', 'year': 3000})
]

for song, ctxt in nlp.pipe(songs, as_tuples=True):
    song._.year = ctxt['year']
    song._.album = ctxt['album']
    print(song._.album)

Foo
Bar
Baz


### speeding processing with nlp.make_doc and nlp.disable

In [18]:
# nlp.make_doc just tokenizes the document, nothing else
toks = nlp.make_doc('foo bar baz')
for t in toks:
    print(t.text)

try:
    print(repr(toks[0].lemma_))
    # no lemmatization because only tokenization was done
except Exception as ex:
    print(ex)

foo
bar
baz
''


In [19]:
text = "He drinks coffee when he's angry."
# runs all pipes except the caffeinator and the emotionizer
# while inside the `with` block
with nlp.select_pipes(disable=['caffeinator', 'emotionizer']):
    doc = nlp(text)
    
print('with caffeinator and emotionizer disabled')
print(doc._.is_caffeinated)
print(doc._.emotion_count('angry'))
print('with caffeinator and emotionizer enabled')
doc = nlp(text)
print(doc._.is_caffeinated)
print([x._.emotion for x in doc])
print('with only caffeinator enabled')
with nlp.select_pipes(enable=['caffeinator']):
    doc = nlp(text)
print(doc._.is_caffeinated)
print([x._.emotion for x in doc])

with caffeinator and emotionizer disabled
False
0
with caffeinator and emotionizer enabled
True
[None, None, None, None, 'angry', None, 'angry', None]
with only caffeinator enabled
True
[None, None, None, None, None, None, None, None]
