## spans, docs, vocab

In [1]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span
from spacy import displacy
nlp = spacy.blank('en')

In [2]:
# the vocabulary of a spaCy model is stored in a lookup table
# with a two-way mapping between hashes and strings
doc = nlp('I love coffee!!!')
coffee_hash = nlp.vocab.strings['coffee']
print(f'{coffee_hash = }, {nlp.vocab.strings[coffee_hash] = }')

coffee_hash = 3197928453018144401, nlp.vocab.strings[coffee_hash] = 'coffee'


In [3]:
# a lexeme is an entry in the vocab
lexeme = nlp.vocab['coffee']
# the orth attribute is the hash
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

coffee 3197928453018144401 True


In [4]:
# The words and spaces to create the doc from
words = ["Good", "morning", "USA", "!"]
spaces = [True, True, False, False]

# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc)

Good morning USA!


In [5]:
greet_span = Span(doc, 0, 2, label='GREETING')
usa_span = Span(doc, 2, 3, label='GPE')
doc.ents = [greet_span, usa_span]
displacy.render(doc, style='ent')

## word vectors

In [6]:
# spacy.blank and en_core_web_sm don't have built-in word vectorization
nlp = spacy.load('en_core_web_md')

In [7]:
doc1 = nlp('I enjoy caffeinated beverages.')
doc2 = nlp('I LOVE COFFEE')
doc3 = nlp('I hate Coca-Cola \u2639')
coke_span = Span(doc3, 2, 5, label = 'CONSUMABLE')
emoji_span = Span(doc3, 5, 6, label = 'EMOJI')
doc3.ents = [coke_span, emoji_span]
docs = [doc1, doc2, doc3, coke_span, emoji_span]
# spans and individual tokens can be compared to documents
# default similarity metric is cosine similarity, but can be adjusted
for ii, d1 in enumerate(docs):
    for d2 in docs[ii+1:len(docs)]:
        print(f'"{d1.text}" <-> "{d2.text}": {d1.similarity(d2)}')
print(('"angry face" <-> "\u2639": '
      f'{nlp("angry face").similarity(emoji_span)}'))

"I enjoy caffeinated beverages." <-> "I LOVE COFFEE": 0.8751647980586529
"I enjoy caffeinated beverages." <-> "I hate Coca-Cola ☹": 0.6316076821778656
"I enjoy caffeinated beverages." <-> "Coca-Cola": 0.4358014247194589
"I enjoy caffeinated beverages." <-> "☹": -0.032447879410338464
"I LOVE COFFEE" <-> "I hate Coca-Cola ☹": 0.7047962756090318
"I LOVE COFFEE" <-> "Coca-Cola": 0.3882343083308216
"I LOVE COFFEE" <-> "☹": -0.005725138796304899
"I hate Coca-Cola ☹" <-> "Coca-Cola": 0.7712480345362694
"I hate Coca-Cola ☹" <-> "☹": 0.31047273144090287
"Coca-Cola" <-> "☹": -0.0868011936545372
"angry face" <-> "☹": -0.050709066160061725


### PhraseMatcher: match sub-documents in a document

In [8]:
# The phrase matcher can match entire phrases as documents
grimdark = [nlp(f'{adj} {noun}') 
            for adj in ['grim', 'menacing', 'depressing', 'forboding']
            for noun in ['darkness', 'shadows', 'obscurity', 'void']
]

phrase_matcher = PhraseMatcher(nlp.vocab)
phrase_matcher.add('GRIM_DARK', grimdark)

In [10]:
doc = nlp('''
Chapter 1:
In the grim darkness of the far future there is only war.
Bojor crept through the menacing shadows, contemplating the 
depressing void of outer space.
Out of the forboding obscurity skulked a Tyranid warrior! 
''')

matches = phrase_matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

grim darkness
menacing shadows
depressing void
forboding obscurity
