# Chapter 2

### Vocab: Stores data shared across multiple documents

In [7]:
import spacy

In [8]:
from spacy.matcher import Matcher

In [9]:
nlp = spacy.blank("en") #Creates a blank English nlp object

In [10]:
nlp.vocab.strings.add("coffee")

3197928453018144401

In [11]:
coffee_hash = nlp.vocab.strings["coffee"]

In [12]:
coffee_string = nlp.vocab.strings[coffee_hash]

In [13]:
string = nlp.vocab.strings[3197928453018144401] # Raises an error if we haven't seen the string before

In [14]:
doc = nlp("I love coffee")
print("hash value:", nlp.vocab.strings["coffee"])
print("string value:", nlp.vocab.strings[3197928453018144401])

hash value: 3197928453018144401
string value: coffee


In [15]:
doc = nlp("I love coffee")
print("hash value:", doc.vocab.strings["coffee"])

hash value: 3197928453018144401


### A Lexeme object is an entry in the vocabulary

In [16]:
doc = nlp("I love coffee")
lexeme = nlp.vocab["coffee"]

In [17]:
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

coffee 3197928453018144401 True


### The Doc Object

In [18]:
import spacy
nlp = spacy.blank("en")

#Import the Doc Class
from spacy.tokens import Doc

words = ["Hello", "world", "!"]
spaces = [True, False, False]

doc = Doc(nlp.vocab, words=words, spaces=spaces)

### The Span Object

In [19]:
from spacy.tokens import Doc, Span

#The words and spaces to create the doc from
words = ["Hello", "world", "!"]
spaces = [True, False, False]

#Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)

#Create a span manually
span = Span(doc, 0, 2)

#Create a span with label
span_with_label = Span(doc, 0, 2, label="GREETING")

#Add span to the doc.ents
doc.ents = [span_with_label]

## Comparing semantic similarity

In [20]:
import spacy

nlp = spacy.load("en_core_web_md")

#Compare two documents
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")
print(doc1.similarity(doc2))

0.8627204117787385


In [21]:
doc = nlp("I like pizza and pasta")
token1 = doc[2]
token2 = doc[4]
print(token1.similarity(token2))

0.7369546


According to word vectors, the tokens "pizza" and "pasta" are kind of similar, and receive a score of 0.7

In [22]:
doc = nlp("I like pizza")
token = nlp("soap")[0]

print(doc.similarity(token))

0.32531983166759537


In [23]:
span = nlp("I like pizza and pasta")[2:5]
doc = nlp("McDonalds sells burgers")

print(span.similarity(doc))

0.6199092090831612


Similarity depends on the application context
- Useful for many application: recommendation systems, flagging duplicates
- There's no object definition of "similarity"
- Depends on the context and what application needs to do


In [24]:
doc1 = nlp("I like cats")
doc2 = nlp("I hate cats")

print(doc1.similarity(doc2))

0.9501447503553421


Even if both the statements mean the opposite, spacy thinks that they are similar. This makes sense since both the statements are reffering to cats

# Rule-Based Matching Examples

In [25]:
# Initialize with the shared vocab
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# Patterns are lists of dictionaries describing the tokens
pattern = [{"LEMMA": "love", "POS": "VERB"}, {"LOWER": "cats"}]
matcher.add("LOVE_CATS", [pattern])

# Operators can specify how often a token should be matched
pattern = [{"TEXT": "very", "OP": "+"}, {"TEXT": "happy"}]
matcher.add("VERY_HAPPY", [pattern])

# Calling matcher on doc returns list of (match_id, start, end) tuples
doc = nlp("I love cats and I'm very very happy")
matches = matcher(doc)

# Adding Statistical Predictions

In [26]:
matcher = Matcher(nlp.vocab)
matcher.add("DOG", [[{"LOWER": "golden"}, {"LOWER": "retriever"}]])
doc = nlp("I have a Golden Retriever")

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print("Matched span:", span.text)
    # Get the span's root token and root head token
    print("Root token:", span.root.text)
    print("Root head token:", span.root.head.text)
    # Get the previous token and its POS tag
    print("Previous token:", doc[start - 1].text, doc[start - 1].pos_)

Matched span: Golden Retriever
Root token: Retriever
Root head token: have
Previous token: a DET


# Phrase Matching

In [27]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)
pattern = nlp("Golden Retriever")
matcher.add("DOG", [pattern])
doc = nlp("I have a Golden Retriever")

#Iterate over the matches
for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print("Matched span:", span.text)

Matched span: Golden Retriever
