# 1.0 spaCy Linguistic Annotations

## 1.1 Importing spaCy and Loading Data

In [74]:
import spacy
import numpy as np

In [75]:
nlp = spacy.load("en_core_web_sm")

In [76]:
with open('data/wiki_us.txt', 'r') as f:
    text = f.read()

In [77]:
print(text)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America. It consists of 50 states, a federal district, five major unincorporated territories, 326 Indian reservations, and some minor possessions.[j] At 3.8 million square miles (9.8 million square kilometers), it is the world's third- or fourth-largest country by total area.[d] The United States shares significant land borders with Canada to the north and Mexico to the south, as well as limited maritime borders with the Bahamas, Cuba, and Russia.[22] With a population of more than 331 million people, it is the third most populous country in the world. The national capital is Washington, D.C., and the most populous city is New York.

Paleo-Indians migrated from Siberia to the North American mainland at least 12,000 years ago, and European colonization began in the 16th century. The United States emerged from the thirteen British colonies est

## 1.2 Creating a Doc Container

In [78]:
# create doc object
doc = nlp(text) #call nlp model

In [79]:
print(doc)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America. It consists of 50 states, a federal district, five major unincorporated territories, 326 Indian reservations, and some minor possessions.[j] At 3.8 million square miles (9.8 million square kilometers), it is the world's third- or fourth-largest country by total area.[d] The United States shares significant land borders with Canada to the north and Mexico to the south, as well as limited maritime borders with the Bahamas, Cuba, and Russia.[22] With a population of more than 331 million people, it is the third most populous country in the world. The national capital is Washington, D.C., and the most populous city is New York.

Paleo-Indians migrated from Siberia to the North American mainland at least 12,000 years ago, and European colonization began in the 16th century. The United States emerged from the thirteen British colonies est

In [80]:
print(len(text))
print(len(doc))

3525
652


In [81]:
# text is calculated by characters

for token in text[0:10]:
    print(token)

T
h
e
 
U
n
i
t
e
d


In [82]:
# doc is calculated by words and punctuations
for token in doc[0:10]:
    print(token)

The
United
States
of
America
(
U.S.A.
or
USA
)


In [83]:
# the reason why using spacy instead of split()
for token in text.split()[:10]:
    print(token)

The
United
States
of
America
(U.S.A.
or
USA),
commonly
known


## 1.3 Sentence Boundary Detection (SDB)

In [84]:
# split each sentences from the paragraph
for sent in doc.sents:
    print(sent)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.
It consists of 50 states, a federal district, five major unincorporated territories, 326 Indian reservations, and some minor possessions.[j]
At 3.8 million square miles (9.8 million square kilometers), it is the world's third- or fourth-largest country by total area.[d]
The United States shares significant land borders with Canada to the north and Mexico to the south, as well as limited maritime borders with the Bahamas, Cuba, and Russia.[22] With a population of more than 331 million people, it is the third most populous country in the world.
The national capital is Washington, D.C., and the most populous city is New York.


Paleo-Indians migrated from Siberia to the North American mainland at least 12,000 years ago, and European colonization began in the 16th century.
The United States emerged from the thirteen British colonies es

In [85]:
sentence1 = list(doc.sents)[0]
print(sentence1)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.


## 1.4 Token Attributes

In [86]:
for token in doc[:10]:
    print(token)

The
United
States
of
America
(
U.S.A.
or
USA
)


In [87]:
token2 = sentence1[2]
print(token2)

States


### 1.4.1 Text

In [88]:
token2.text

'States'

### 1.4.2 Head

In [89]:
token2.head

is

### 1.4.3 Left Edge

In [90]:
token2.left_edge

The

### 1.4.4 Right Edge

In [91]:
token2.right_edge

,

### 1.4.5 Entity Type 

In [92]:
token2.ent_type

384

In [93]:
token2.ent_type_ # return an integer the corresponds to an entity type (give string equivalent) 

'GPE'

### 1.4.6 Ent IOB

In [94]:
# IOB code:
# B = token begins an entity
# I = inside an entity
# O = outside an entity
token2.ent_iob_

'I'

### 1.4.7 Lemma 

In [95]:
# lemma = base form of the token, with no inflection suffixes
token2.lemma_

'States'

In [96]:
sentence1[12].lemma_

'know'

In [97]:
print(sentence1[12])

known


### 1.4.8 Morph 

In [98]:
token2.morph

Number=Sing

In [99]:
sentence1[12].morph

Aspect=Perf|Tense=Past|VerbForm=Part

### 1.4.9 Part of Speech 

In [100]:
token2.pos_

'PROPN'

### 1.4.10 Syntactic Dependency 

In [101]:
token2.dep_

'nsubj'

### 1.4.11 Language 

In [102]:
token2.lang_

'en'

## 1.5 Part of Speech Tagging (POS)

In [103]:
text = 'Mike enjoys playing football.'
doc2 = nlp(text)
print(doc2)

Mike enjoys playing football.


In [104]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Mike PROPN nsubj
enjoys VERB ROOT
playing VERB xcomp
football NOUN dobj
. PUNCT punct


In [105]:
from spacy import displacy
displacy.render(doc2, style='dep')

## 1.6 Named Entity Recognition (NER)

In [106]:
for ent in doc.ents:
    print(ent.text, ent.label_)

The United States of America GPE
U.S.A. GPE
USA GPE
the United States GPE
U.S. GPE
US GPE
America GPE
North America LOC
50 CARDINAL
five CARDINAL
326 CARDINAL
Indian NORP
3.8 million square miles QUANTITY
9.8 million square kilometers QUANTITY
fourth ORDINAL
United States GPE
Canada GPE
Mexico GPE
Bahamas GPE
Cuba GPE
more than 331 million CARDINAL
third ORDINAL
Washington GPE
D.C. GPE
New York GPE
Siberia LOC
North American NORP
at least 12,000 years ago DATE
European NORP
the 16th century DATE
The United States GPE
thirteen CARDINAL
British NORP
the East Coast LOC
Great Britain GPE
the American Revolutionary War ORG
1775â€“1783 CARDINAL
the late 18th century DATE
U.S. GPE
North America LOC
Native Americans NORP
1848 DATE
the United States GPE
United States GPE
the second half of the 19th century DATE
the American Civil War ORG
The Spanishâ€“American War and World War EVENT
U.S. GPE
World War II EVENT
the Cold War EVENT
the United States GPE
the Korean War EVENT
the Vietnam War EVENT


In [107]:
displacy.render(doc, style='ent')

# 2.0 Word Vectors and spaCy

In [108]:
nlp = spacy.load('en_core_web_md')

In [109]:
with open('data/wiki_us.txt', 'r') as f:
    text = f.read()

In [110]:
doc = nlp(text)
sentence1 = list(doc.sents)[0]
print(sentence1)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.


In [111]:
your_word = 'country'
ms = nlp.vocab.vectors.most_similar(np.asarray([nlp.vocab.vectors[nlp.vocab.strings[your_word]]]), n=10)
words = [nlp.vocab.strings[w] for w in ms[0][0]]
distances = ms[2]
print(words)

['country—0,467', 'nationâ\x80\x99s', 'countries-', 'continente', 'Carnations', 'pastille', 'бесплатно', 'Argents', 'Tywysogion', 'Teeters']


In [112]:
doc1 = nlp('I like salty fries and humburgers.')
doc2 = nlp('Fast food tastes very good.')

In [113]:
print(doc1, "<->", doc2, doc1.similarity(doc2))

I like salty fries and humburgers. <-> Fast food tastes very good. 0.6786707181458879


In [114]:
doc3 = nlp('The Empire State Building is in New York.')

In [115]:
print(doc1, "<->", doc3, doc1.similarity(doc3))

I like salty fries and humburgers. <-> The Empire State Building is in New York. 0.1840035410956762


In [116]:
doc4 = nlp('I enjoy oranges.')
doc5 = nlp('I enjoy apples.')

In [117]:
print(doc4, "<->", doc5, doc4.similarity(doc5))

I enjoy oranges. <-> I enjoy apples. 0.9775702131220241


In [118]:
doc6 = nlp('I enjoy burgers.')

In [119]:
print(doc4, "<->", doc6, doc4.similarity(doc6))

I enjoy oranges. <-> I enjoy burgers. 0.9628306772893752


In [120]:
french_fries = nlp('salty fries')
burgers = nlp('hamburgers')
print(french_fries, "<->", burgers, french_fries.similarity(burgers))

salty fries <-> hamburgers 0.6938489398584435


# 3.0 spaCy Pipelines

## 3.1 Attribute Ruler
- Dependency Parser
- EntityLinker
- EntityRecognizer
- EntityRuler
- Lemmatizer
- Morpholog
- SentenceRecognizer
- Sentencizer
- SpanCategorizer
- Tagger
- TextCategorizer
- Tok2Vec
- Tokenizer
- TrainablePipe
- Transform

## 3.2 Matchers
- DependencyMatcher
- Matcher
- PhraseMatcher

## 3.3 How a Add Pipeline

In [121]:
nlp = spacy.blank('en')

In [122]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x180cd3f9100>

In [123]:
import requests
from bs4 import BeautifulSoup
s = requests.get("https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt")
soup = BeautifulSoup(s.content).text.replace("-\n", "").replace("\n", " ")
nlp.max_length = 5278439

In [124]:
%%time
doc = nlp(soup)
print (len(list(doc.sents)))

94134
Wall time: 9.57 s


In [125]:
nlp2 = spacy.load("en_core_web_sm")
nlp2.max_length = 5278439

In [126]:
%%time
doc = nlp2(soup)
print (len(list(doc.sents)))

99388
Wall time: 6min 43s


In [127]:
nlp.analyze_pipes()

{'summary': {'sentencizer': {'assigns': ['token.is_sent_start', 'doc.sents'],
   'requires': [],
   'scores': ['sents_f', 'sents_p', 'sents_r'],
   'retokenizes': False}},
 'problems': {'sentencizer': []},
 'attrs': {'doc.sents': {'assigns': ['sentencizer'], 'requires': []},
  'token.is_sent_start': {'assigns': ['sentencizer'], 'requires': []}}}

In [128]:
nlp2.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'parser': [],
  'attribute_ruler': [],
  'lemmatizer': [],
  'ner': []},
 'att

# 5.0 spaCy's EntityRuler

In [129]:
nlp = spacy.load('en_core_web_sm')
text = 'West Chestertenfieldville was referenced in Mr. Deeds.'

In [130]:
doc = nlp(text)

In [131]:
for ent in doc.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville LOC
Deeds PERSON


In [132]:
ruler = nlp.add_pipe('entity_ruler')

In [133]:
patterns = [
    {"label": "GPE", "pattern": "West Chestertenfieldville"}
]

In [134]:
ruler.add_patterns(patterns)

In [135]:
doc2 = nlp(text)
for ent in doc2.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville LOC
Deeds PERSON


In [136]:
nlp2 = spacy.load('en_core_web_sm')

In [137]:
ruler = nlp2.add_pipe('entity_ruler', before='ner')

In [138]:
ruler.add_patterns(patterns)

In [139]:
doc = nlp2(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville GPE
Deeds PERSON


In [140]:
nlp2.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [141]:
nlp3 = spacy.load('en_core_web_sm')

In [142]:
ruler = nlp3.add_pipe('entity_ruler', before='ner')

In [143]:
patterns = [
    {"label": "GPE", "pattern": "West Chestertenfieldville"},
    {"label": "FILM", "pattern": "Mr. Deeds"},
]

In [144]:
ruler.add_patterns(patterns)

In [145]:
doc = nlp3(text)

In [146]:
for ent in doc.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville GPE
Mr. Deeds FILM


# 6.0 spacy Matcher

In [147]:
from spacy.matcher import Matcher

In [148]:
nlp = spacy.load('en_core_web_sm')

In [149]:
matcher = Matcher(nlp.vocab)
pattern = [{'LIKE_EMAIL': True}]
matcher.add('EMAIL_ADDRESS', [pattern])

In [151]:
doc = nlp("This is an email address: lydia08248@yahoo.com")
matches = matcher(doc)

In [152]:
print(matches)

[(16571425990740197027, 6, 7)]


In [153]:
print(nlp.vocab[matches[0][0]].text)

EMAIL_ADDRESS
