In [None]:
### Basic

In [3]:
import spacy

In [6]:
nlp = spacy.load("en_core_web_sm")

In [None]:
with open ("data/wiki_us.txt", "r") as f:
    text= f.read()
    

In [22]:
#Creating a Doc Container
doc = nlp(text)

In [9]:
print(len(text))

3525


In [10]:
print(len(doc))

652


In [12]:
for token in text[0:10]:
    print(token)

T
h
e
 
U
n
i
t
e
d


In [16]:
for token in doc[:10]:
    print(token)

The
United
States
of
America
(
U.S.A.
or
USA
)


In [18]:
#sentence boundary detection 
for sent in doc.sents:
    print(sent)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.
It consists of 50 states, a federal district, five major unincorporated territories, 326 Indian reservations, and some minor possessions.[j]
At 3.8 million square miles (9.8 million square kilometers), it is the world's third- or fourth-largest country by total area.[d]
The United States shares significant land borders with Canada to the north and Mexico to the south, as well as limited maritime borders with the Bahamas, Cuba, and Russia.[22] With a population of more than 331 million people, it is the third most populous country in the world.
The national capital is Washington, D.C., and the most populous city is New York.


Paleo-Indians migrated from Siberia to the North American mainland at least 12,000 years ago, and European colonization began in the 16th century.
The United States emerged from the thirteen British colonies es

In [21]:
#Convert generator into list
sen1 = list(doc.sents)[0]
print(sen1)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.


### Token attributes

In [23]:
for token in doc[:10]:
    print(token)

The
United
States
of
America
(
U.S.A.
or
USA
)


In [28]:
token2 = sen1[2]
print(token2)

States


In [29]:
token2.text

'States'

In [30]:
token2.left_edge

The

In [32]:
token2.right_edge

,

In [34]:
#Entity type
token2.ent_type

384

In [36]:
#iob - entity code, word "States" is inside of larger entity
token2.ent_iob_

'I'

In [37]:
#What the word looks like with no word inflection
token2.lemma_

'States'

In [43]:
#Original form and lemma
print(sen1[12])
sen1[12].lemma_


known


'know'

In [47]:
#Morphology of a word
token2.morph

Number=Sing

In [53]:
#Part of speech
#PROPN - proper noun
token2.pos_

'PROPN'

In [55]:
#Dependency relation
token2.dep_

'nsubj'

In [57]:
#Language of the doc object
token2.lang_

'en'

### Linguistic annotations

In [58]:
text= "Mike enjoys playing football"
doc2 = nlp(text)
print(doc2)

In [59]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Mike PROPN nsubj
enjoys VERB ROOT
playing VERB xcomp
football NOUN dobj


In [60]:
from spacy import displacy
displacy.render(doc2, style="dep")

In [None]:
### Named Entity Recognition

In [64]:
#Small model (doesnt contain word vectors), makes some mistakes
for ent in doc.ents:
    print(ent.text, ent.label_)

The United States of America GPE
U.S.A. GPE
USA GPE
the United States GPE
U.S. GPE
US GPE
America GPE
North America LOC
50 CARDINAL
five CARDINAL
326 CARDINAL
Indian NORP
3.8 million square miles QUANTITY
9.8 million square kilometers QUANTITY
fourth ORDINAL
United States GPE
Canada GPE
Mexico GPE
Bahamas GPE
Cuba GPE
more than 331 million CARDINAL
third ORDINAL
Washington GPE
D.C. GPE
New York GPE
Siberia LOC
North American NORP
at least 12,000 years ago DATE
European NORP
the 16th century DATE
The United States GPE
thirteen CARDINAL
British NORP
the East Coast LOC
Great Britain GPE
the American Revolutionary War ORG
1775â€“1783 CARDINAL
the late 18th century DATE
U.S. GPE
North America LOC
Native Americans NORP
1848 DATE
the United States GPE
United States GPE
the second half of the 19th century DATE
the American Civil War ORG
The Spanishâ€“American War and World War EVENT
U.S. GPE
World War II EVENT
the Cold War EVENT
the United States GPE
the Korean War EVENT
the Vietnam War EVENT


In [65]:
#Wikipedia data is usually included in training process
displacy.render(doc, style="ent")

### Word vectors

In [66]:
!python3 -m spacy download en_core_web_md

Collecting en-core-web-md==3.4.0
[0m  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.0/en_core_web_md-3.4.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
#Using medium model since it contains word vectors 

In [67]:
sen1[0].vector

array([-4.7417247e-01, -1.2433453e+00, -8.2698214e-01,  5.3927749e-01,
        9.6928227e-01, -1.3288363e+00,  3.0219358e-01,  1.0057645e+00,
       -7.2275376e-01, -5.1134312e-01, -4.3427974e-01,  8.1987673e-01,
       -9.4222236e-01, -7.7374637e-01, -4.1495323e-01,  1.6128494e+00,
        1.0609403e+00, -1.4666574e+00,  1.0022087e+00, -8.1486106e-01,
       -7.1720344e-01,  2.4605086e-01,  1.2093654e-01, -2.0303577e-03,
        9.1325152e-01, -9.6433628e-01,  4.8440075e-01, -8.3424145e-01,
       -1.5584639e-01,  2.3160326e+00, -3.6400270e-01, -2.0954418e-01,
       -4.1756713e-01,  2.2354668e-01,  2.2343770e-01,  1.7631934e+00,
       -7.2438806e-01, -2.4595323e-01,  7.5045541e-02, -5.2581841e-01,
       -8.0298591e-01,  2.1506948e+00, -7.3358738e-01, -3.7619203e-01,
        4.0049046e-01, -1.8522177e+00,  5.2772880e-01,  1.9132565e+00,
        1.0864006e+00, -1.6947467e+00, -1.1881056e+00, -4.3570077e-01,
       -6.2801197e-02, -9.2769229e-01,  5.3763533e-01,  2.8661888e+00,
      

In [4]:
nlp = spacy.load("en_core_web_md")

In [69]:
with open ("data/wiki_us.txt", "r") as f:
    text = f.read();

In [70]:
doc = nlp(text)
sen1 = list(doc.sents)[0]
print(sen1)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.


In [None]:
#How the word country is similar to other words within our model's word embeddings

In [73]:
import numpy as np
#https://stackoverflow.com/questions/54717449/mapping-word-vector-to-the-most-similar-closest-word-using-spacy
your_word = "dog"

ms = nlp.vocab.vectors.most_similar(
    np.asarray([nlp.vocab.vectors[nlp.vocab.strings[your_word]]]), n=10)
words = [nlp.vocab.strings[w] for w in ms[0][0]]
distances = ms[2]
print(words)

['dogsbody', 'wolfdogs', 'Baeg', 'duppy', 'pet(s', 'postcanine', 'Kebira', 'uppies', 'Toropets', 'moggie']


In [74]:
doc1 = nlp("I like salty fries and hamburgers.")
doc2 = nlp("Fast food tastes very good.")

In [75]:
print(doc1, "<->", doc2, doc1.similarity(doc2))

I like salty fries and hamburgers. <-> Fast food tastes very good. 0.691649353055761


In [76]:
doc3 = nlp("The Empire State Building is in New York")

In [77]:
print(doc1, "<->", doc3, doc1.similarity(doc3))

I like salty fries and hamburgers. <-> The Empire State Building is in New York 0.08360484380921088


In [None]:
https://www.youtube.com/watch?v=dIUTsFT2MeQ&t=3644s

In [21]:
doc4 = nlp("I enjoy apples.")

In [22]:
doc5 = nlp("I enjoy oranges.")

In [23]:
print(doc4, "<->", doc5, doc4.similarity(doc5))

I enjoy apples. <-> I enjoy oranges. 0.977570143948367


### Pipelines

In [24]:
nlp = spacy.blank("en")

In [25]:
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x13ec49c00>

In [30]:
nlp.analyze_pipes()

{'summary': {'sentencizer': {'assigns': ['token.is_sent_start', 'doc.sents'],
   'requires': [],
   'scores': ['sents_f', 'sents_p', 'sents_r'],
   'retokenizes': False}},
 'problems': {'sentencizer': []},
 'attrs': {'token.is_sent_start': {'assigns': ['sentencizer'], 'requires': []},
  'doc.sents': {'assigns': ['sentencizer'], 'requires': []}}}

In [26]:
import requests
from bs4 import BeautifulSoup
s = requests.get("https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt")
soup = BeautifulSoup(s.content).text.replace("-\n", "").replace("\n", " ")
nlp.max_length = 5278439

In [27]:
%%time
doc = nlp(soup)
print (len(list(doc.sents)))

94134
CPU times: user 6.63 s, sys: 52.7 ms, total: 6.69 s
Wall time: 6.69 s


In [31]:
nlp2 = spacy.load("en_core_web_sm")

In [32]:
nlp2.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'parser': [],
  'attribute_ruler': [],
  'lemmatizer': [],
  'ner': []},
 'att

### Entity Ruler
Rule based vs machine learning approach

In [5]:
import spacy

In [6]:
nlp = spacy.load("en_core_web_sm")
text = "West Chersterenfieldville was referenced in Mr. Deeds."

In [7]:
doc = nlp(text)

In [8]:
for ent in doc.ents:
    print(ent.text, ent.label_)

West Chersterenfieldville LOC
Deeds PERSON


In [9]:
ruler = nlp.add_pipe("entity_ruler")

In [12]:
patterns = [
    {"label" : "GPE", "pattern" : "West Chersterenfieldville"}
]

In [13]:
ruler.add_patterns(patterns)

In [14]:
doc2 = nlp(text)
for ent in doc2.ents:
    print(ent.text, ent.label_)

West Chersterenfieldville LOC
Deeds PERSON


In [20]:
nlp2 = spacy.load("en_core_web_sm")

In [21]:
ruler.add_patterns(patterns)

In [22]:
for ent in doc.ents:
    print(ent.text, ent.label_)

West Chersterenfieldville LOC
Deeds PERSON


In [23]:
nlp3 = spacy.load("en_core_web_sm")

In [24]:
ruler = nlp3.add_pipe("entity_ruler", before = "ner")

In [26]:
patterns = [
    {"label" : "GPE", "pattern" : "West Chersterenfieldville"},
    {"label" : "FILM", "pattern" : "Mr. Deeds"}
]

In [27]:
ruler.add_patterns(patterns)

In [28]:
for ent in doc.ents:
    print(ent.text, ent.label_)

West Chersterenfieldville LOC
Deeds PERSON


Topoynm resolution

### spaCy Matcher

In [30]:
import spacy
from spacy.matcher import Matcher

In [31]:
nlp = spacy.load("en_core_web_sm")

In [33]:
matcher = Matcher(nlp.vocab)
pattern = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL_ADDRESS", [pattern])

In [39]:
doc = nlp("This is an email address: lukaLB@gmail.com")
matches = matcher(doc)

In [40]:
print(matches)

[(16571425990740197027, 6, 7)]


In [42]:
print(nlp.vocab[matches[0][0]].text)

EMAIL_ADDRESS


In [43]:
with open("data/wiki_mlk.txt", "r") as f:
    text = f.read()

In [44]:
print(text)

Martin Luther King Jr. (born Michael King Jr.; January 15, 1929 – April 4, 1968) was an American Baptist minister and activist, one of the most prominent leaders in the civil rights movement from 1955 until his assassination in 1968. An African American church leader and the son of early civil rights activist and minister Martin Luther King Sr., King advanced civil rights for people of color in the United States through nonviolence and civil disobedience. Inspired by his Christian beliefs and the nonviolent activism of Mahatma Gandhi, he led targeted, nonviolent resistance against Jim Crow laws and other forms of discrimination.

King participated in and led marches for the right to vote, desegregation, labor rights, and other civil rights.[1] He oversaw the 1955 Montgomery bus boycott and later became the first president of the Southern Christian Leadership Conference (SCLC). As president of the SCLC, he led the unsuccessful Albany Movement in Albany, Georgia, and helped organize some

In [45]:
nlp = spacy.load("en_core_web_sm")

In [48]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS" : "PROPN"}]
matcher.add("PROPER_NOUN", [pattern])
doc = nlp(text)
matches = matcher(doc)
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

113
(451313080118390996, 0, 1) Martin
(451313080118390996, 1, 2) Luther
(451313080118390996, 2, 3) King
(451313080118390996, 3, 4) Jr.
(451313080118390996, 6, 7) Michael
(451313080118390996, 7, 8) King
(451313080118390996, 8, 9) Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 15, 16) April
(451313080118390996, 62, 63) Martin


In [50]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS" : "PROPN", "OP" : "+"}]
matcher.add("PROPER_NOUN", [pattern])
doc = nlp(text)
matches = matcher(doc)
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

194
(451313080118390996, 0, 1) Martin
(451313080118390996, 0, 2) Martin Luther
(451313080118390996, 1, 2) Luther
(451313080118390996, 0, 3) Martin Luther King
(451313080118390996, 1, 3) Luther King
(451313080118390996, 2, 3) King
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 1, 4) Luther King Jr.
(451313080118390996, 2, 4) King Jr.
(451313080118390996, 3, 4) Jr.


In [53]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS" : "PROPN", "OP" : "+"}]
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1]) 
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

66
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 6, 9) Michael King Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 15, 16) April
(451313080118390996, 62, 67) Martin Luther King Sr.
(451313080118390996, 68, 69) King
(451313080118390996, 78, 80) United States
(451313080118390996, 96, 98) Mahatma Gandhi
(451313080118390996, 106, 108) Jim Crow
(451313080118390996, 116, 117) King


In [54]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS" : "PROPN", "OP" : "+"}, {"POS" : "VERB"}]
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1]) 
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

6
(451313080118390996, 68, 70) King advanced
(451313080118390996, 116, 118) King participated
(451313080118390996, 256, 258) SCLC put
(451313080118390996, 310, 315) Director J. Edgar Hoover considered
(451313080118390996, 382, 384) King won
(451313080118390996, 523, 526) United States beginning
