# spaCy Intro

In [130]:
import spacy

nlp = spacy.load("en_core_web_md")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [131]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [132]:
tokens = nlp("dog cat banana afskfsd")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
banana True 6.700014 False
afskfsd False 0.0 True


## Similarity

In [133]:
tokens = nlp("dog cat banana")

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.80168545
dog banana 0.24327648
cat dog 0.80168545
cat cat 1.0
cat banana 0.28154367
banana dog 0.24327648
banana cat 0.28154367
banana banana 1.0


In [134]:
nyg_article = '''New York Giants defensive lineman Leonard Williams finished the best season of his NFL career by being named NFC Defensive Player of the Week for his dominant effort Sunday against the Dallas Cowboys.

Williams had three sacks, five additional pressures, three tackles for loss and six total stops against Dallas, along with a pass defensed in the Giants’ 23-19 victory.

Williams also won the Defensive Player of the Week honor after a 2.5-sack game against the Seattle Seahawks.

The six-year veteran finished with 11.5 sacks, the first double-digit sack season of his career. Williams’ total is the most by a Giant since Jason Pierre-Paul had 14.5 in 2014.

The Giants finished the season with 40 sacks, their highest total since they had 47 in 2014.

I also like the Dallas Stars.

Williams is heading to free agency after playing the 2020 season on the franchise tag. Best guess is he will be looking for a contract that will put him in the top 10 among defensive linemen, which means an average annual value of at least $17.5 million.'''

In [153]:
nyg_tokens = nlp(nyg_article)

for ent in nyg_tokens.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Dallas Cowboys 185 199 ORG
three 215 220 CARDINAL
five 228 232 CARDINAL
three 255 260 CARDINAL
six 282 285 CARDINAL
Dallas 306 312 GPE
Giants 348 354 ORG
23 356 358 CARDINAL
Seattle Seahawks 463 479 ORG
six-year 486 494 DATE
11.5 517 521 CARDINAL
first 533 538 ORDINAL
Jason Pierre-Paul 624 641 PERSON
14.5 646 650 CARDINAL
2014 654 658 DATE
Giants 665 671 ORG
40 697 699 CARDINAL
47 742 744 CARDINAL
2014 748 752 DATE
Dallas Stars 771 783 ORG
10 950 952 CARDINAL
at least $17.5 million 1017 1039 MONEY


In [94]:
nyg_tokens.ents[0]

New York Giants

In [107]:
train_data = [
("New York Giants defensive lineman Leonard Williams finished the best season of his NFL career by being named NFC Defensive Player of the Week for his dominant effort Sunday against the Dallas Cowboys.", {"entities": [(185,199, "ORG")]}),
("The Dallas Cowboys did not make the playoffs", {"entities": [(4,18, "ORG")]}),
("When will the Dallas Cowboys learn", {"entities": [(14,28, "ORG")]})
]

In [100]:
unlabeled_text = ["New York Giants defensive lineman Leonard Williams finished the best season of his NFL career by being named NFC Defensive Player of the Week for his dominant effort Sunday against the Dallas Cowboys.",
                  "The Dallas Cowboys did not make the playoffs",
                  "When will the Dallas Cowboys learn"]    




In [143]:
sentences = ["The Dallas Cowboys defensive lineman Leonard Williams finished the best season of his NFL career by being named NFC Defensive Player of the Week for his dominant effort Sunday against the Dallas Cowboys."]
ent = "Dallas Cowboys"
label = "Org"

matches = [(sentence, 
            {"entities":
                       [(match.start(), match.end(), label) 
                        for match in re.finditer(ent, sentence)]}) 
           for sentence in sentences]
matches

[('The Dallas Cowboys defensive lineman Leonard Williams finished the best season of his NFL career by being named NFC Defensive Player of the Week for his dominant effort Sunday against the Dallas Cowboys.',
  {'entities': [(4, 18, 'Org'), (188, 202, 'Org')]})]

Create a function that finds that start and stop position of a regular expression to create data labels.

In [144]:
def label_sentences(sentences, ent, label):
        
    labeled_sentences = [(sentence, 
                          {"entities":
                           [(match.start(), match.end(), label) 
                            for match in re.finditer(ent, sentence)]}) 
                         for sentence in sentences]
    return labeled_sentences
        
    

label_sentences(sentences, ent, label)


[('The Dallas Cowboys defensive lineman Leonard Williams finished the best season of his NFL career by being named NFC Defensive Player of the Week for his dominant effort Sunday against the Dallas Cowboys.',
  {'entities': [(4, 18, 'Org'), (188, 202, 'Org')]})]

In [146]:
test_labeled_data = label_sentences(unlabeled_text, ent, "ORG")
test_labeled_data

[('New York Giants defensive lineman Leonard Williams finished the best season of his NFL career by being named NFC Defensive Player of the Week for his dominant effort Sunday against the Dallas Cowboys.',
  {'entities': [(185, 199, 'ORG')]}),
 ('The Dallas Cowboys did not make the playoffs',
  {'entities': [(4, 18, 'ORG')]}),
 ('When will the Dallas Cowboys learn', {'entities': [(14, 28, 'ORG')]})]

In [147]:
test_labeled_data == train_data

False

## Update NER Model

In [148]:
ner = nlp.get_pipe('ner')

In [149]:
optimizer = nlp.entity.create_optimizer()

In [150]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [151]:
import random
from spacy.util import minibatch, compounding
from pathlib import Path

n_iter = 30

with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        batches = minibatch(train_data, 
                            size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch) 
            # Updating the weights
            nlp.update(texts, annotations, sgd=optimizer, 
                       drop=0.35, losses=losses)
            print('Losses', losses)
           

Losses {'ner': 18.006737530231476}
Losses {'ner': 14.538374602794647}
Losses {'ner': 15.33916187286377}
Losses {'ner': 17.33237385749817}
Losses {'ner': 13.108849048614502}
Losses {'ner': 12.376362681388855}
Losses {'ner': 4.871224594360683}
Losses {'ner': 9.965931415557861}
Losses {'ner': 5.838792573660612}
Losses {'ner': 8.780683264136314}
Losses {'ner': 4.55295510738506}
Losses {'ner': 3.2855260060687215}
Losses {'ner': 0.9110950720496476}
Losses {'ner': 0.05860983821821719}
Losses {'ner': 2.3525918616670083}
Losses {'ner': 0.6757902671224656}
Losses {'ner': 1.4689184177861705}
Losses {'ner': 0.30657236148454103}
Losses {'ner': 1.4869060846461934}
Losses {'ner': 0.004045578414945794}
Losses {'ner': 0.10561149495765676}
Losses {'ner': 0.704708021359238}
Losses {'ner': 0.00020388278187510878}
Losses {'ner': 0.044088090775574074}
Losses {'ner': 0.12327467191153119}
Losses {'ner': 0.085254795008332}
Losses {'ner': 0.0020068665102201955}
Losses {'ner': 0.0003760322191510568}
Losses {'ner

In [152]:
test_doc = nlp("The Dallas Cowboys are the worst team in Dallas.  The other Dallas team is the Dallas Stars")
print("Entities", [(ent.text, ent.label_) for ent in test_doc.ents])

Entities [('Dallas Cowboys', 'ORG'), ('Dallas', 'GPE'), ('Dallas', 'GPE'), ('Dallas Stars', 'ORG')]


In [154]:
output_dir = Path('/content/')
output_dir

WindowsPath('/content')

In [162]:
import os
os.getcwd()

'C:\\Users\\nickr\\Documents\\Projects\\NLP\\spacy'

In [187]:
import os 
  
# Directory 
directory = "content\\team_model"
  
# Parent Directory path 
parent_dir = os.getcwd()
parent_dir  
# Path 
output_dir = os.path.join(parent_dir, directory) 
  
# Create the directory 
# 'GeeksForGeeks' in 
# '/home / User / Documents' 
os.makedirs(output_dir) 

In [184]:
os.path.join(parent_dir, "content/team_model") 

'C:\\Users\\nickr\\Documents\\Projects\\NLP\\spacy\\content/team_model'

In [178]:
output_dir
directory = "\\content\\team_model"
os.path.join(parent_dir, directory) 

'C:\\content\\team_model'

In [188]:
# Save the  model to directory
# output_dir = './content/team_model'
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

# Load the saved model and predict
print("Loading from", output_dir)
nlp_updated = spacy.load(output_dir)
doc = nlp_updated(nyg_article)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Saved model to C:\Users\nickr\Documents\Projects\NLP\spacy\content\team_model
Loading from C:\Users\nickr\Documents\Projects\NLP\spacy\content\team_model
Entities [('Dallas Cowboys', 'ORG'), ('three', 'CARDINAL'), ('five', 'CARDINAL'), ('three', 'CARDINAL'), ('six', 'CARDINAL'), ('Dallas', 'GPE'), ('Giants', 'ORG'), ('23', 'CARDINAL'), ('Seattle Seahawks', 'ORG'), ('six-year', 'DATE'), ('11.5', 'CARDINAL'), ('first', 'ORDINAL'), ('Jason Pierre-Paul', 'PERSON'), ('14.5', 'CARDINAL'), ('2014', 'DATE'), ('Giants', 'ORG'), ('40', 'CARDINAL'), ('47', 'CARDINAL'), ('2014', 'DATE'), ('Dallas Stars', 'ORG'), ('10', 'CARDINAL'), ('at least $17.5 million', 'MONEY')]
