# 01 Introduction to NER (Named Entity Recognition)

In [1]:
"""
Key Terminology:

    Natural Language Processing (NLP)
    
    Named Entity Recognition (NER)
    
    Information Extraction (IE)
    
    Gazetteer (Rules-Based Method)
    
    Linguistic Ambiguity
    
    Domain Adaptation
    
    Generalize



Key Libraries:

    NLP           ==> spaCy & NLTK
    
    Word Vectors  ==> Gensim
"""

# images
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://miro.medium.com/max/602/1*bx85lgIdG9PWdCCnfNpsjQ.png")

# 02 Gazetteer and NER (Rules-Based NER)

In [2]:
import requests

r = requests.get('''http://www.pauladaunt.com/books/Children's/Harry_Potter1-4/J.%20K.%20Rowling%20-%20Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt''')
hp = r.content.decode('utf-8')
print(hp[:500])

Harry Potter and the Sorcerer's Stone


CHAPTER ONE

THE BOY WHO LIVED

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They were the last
people you'd expect to be involved in anything strange or mysterious,
because they just didn't hold with such nonsense.

Mr. Dursley was the director of a firm called Grunnings, which made
drills. He was a big, beefy man with hardly any neck, although he did
have a very large mustache


https://query.wikidata.org/#SELECT%20%3Fitem%20%3FitemLabel%20%0AWHERE%20%0A%7B%0A%20%20%3Fitem%20wdt%3AP1441%20wd%3AQ8337.%0A%20%20%3Fitem%20wdt%3AP31%20wd%3AQ3658341.%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22%5BAUTO_LANGUAGE%5D%2Cen%22.%20%7D%0A%7D

In [3]:
import pandas as pd
#fetching data from Wikidata

##catching empty labels
def try2unpack(x):
    try:
        x=x['value']
    except (KeyError, TypeError):
        pass
    
    try:
        if 'http://www.wikidata.org/entity/Q' in x or 'http://www.wikidata.org/entity/P' in x:
            return x.rsplit('/',1)[1]
        else:
            return x
    except TypeError:
        return x

##convert fetched data to dataframe
def json2pandas(data):
    return pd.DataFrame(data['results']['bindings'], columns=data['head']['vars']).applymap(lambda x: try2unpack(x))

##example query    
url = 'https://query.wikidata.org/sparql'


query = """
SELECT ?item ?itemLabel 
WHERE 
{
  ?item wdt:P1441 wd:Q8337.
  ?item wdt:P31 wd:Q3658341.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
""".strip()
r = requests.get(url, params = {'format': 'json', 'query': query})
if (r.status_code == 414 | r.status_code == 431 ):
    r = requests.post(url, params = {'format': 'json', 'query': query})
status=r.status_code
data = r.json()
characters=json2pandas(data)
characters = characters['itemLabel'].values
characters[:10]

array(['Ron Weasley', 'Hermione Granger', 'Lord Voldemort',
       'Severus Snape', 'Rubeus Hagrid', 'Draco Malfoy', 'Ginny Weasley',
       'Luna Lovegood', 'Neville Longbottom', 'Minerva McGonagall'],
      dtype=object)

In [4]:
import re
text = hp.split('\n\n')[4]

print(text)
text = text.replace("\n", " ").strip()
text = re.sub('[^\w ]+','', text)
text

Mr. Dursley was the director of a firm called Grunnings, which made
drills. He was a big, beefy man with hardly any neck, although he did
have a very large mustache. Mrs. Dursley was thin and blonde and had
nearly twice the usual amount of neck, which came in very useful as she
spent so much of her time craning over garden fences, spying on the
neighbors. The Dursleys had a small son called Dudley and in their
opinion there was no finer boy anywhere.


'Mr Dursley was the director of a firm called Grunnings which made drills He was a big beefy man with hardly any neck although he did have a very large mustache Mrs Dursley was thin and blonde and had nearly twice the usual amount of neck which came in very useful as she spent so much of her time craning over garden fences spying on the neighbors The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere'

In [5]:
words = text.split(' ')
print(words)

['Mr', 'Dursley', 'was', 'the', 'director', 'of', 'a', 'firm', 'called', 'Grunnings', 'which', 'made', 'drills', 'He', 'was', 'a', 'big', 'beefy', 'man', 'with', 'hardly', 'any', 'neck', 'although', 'he', 'did', 'have', 'a', 'very', 'large', 'mustache', 'Mrs', 'Dursley', 'was', 'thin', 'and', 'blonde', 'and', 'had', 'nearly', 'twice', 'the', 'usual', 'amount', 'of', 'neck', 'which', 'came', 'in', 'very', 'useful', 'as', 'she', 'spent', 'so', 'much', 'of', 'her', 'time', 'craning', 'over', 'garden', 'fences', 'spying', 'on', 'the', 'neighbors', 'The', 'Dursleys', 'had', 'a', 'small', 'son', 'called', 'Dudley', 'and', 'in', 'their', 'opinion', 'there', 'was', 'no', 'finer', 'boy', 'anywhere']


In [6]:
for word in words:
    if word in characters:
        print(word)

In [7]:
character_names = []
for character in characters:
    names = character.split()
    for name in names:
        character_names.append(name.strip())
character_names[:5]

['Ron', 'Weasley', 'Hermione', 'Granger', 'Lord']

In [8]:
for word in words:
    if word in character_names:
        print(word)

Dursley
the
Dursley
the
the
The
Dudley


In [9]:
stopwords = ['The', 'the']
i = 0
for word in words:
    if word in character_names and word not in stopwords:
        if words[i-1][0].isupper():
            print(f"""Found Character: {words[i-1]} {word}""")
        else:
            print(f"""Found Character: {word}""")
    i+=1
text

Found Character: Mr Dursley
Found Character: Mrs Dursley
Found Character: Dudley


'Mr Dursley was the director of a firm called Grunnings which made drills He was a big beefy man with hardly any neck although he did have a very large mustache Mrs Dursley was thin and blonde and had nearly twice the usual amount of neck which came in very useful as she spent so much of her time craning over garden fences spying on the neighbors The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere'

In [10]:
text_i = 1
for text in hp.split('\n\n')[:30]:
    text = text.replace("\n", " ").strip()
    text = re.sub('[^\w ]+','', text)
    words = text.split(' ')

    i = 0
    for word in words:
        try:
            if word in character_names and word not in stopwords:
                if words[i-1][0].isupper():
                    print(f"""Found Character in ({text_i}): {words[i-1]} {word}""")
                else:
                    print(f"""Found Character in ({text_i}): {word}""")
        except IndexError:
            pass
        i+=1
    text_i += 1

Found Character in (1): Stone Harry
Found Character in (1): Harry Potter
Found Character in (4): Mrs Dursley
Found Character in (5): Mr Dursley
Found Character in (5): Mrs Dursley
Found Character in (5): Dudley
Found Character in (6): Mrs Potter
Found Character in (6): Mrs Dursley
Found Character in (6): Dudley
Found Character in (7): Mrs Dursley
Found Character in (7): Mr Dursley
Found Character in (7): Mrs Dursley
Found Character in (7): Dudley
Found Character in (9): Mr Dursley
Found Character in (9): Mrs Dursley
Found Character in (9): Dudley
Found Character in (9): Dudley
Found Character in (9): Mr Dursley
Found Character in (10): Mr Dursley
Found Character in (10): Mr Dursley
Found Character in (10): Mr Dursley
Found Character in (10): Mr Dursley
Found Character in (11): Mr Dursley
Found Character in (11): Mr Dursley
Found Character in (11): Mr Dursley
Found Character in (11): Mr Dursley
Found Character in (12): Mr Dursley
Found Character in (12): Mr Dursley
Found Character in (1

# 03 Introduction to Machine Learning NER

In [11]:
text = hp.split('\n\n')[4]
text = text.replace("\n", " ").strip()
#text = re.sub('[^\w ]+','', text)
text

'Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.'

In [12]:
import spacy
# spacy.load("en_core_web_lg") # large
# spacy.load("en_core_web_sm") # small

#install
#!python3 -m spacy download en_core_web_sm

In [13]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

Dursley PERSON
Grunnings ORG
Dursley PERSON
Dursleys PERSON
Dudley PERSON


In [14]:
doc = nlp(text.replace('Mr. Dursley', 'Olympe Maxime'))

for ent in doc.ents:
    print(ent.text, ent.label_)

Grunnings ORG
Dursley PERSON
Dursleys PERSON
Dudley PERSON


In [15]:
doc = nlp(text.replace('Mr. Dursley', 'Nagini'))

for ent in doc.ents:
    print(ent.text, ent.label_)

Nagini ORG
Grunnings ORG
Dursley PERSON
Dursleys PERSON
Dudley PERSON


In [16]:
doc = nlp(text + " Today is Tuesday the 18th. Two is larger than 1.")

for ent in doc.ents:
    print(ent.text, ent.label_)

Dursley PERSON
Grunnings ORG
Dursley PERSON
Dursleys PERSON
Dudley PERSON
Today DATE
Two CARDINAL
1 CARDINAL


In [17]:
doc = nlp(text + " Today is November the 18th. Two is larger than 1.")

for ent in doc.ents:
    print(ent.text, ent.label_)

Dursley PERSON
Grunnings ORG
Dursley PERSON
Dursleys PERSON
Dudley PERSON
Two CARDINAL
1 CARDINAL


In [18]:
#what does spacy?
Image(url= "https://miro.medium.com/max/2400/0*K5a1Ws_nsbEjhbYk.png")

# 04 using spaCy's Named Entity Recognition

In [19]:
characters[:10]

array(['Ron Weasley', 'Hermione Granger', 'Lord Voldemort',
       'Severus Snape', 'Rubeus Hagrid', 'Draco Malfoy', 'Ginny Weasley',
       'Luna Lovegood', 'Neville Longbottom', 'Minerva McGonagall'],
      dtype=object)

In [20]:
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
import json

In [21]:
re.sub("""[Tt]he|[Aa]nd""", "", """The the and And Theodor""")

'    odor'

In [22]:
re.sub("""[Tt]he |[Aa]nd""", "", """The the and And Theodor""")

'  Theodor'

In [23]:
new_characters=[]
for char in characters:
    char = re.sub("""[Tt]he|""", "", char)
    char = re.sub("""\s+""", " ", char)
    new_characters.append(char)

new_characters[:10]

['Ron Weasley',
 'Hermione Granger',
 'Lord Voldemort',
 'Severus Snape',
 'Rubeus Hagrid',
 'Draco Malfoy',
 'Ginny Weasley',
 'Luna Lovegood',
 'Neville Longbottom',
 'Minerva McGonagall']

In [24]:
titles = ["Dr.", "Professor", "Mr.","Mrs.", "Ms.", "Miss", "Aunt", "Uncle", "Mr. and Mrs."]

final_characters = []
for char in new_characters:
    final_characters.append(char.strip())
    final_characters.append(char.split()[0].strip())
    final_characters.append(char.split()[-1].strip())
    for title in titles:
        titled_char = f"{title} {char.split()[-1]}"
        if titled_char not in char:
            final_characters.append(titled_char.strip())
            
final_characters[:10]

['Ron Weasley',
 'Ron',
 'Weasley',
 'Dr. Weasley',
 'Professor Weasley',
 'Mr. Weasley',
 'Mrs. Weasley',
 'Ms. Weasley',
 'Miss Weasley',
 'Aunt Weasley']

In [25]:
def creat_training_data(data, data_type):
    patterns = []
    
    for item in data:
        pattern = {
            "label": data_type,
            "pattern": item           
        }
        patterns.append(pattern)
    return patterns
        
patterns = creat_training_data(data=sorted(final_characters), data_type="PERSON")

print(patterns[:10])

[{'label': 'PERSON', 'pattern': 'Abbott'}, {'label': 'PERSON', 'pattern': 'Aberforth'}, {'label': 'PERSON', 'pattern': 'Aberforth Dumbledore'}, {'label': 'PERSON', 'pattern': 'Abraxas'}, {'label': 'PERSON', 'pattern': 'Abraxas Malfoy'}, {'label': 'PERSON', 'pattern': 'Alastor'}, {'label': 'PERSON', 'pattern': 'Alastor Moody'}, {'label': 'PERSON', 'pattern': 'Albert'}, {'label': 'PERSON', 'pattern': 'Albert Runcorn'}, {'label': 'PERSON', 'pattern': 'Albus'}]


In [26]:
def generate_rules(patterns):
    nlp = English()
    ruler = EntityRuler(nlp)
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    nlp.to_disk('hp_ner')
    
generate_rules(patterns)

In [27]:
nlp = spacy.load('hp_ner')

In [28]:
text

'Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.'

In [29]:
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

Mr. Dursley PERSON
Mrs. Dursley PERSON
Dudley PERSON


In [30]:
doc = nlp(text.replace('Mr. Dursley', 'Nagini'))

for ent in doc.ents:
    print(ent.text, ent.label_)

Nagini PERSON
Mrs. Dursley PERSON
Dudley PERSON


# 05 Training a spaCy NER model

In [31]:
import spacy
import random

In [32]:
# TRAIN_DATA[(text, {"entities":[(start_entity, end_entity, label)]})]

In [33]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Nagini 0 6 PERSON
Mrs. Dursley 161 173 PERSON
Dudley 389 395 PERSON


In [34]:
text = hp.split('\n\n')[3]
text = text.replace("\n", " ").strip()
#text = re.sub('[^\w ]+','', text)

doc = nlp(text)
doc

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.

In [35]:
from tqdm import tqdm

In [36]:
train_data = []

for text in tqdm(hp.split('\n\n')):
    text = text.replace("\n", " ").strip()
    doc = nlp(text)
    
    
    entities = []

    for ent in doc.ents:
        entities.append( (ent.start_char, ent.end_char, ent.label_) )

    if len(entities)>0:
        result = [text, {"entities": entities}]
        train_data.append(result)        

print(train_data[:5])

100%|██████████| 3032/3032 [00:00<00:00, 3037.94it/s]

[["Harry Potter and the Sorcerer's Stone", {'entities': [(0, 12, 'PERSON')]}], ["Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.", {'entities': [(0, 20, 'PERSON')]}], ['Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.', {'entities': [(0, 11, 'PERSON'), (166, 178, 'PERSON'), (394, 400, 'PERSON')]}], ["The Dursleys had everything they wanted, but they also had a secret, and thei




In [37]:

def train_spacy(data, iterations):
    train_data = data
    nlp = spacy.blank("en")
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
        
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
            
            
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in tqdm(range(iterations)):
            print("Starting iteration " + str(itn))
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                nlp.update(
                    [text],
                    [annotations],
                    drop = 0.2,
                    sgd = optimizer,
                    losses = losses
                )
            print(losses)
    return nlp

#nlp= train_spacy(data = train_data, iterations = 30)
#nlp.to_disk("hp_ner_model")

"""
Starting iteration 0
{'ner': 964.6060144836445}
Starting iteration 1
{'ner': 470.0375200238501}
Starting iteration 2
{'ner': 276.42840668701115}
Starting iteration 3


....


Starting iteration 27
{'ner': 56.020043512604765}
Starting iteration 28
{'ner': 52.933951228180256}
Starting iteration 29
{'ner': 40.17626116715598}
"""

  proc.begin_training(
  proc.begin_training(
  0%|          | 0/30 [00:00<?, ?it/s]

Starting iteration 0


  3%|▎         | 1/30 [01:57<56:54, 117.73s/it]

{'ner': 1190.1212724667828}
Starting iteration 1


  7%|▋         | 2/30 [04:08<58:39, 125.69s/it]

{'ner': 257.4625758977645}
Starting iteration 2


 10%|█         | 3/30 [06:19<57:38, 128.10s/it]

{'ner': 179.01278866814081}
Starting iteration 3


 13%|█▎        | 4/30 [08:30<55:57, 129.12s/it]

{'ner': 127.28947739240309}
Starting iteration 4


 17%|█▋        | 5/30 [10:41<54:07, 129.90s/it]

{'ner': 116.12218869039317}
Starting iteration 5


 20%|██        | 6/30 [12:53<52:14, 130.60s/it]

{'ner': 94.42329238422118}
Starting iteration 6


 23%|██▎       | 7/30 [15:05<50:12, 130.98s/it]

{'ner': 107.01615194412345}
Starting iteration 7


 27%|██▋       | 8/30 [17:17<48:08, 131.32s/it]

{'ner': 112.49989755267833}
Starting iteration 8


 30%|███       | 9/30 [19:30<46:05, 131.70s/it]

{'ner': 58.84788216660837}
Starting iteration 9


 33%|███▎      | 10/30 [21:46<44:24, 133.22s/it]

{'ner': 73.65939048508356}
Starting iteration 10


 37%|███▋      | 11/30 [25:00<48:04, 151.84s/it]

{'ner': 103.4645736179386}
Starting iteration 11


 40%|████      | 12/30 [28:09<48:56, 163.16s/it]

{'ner': 62.571016549120245}
Starting iteration 12


 43%|████▎     | 13/30 [31:15<48:07, 169.85s/it]

{'ner': 81.914285995891}
Starting iteration 13


 47%|████▋     | 14/30 [34:15<46:09, 173.07s/it]

{'ner': 72.33918153999785}
Starting iteration 14


 50%|█████     | 15/30 [36:50<41:51, 167.46s/it]

{'ner': 78.40935126072387}
Starting iteration 15


 53%|█████▎    | 16/30 [39:31<38:38, 165.58s/it]

{'ner': 78.64013137795072}
Starting iteration 16


 57%|█████▋    | 17/30 [42:05<35:09, 162.26s/it]

{'ner': 100.8149738604597}
Starting iteration 17


 60%|██████    | 18/30 [44:52<32:41, 163.45s/it]

{'ner': 37.07076607113968}
Starting iteration 18


 63%|██████▎   | 19/30 [48:08<31:45, 173.25s/it]

{'ner': 77.01631055070682}
Starting iteration 19


 67%|██████▋   | 20/30 [51:26<30:06, 180.69s/it]

{'ner': 46.55958342258336}
Starting iteration 20


 70%|███████   | 21/30 [55:33<30:06, 200.68s/it]

{'ner': 94.88697055921342}
Starting iteration 21


 73%|███████▎  | 22/30 [59:04<27:10, 203.76s/it]

{'ner': 57.82613893190236}
Starting iteration 22


 77%|███████▋  | 23/30 [1:03:11<25:17, 216.72s/it]

{'ner': 60.673335081962755}
Starting iteration 23


 80%|████████  | 24/30 [1:06:58<21:58, 219.69s/it]

{'ner': 82.14718610364297}
Starting iteration 24


 83%|████████▎ | 25/30 [1:10:43<18:26, 221.32s/it]

{'ner': 72.62374652254414}
Starting iteration 25


 87%|████████▋ | 26/30 [1:14:10<14:28, 217.16s/it]

{'ner': 56.91785386652102}
Starting iteration 26


 90%|█████████ | 27/30 [1:17:49<10:52, 217.65s/it]

{'ner': 61.37368787159892}
Starting iteration 27


 93%|█████████▎| 28/30 [1:21:27<07:15, 217.78s/it]

{'ner': 43.98806502874159}
Starting iteration 28


 97%|█████████▋| 29/30 [1:25:01<03:36, 216.57s/it]

{'ner': 60.148592117800035}
Starting iteration 29


100%|██████████| 30/30 [1:28:43<00:00, 177.45s/it]

{'ner': 50.196763980995975}





"\nStarting iteration 0\n{'ner': 964.6060144836445}\nStarting iteration 1\n{'ner': 470.0375200238501}\nStarting iteration 2\n{'ner': 276.42840668701115}\nStarting iteration 3\n\n\n....\n\n\nStarting iteration 27\n{'ner': 56.020043512604765}\nStarting iteration 28\n{'ner': 52.933951228180256}\nStarting iteration 29\n{'ner': 40.17626116715598}\n"

In [38]:
nlp = spacy.load('hp_ner')
doc = nlp("""Gollum was the director of a firm called Grunnings, which made drills.""")

for ent in doc.ents:
    print(ent.text, ent.label_)

In [39]:
doc = nlp("""Mr. Dursley was the director of a firm called Grunnings, which made drills.""")

for ent in doc.ents:
    print(ent.text, ent.label_)

Mr. Dursley PERSON


In [40]:
[char for char in final_characters if "Gollum" in final_characters]

[]

In [41]:
"Gollum" in hp

False

In [42]:
nlp = spacy.load("hp_ner_model")
doc = nlp("""Gollum was the director of a firm called Grunnings, which made drills""")

for ent in doc.ents:
    print(ent.text, ent.label_)

Gollum PERSON


In [46]:
doc = nlp("""Max was the director of a firm called Grunnings, which made drills""")

for ent in doc.ents:
    print(ent.text, ent.label_)

In [43]:
doc = nlp("""Mr. Dursley was the director of a firm called Grunnings, which made drills.""")

for ent in doc.ents:
    print(ent.text, ent.label_)

Mr. Dursley PERSON


In [44]:
doc = nlp("""Somerandomname was the director of a firm called Grunnings, which made drills.""")

for ent in doc.ents:
    print(ent.text, ent.label_)

In [45]:
doc = nlp("""
Harry James Potter was an English half-blood wizard, and one of the most famous wizards of modern times. The only child and son of James and Lily Potter (née Evans), Harry's birth was overshadowed by a prophecy, naming either himself or Neville Longbottom as the one with the power to vanquish Lord Voldemort. After half of the prophecy was reported to Voldemort, courtesy of Severus Snape, Harry was chosen as the target due to his many similarities with the Dark Lord. In turn, this caused the Potter family to go into hiding. Voldemort made his first vain attempt to circumvent the prophecy when Harry was a year and three months old. During this attempt, he murdered Harry's parents as they tried to protect him, but this unsuccessful attempt to kill Harry led to Voldemort's first downfall. This downfall marked the end of the First Wizarding War, and to Harry henceforth being known as "The Boy Who Lived", as he was the only known survivor of the Killing Curse.
""".strip() )

for ent in doc.ents:
    print(ent.text, ent.label_)

Harry James PERSON
Potter PERSON
James PERSON
Lily Potter PERSON
Harry PERSON
Neville Longbottom PERSON
Lord Voldemort PERSON
Voldemort PERSON
Severus Snape PERSON
Harry PERSON
Lord PERSON
Potter PERSON
Voldemort PERSON
Harry PERSON
Harry PERSON
Harry PERSON
Voldemort PERSON
First PERSON
Harry PERSON


# 06 Introduction to Word Vectors

In [9]:
#!python3 -m spacy download en_core_web_lg

In [4]:
import numpy as np
import spacy

nlp = spacy.load("en_core_web_lg")
def spacy_similarity(word):
    ms = nlp.vocab.vectors.most_similar(
        np.asarray([nlp.vocab.vectors[nlp.vocab.strings[word]]]), n=10)
    words = [nlp.vocab.strings[w] for w in ms[0][0]]
    distances = ms[2]
    print(words)
    
spacy_similarity("Harry")

['HARRY', 'Harry', 'harry', 'POTTER', 'Potter', 'potter', 'Hermione', 'HERMIONE', 'hermione', 'Hallows']


In [6]:
spacy_similarity("Gollum")

['gollum', 'Gollum', 'GOLLUM', 'Gandalf', 'GANDALF', 'gandalf', 'frodo', 'FRODO', 'Frodo', 'smeagol']


# 07 Generating Custom Word Vectors in Gensim

In [1]:
import json, re
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
import multiprocessing

In [2]:
import requests

r = requests.get('''http://www.pauladaunt.com/books/Children's/Harry_Potter1-4/J.%20K.%20Rowling%20-%20Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt''')
hp = r.content.decode('utf-8')
hp = hp.replace('\n\n','. ').replace('..','.').replace('\n',' ')
hp = re.sub('\s+',' ', hp)

print(hp[:500])

Harry Potter and the Sorcerer's Stone. CHAPTER ONE. THE BOY WHO LIVED. Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. 


In [3]:
from spacy.lang.en import English 

def cleaning(text):
    text = str(text)
    text = re.sub("[^\w ]","", text)
    return text

def tokenize(text):
    text = text.split()
    text = [word for word in text if word not in nlp.Defaults.stop_words]
    return text

nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer')) 
doc = nlp(hp)
hp = [tokenize(cleaning(sent)) for sent in doc.sents]
hp[:5]

[['Harry', 'Potter', 'Sorcerers', 'Stone'],
 ['CHAPTER', 'ONE'],
 ['THE', 'BOY', 'WHO', 'LIVED'],
 ['Mr',
  'Mrs',
  'Dursley',
  'number',
  'Privet',
  'Drive',
  'proud',
  'perfectly',
  'normal',
  'thank'],
 ['They',
  'people',
  'youd',
  'expect',
  'involved',
  'strange',
  'mysterious',
  'didnt',
  'hold',
  'nonsense']]

In [4]:
sentences = hp
def training(model_name):
    sentences = hp
    cores = multiprocessing.cpu_count()
    w2v_model = Word2Vec(
        min_count = 5, # min freq of word
        window = 2, # surrounding words of w2v
        vector_size = 500 , # dimensionality of a token
        sample = 6e-5,
        alpha = 0.03,
        min_alpha = 0.0007,
        negative = 20,
        workers = cores-1
    )
      
    w2v_model.build_vocab(sentences)
    w2v_model.train(sentences, total_examples = w2v_model.corpus_count, epochs=30)
    w2v_model.save(f"""word_vectors/{model_name}.model""")
    w2v_model.wv.save_word2vec_format(f"""word_vectors/word2vec_{model_name}.txt""")
    
training("hp_ner_model_01")    

In [5]:
def gen_similarity(word):
    model = KeyedVectors.load_word2vec_format("word_vectors/word2vec_hp_ner_model_01.txt", binary=False)
    results = model.most_similar(positive=[word])
    print(results)

In [6]:
gen_similarity("Harry")

[('shouted', 0.999816358089447), ('noticed', 0.9998037219047546), ('sir', 0.9997997283935547), ('For', 0.9997996091842651), ('knocked', 0.9997986555099487), ('train', 0.9997981190681458), ('Griphook', 0.9997978210449219), ('Even', 0.9997971653938293), ('kicked', 0.9997969269752502), ('sharply', 0.9997965693473816)]


In [7]:
gen_similarity("Gryffindor")

[('points', 0.9997794032096863), ('If', 0.9997761845588684), ('Slytherin', 0.9997742176055908), ('taken', 0.9997738003730774), ('Theres', 0.9997723698616028), ('So', 0.9997720122337341), ('Hell', 0.9997705221176147), ('Its', 0.9997681975364685), ('Gryffindors', 0.9997647404670715), ('Five', 0.9997645616531372)]


# 08 Importing Custom Word Vectors from Gensim into spaCy

In [9]:
import spacy
import subprocess
import sys

word_vectors = "word_vectors/word3vechp_ner_model_01.txt"
model_name = "hp_model_test"

def load_word_vectors(model_name, word_vectors):
    subprocess.run([sys.executable,
                   "-m",
                    "spacy",
                   "init-model",
                   "en",
                   model_name,
                   "--vectors-loc",
                   word_vectors]
    )
load_word_vectors(model_name, word_vectors)