<a href="https://colab.research.google.com/github/osamadev/NER_Using_Spacy/blob/master/Copy_of_NER_Spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training a new custom entity type "STREET_NAME" Using Spacy


In [0]:
# Load all the required python libraries and modules

from __future__ import unicode_literals, print_function
import numpy as np
import pandas as pd

import spacy
from spacy import displacy
from spacy.util import minibatch, compounding

from sklearn.utils import shuffle
import random
from pathlib import Path
from collections import Counter

from matplotlib import pyplot as plt
%matplotlib inline

import pprint
pp = pprint.PrettyPrinter(indent=4)

import en_core_web_sm as en

In [3]:
addresses_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/adresser.csv')

addresses_df.replace('', np.nan, inplace=True)

addresses_df = shuffle(addresses_df)

addresses_df.head(n=10)

Unnamed: 0,road code,Street Name,addressing road name,house number,Additional city name,postnr,Postal Code Name,storm recipient zip code,stormodtagerpostnrnavn,municipal code,municipality name,owner name,region code,region name,jordstykke_ejerlavnavn (City Name),Municipality
27075,1472,Sejerbyvej,Sejerbyvej,57,Sejerby,4592,Sejerø,,,326,Kalundborg,"Sejerby By, Sejerø",1085,Region Sjælland,"Sejerby By, Sejerø",Kalundborg
7331,412,Flinterupvej,Flinterupvej,61,,4480,Store Fuglede,,,326,Kalundborg,"Flinterup By, St. Fuglede",1085,Region Sjælland,"Flinterup By, St. Fuglede",Kalundborg
4631,266,Dortheavej,Dortheavej,4,,4270,Høng,,,326,Kalundborg,"Finderup By, Finderup",1085,Region Sjælland,"Finderup By, Finderup",Kalundborg
13423,748,Ingelisevej,Ingelisevej,10,Kaldred,4593,Eskebjerg,,,326,Kalundborg,"Torpe By, Bregninge",1085,Region Sjælland,"Torpe By, Bregninge",Kalundborg
2137,124,Birkelunden,Birkelunden,14,,4270,Høng,,,326,Kalundborg,"Høng By, Finderup",1085,Region Sjælland,"Høng By, Finderup",Kalundborg
5628,334,Elverhøjen,Elverhøjen,24,,4400,Kalundborg,,,326,Kalundborg,"Svenstrup By, Raklev",1085,Region Sjælland,"Svenstrup By, Raklev",Kalundborg
3247,189,Bryggervænget,Bryggervænget,3,,4400,Kalundborg,,,326,Kalundborg,Kalundborg Bygrunde,1085,Region Sjælland,Kalundborg Bygrunde,Kalundborg
2276,138,Bjerge Byvej,Bjerge Byvej,33,Bjerge,4480,Store Fuglede,,,326,Kalundborg,"Bjerge By, Svallerup",1085,Region Sjælland,"Bjerge By, Svallerup",Kalundborg
17819,962,Kystparken,Kystparken,3,,4400,Kalundborg,,,326,Kalundborg,"Nyrup By, Raklev",1085,Region Sjælland,"Nyrup By, Raklev",Kalundborg
12306,676,Hovedgaden,Hovedgaden,34,,4270,Høng,,,326,Kalundborg,"Høng By, Finderup",1085,Region Sjælland,"Høng By, Finderup",Kalundborg


In [4]:
pp.pprint(addresses_df.shape)

(37288, 16)


In [17]:
training_data = []

# define new entity label for street address (STREET_NAME)
label = 'STREET_NAME'

sentences_templates = ["I live in {0}", "Friend of mine lives in {0}", "I know the address of that street, it is {0}",\
    "Mr. Absalon Adam lived before in {0}", "I like the resturants in {0}", "Check the map to find the directions to {0}", \
    "My friend Aksel will meet me in {0}", "Me and Adrian has a meeting in {0}", "Taxi driver can take you to {0}", \
      "Stay away of {0}", "His address is {0}", "My cousine lives in {0}", "I like shops in {0}", "Let us drive to {0}", \
      "Do you like this street?", "I walk everyday in that place", "{0}, this is my current address", "{0} is awesome place"
]

# prepare the new training dataset based on the data collected from the street addresses in Kalundborg city in Denmark
def prepare_training_data():
  
    for index, item in addresses_df.iterrows():
        sentence_pholder_idx = np.random.randint(0, len(sentences_templates), size=1)[0]
        sentence_pholder = sentences_templates[sentence_pholder_idx]
    
        street_address = item['Street Name'] + " " + str(item['road code'])
        if '{0}' in sentence_pholder:
            if item['Additional city name'] != '':
                if item['Additional city name'] != np.nan and str(item['Additional city name']) != 'nan':
                    street_address = street_address + ", "+ item['Additional city name']

            street_address = street_address + ", "+ item['Municipality']
            start_idx = sentence_pholder.find('{0}')
            new_sentence = sentence_pholder.replace('{0}', street_address)
            end_idx = start_idx + len(street_address)
            
            training_data.append((new_sentence, {
            'entities': [(start_idx, end_idx, label)]
            }))
        else:
            new_sentence = sentence_pholder
            
            training_data.append((new_sentence, {
            'entities': []
            }))
            
    return training_data

dataset = prepare_training_data()

'''
# Add few random examples for the existing entity types to our training dataset
dataset.insert(10, ('Who is Shaka Khan?', {
        'entities': [(7, 17, 'PERSON')]
}))

dataset.insert(300, ('Who is Bill Gates?', {
        'entities': [(7, 17, 'PERSON')]
}))

dataset.insert(600, ('Steve Jobs was a genius but work holic.', {
        'entities': [(0, 10, 'PERSON')]
}))

dataset.insert(1200, ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    }))

dataset.insert(1800, ('I like Cairo and Turkey.', {
        'entities': [(7, 12, 'LOC'), (18, 24, 'LOC')]
    }))
'''

# I will consider only 4000 instances during the training of the NER model (just to expedite the training process in that demo scenario)   
train_dataset = dataset[:4000]

# pick 500 instaces as our test dataset
test_dataset = dataset[10000:10500]

# View 10 sentences from the new training dataset
train_dataset[1000:1010]

[('His address is Eskebjergvej 368, Kalundborg',
  {'entities': [(15, 43, 'STREET_NAME')]}),
 ('His address is Vestervej 1989, Kalundborg',
  {'entities': [(15, 41, 'STREET_NAME')]}),
 ('Stay away of Skippingevej 1496, Særslev, Kalundborg',
  {'entities': [(13, 51, 'STREET_NAME')]}),
 ('Dagliljevej 243, Kalundborg is awesome place',
  {'entities': [(0, 27, 'STREET_NAME')]}),
 ('I like the resturants in Græsmarkvej 538, Kalundborg',
  {'entities': [(25, 52, 'STREET_NAME')]}),
 ('I like shops in Solbakken 1591, Kalundborg',
  {'entities': [(16, 42, 'STREET_NAME')]}),
 ('I like the resturants in Bakkevangen 2107, Kalundborg',
  {'entities': [(25, 53, 'STREET_NAME')]}),
 ('His address is Vestre Havneplads 1992, Kalundborg',
  {'entities': [(15, 49, 'STREET_NAME')]}),
 ('Mr. Absalon Adam lived before in Sofievej 1587, Kalundborg',
  {'entities': [(33, 58, 'STREET_NAME')]}),
 ('Stay away of Alleshavevej 31, Kalundborg',
  {'entities': [(13, 40, 'STREET_NAME')]})]

In [0]:
# This function is to update and retrain the NER model for the new custom entity type from the new training dataset 

def train_NER_model(new_model_name, label_name, model=None,output_dir=None, n_iter=20):
  
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
        
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')
    
    ner.add_label(label_name)   # add new entity label to entity recognizer
    if model is None:
        nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(train_dataset)
            losses = {}
            
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_dataset, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.5,
                           losses=losses)
            print('Losses', losses)
    
    
    # save the trained model to output directory
    if output_dir is not None:
      nlp.meta['name'] = new_model_name
      save_updated_NER_model(nlp, output_dir)
        
    return nlp

# save the trained model to output directory
def save_updated_NER_model(model, output_dir):
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    
    model.to_disk(output_dir)
    print("Final model saved to", output_dir)
    
# load the trained NER model    
def load_trained_NER_model(output_dir):
    # load the trained model
    print("Loading from", output_dir)
    nlp_model = spacy.load(output_dir)
    return nlp_model
  
# test the trained model  
def test_NER_model(nlp_model, document_test, show_entities=True, style_sentence=True):
    document = nlp_model(document_test)
    
    if show_entities:
      for entities in document.ents:
          print(entities.label_, entities.text)
          
    if style_sentence:
      colors = {'STREET_NAME': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)'}
      displacy.render(document, jupyter=True, style='ent', options={'colors': colors})

# choose random instances batch from the testing dataset to evaluate the trained NER model against unseen instances.    
def test_samples_batch(model, test_set=test_dataset, samples_no=10):
    
    for i in range(0, samples_no):
      test_sample_idx = np.random.randint(0, len(test_set), size=1)[0]
      test_doc = test_set[test_sample_idx][0]
      
      test_NER_model(model, test_doc, show_entities=False)

In [23]:
out_dir='/content/drive/My Drive/Colab Notebooks/output'

nlp_model = train_NER_model(new_model_name='street_names_model', label_name=label,output_dir=out_dir, n_iter=10)

Created blank 'en' model
Losses {'ner': 47.34154281312178}
Losses {'ner': 1.047594028917558}
Losses {'ner': 1.9571338721843157}
Losses {'ner': 0.49388970918499897}
Losses {'ner': 0.09628244856817397}
Losses {'ner': 0.6681397374638589}
Losses {'ner': 0.08163265883922764}
Losses {'ner': 0.809048720085724}
Losses {'ner': 0.5977721174947469}
Losses {'ner': 0.357674739472754}
Final model saved to /content/drive/My Drive/Colab Notebooks/output


In [40]:
# load the trained model
nlp_trained_model = load_trained_NER_model(output_dir=out_dir)

# test the trained model
sample_docs = ["Mr. Absalon Adam lived before in Sine Olsensvej 1479, Svallerup, Kalundborg",
               "I live currently in Absalonsvej 4, Bjerge Str, Copenhagen City, Denmark",
               "His address is Elledevej 317, Kalundborg",
               "Mr. Charles B. Evans is a consular officer, Department of State, Nouakchott Place 33, Washington DC",
               "I live in Union Square 23, San Francisco."]

for test_doc in sample_docs:
  test_NER_model(nlp_trained_model, test_doc)

Loading from /content/drive/My Drive/Colab Notebooks/output
STREET_NAME Sine Olsensvej 1479, Svallerup, Kalundborg


STREET_NAME Absalonsvej 4, Bjerge Str, Copenhagen City, Denmark


STREET_NAME Elledevej 317, Kalundborg


STREET_NAME Mr. Charles B. Evans is
STREET_NAME State, Nouakchott Place 33, Washington DC


STREET_NAME Union Square 23, San Francisco.


In [49]:
# Check batch of instances randomly picked from the testing dataset

test_samples_batch(nlp_trained_model, test_dataset)