# Train you NER model using Spacy

**Upload your exported annotated json1 file from Doccano in the folder**

## Import Libraries

In [1]:
# Import Libraries
from __future__ import unicode_literals, print_function
import numpy as np
import pandas as pd
import random
from pathlib import Path
import spacy
import re

## Dataset Preparation

**While annotating there might be trailing and leading whitespaces in your entities, which leads to error while training your model using Spacy. Instead of worrying about it while annotating, we define a function here which trims whitespace**

In [2]:
# Trim leading and trailing whitespaces from entity spans
def trim_entity_spans(data: list) -> list:

    invalid_span_tokens = re.compile(r'\s')
#    print(data)
    cleaned_data = []
    for annotations,text in data:
 #        print(text)
#         print(annotations)
        text = text['entities']
  #      print(entities)
        valid_entities = []
        for start, end, label in annotations:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

**Spacy needs a specific format of data, so we prepare it here from our json data**

In [3]:
# training data
dataset = pd.read_json('C:/Users/nihar/Desktop/file_soma.json1', lines=True)
dataset.drop(['id', 'meta', 'annotation_approver'],axis=1, inplace=True)
TRAINING_DATA = [([i for i in row][1],{"entities":[i for i in row][2]}) for row in dataset.itertuples()]
#print(TRAINING_DATA)
TRAIN_DATA = trim_entity_spans(TRAINING_DATA)

In [4]:
TRAIN_DATA[0]

['Dear Ms. mouse minnie Thank you for applying to Online Admissions at Management Development Institute, Gurgaon. We have received payment from you, details of which are given below. Kindly verify the details and in case of any discrepancy please let us know.  Form No : 1002994 Name : Ms. mouse minnie Payment Type : Credit Card Trans Id : MCIT0990458710 Trans Date : 07-09-2011 22:24:57 Amount : 1750.00  Management Development Institute, Gurgaon Admission Department  The information contained in this electronic message and any attachments to this message are intended for the exclusive use of the addressee(s) and may contain proprietary, confidential or privileged information. If you are not the intended recipient, you should not disseminate, distribute or copy this e-mail. Please notify the sender immediately and destroy all copies of this message and any attachments contained in it.',
 {'entities': [[69, 110, 'Merchant Name'],
   [269, 276, 'Booking/Order ID'],
   [288, 300, 'Customer 

## Train Your Model

### Defining a function which trains the NER model, tests on an email and saves the model for further training 

In [5]:
text = " Order Details  Order Number:  NIO186547 Merchant Site URL:  http://www.theschoolofai.in  Received on: Dec, 9 2010  Total Amount : USD 599.99  Card Holder :  Bryan D. MacKenzie  Card Type :  VISA  Billing Address :  (As entered by the Customer)  6427 14ave Edmonton, AB - t6l1y1 Canada  Customer's Phone Number : 1-780-9041372 Customer's E-mail ID : bdmacken@ualberta.ca Customer's IP Address : 96.52.201.31 ( Analyze IP Address )   Recipient's Name : Bryan D. MacKenzie  Shipping Address :  6427 14ave, Edmonton - t6l1y1, AB, CA  Recipient's Phone Number :  1-780-9041372  Special Instruction :     You have 12 days to execute and capture this order amount from your CCAvenue account interface. After you have executed this order, Please log in at http://world.ccavenue.com and click on 'View Pending Orders' to update the records.   ---------------------------------------------------------------------------------------------------------------- * Caution :  Please use your best judgment in deciding whether to fulfill or cancel this order. If this customer is using a stolen credit card, then you will loose money of this order.  ----------------------------------------------------------------------------------------------------------------  Best Regards   CCAvenue Customer Service"

In [6]:
def train_test_savemodel(data= TRAIN_DATA, test_text = text, model=None, new_model_name='mymodel', output_dir='.', n_iter=30):
    
    #Set up the pipeline and entity recognizer, and train the new entity
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')
    print('phase-1 completed')

    # add labels from the train data
    for _, annotations in data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])
    print('phase-2 completed')

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(data)
            losses = {}
            for text, annotations in data:
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)
            
    print('phase-3 completed')    

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print('Text for test :', test_text)
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        for ent in doc.ents:
            print(ent.label_, ":", ent.text)

### Train the model

In [None]:
train_test_savemodel(output_dir='C:/Users/nihar/Documents/NERMODEL')

Created blank 'en' model
phase-1 completed
phase-2 completed


### Retrain the saved model

In [9]:
text2 = " Hi John,  Thanks for using Swiggy! Your order has been confirmed and will be delivered shortly.   Look forward to serving you.     Order No:  54100604095  Restaurant:  Chai Point  Track your Order   Item Name  Quantity  Price  Classic Masala Maggi160  Banana Cake169  Cart Subtotal Rs. 129  Packing Charges Rs. 20  GST Rs. 7.45  Grand Total: 118  Delivery Address:  170, 3floor, Amarjyoti Layout, Domlur, Bengaluru, Karnataka 560071, India, Bangalore  Landmark:  Arun Arcade, next to Axis Viviana  Get the App:      Follow us:    © 2017-Swiggy. All rights reserved.   Swiggy,Tower D, 9th Floor, IBC Knowledge Park, Bannerghatta Road, Bangalore - 560029"

In [16]:
train_test_savemodel(test_text = text2, model= '.', n_iter=20,output_dir='C:/Users/nihar/Documents/NERMODEL' )

Loaded model '.'
phase-1 completed
phase-2 completed
{'ner': 2738.6748667817387}
{'ner': 2739.475728160789}
{'ner': 2759.9131616103878}
{'ner': 2806.6936698466575}
{'ner': 2800.996897301178}
{'ner': 2970.541562117246}
{'ner': 2967.744069606567}
{'ner': 3047.4730917415445}
{'ner': 2819.6572913517157}
{'ner': 2652.039087250424}
{'ner': 2604.2075583480278}
{'ner': 2695.4105221890536}
{'ner': 3006.310835652553}
{'ner': 2627.6779870515643}
{'ner': 2476.452451731627}
{'ner': 2446.1573565407884}
{'ner': 2630.1894531701446}
{'ner': 2536.349130995529}
{'ner': 2565.8477835438825}
{'ner': 2517.2402357840015}
phase-3 completed
Saved model to C:\Users\nihar\Downloads
Text for test :  Hi John,  Thanks for using Swiggy! Your order has been confirmed and will be delivered shortly.   Look forward to serving you.     Order No:  54100604095  Restaurant:  Chai Point  Track your Order   Item Name  Quantity  Price  Classic Masala Maggi160  Banana Cake169  Cart Subtotal Rs. 129  Packing Charges Rs. 20  GST R

In [14]:
text3= " Order ID 5170473231    May 16, 2018 07:40 AM      Rs 1155Paid Successfully To           Bangalore Electricity Supply Company Ltd. (BESCOM)     Consumer "
train_test_savemodel(test_text = text3, model= '.', n_iter=20, )

Loaded model '.'
phase-1 completed
phase-2 completed
{'ner': 2921.611525464816}
phase-3 completed
Saved model to .
Text for test :  Order ID 5170473231    May 16, 2018 07:40 AM      Rs 1155Paid Successfully To           Bangalore Electricity Supply Company Ltd. (BESCOM)     Consumer 
Loading from .
Booking/Order ID : 5170473231
Booking/Delivery Date : May 16, 2018


In [None]:
def predict(test_text,):
    print('Text for test :', test_text)
    print("Loading from", output_dir)
    nlp2 = spacy.load(output_dir)
    doc = nlp2(test_text)
    for ent in doc.ents:
        print(ent.label_, ":", ent.text)

In [None]:
text2 = " Hi John,  Thanks for using Swiggy! Your order has been confirmed and will be delivered shortly.   Look forward to serving you.     Order No:  54100604095  Restaurant:  Chai Point  Track your Order   Item Name  Quantity  Price  Classic Masala Maggi160  Banana Cake169  Cart Subtotal Rs. 129  Packing Charges Rs. 20  GST Rs. 7.45  Grand Total: 118  Delivery Address:  170, 3floor, Amarjyoti Layout, Domlur, Bengaluru, Karnataka 560071, India, Bangalore  Landmark:  Arun Arcade, next to Axis Viviana  Get the App:      Follow us:    © 2017-Swiggy. All rights reserved.   Swiggy,Tower D, 9th Floor, IBC Knowledge Park, Bannerghatta Road, Bangalore - 560029"
train_test_savemodel(test_text = text2, model= '.', n_iter=20,output_dir='C:/Users/nihar/Downloads' )

**Due to less data, these shows slight error on test text.**