## Code to create a NER model for electronic products

### The products used are:

- Laptops
- Monitors
- HardDisks
- Printers
- Shredders

In [11]:
##import libraries
import pandas as pd
import numpy as np
import spacy
import random
import time
import csv
import json
import logging

### Creating training data

In [19]:
# function to load data
def create_trainData (csv_file,product):
    
    ## loading data with the columns names as entities
    data = pd.read_csv(csv_file,index_col=None)
    
    ## the entities
    cols = data.columns
    
    ## number of entities
    num_ent = len(data.columns)
    ent_list = list(np.arange(num_ent))
    
    ## jumble indices of entities to create a phrase
    prod_name = [] # list of all product names
    prod_ann = [] # list of all the annotations
    for i in range(len(data)): # loop for each laptop
        idx_list = random.sample(ent_list,num_ent) # shuffling indices
        cont = []
        ann = []
        ann_idx = 0 # pointer for annotating 
        for j in range(num_ent): # creating the jumbled product name
            col_num = idx_list[j] # column number according jumbled column index
            val = data.iloc[i,col_num] # value of the entity 
            cont.append(val) # appending list of entities into a single list
            ann.append((ann_idx, len(val)+ ann_idx, cols[col_num])) # annotations and entity name
            ann_idx = ann_idx + len(val) + 1 # updating the annotation pointer

        prod_name.append( ' '.join(cont)) # complete phrase for each product
        prod_ann.append(ann)
        
    ## combining content with annotations
    prod =[]
    for i in range(len(data)):
        prod.append([prod_name[i], prod_ann[i]])
        
    ## creating a dataframe with product names and annotations
    prod_data = pd.DataFrame(prod, columns = ['ProdName','Annotations'])
    
    # converting into csv file
    prod_data.to_csv(product+'_trainData.csv', index= None)
    
    # convert into json
    csvfile = open(product+'_trainData.csv', 'r')
    jsonfile = open(product+'_trainData.json', 'w')

    fieldnames = ('ProdName', 'Annotations')
    reader = csv.DictReader( csvfile, fieldnames)

    for row in reader:
        json.dump(row, jsonfile)
        jsonfile.write('\n')

In [25]:
## function to convert json file into spacy traning data format
def convert_to_spacytrain(json_file):
    try:
        training_data = []
        lines=[]
        with open(json_file, 'r') as f:
            lines = f.readlines() 
            
        for line in lines[1:400]: # loop for every product
            data = json.loads(line) # single row
            text = data['ProdName'] #this is complete phrase
            entities = data['Annotations']
            training_data.append((text, {"entities" : eval(entities)}))
            
        return training_data
    
    except Exception as e:
        
        logging.exception("Unable to process " + json_file + "\n" + "error = " + str(e))
        
        return None

In [39]:
def train_spacy(data,iterations):
    
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
                ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                #print(text, annotations)
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
                
            print(losses)
            if (losses['ner'] <100):
                break

    return nlp

In [None]:
%%time
prdnlp = train_spacy(train_data,10)

### Driver Code

In [22]:
json_file = create_trainData('laptop.csv', 'laptop')
train_data = convert_to_spacytrain('laptop_trainData.json')

In [26]:
train_data = convert_to_spacytrain('laptop_trainData.json')

In [40]:
test = train_spacy(train_data,3)

Starting iteration 0
{'ner': 1688.0190589409233}
Starting iteration 1
{'ner': 337.61964259671106}
Starting iteration 2
{'ner': 203.93771591093065}


In [None]:
json_file = create_trainData('laptop.csv', 'laptop')
train_data = convert_to_spacytrain('laptop_trainData.json')