## Code to create a NER model for electronic products

### The products used are:

- Laptops
- Monitors
- HardDisks
- Printers
- Shredders (not included in this code)

In [1]:
##import libraries
import pandas as pd
import numpy as np
import spacy
import random
import time
import csv
import json
import logging
import os

### Creating training data

In [2]:
# function to load data
def create_trainData (csv_file,product):
    
    ## loading data with the columns names as entities
    data = pd.read_csv(csv_file,index_col=None)
    
    ## the entities
    cols = data.columns
    
    ## number of entities
    num_ent = len(data.columns)
    ent_list = list(np.arange(num_ent))
    
    ## jumble indices of entities to create a phrase
    prod_name = [] # list of all product names
    prod_ann = [] # list of all the annotations
    for i in range(len(data)): # loop for each laptop
        idx_list = random.sample(ent_list,num_ent) # shuffling indices
        cont = []
        ann = []
        ann_idx = 0 # pointer for annotating 
        for j in range(num_ent): # creating the jumbled product name
            col_num = idx_list[j] # column number according jumbled column index
            val = data.iloc[i,col_num] # value of the entity 
            cont.append(str(val)) # appending list of entities into a single list
            ann.append((ann_idx, len(str(val))+ ann_idx, cols[col_num])) # annotations and entity name
            ann_idx = ann_idx + len(str(val)) + 1 # updating the annotation pointer

        prod_name.append( ' '.join(cont)) # complete phrase for each product
        prod_ann.append(ann)
        #print(prod_name)
        
    ## combining content with annotations
    prod =[]
    for i in range(len(data)):
        prod.append([prod_name[i], prod_ann[i]])
        
    ## creating a dataframe with product names and annotations
    prod_data = pd.DataFrame(prod, columns = ['ProdName','Annotations'])
    
    # converting into csv file
    prod_data.to_csv(product+'_trainData.csv', index= None)
    
    # convert into json
    csvfile = open(product+'_trainData.csv', 'r')
    jsonfile = open(product+'_trainData.json', 'w')

    fieldnames = ('ProdName', 'Annotations')
    reader = csv.DictReader( csvfile, fieldnames)

    for row in reader:
        json.dump(row, jsonfile)
        jsonfile.write('\n')

In [3]:
## function to convert json file into spacy traning data format
def convert_to_spacytrain(json_file):
    try:
        training_data = []
        lines=[]
        with open(json_file, 'r') as f:
            lines = f.readlines() 
            
        for line in lines[1:400]: # loop for every product
            data = json.loads(line) # single row
            text = data['ProdName'] #this is complete phrase
            entities = data['Annotations']
            training_data.append((text, {"entities" : eval(entities)}))
            
        return training_data
    
    except Exception as e:
        
        logging.exception("Unable to process " + json_file + "\n" + "error = " + str(e))
        
        return None

In [4]:
def train_spacy(nlp,data,iterations):
    
    TRAIN_DATA = data
       
    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
                ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
#         if nlp is None:
#             optimizer = nlp.begin_training()
#         else:
#             optimizer = nlp.resume_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                #print(text, annotations)
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    #drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
                
            print(losses)
            if (losses['ner'] <100):
                break

    return nlp

### Driver Code

In [5]:
## to get current location path
path = os.getcwd() 
path

'/Users/mparvatham/DatasetCreation/spacy-ner-annotator-master/Spacy_NER'

In [6]:

files = []

for file in os.listdir(path+"/catalogue"):
    if file.endswith(".csv"):
        files.append(os.path.join(path+"/catalogue", file))

print(files)

['/Users/mparvatham/DatasetCreation/spacy-ner-annotator-master/Spacy_NER/catalogue/laptop.csv', '/Users/mparvatham/DatasetCreation/spacy-ner-annotator-master/Spacy_NER/catalogue/swiches.csv', '/Users/mparvatham/DatasetCreation/spacy-ner-annotator-master/Spacy_NER/catalogue/HardDisk.csv', '/Users/mparvatham/DatasetCreation/spacy-ner-annotator-master/Spacy_NER/catalogue/printers.csv', '/Users/mparvatham/DatasetCreation/spacy-ner-annotator-master/Spacy_NER/catalogue/monitors.csv']


In [7]:
count= 1
nlp = spacy.blank("en")  # create blank Language class
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
       
for file in files:
    print('prod'+str(count))
    json_file = create_trainData(file, 'prod'+str(count))
    train_data = convert_to_spacytrain('prod'+str(count)+'_trainData.json')
    nlp = train_spacy(nlp,train_data,20)
    count+=1

prod1
Starting iteration 0
{'ner': 1475.4946768784332}
Starting iteration 1
{'ner': 125.85785638970043}
Starting iteration 2
{'ner': 156.76143061454846}
Starting iteration 3
{'ner': 542.1258806540089}
Starting iteration 4
{'ner': 230.27847208136853}
Starting iteration 5
{'ner': 95.33896605580753}
prod2
Starting iteration 0
{'ner': 748.3494689263875}
Starting iteration 1
{'ner': 36.19504030366512}
prod3
Starting iteration 0
{'ner': 470.3095202818697}
Starting iteration 1
{'ner': 0.020973742354482083}
prod4
Starting iteration 0
{'ner': 421.1634865267402}
Starting iteration 1
{'ner': 235.19775699540472}
Starting iteration 2
{'ner': 218.32219859684818}
Starting iteration 3
{'ner': 217.44229563161633}
Starting iteration 4
{'ner': 170.5059630554129}
Starting iteration 5
{'ner': 163.94962442145658}
Starting iteration 6
{'ner': 163.20267752830327}
Starting iteration 7
{'ner': 151.7358513070732}
Starting iteration 8
{'ner': 144.01171361637176}
Starting iteration 9
{'ner': 139.5806510379378}
Sta

In [16]:
test_1 ="HP "


In [17]:
#test_text = input("Enter your testing text: ")
doc = nlp(test_1)

for ent in doc.ents:

    print('Entity: ',ent.text)
    print('Details: ',ent.start_char, ent.end_char, ent.label_)


Entity:  What
Details:  0 4 HDMI
Entity:  is the
Details:  5 11 DisplayType
Entity:  cost
Details:  12 16 Brand
Entity:  of
Details:  17 19 DisplayType
Entity:  Lenovo
Details:  20 26 Brand
Entity:  Ideapad
Details:  27 34 Category
Entity:  ?
Details:  34 35 Brand
