## Creating a Spacy NER Model for laptops

The Brand, Model, Processor, RAM, OS, Disk, Dim are the entities we want to define.

In [1]:
##import libraries
import pandas as pd
import numpy as np
import spacy
import random
import time

### 1. Data


In [2]:
## loading data with the columns names as entities
data = pd.read_csv("catalogue/laptop.csv",index_col=None)
data

Unnamed: 0,Brand,Model,Processor,RAM,OS,Disk,Dim,Category
0,Lenovo,Ideapad,Intel Core i3 Processor (7th Gen),4 GB DDR4,64 bit Windows 10,1 TB HDD,39.62 cm (15.6 inch),Laptop
1,Lenovo,Ideapad,Intel Core i3 Processor (7th Gen),4 GB DDR4,64 bit Windows 10,1 TB HDD,39.62 cm (15.6 inch),Laptop
2,HP,EliteBook,Intel Core i3 Processor (7th Gen),8 GB DDR4,64 bit Windows 10,256 GB SSD,35.56 cm (14 inch),Laptop
3,Dell,Vostro,Intel Core i3 Processor (8th Gen),4 GB DDR4,Linux/Ubuntu,1 TB HDD,35.56 cm (14 inch),Laptop
4,HP,Zbook,Intel Core i5 Processor (8th Gen),8 GB DDR4,64 bit Windows 10,1 TB HDD,35.56 cm (14 inch),Laptop
...,...,...,...,...,...,...,...,...
411,Lenovo,Ideapad,Intel Core i5 Processor (6th Gen),4 GB DDR3,64 bit Windows 10,1 TB HDD,35.56 cm (14 inch),Laptop
412,Lenovo,Ideapad,Intel Core i7 Processor (8th Gen),8 GB DDR4,64 bit Windows 10,1 TB HDD,39.62 cm (15.6 inch),Laptop
413,Lenovo,Ideapad,AMD APU Quad Core A6 Processor,4 GB DDR3,64 bit Windows 10,1 TB HDD,39.62 cm (15.6 inch),Laptop
414,Lenovo,Legion,Intel Pentium Quad Core Processor (4th Gen),4 GB DDR3,D,500 GB HDD,39.62 cm (15.6 inch),Laptop


### 2. Pre-processing

The input data has to be in a particular format.

Steps:
1. Create a phrase with jumbled entities and the annotations for each entity.
2. Convert into json file with the content (phrase above) and the corresponding entities.
3. The json file is converted into a list.


####  a. Creating content for the laptops

In [3]:
## the entities
cols = data.columns
cols

Index(['Brand', 'Model', 'Processor', 'RAM', 'OS', 'Disk', 'Dim', 'Category'], dtype='object')

In [4]:
## number of entities
num_ent = len(data.columns)
ent_list = list(np.arange(num_ent))
ent_list

[0, 1, 2, 3, 4, 5, 6, 7]

In [5]:
# sample of an entity
data.iloc[0,2]

'Intel Core i3 Processor (7th Gen)'

In [6]:
## jumble indices of entities to create a phrase
prod_name = [] # list of all product names
prod_ann = [] # list of all the annotations
for i in range(len(data)): # loop for each laptop
    idx_list = random.sample(ent_list,num_ent) # shuffling indices
    cont = []
    ann = []
    ann_idx = 0 # pointer for annotating 
    for j in range(num_ent): # creating the jumbled product name
        col_num = idx_list[j] # column number according jumbled column index
        val = data.iloc[i,col_num] # value of the entity 
        cont.append(val) # appending list of entities into a single list
        ann.append((ann_idx, len(val)+ ann_idx, cols[col_num])) # annotations and entity name
        ann_idx = ann_idx + len(val) + 1 # updating the annotation pointer
        
    prod_name.append( ' '.join(cont)) # complete phrase for each laptop
    prod_ann.append(ann) 


    

In [7]:
## Example
## This is the content
sample_prod_name = prod_name[8]
sample_prod_name

'512 GB SSD 8 GB DDR4  35.56 cm (14 inch)  64 bit Windows 10 Modern Intel Core i5 Processor (10th Gen) Laptop MSI'

In [8]:
## entities in it with their annotations
sample_prod_ent = prod_ann[8]
sample_prod_ent

[(0, 10, 'Disk'),
 (11, 21, 'RAM'),
 (22, 41, 'Dim'),
 (42, 59, 'OS'),
 (60, 66, 'Model'),
 (67, 101, 'Processor'),
 (102, 108, 'Category'),
 (109, 112, 'Brand')]

In [9]:
## take an entity in the prod 
st_id = sample_prod_ent[7][0]
end_id = sample_prod_ent[7][1]
ent = sample_prod_ent[7][2]
print(ent,':', sample_prod_name[st_id : end_id])


Brand : MSI


In [10]:
prod =[]
for i in range(len(data)):
    prod.append([prod_name[i], prod_ann[i]])

prod[4]

['HP 64 bit Windows 10 Zbook 1 TB HDD Laptop 35.56 cm (14 inch)  8 GB DDR4  Intel Core i5 Processor (8th Gen)',
 [(0, 2, 'Brand'),
  (3, 20, 'OS'),
  (21, 26, 'Model'),
  (27, 35, 'Disk'),
  (36, 42, 'Category'),
  (43, 62, 'Dim'),
  (63, 73, 'RAM'),
  (74, 107, 'Processor')]]

In [11]:
## creating a dataframe with product names and annotations
prod_data = pd.DataFrame(prod, columns = ['ProdName','Annotations'])
prod_data.head()

Unnamed: 0,ProdName,Annotations
0,4 GB DDR4 1 TB HDD Laptop 39.62 cm (15.6 inch...,"[(0, 10, RAM), (11, 19, Disk), (20, 26, Catego..."
1,Lenovo 39.62 cm (15.6 inch) Laptop Intel Core...,"[(0, 6, Brand), (7, 28, Dim), (29, 35, Categor..."
2,256 GB SSD 64 bit Windows 10 Laptop EliteBook ...,"[(0, 10, Disk), (11, 28, OS), (29, 35, Categor..."
3,4 GB DDR4 Linux/Ubuntu Vostro 35.56 cm (14 in...,"[(0, 10, RAM), (11, 23, OS), (24, 30, Model), ..."
4,HP 64 bit Windows 10 Zbook 1 TB HDD Laptop 35....,"[(0, 2, Brand), (3, 20, OS), (21, 26, Model), ..."


In [12]:
# converting into csv file
prod_data.to_csv('laptop_prodNames.csv', index= None)

#### b. Creating json file

In [13]:
# converting into json format
import csv
import json

csvfile = open('laptop_prodNames.csv', 'r')
jsonfile = open('laptop_prodNames.json', 'w')

fieldnames = ('ProdName', 'Annotations')
reader = csv.DictReader( csvfile, fieldnames)

for row in reader:
    json.dump(row, jsonfile)
    jsonfile.write('\n')

#### c. json to list (spacy format)

In [14]:
## function to convert json file into spacy traning data format
def convert_to_spacytrain(json_file):
    try:
        
        training_data = []
        lines=[]
        with open(json_file, 'r') as f:
            lines = f.readlines() # this has 416 lines
            
        for line in lines[1:400]: # loop for every product
            data = json.loads(line) # single row
            text = data['ProdName'] #this is complete phrase
            entities = data['Annotations']
            training_data.append((text, {"entities" : eval(entities)}))
            
        return training_data
    
    except Exception as e:
        
        logging.exception("Unable to process " + json_file + "\n" + "error = " + str(e))
        
        return None

In [15]:
train_data = convert_to_spacytrain('laptop_prodNames.json')
train_data

[('4 GB DDR4  1 TB HDD Laptop 39.62 cm (15.6 inch)  64 bit Windows 10 Lenovo Intel Core i3 Processor (7th Gen) Ideapad',
  {'entities': [(0, 10, 'RAM'),
    (11, 19, 'Disk'),
    (20, 26, 'Category'),
    (27, 48, 'Dim'),
    (49, 66, 'OS'),
    (67, 73, 'Brand'),
    (74, 107, 'Processor'),
    (108, 115, 'Model')]}),
 ('Lenovo 39.62 cm (15.6 inch)  Laptop Intel Core i3 Processor (7th Gen) 4 GB DDR4  1 TB HDD 64 bit Windows 10 Ideapad',
  {'entities': [(0, 6, 'Brand'),
    (7, 28, 'Dim'),
    (29, 35, 'Category'),
    (36, 69, 'Processor'),
    (70, 80, 'RAM'),
    (81, 89, 'Disk'),
    (90, 107, 'OS'),
    (108, 115, 'Model')]}),
 ('256 GB SSD 64 bit Windows 10 Laptop EliteBook Intel Core i3 Processor (7th Gen) HP 8 GB DDR4  35.56 cm (14 inch) ',
  {'entities': [(0, 10, 'Disk'),
    (11, 28, 'OS'),
    (29, 35, 'Category'),
    (36, 45, 'Model'),
    (46, 79, 'Processor'),
    (80, 82, 'Brand'),
    (83, 93, 'RAM'),
    (94, 113, 'Dim')]}),
 ('4 GB DDR4  Linux/Ubuntu Vostro 35.56 cm 

### 3. Training NER model

In [16]:
def train_spacy(data,iterations):
    
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
                ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                #print(text, annotations)
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
                
            if (losses['ner'] <100):
                    break
            print(losses)
    return nlp

In [17]:
# can plot losses and choose best model

In [18]:
%%time
prdnlp = train_spacy(train_data,10)

Starting iteration 0
{'ner': 1616.534688581056}
Starting iteration 1
{'ner': 296.74486782755986}
Starting iteration 2
{'ner': 227.66339603823027}
Starting iteration 3
{'ner': 388.8967474150053}
Starting iteration 4
{'ner': 253.16708756021092}
Starting iteration 5
{'ner': 169.10252932106346}
Starting iteration 6
{'ner': 245.55921900603008}
Starting iteration 7
{'ner': 260.9601010909793}
Starting iteration 8
{'ner': 425.2567879593142}
Starting iteration 9
CPU times: user 12min 54s, sys: 10.7 s, total: 13min 4s
Wall time: 3min 18s


In [19]:
prdnlp_1 = train_spacy(train_data,20)

Starting iteration 0
{'ner': 1976.1569668445497}
Starting iteration 1
{'ner': 497.76540309684856}
Starting iteration 2
{'ner': 188.1025777314755}
Starting iteration 3
{'ner': 161.8841822449491}
Starting iteration 4


### 4. Testing the model

In [20]:
test_text = input("Enter your testing text: ")
doc = prdnlp(test_text)

for ent in doc.ents:

    print('Entity: ',ent.text)
    print('Details: ',ent.start_char, ent.end_char, ent.label_)

Enter your testing text: I have a Lenovo Ideapad laptop
Entity:  I
Details:  0 1 Model
Entity:  have
Details:  2 6 Brand
Entity:  Lenovo
Details:  9 15 Brand
Entity:  Ideapad
Details:  16 23 Model
Entity:  laptop
Details:  24 30 Category


In [21]:
test_text = input("Enter your testing text: ")
doc = prdnlp_1(test_text)

for ent in doc.ents:

    print('Entity: ',ent.text)
    print('Details: ',ent.start_char, ent.end_char, ent.label_)

Enter your testing text: 256 GB SSD and Intel Core i7 Processor (8th Gen) HP?
Entity:  256 GB SSD
Details:  0 10 Disk
Entity:  Intel Core i7 Processor (8th Gen)
Details:  15 48 Processor
Entity:  HP
Details:  49 51 Brand
Entity:  ?
Details:  51 52 Brand
