# Evaluating custom NER model
Now that I have built a custom NER model and tried it out on some unseen text, I want to compute some evaluation metrics to see exactly how the model performs.

## First need to produce an annotated test set (unseen data)

In [1]:
import json
import random
import pandas as pd
import numpy as np

In [2]:
# Function to load in json file from LightTag
def load_json_to_df(file):
    
    results = json.load(open(file))
    
    # Create list of only those posts which have been annotated
    annotated = []
    for example in results['examples']:
        if example['annotations'] != []:
            annotated.append(example)
            
    return pd.DataFrame(annotated)


# Function to convert data into spacy format
def convert_to_spacy_format(df):
    
    TRAIN_DATA = []
    
    for index, row in df.iterrows():
        
        sentence = row['content']
        annotations_input_list = row['annotations']
        
        annotations_output_list = []
        ing_dict = {}
        
        for annotation in annotations_input_list:
            
            annotations_output_list.append((annotation['start'], annotation['end'], annotation['tag']))
        
        ing_dict['entities'] = annotations_output_list
        TRAIN_DATA.append((sentence, ing_dict))
        
    return TRAIN_DATA

In [3]:
TRAIN_DATA = convert_to_spacy_format(load_json_to_df('ingredient-tagger_annotations.json'))

In [20]:
for post, entities in TRAIN_DATA:
    if 'hearty zesty treat' in post:
        print(post)
        print("\n-------\n")
        print(entities)

In [22]:
TEST_DATA = convert_to_spacy_format(load_json_to_df('ingredient-tagger_annotations_2.json'))

In [28]:
test_set = []
for post, entities in TEST_DATA:
    if post not in [TRAIN_DATA[i][0] for i in range(len(TRAIN_DATA))]:
        test_set.append((post, entities))

In [30]:
len(test_set)

14

In [31]:
# Converting test_set to docbin spacy file

import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

def convert_to_spacy_docbin(dataset, file_location):
    
    nlp = spacy.blank("en") # load a new spacy model
    db = DocBin() # create a DocBin object

    for text, annot in tqdm(dataset):
        doc = nlp.make_doc(text) 
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print(f"Skipping entity: {(start,end,label)}")
            else:
                ents.append(span)
        print(ents)
        try:
            doc.ents = ents
        except:
            print(f"Error with document")
        db.add(doc)

    db.to_disk(file_location) # save the docbin object


convert_to_spacy_docbin(test_set, "./test.spacy")

100%|██████████████████████████████████████████| 14/14 [00:00<00:00, 104.51it/s]


Skipping entity: (136, 143, 'INGREDIENT')
Skipping entity: (103, 109, 'INGREDIENT')
Skipping entity: (122, 126, 'INGREDIENT')
Skipping entity: (70, 71, 'QUANTITY')
Skipping entity: (128, 134, 'INGREDIENT')
Skipping entity: (82, 83, 'QUANTITY')
[2, 45, ham, potatoes, 5, oil, potato, 1]
[red onion, cup, tomatoes, 1, 1 /2, 2, cup, grape, honey, lime juice, cup, 1/4, 1/2, black pepper, basil, 1, watermelon, 5, 1, tb, cup]
[basil, olive oil, basil, pasta, mozzarella, pasta, vinegar, chicken, tomatoes, sausage]
[cream, dash, water, cup, 1/2, 1, 1, chia seeds, 3, vanilla protein powder, cup, oats, vanilla, egg white, teaspoon, egg, 1, 5, banana, 1/2, tablespoon, 1]
Error with document
[ml, 1, 2, 2, 70, 10, 4, flour, 3, eggs, self-raising flour, 5, cup, cup, cup, oranges, teaspoon, 2, 6, sugar, 4, oil, eggs, baking powder, orange juice, 2, oil, cup, sugar, 1, 1, half, sugar, cup, orange juice, half, baking powder, 8, 1, flour]
Error with document
Skipping entity: (307, 309, 'QUANTITY')
[cream 

## Now time to evaluate the model

In [36]:
! python -m spacy evaluate ./model-best ./test.spacy --output ./evaluation_metrics.json

[38;5;4mℹ Using CPU[0m
[1m

TOK     100.00
NER P   85.21 
NER R   77.07 
NER F   80.94 
SPEED   12296 

[1m

                  P       R       F
INGREDIENT    78.85   68.72   73.43
QUANTITY      92.94   85.87   89.27
MEASUREMENT   93.02   93.02   93.02

[38;5;2m✔ Saved results to evaluation_metrics.json[0m


In [37]:
json.load(open("evaluation_metrics.json"))

{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'ents_p': 0.8521126761,
 'ents_r': 0.7707006369,
 'ents_f': 0.8093645485,
 'ents_per_type': {'INGREDIENT': {'p': 0.7884615385,
   'r': 0.687150838,
   'f': 0.7343283582},
  'QUANTITY': {'p': 0.9294117647, 'r': 0.8586956522, 'f': 0.8926553672},
  'MEASUREMENT': {'p': 0.9302325581, 'r': 0.9302325581, 'f': 0.9302325581}},
 'speed': 12295.7972191879}

In [42]:
! python -m spacy evaluate ./model-best ./test.spacy -dp ./

[38;5;4mℹ Using CPU[0m
[1m

TOK     100.00
NER P   85.21 
NER R   77.07 
NER F   80.94 
SPEED   12622 

[1m

                  P       R       F
INGREDIENT    78.85   68.72   73.43
QUANTITY      92.94   85.87   89.27
MEASUREMENT   93.02   93.02   93.02

[38;5;2m✔ Generated 25 parses as HTML[0m
.
