# Building NER model using annotated dataset

I have annotated ~40 post descriptions. I shall use these to train an NER tagger and analyse the results.

In [48]:
import json
import random
import pandas as pd
import numpy as np
import inflect

In [1]:
# Function to load in json file from LightTag
def load_json_to_df(file):
    
    results = json.load(open(file))
    
    # Create list of only those posts which have been annotated
    annotated = []
    for example in results['examples']:
        if example['annotations'] != []:
            annotated.append(example)
            
    return pd.DataFrame(annotated)


# Function to convert data into spacy format
def convert_to_spacy_format(df):
    
    TRAIN_DATA = []
    
    for index, row in df.iterrows():
        
        sentence = row['content']
        annotations_input_list = row['annotations']
        
        annotations_output_list = []
        ing_dict = {}
        
        for annotation in annotations_input_list:
            
            annotations_output_list.append((annotation['start'], annotation['end'], annotation['tag']))
        
        ing_dict['entities'] = annotations_output_list
        TRAIN_DATA.append((sentence, ing_dict))
        
    return TRAIN_DATA

In [5]:
TRAIN_DATA = convert_to_spacy_format(load_json_to_df('ingredient-tagger_annotations.json'))

In [8]:
# Splitting into train and dev set
random.seed(23)
random.shuffle(TRAIN_DATA)
split_integer = int(len(TRAIN_DATA)*0.8)
train_set = TRAIN_DATA[:split_integer]
dev_set = TRAIN_DATA[split_integer:]

In [14]:
# Converting TRAIN_DATA to docbin spacy file

import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

def convert_to_spacy_docbin(dataset, file_location):
    
    nlp = spacy.blank("en") # load a new spacy model
    db = DocBin() # create a DocBin object

    for text, annot in tqdm(dataset):
        doc = nlp.make_doc(text) 
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print(f"Skipping entity: {(start,end,label)}")
            else:
                ents.append(span)
        print(ents)
        try:
            doc.ents = ents
        except:
            print(f"Error with document")
        db.add(doc)

    db.to_disk(file_location) # save the docbin object


convert_to_spacy_docbin(train_set, "./train.spacy")
convert_to_spacy_docbin(dev_set, "./dev.spacy")

100%|██████████████████████████████████████████| 28/28 [00:00<00:00, 135.93it/s]

Skipping entity: (717, 718, 'QUANTITY')
Skipping entity: (728, 732, 'INGREDIENT')
Skipping entity: (786, 791, 'INGREDIENT')
Skipping entity: (715, 716, 'QUANTITY')
[5, tablespoon, 2, parmesan, teaspoon, black, 200, fennel seed, teaspoon, 3/4, 2, 3, 1, oil, 1, olive, fennel, 10, yeast, 25, clove, fennel, garlic, parmesan, teaspoon]
[1/2, 1, block, ricotta, cheese, cheese, salt, 6, container, milk, 2, pasta, 5, 20, boxes, noodles, roll, 2, garlic powder, cheese, 1, italian seasoning, mozzarella, sausage, sausage, 400, tablespoon, basil, 2, jars, 1/2, 4, basil, 20, 1/2, 7, 1, mozzarella, tomato, tablespoon, ricotta, 1, cheese, 3, tablespoon]
Skipping entity: (367, 368, 'QUANTITY')
Skipping entity: (492, 498, 'INGREDIENT')
[vanilla, teaspoon, sugar, 1/4, flour, 1/2, 1, custard, 1, cream, sugar, custard, tablespoon, 1/4, cup, 1, vanilla extract, 1/2, egg, cup, 1/2, tablespoon, 350, sugar, butter, 20, flour, custard, ginger, cinnamon, custard, egg, jam, 1, egg, cup, custard, cup, 4, 1, 2, cu


100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 81.20it/s]

[863, feta, tablespoon, feta, cheese, orzo, 200, oregano, 1, 2, salt, feta, 1, teaspoon, garlic, onion, 2, 1, tomato, garlic, 30, feta, tomatoes, cheese, pasta, gram, pepper, clove, tomato, herbs, 4, onions, 4, gram, teaspoon, 863, orzo, pasta, 8, tomatoes, chorizo, cheese, chorizo, 1, cheese, basil]
Skipping entity: (441, 446, 'INGREDIENT')
Skipping entity: (328, 333, 'INGREDIENT')
Skipping entity: (917, 924, 'INGREDIENT')
Skipping entity: (384, 385, 'QUANTITY')
Skipping entity: (427, 428, 'QUANTITY')
Skipping entity: (316, 317, 'QUANTITY')
Skipping entity: (347, 351, 'INGREDIENT')
Skipping entity: (1014, 1019, 'INGREDIENT')
Skipping entity: (382, 383, 'QUANTITY')
Skipping entity: (526, 536, 'MEASUREMENT')
Skipping entity: (419, 424, 'INGREDIENT')
Skipping entity: (429, 430, 'QUANTITY')
[flour, cup, eggs, teaspoon, 2, flax, teaspoon, cup, 6, tablespoon, oreos, bi carb, flax, water, cup, eggs, butter, 1, cup, 3, oreo, cup, 1, 180, tablespoon, oreos, oreo, eggs, 1, flaxseed, 1, tablespo




## Running the model

In [21]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [22]:
! python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./dev.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-08-11 11:08:34,702] [INFO] Set up nlp object from config
[2022-08-11 11:08:34,721] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-08-11 11:08:34,736] [INFO] Created vocabulary
[2022-08-11 11:08:34,737] [INFO] Finished initializing nlp object
[2022-08-11 11:08:35,757] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     62.21    1.18    3.45    0.71    0.01
  7     200         97.46   3599.58   80.74   84.17   77.58    0.81
 14     400        112.80    231.63   84.09   89.88   79.00    0.84
 21     600        133.64    119.79   81.83   84.47   79.36    0.82
 28     800         87.56     55.46   84.07   87.64   80.78    

In [23]:
nlp_ner = spacy.load("./model-best")

In [41]:
posts = pd.read_csv('preprocessed_descriptions.csv')

colors = {
    "MEASUREMENT": "#bfeeb7",
    "INGREDIENT": "#feca74",
    "QUANTITY": "#e4e7d2"
}
options = {"ents": list(colors), "colors": colors}

for post in posts['description_preprocessed'][121:130]:
    doc = nlp_ner(post)
    spacy.displacy.render(doc, style="ent", options=options, jupyter=True)
    print("\n------------------------------------------------\n")


------------------------------------------------




------------------------------------------------




------------------------------------------------




------------------------------------------------




------------------------------------------------




------------------------------------------------




------------------------------------------------




------------------------------------------------




------------------------------------------------



## Determining unique ingredients in each post

In [57]:
def get_unique_ingredients(text, model):
    
    nlp = spacy.load(model)
    doc = nlp(text)
    
    ingredients = []
    ingredients_singularized = []
    
    for entity in doc.ents:
        if entity.label_ == 'INGREDIENT':
            ingredients.append(entity.text)

    ingredients = np.unique(ingredients)
    
    # Singularizing ingredients using inflect python package
    p = inflect.engine()
    for word in ingredients:
        if p.singular_noun(word):
            ingredients_singularized.append(p.singular_noun(word))
        else:
            ingredients_singularized.append(word)
            
    return np.unique(ingredients_singularized)

In [62]:
# Adding list of unique ingredients column to dataframe
posts['unique_ingredients'] = posts['description_preprocessed'].apply(get_unique_ingredients)

In [64]:
def include_unique_ingredients(df, model):
    
    nlp = spacy.load(model)
    
    df['unique_ingredients'] = df['description_preprocessed'].apply(get_unique_ingredients)
    
    return df

In [67]:
include_unique_ingredients(pd.read_csv('preprocessed_descriptions.csv'), "./model-best")

Unnamed: 0.1,Unnamed: 0,description_preprocessed,unique_ingredients
0,81,chocolate fudge protein oatmeal \ningredients:...,"[banana, milk, oat, peanut butter, sugar, water]"
1,104,⁣caprese chicken with pesto ⠀\n⠀\nthis dish is...,"[basil, breast, cheese, chicken, mozarella, oi..."
2,169,looking for a fast recipe to make for lunch? t...,"[ginger, hemp seed, oil, olive, platter, water]"
3,172,sweet & savoury mushroom scones~ this easy and...,"[baking powder, butter, cheese, cream, egg, fl..."
4,226,paneer tikka recipe\nsave it to try later\n\ni...,"[black pepper powder, butter, carom seed, cumi..."
...,...,...,...
328,3208,prawn taco bowl\n—————————————————————-\nperfe...,"[avocado, cheese, chilli, coriander, cucumber,..."
329,3389,"salmon, butternut squash, sweet potato and kal...","[black, garlic powder, oil, paprika, pepper, p..."
330,3463,are you feeling lazy? we have just the recipe ...,"[black pepper, bread, butter, cheese, garlic, ..."
331,3556,servings: 1 \n⠀ ⠀\ningredients:\n⠀\n1/4 cup sh...,"[black pepper, cheddar, cheese, egg, flour]"
