# Preprocess datasets and create docbins for training of spacy NER model

In this notebook, I read the datasets (in csv format) that were created in **create_dataset_for_NER.ipynb**. These datasets were annotated manually after their creation. After reading them, I create columns that are needed for the data to be processed by a SpaCy pipeline, with the purpose to recognise if a given ingredient is considered a "main ingredient", based on the specifications given in **create_dataset_for_NER.ipynb**. The final product are to .spacy docbin files, that can be used as input for training my NER Spacy model.

In [39]:
from pathlib import Path

import pandas as pd
import numpy as np

from spacy.tokens import Span, DocBin
import spacy

In [40]:
BASE_DIR = Path().resolve().parent

def data_path(folder, file_name):
    return Path(BASE_DIR) / f"data/{folder}/{file_name}"

def create_doc_columns(df, label='FOOD'):
    nlp = spacy.blank('en')

    df['ingredients'] = df['ingredients'].map(lambda x: x.lower())
    df['main_ingredient'] = df['main_ingredient'].map(lambda x: x.lower())
    df['start_idx'] = df.apply(lambda x: x.ingredients.find(x.main_ingredient), axis=1)
    df['end_idx'] = df.apply(lambda x: x.start_idx + len(x.main_ingredient), axis=1)
    df['doc'] = df.ingredients.map(lambda x: nlp(x))
    df['span'] = df.apply(lambda x: x.doc.char_span(x.start_idx, x.end_idx, label=label) if x.start_idx>-1 else None, axis=1)

    return df

def create_docbin_with_data(df):
    db = DocBin()

    for i in range(df.shape[0]):
        if df['span'].iloc[i] is not None:
            df['doc'].iloc[i].set_ents([df['span'].iloc[i]], default="unmodified")
        else:
            df['doc'].iloc[i].ents = []
        
        db.add(df['doc'].iloc[i])

    return db


### Create the train dataset

In [41]:
# Read train dataset
ingredients_rows_df = pd.read_csv(data_path("interim", "selected_ingrendiens_NER_large.csv"), encoding='latin-1', sep='\t')
ingredients_rows_df

Unnamed: 0,ingredients;main_ingredient
0,"1 large red onion, halved lengthways, thinly s..."
1,2 small green jalapenos;
2,2/3 cup rice vinegar;
3,1 tablespoon lime juice;
4,1 heaped teaspoon sea salt;
...,...
1630,3 T olive oil;
1631,2 T balsamic vinegar;
1632,1 t salt;
1633,2 t black pepper;


In [42]:
# Correct the split of the dataset 
ingredients_ner_df = pd.concat([ingredients_rows_df, ingredients_rows_df['ingredients;main_ingredient'].str.split(';', expand=True)], axis=1).drop(['ingredients;main_ingredient'], axis=1)
ingredients_ner_df.rename(columns = {0: 'ingredients', 1: 'main_ingredient'}, inplace=True)

In [43]:
ingredients_ner_df.head(20)

Unnamed: 0,ingredients,main_ingredient
0,"1 large red onion, halved lengthways, thinly s...",
1,2 small green jalapenos,
2,2/3 cup rice vinegar,
3,1 tablespoon lime juice,
4,1 heaped teaspoon sea salt,
5,1/2 cup mayonnaise,
6,1/2 cup sour cream,
7,"2 teaspoons lime juice, plus extra to taste",
8,1 teaspoon finely grated lime zest,
9,Pinch of sea salt,


In [44]:
ingredients_ner_df = create_doc_columns(ingredients_ner_df)

In [45]:
ingredients_ner_df.head(20)

Unnamed: 0,ingredients,main_ingredient,start_idx,end_idx,doc,span
0,"1 large red onion, halved lengthways, thinly s...",,0,0,"(1, large, red, onion, ,, halved, lengthways, ...",
1,2 small green jalapenos,,0,0,"(2, small, green, jalapenos)",
2,2/3 cup rice vinegar,,0,0,"(2/3, cup, rice, vinegar)",
3,1 tablespoon lime juice,,0,0,"(1, tablespoon, lime, juice)",
4,1 heaped teaspoon sea salt,,0,0,"(1, heaped, teaspoon, sea, salt)",
5,1/2 cup mayonnaise,,0,0,"(1/2, cup, mayonnaise)",
6,1/2 cup sour cream,,0,0,"(1/2, cup, sour, cream)",
7,"2 teaspoons lime juice, plus extra to taste",,0,0,"(2, teaspoons, lime, juice, ,, plus, extra, to...",
8,1 teaspoon finely grated lime zest,,0,0,"(1, teaspoon, finely, grated, lime, zest)",
9,pinch of sea salt,,0,0,"(pinch, of, sea, salt)",


Now process the ottolenghi data

In [46]:
ingredients_rows_ottolenghi_df = pd.read_csv(data_path("interim", "ottolenghi_train_set.csv"), encoding='latin-1', sep='\t')
ingredients_rows_ottolenghi_df

Unnamed: 0,ingredients;main_ingredient
0,unsalted butter;
1,"shallot, finely diced (60g);"
2,"garlic cloves, crushed;"
3,"rainbow chard, stems and leaves separated, st..."
4,flour;
...,...
278,lemon juice;
279,rice (or white) wine vinegar ;
280,light soy sauce;
281,mirin;


In [47]:
# Correct the split of the dataset 
ingredients_rows_ottolenghi_df = pd.concat([ingredients_rows_ottolenghi_df, ingredients_rows_ottolenghi_df['ingredients;main_ingredient'].str.split(';', expand=True)], axis=1).drop(['ingredients;main_ingredient'], axis=1)
ingredients_rows_ottolenghi_df.rename(columns = {0: 'ingredients', 1: 'main_ingredient'}, inplace=True)
ingredients_rows_ottolenghi_df.head()

Unnamed: 0,ingredients,main_ingredient
0,unsalted butter,
1,"shallot, finely diced (60g)",
2,"garlic cloves, crushed",
3,"rainbow chard, stems and leaves separated, st...",rainbow chard
4,flour,


In [48]:
ingredients_rows_ottolenghi_df = create_doc_columns(ingredients_rows_ottolenghi_df)
ingredients_rows_ottolenghi_df.head()

Unnamed: 0,ingredients,main_ingredient,start_idx,end_idx,doc,span
0,unsalted butter,,0,0,"( , unsalted, butter)",
1,"shallot, finely diced (60g)",,0,0,"( , shallot, ,, finely, diced, (, 60, g, ))",
2,"garlic cloves, crushed",,0,0,"( , garlic, cloves, ,, crushed)",
3,"rainbow chard, stems and leaves separated, st...",rainbow chard,1,14,"( , rainbow, chard, ,, stems, and, leaves, sep...","(rainbow, chard)"
4,flour,,0,0,"( , flour)",


Merge the two datasets

In [49]:
train_data_df = pd.concat([ingredients_ner_df, ingredients_rows_ottolenghi_df], ignore_index=True)
train_data_df.head()

Unnamed: 0,ingredients,main_ingredient,start_idx,end_idx,doc,span
0,"1 large red onion, halved lengthways, thinly s...",,0,0,"(1, large, red, onion, ,, halved, lengthways, ...",
1,2 small green jalapenos,,0,0,"(2, small, green, jalapenos)",
2,2/3 cup rice vinegar,,0,0,"(2/3, cup, rice, vinegar)",
3,1 tablespoon lime juice,,0,0,"(1, tablespoon, lime, juice)",
4,1 heaped teaspoon sea salt,,0,0,"(1, heaped, teaspoon, sea, salt)",


In [50]:
train_data_df.shape

(1918, 6)

In [51]:
# Save the train set

train_data_df.to_csv(data_path('interim', 'train_set.csv'), columns=['ingredients', 'main_ingredient'], index=False, sep=';')

Create the train docbin

In [52]:
db = create_docbin_with_data(train_data_df)

In [58]:
print(db.__len__())
db.to_disk("train_large_ottolenghi.spacy") 

272


### Create the evaluation dataset

In [53]:
# Read the annotated dev dataset

dev_set_df =pd.read_csv(Path(BASE_DIR) / f"data/interim/dev_set.csv", encoding='latin-1', sep='\t')
dev_set_df

Unnamed: 0,ingredients;main_ingredient
0,"large sweet potato (250g), cut into 2Â½cm-thi..."
1,olive oil;
2,salt and black pepper;
3,"garlic cloves, peeled and thinly sliced;"
4,tomato passata;tomato
...,...
267,cumin seedsÂ ;
268,"red chillies, finely sliced on an angle (dese..."
269,"spring onions, finely sliced on an angle;"
270,"coriander, leaves and stalks separated;"


In [54]:
dev_set_df = pd.concat([dev_set_df, dev_set_df['ingredients;main_ingredient'].str.split(';', expand=True)], axis=1).drop(['ingredients;main_ingredient'], axis=1)
dev_set_df.rename(columns = {0: 'ingredients', 1: 'main_ingredient'}, inplace=True)

dev_set_df.head()

Unnamed: 0,ingredients,main_ingredient
0,"large sweet potato (250g), cut into 2Â½cm-thi...",sweet potato
1,olive oil,
2,salt and black pepper,
3,"garlic cloves, peeled and thinly sliced",
4,tomato passata,tomato


In [55]:
dev_set_df = create_doc_columns(dev_set_df)
dev_set_df.head()

Unnamed: 0,ingredients,main_ingredient,start_idx,end_idx,doc,span
0,"large sweet potato (250g), cut into 2â½cm-thi...",sweet potato,7,19,"( , large, sweet, potato, (, 250, g, ), ,, cut...","(sweet, potato)"
1,olive oil,,0,0,"( , olive, oil)",
2,salt and black pepper,,0,0,"( , salt, and, black, pepper)",
3,"garlic cloves, peeled and thinly sliced",,0,0,"( , garlic, cloves, ,, peeled, and, thinly, sl...",
4,tomato passata,tomato,1,7,"( , tomato, passata)",(tomato)


In [56]:
db = create_docbin_with_data(dev_set_df)

In [57]:
print(db.__len__())
db.to_disk("eval.spacy") 

272
