# Preprocess datasets and create docbins for training of spacy NER model

In this notebook, I read the datasets (in csv format) that were created in **create_dataset_for_NER.ipynb**. These datasets were annotated manually after their creation. After reading them, I create columns that are needed for the data to be processed by a SpaCy pipeline, with the purpose to recognise if a given ingredient is considered a "main ingredient", based on the specifications given in **create_dataset_for_NER.ipynb**. The final product are to .spacy docbin files, that can be used as input for training my NER Spacy model.

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

import spacy
from spacy.tokens import Span, DocBin

In [8]:
BASE_DIR = Path().resolve().parent

def data_path(folder, file_name):
    return Path(BASE_DIR) / f"data/{folder}/{file_name}"

def create_doc_columns(df, label='FOOD'):
    nlp = spacy.blank('en')

    df['ingredients'] = df['ingredients'].map(lambda x: x.lower())
    df['main_ingredient'] = df['main_ingredient'].map(lambda x: x.lower())
    df['start_idx'] = df.apply(lambda x: x.ingredients.find(x.main_ingredient), axis=1)
    df['end_idx'] = df.apply(lambda x: x.start_idx + len(x.main_ingredient), axis=1)
    df['doc'] = df.ingredients.map(lambda x: nlp(x))
    df['span'] = df.apply(lambda x: x.doc.char_span(x.start_idx, x.end_idx, label=label) if x.start_idx>-1 else None, axis=1)

    return df

def create_docbin_with_data(df):
    db = DocBin()

    for i in range(df.shape[0]):
        if df['span'].iloc[i] is not None:
            df['doc'].iloc[i].set_ents([df['span'].iloc[i]], default="unmodified")
        else:
            df['doc'].iloc[i].ents = []
        
        db.add(df['doc'].iloc[i])

    return db


### Create the train dataset

In [12]:
# Read train dataset
ingredients_rows_df = pd.read_csv(data_path("interim", "selected_ingrendiens_NER.csv"), encoding='latin-1', sep='\t')
ingredients_rows_df

Unnamed: 0,ingredients;main_ingredient
0,ROSEMARY-CORNMEAL CRUST:;
1,3/4 cup all-purpose flour;
2,1/2 cup plain white cornmeal;
3,1/4 cup powdered sugar;
4,2 teaspoons chopped fresh rosemary;
...,...
1966,"coriander, leaves and stalks separated;coriander"
1967,1 tbs celery seeds;
1968,celery seeds;
1969,"fennel seeds, toasted;"


In [13]:
# Correct the split of the dataset 
ingredients_ner_df = pd.concat([ingredients_rows_df, ingredients_rows_df['ingredients;main_ingredient'].str.split(';', expand=True)], axis=1).drop(['ingredients;main_ingredient'], axis=1)
ingredients_ner_df.rename(columns = {0: 'ingredients', 1: 'main_ingredient'}, inplace=True)

In [5]:
ingredients_ner_df.head()

Unnamed: 0,ingredients,main_ingredient
0,ROSEMARY-CORNMEAL CRUST:,
1,3/4 cup all-purpose flour,
2,1/2 cup plain white cornmeal,
3,1/4 cup powdered sugar,
4,2 teaspoons chopped fresh rosemary,


In [6]:
ingredients_ner_df = create_doc_columns(ingredients_ner_df)

In [7]:
ingredients_ner_df.head()

Unnamed: 0,ingredients,main_ingredient,start_idx,end_idx,doc,span
0,rosemary-cornmeal crust:,,0,0,"(rosemary, -, cornmeal, crust, :)",
1,3/4 cup all-purpose flour,,0,0,"(3/4, cup, all, -, purpose, flour)",
2,1/2 cup plain white cornmeal,,0,0,"(1/2, cup, plain, white, cornmeal)",
3,1/4 cup powdered sugar,,0,0,"(1/4, cup, powdered, sugar)",
4,2 teaspoons chopped fresh rosemary,,0,0,"(2, teaspoons, chopped, fresh, rosemary)",


In [10]:
db = create_docbin_with_data(ingredients_ner_df)

In [11]:
print(db.__len__())
db.to_disk("train.spacy") 

1971


### Create the evaluation dataset

In [14]:
# Read the annotated dev dataset

dev_set_df =pd.read_csv(Path(BASE_DIR) / f"data/interim/dev_set.csv", encoding='latin-1', sep='\t')
dev_set_df

Unnamed: 0,ingredients;main_ingredient
0,"large sweet potato (250g), cut into 2Â½cm-thi..."
1,olive oil;
2,salt and black pepper;
3,"garlic cloves, peeled and thinly sliced;"
4,tomato passata;
...,...
267,cumin seedsÂ ;
268,"red chillies, finely sliced on an angle (dese..."
269,"spring onions, finely sliced on an angle;"
270,"coriander, leaves and stalks separated;coriander"


In [15]:
dev_set_df = pd.concat([dev_set_df, dev_set_df['ingredients;main_ingredient'].str.split(';', expand=True)], axis=1).drop(['ingredients;main_ingredient'], axis=1)
dev_set_df.rename(columns = {0: 'ingredients', 1: 'main_ingredient'}, inplace=True)

In [16]:
dev_set_df = create_doc_columns(dev_set_df)
dev_set_df.head()

Unnamed: 0,ingredients,main_ingredient,start_idx,end_idx,doc,span
0,"large sweet potato (250g), cut into 2â½cm-thi...",sweet potato,7,19,"( , large, sweet, potato, (, 250, g, ), ,, cut...","(sweet, potato)"
1,olive oil,,0,0,"( , olive, oil)",
2,salt and black pepper,,0,0,"( , salt, and, black, pepper)",
3,"garlic cloves, peeled and thinly sliced",,0,0,"( , garlic, cloves, ,, peeled, and, thinly, sl...",
4,tomato passata,,0,0,"( , tomato, passata)",


In [17]:
db = create_docbin_with_data(dev_set_df)

In [18]:
print(db.__len__())
db.to_disk("eval.spacy") 

272
