# NERfR (Named Entity Recognition for Recipes)

In [1]:
# Installations and Imports
import spacy
import sys
from tasteset_utils import prepare_data, ENTITIES
import json
# json_path = '/home/pgajo/working/food/data/TASTEset/data/TASTEset_semicolon_formatted_en-it_unaligned_aligned_model=mdeberta-v3-base-xl-wa_recipe_aligner_5epochs_error_rate=0.0119_pruned.json'
json_path = '/home/pgajo/working/food/data/TASTEset/data/TASTEset_semicolon_formatted_en-it_itemwise.json'

with open(json_path, 'r') as f:
    training_data = json.load(f)
print(training_data['annotations'][0])

  from .autonotebook import tqdm as notebook_tqdm


{'text_en': '5 ounces rum;4 ounces triple sec;3 ounces Tia Maria;20 ounces orange juice', 'entities_en': [[0, 1, 'QUANTITY'], [2, 8, 'UNIT'], [9, 12, 'FOOD'], [13, 14, 'QUANTITY'], [15, 21, 'UNIT'], [22, 32, 'FOOD'], [33, 34, 'QUANTITY'], [35, 41, 'UNIT'], [42, 51, 'FOOD'], [52, 54, 'QUANTITY'], [55, 61, 'UNIT'], [62, 74, 'FOOD']], 'text_it': "5 once rum;4 once triple sec;3 once Tia Maria;20 once succo d'arancia", 'entities_it': [[0, 1, 'QUANTITY'], [2, 6, 'UNIT'], [7, 10, 'FOOD'], [11, 12, 'QUANTITY'], [13, 17, 'UNIT'], [18, 28, 'FOOD'], [29, 30, 'QUANTITY'], [31, 35, 'UNIT'], [36, 45, 'FOOD'], [46, 48, 'QUANTITY'], [49, 53, 'UNIT'], [54, 69, 'FOOD']]}


In [None]:
# make spacy dataset
from spacy.tokens import DocBin
import os
import re

def doc_from_annotations(training_annotations, languages = ['en']):
  num_of_entities = 0
  doc_bin = DocBin()
  for lang in languages:
    nlp = spacy.blank(lang)
    for idx, example in enumerate(training_annotations):
      doc = nlp.make_doc(re.sub(r'-(\d+)', r' \1', example[f'text_{lang}'].replace(';', ' ')))
      print(idx, 'doc:', doc)
      ents = []
      # print("len(example[f'entities_{lang}']):", len(example[f'entities_{lang}']))
      for i, entity in enumerate(example[f'entities_{lang}']):
        span = doc.char_span(*entity, alignment_mode='strict')
        # print('span.start', span.start)
        # print('span.end', span.end)
        # print(i, 'entity\t', entity, '\tspan:\t', span)
        # print(i, 'entity\t', entity, '\traw:\t', example[f'text_{lang}'][entity[0]:entity[1]])
        # if the span is None, skip it and don't add it to the doc's entities
        if span is None:
          continue
        ents.append(span)
        num_of_entities += 1
      
      doc.ents = ents
      doc_bin.add(doc)
    
  print('num_of_entities:', num_of_entities)
  return doc_bin

train_len = int(0.8*len(training_data['annotations'])) # 80/20 split
languages = ['it']
lang_id = '-'.join(languages)
train_bin = doc_from_annotations(training_data['annotations'][:train_len], languages = languages)
print('train_bin length:', len(train_bin))
dev_bin = doc_from_annotations(training_data['annotations'][train_len:], languages = languages)
print('dev_bin length:', len(dev_bin))
spacy_dir = '/home/pgajo/working/food/data/TASTEset/data/spacy'
train_path = os.path.join(spacy_dir, f"{lang_id}_train.spacy")
dev_path = os.path.join(spacy_dir, f"{lang_id}_dev.spacy")
train_bin.to_disk(train_path)
dev_bin.to_disk(dev_path)

In [None]:
# Transformer config
model_name = 'bert-base-multilingual-cased'
BASE_CONFIG_TRANFORMER = """
# This is an auto-generated partial config. To use it with 'spacy train'
# you can run spacy init fill-config to auto-fill all default settings:
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
[paths]
train = train_path
dev = dev_path
vectors = null
[system]
gpu_allocator = "pytorch"

[nlp]
lang = "it"
pipeline = ["transformer","ner"]
batch_size = 128

[components]

[components.transformer]
factory = "transformer"

[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v3"
name = "model_name"
# name = "microsoft/mdeberta-v3-base"
tokenizer_config = {"use_fast": true}

[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96

[components.ner]
factory = "ner"

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0

[components.ner.model.tok2vec.pooling]
@layers = "reduce_mean.v1"

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0

[training]
accumulate_gradient = 3
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"

[training.optimizer]
@optimizers = "Adam.v1"

[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 5000
initial_rate = 2e-5

[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 256

[initialize]
vectors = ${paths.vectors}"""
BASE_CONFIG_TRANFORMER = BASE_CONFIG_TRANFORMER.replace('model_name', model_name)
BASE_CONFIG_TRANFORMER = BASE_CONFIG_TRANFORMER.replace('train_path', train_path)
BASE_CONFIG_TRANFORMER = BASE_CONFIG_TRANFORMER.replace('dev_path', dev_path)
print(BASE_CONFIG_TRANFORMER)
base_config_path = f"{model_name}.cfg"
with open(base_config_path, 'w') as f:
  f.write(BASE_CONFIG_TRANFORMER)

In [None]:
# This command fills in your config with from the base_config you generated. The
# last argument is the name of your config. I used "_eff" for "efficiency". Feel
# free to change that
# !python -m spacy init fill-config mbert.cfg config_mbert.cfg
model_config_path = f"config_{model_name}.cfg"
!python -m spacy init fill-config "$base_config_path" "$model_config_path"

## Training

Run the following code to train! Note that you'll have to change the path and name of the `.cfg` file as necessary. The last argument is a folder that'll contain your pipeline. Feel free to prefix it with a path to a more useful location. Also have some fun with the name!

You'll get periodic updates with the `loss`, `F1`, `precision`, `recall` for the NER model over time. They also give you a `SCORE`, which is helpful when training multiple components, but in our case, the `SCORE` is just the `F1` score for the NER model.

In [None]:
suffix = 'item-wise'
output_path = f"output_{model_name}_{lang_id}_{suffix}"
!python -m spacy train "$model_config_path" --output "$output_path" -g 0

## Results

The training outputs a `meta.json` file in the output folder (`output_eff` in our case). We can use this to check a number of metrics, including the performance of each entity class.

In [None]:
import json
import pandas as pd

# grab the performance dict from within the meta file
print(f"{output_path}/model-best/meta.json")
performance = json.load(open(f"{output_path}/model-best/meta.json", 'r'))['performance']
performance_by_ent = performance['ents_per_type']

perf_df = pd.DataFrame(performance_by_ent)
perf_df["TOTAL"] = [performance['ents_p'], performance['ents_r'], performance['ents_f']]
# sort by header
perf_df = perf_df.reindex(sorted(perf_df.columns), axis=1)

# display df with the cell color corresponding to the value (dark=high; light=low)
perf_df.style.background_gradient(
    axis=1, low=perf_df.min().min(), high=1, cmap='YlOrBr'
    )

Here we've got the precision (p), recall (r), and F1 (f) score by entity. It seems like the best performing entities are the ones we care the most about. Only 40% of *PART* entities are being turned up. I can live with that.

## Getting the Confusion Matrix

We're going to be plotting a [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix) on the same test set we used for training. At a high level, this entails running each sample through the trained model, and, for each token, storing the entity the model predicted for that token, as well as the ground truth entity (as labeled by the dataset authors).

In [None]:
import spacy
# load the model and test set. Again, change the paths as required
nlp = spacy.load(f"{output_path}/model-best")
test_set = list(DocBin().from_disk(dev_path).get_docs(nlp.vocab))

In [None]:
pred_ents = []
true_ents = []

for recipe in test_set:
  # tok.ent_type_ gets the ent per token, as opposed to breaking the Doc into
  # entities. This ensures that `true_ents` and `pred_ents` are the same length.
  true_ents += [tok.ent_type_ for tok in recipe]
  # `recipe.text` grabs the raw recipe, because `recipe` already contains entity
  # labels.
  pred_ents += [tok.ent_type_ for tok in nlp(recipe.text)]

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# create and display the confusion matrix
cm = confusion_matrix(true_ents, pred_ents, labels=ENTITIES)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=ENTITIES)

disp.plot()
plt.xticks(rotation=70)
plt.show()

Unfortunately, there isn't quite enough data for the color mapping to show fine-grained differences. Nonetheless, we can get a sense for the more common mislabelings.

## Inference

Let's test this baby out!

You'll notice I load the recipes as lists of ingredient lines and run each ingredient line through the pipeline separately, even though we trained the models on recipes
I found it performed better this way, despite the formatting mismatch.

In [None]:
!pip install recipe-scrapers

In [None]:
from recipe_scrapers import scrape_me

# RECIPE_URL = "https://ricette.giallozafferano.it/Trota-salmonata-in-crosta-di-pistacchi.html"
RECIPE_URL = "https://cucchiaio.it/ricetta/torta-con-farina-di-mandorle/"
scraper = scrape_me(RECIPE_URL)
scraper.ingredients()

### Text Pre-Processing Function

Note the float representations of ingredient quantities, in spite of the fact that the website shows them in mixed numbers.

In [None]:
from fractions import Fraction
import re


def fraction_to_mixed_number(fraction: Fraction) -> str:
  if fraction.numerator >= fraction.denominator:
    whole, remainder = divmod(fraction.numerator, fraction.denominator)
    if remainder == 0:
      return str(whole)
    else:
      return f"{whole} {Fraction(remainder, fraction.denominator)}"
  else:
    return str(fraction)


def convert_floats_to_fractions(text: str) -> str:
    return re.sub(
        r'\b-?\d+\.\d+\b',
        lambda match: fraction_to_mixed_number(
            Fraction(float(match.group())).limit_denominator()), text
        )


def process_text(text, model=nlp):
  """
  A wrapper function to pre-process text and run it through our pipeline.
  """
  return nlp(convert_floats_to_fractions(text))

In [None]:
# Let's have a look at our processing function at work\
fraction_mapping = { 
        '½': '1/2', '¼': '1/4', '¾': '3/4',
        '⅓': '1/3', '⅔': '2/3', '⅕': '1/5',
        '⅖': '2/5', '⅗': '3/5', '⅘': '4/5',
        '⅙': '1/6', '⅚': '5/6', '⅛': '1/8',
        '⅜': '3/8', '⅝': '5/8', '⅞': '7/8',
    }
import re
def convert_single_char_fractions(text):
    for key in fraction_mapping.keys():
        text = text.replace(key, fraction_mapping[key])
    return text

[convert_single_char_fractions(convert_floats_to_fractions(line)) for line in scraper.ingredients()]

### Running Inference with Processing

In [None]:
import spacy

In [None]:
# Load the model again for good measure
nlp = spacy.load(f"{output_path}/model-best/")

from spacy import displacy
# process the recipe, line-by-line
docs = [process_text(line, model = nlp) for line in scraper.ingredients()]

displacy.render(docs, style="ent", jupyter=True)

In [None]:
PREPPY_URL = 'https://preppykitchen.com/coffee-cake/'
scraper = scrape_me(PREPPY_URL, wild_mode=True)
scraper.ingredients()
# process the recipe, line-by-line
docs_preppy = [process_text(line) for line in scraper.ingredients()]

displacy.render(docs_preppy, style="ent", jupyter=True)