# NERfR (Named Entity Recognition for Recipes)

In [None]:
# Installations and Imports
import spacy
import sys
from tasteset_utils import prepare_data, ENTITIES
import json
# json_path = '/home/pgajo/working/food/data/TASTEset/data/TASTEset_semicolon_formatted_en-it_unaligned_aligned_model=mdeberta-v3-base-xl-wa_recipe_aligner_5epochs_error_rate=0.0119_pruned.json'
json_path = '/home/pgajo/working/food/data/TASTEset/data/TASTEset_semicolon_formatted_en-it_itemwise.json'

with open(json_path, 'r') as f:
    training_data = json.load(f)
print(training_data['annotations'][0])

In [None]:
# make spacy dataset from tasteset format
from spacy.tokens import DocBin
import os
import re

def tasteset_to_spacy_formatter(training_annotations, languages = ['en']):
  num_of_entities = 0
  doc_bin = DocBin()
  for lang in languages:
    nlp = spacy.blank(lang)
    for idx, example in enumerate(training_annotations):
      doc = nlp.make_doc(re.sub(r'-(\d+)', r' \1', example[f'text_{lang}'].replace(';', ' ')))
      print(idx, 'doc:', doc)
      ents = []
      # print("len(example[f'ents_{lang}']):", len(example[f'ents_{lang}']))
      for i, entity in enumerate(example[f'ents_{lang}']):
        span = doc.char_span(*entity, alignment_mode='strict')
        # print('span.start', span.start)
        # print('span.end', span.end)
        # print(i, 'entity\t', entity, '\tspan:\t', span)
        # print(i, 'entity\t', entity, '\traw:\t', example[f'text_{lang}'][entity[0]:entity[1]])
        # if the span is None, skip it and don't add it to the doc's entities
        if span is None:
          continue
        ents.append(span)
        num_of_entities += 1
      
      doc.ents = ents
      doc_bin.add(doc)
    
  print('num_of_entities:', num_of_entities)
  return doc_bin

train_len = int(0.8*len(training_data['annotations'])) # 80/20 split
languages = ['it']
lang_id = '-'.join(languages)
train_bin = tasteset_to_spacy_formatter(training_data['annotations'][:train_len], languages = languages)
print('train_bin length:', len(train_bin))
dev_bin = tasteset_to_spacy_formatter(training_data['annotations'][train_len:], languages = languages)
print('dev_bin length:', len(dev_bin))
spacy_dir = '/home/pgajo/working/food/data/TASTEset/data/spacy'
train_path = os.path.join(spacy_dir, f"{lang_id}_train.spacy")
dev_path = os.path.join(spacy_dir, f"{lang_id}_dev.spacy")
train_bin.to_disk(train_path)
dev_bin.to_disk(dev_path)

In [None]:
# Transformer config
model_name = 'bert-base-multilingual-cased'
BASE_CONFIG_TRANFORMER = """
# This is an auto-generated partial config. To use it with 'spacy train'
# you can run spacy init fill-config to auto-fill all default settings:
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
[paths]
train = train_path
dev = dev_path
vectors = null
[system]
gpu_allocator = "pytorch"

[nlp]
lang = "it"
pipeline = ["transformer","ner"]
batch_size = 128

[components]

[components.transformer]
factory = "transformer"

[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v3"
name = "model_name"
# name = "microsoft/mdeberta-v3-base"
tokenizer_config = {"use_fast": true}

[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96

[components.ner]
factory = "ner"

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0

[components.ner.model.tok2vec.pooling]
@layers = "reduce_mean.v1"

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0

[training]
accumulate_gradient = 3
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"

[training.optimizer]
@optimizers = "Adam.v1"

[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 5000
initial_rate = 2e-5

[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 256

[initialize]
vectors = ${paths.vectors}"""
BASE_CONFIG_TRANFORMER = BASE_CONFIG_TRANFORMER.replace('model_name', model_name)
BASE_CONFIG_TRANFORMER = BASE_CONFIG_TRANFORMER.replace('train_path', train_path)
BASE_CONFIG_TRANFORMER = BASE_CONFIG_TRANFORMER.replace('dev_path', dev_path)
print(BASE_CONFIG_TRANFORMER)
base_config_path = f"{model_name}.cfg"
with open(base_config_path, 'w') as f:
  f.write(BASE_CONFIG_TRANFORMER)

In [None]:
# This command fills in your config with from the base_config you generated. The
# last argument is the name of your config. I used "_eff" for "efficiency". Feel
# free to change that
# !python -m spacy init fill-config mbert.cfg config_mbert.cfg
model_config_path = f"config_{model_name}.cfg"
!python -m spacy init fill-config "$base_config_path" "$model_config_path"

## Training

Run the following code to train! Note that you'll have to change the path and name of the `.cfg` file as necessary. The last argument is a folder that'll contain your pipeline. Feel free to prefix it with a path to a more useful location. Also have some fun with the name!

You'll get periodic updates with the `loss`, `F1`, `precision`, `recall` for the NER model over time. They also give you a `SCORE`, which is helpful when training multiple components, but in our case, the `SCORE` is just the `F1` score for the NER model.

In [None]:
suffix = 'item-wise'
output_path = f"output_{model_name}_{lang_id}_{suffix}"
!python -m spacy train "$model_config_path" --output "$output_path" -g 0

## Results

The training outputs a `meta.json` file in the output folder (`output_eff` in our case). We can use this to check a number of metrics, including the performance of each entity class.

In [None]:
import json
import pandas as pd

# grab the performance dict from within the meta file
print(f"{output_path}/model-best/meta.json")
performance = json.load(open(f"{output_path}/model-best/meta.json", 'r'))['performance']
performance_by_ent = performance['ents_per_type']

perf_df = pd.DataFrame(performance_by_ent)
perf_df["TOTAL"] = [performance['ents_p'], performance['ents_r'], performance['ents_f']]
# sort by header
perf_df = perf_df.reindex(sorted(perf_df.columns), axis=1)

# display df with the cell color corresponding to the value (dark=high; light=low)
perf_df.style.background_gradient(
    axis=1, low=perf_df.min().min(), high=1, cmap='YlOrBr'
    )

Here we've got the precision (p), recall (r), and F1 (f) score by entity. It seems like the best performing entities are the ones we care the most about. Only 40% of *PART* entities are being turned up. I can live with that.

## Getting the Confusion Matrix

We're going to be plotting a [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix) on the same test set we used for training. At a high level, this entails running each sample through the trained model, and, for each token, storing the entity the model predicted for that token, as well as the ground truth entity (as labeled by the dataset authors).

In [None]:
import spacy
# load the model and test set. Again, change the paths as required
nlp = spacy.load(f"{output_path}/model-best")
test_set = list(DocBin().from_disk(dev_path).get_docs(nlp.vocab))

In [None]:
pred_ents = []
true_ents = []

for recipe in test_set:
  # tok.ent_type_ gets the ent per token, as opposed to breaking the Doc into
  # entities. This ensures that `true_ents` and `pred_ents` are the same length.
  true_ents += [tok.ent_type_ for tok in recipe]
  # `recipe.text` grabs the raw recipe, because `recipe` already contains entity
  # labels.
  pred_ents += [tok.ent_type_ for tok in nlp(recipe.text)]

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# create and display the confusion matrix
cm = confusion_matrix(true_ents, pred_ents, labels=ENTITIES)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=ENTITIES)

disp.plot()
plt.xticks(rotation=70)
plt.show()

Unfortunately, there isn't quite enough data for the color mapping to show fine-grained differences. Nonetheless, we can get a sense for the more common mislabelings.

## Evaluation

In [1]:
import json
import spacy
import os
from spacy.tokens import DocBin
from tqdm import tqdm

# make spacy dataset from label-studio format

def label_studio_to_spacy_formatter(label_studio_json_data, languages = ['en']):
  num_of_entities = 0
  doc_bin = DocBin()
  for lang in languages:
    nlp = spacy.blank(lang)
    for idx, example in enumerate(label_studio_json_data):
      text = example['data']['ingredients_it']
      doc = nlp.make_doc(text.replace(';', ' '))
      # print(idx, 'doc:', doc)
      ents = []
      for i, entity_ls in enumerate(example['annotations'][0]['result']):
        entity = [entity_ls['value']['start'],
                    entity_ls['value']['end'],
                    entity_ls['value']['labels'][0]]
        span = doc.char_span(*entity,
                            alignment_mode='strict')

        if span is None:
          print('********************')
          print(idx, entity, text[entity[0]:entity[1]])
          continue
        ents.append(span)
        num_of_entities += 1
      
      try:
        doc.ents = ents
      except Exception as e:
        print(idx, 'doc:', doc)
        print("error:", e)
        # sort entities by start and retain None by making it a dummy entry 
        # ents = sorted([ent for ent in ents if ent is not None], key=lambda x: x.start)
        for ent in ents:
            if ent is not None:
                print(ent.start, ent.end, ent.label_, ent.text)
            else:
                print(ent)
        print('-------------------')
      doc_bin.add(doc)
    
  print('num_of_entities:', num_of_entities)
  return doc_bin

test_json_path = '/home/pgajo/working/food/data/GZ/gz-recipes-annotated/gz-recipes-annotated.json'

with open(test_json_path, 'r') as f:
    test_json = json.load(f)

test_len = int(0.8*len(test_json)) # 80/20 split
languages = ['it']
lang_id = '-'.join(languages)
test_bin = label_studio_to_spacy_formatter(test_json, languages = languages)
print('test_bin length:', len(test_bin))

spacy_dir = '/home/pgajo/working/food/data/TASTEset/data/spacy'
test_path = os.path.join(spacy_dir, f"{lang_id}_{os.path.basename(test_json_path)}_test.spacy")
print(test_path)
test_bin.to_disk(test_path)

  from .autonotebook import tqdm as notebook_tqdm


num_of_entities: 20319
test_bin length: 597
/home/pgajo/working/food/data/TASTEset/data/spacy/it_gz-recipes-annotated.json_test.spacy


In [28]:
# metrics using seqeval
from seqeval.metrics import classification_report

from tqdm import tqdm
import pandas as pd
model_path_1 = '/home/pgajo/working/food/src/ner/spacy_outputs/output_bert-base-multilingual-cased_it_item-wise/model-best'
model_path_2 = '/home/pgajo/working/food/src/ner/spacy_outputs/output_bert-base-multilingual-cased_it/model-best'

model_path_list = [model_path_1, model_path_2]
stats = []

import spacy
ENTITIES = [el.strip() for el in open('/home/pgajo/working/food/data/TASTEset/data/classes').readlines()]
# print(ENTITIES)

for model_path in model_path_list:

    nlp = spacy.load(model_path)
    test_set = list(DocBin().from_disk(test_path).get_docs(nlp.vocab))
    # test_set = test_set[:2]
    y_true = []
    y_pred = []
    for doc in tqdm(test_set, total = len(test_set)):
        y_true_sample = []
        for tok in doc:
            if tok.ent_type_ != '':
                y_true_sample.append('-'.join([tok.ent_iob_, tok.ent_type_]))
            elif tok.ent_type_ == '':
                y_true_sample.append(tok.ent_iob_)

        y_true.append(y_true_sample)

        y_pred_sample = []
        for tok in nlp(doc.text):
            if tok.ent_type_ != '':
                y_pred_sample.append('-'.join([tok.ent_iob_, tok.ent_type_]))
            elif tok.ent_type_ == '':
                y_pred_sample.append(tok.ent_iob_)
        
        y_pred.append(y_pred_sample)

    results_seqeval = classification_report(y_true, y_pred, output_dict=True)
    df_results_seqeval = pd.DataFrame(results_seqeval).T
    display(df_results_seqeval)



100%|██████████| 597/597 [00:22<00:00, 26.90it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
COLOR,0.75,0.006342,0.012579,473.0
FOOD,0.577471,0.58229,0.579871,6471.0
PART,0.166667,0.007576,0.014493,264.0
PHYSICAL_QUALITY,0.497925,0.074906,0.130222,1602.0
PROCESS,0.256637,0.119342,0.162921,243.0
PURPOSE,0.681818,0.428571,0.526316,140.0
QUALITY/BRAND,0.0,0.0,0.0,70.0
QUANTITY,0.878615,0.750305,0.809406,6560.0
TASTE,0.0,0.0,0.0,66.0
UNIT,0.977576,0.974266,0.975919,4430.0


100%|██████████| 597/597 [00:21<00:00, 27.21it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
COLOR,0.910064,0.89852,0.904255,473.0
FOOD,0.693579,0.667748,0.680419,6471.0
PART,0.058824,0.003788,0.007117,264.0
PHYSICAL_QUALITY,0.586503,0.298377,0.395532,1602.0
PROCESS,0.232673,0.193416,0.211236,243.0
PURPOSE,0.38806,0.185714,0.251208,140.0
QUALITY/BRAND,0.0,0.0,0.0,70.0
QUANTITY,0.9481,0.821494,0.880268,6560.0
TASTE,0.457143,0.242424,0.316832,66.0
UNIT,0.932135,0.936343,0.934234,4430.0


In [25]:
results_seqeval.__class__

str

In [5]:
# metrics using spacy scorer
from spacy.training import Example
from spacy.scorer import get_ner_prf

from tqdm import tqdm
import pandas as pd
model_path_1 = '/home/pgajo/working/food/src/ner/spacy_outputs/output_bert-base-multilingual-cased_it_item-wise/model-best'
model_path_2 = '/home/pgajo/working/food/src/ner/spacy_outputs/output_bert-base-multilingual-cased_it/model-best'

model_path_list = [model_path_1, model_path_2]
stats = []

import spacy
ENTITIES = [el.strip() for el in open('/home/pgajo/working/food/data/TASTEset/data/classes').readlines()]
# print(ENTITIES)

for model_path in model_path_list:

    nlp = spacy.load(model_path)
    test_set = list(DocBin().from_disk(test_path).get_docs(nlp.vocab))
    # test_set = test_set[:2]
    examples = []
    for doc in tqdm(test_set, total = len(test_set)):
        ents = doc.ents
        golds = [(ent.start_char, ent.end_char, ent.label_) for ent in ents if ent.label_ in ENTITIES]
        preds = nlp(doc.text)
        gold_dict = {'entities': golds}
        example = Example.from_dict(preds, gold_dict)
        examples.append(example)
    results = get_ner_prf(examples)
    # print(results['ents_f'])

    model_name = model_path.split('/')[-2]
    # print(model_name)

    per_entity_stats = []
    prec_sum = 0
    rec_sum = 0
    f1_sum = 0
    for key in results['ents_per_type'].keys():
        per_entity_stats.append([key, model_name, results['ents_per_type'][key]['p'], results['ents_per_type'][key]['r'], results['ents_per_type'][key]['f']])
        prec_sum += results['ents_per_type'][key]['p']
        rec_sum += results['ents_per_type'][key]['r']
        f1_sum += results['ents_per_type'][key]['f']

    macro_prec = prec_sum / len(per_entity_stats)
    macro_rec = rec_sum / len(per_entity_stats)
    macro_f1 = f1_sum / len(per_entity_stats)

    macro_stats_row = ['macro', model_name, macro_prec, macro_rec, macro_f1]
    stats.append(macro_stats_row)

    micro_stats_row = ['micro', model_name, results['ents_p'], results['ents_r'], results['ents_f']]
    stats.append(micro_stats_row)

    stats += per_entity_stats 

# print(stats)
df_results = pd.DataFrame(stats, columns=['entity', 'model', 'p', 'r', 'f1'])
display(df_results)

['FOOD', 'QUANTITY', 'UNIT', 'PROCESS', 'PHYSICAL_QUALITY', 'COLOR', 'TASTE', 'PURPOSE', 'PART']


100%|██████████| 597/597 [00:21<00:00, 27.34it/s]
100%|██████████| 597/597 [00:22<00:00, 27.03it/s]


Unnamed: 0,entity,model,p,r,f1
0,macro,output_bert-base-multilingual-cased_it_item-wise,0.531857,0.327067,0.356859
1,micro,output_bert-base-multilingual-cased_it_item-wise,0.777327,0.652872,0.709684
2,FOOD,output_bert-base-multilingual-cased_it_item-wise,0.577471,0.58229,0.579871
3,QUANTITY,output_bert-base-multilingual-cased_it_item-wise,0.878615,0.750305,0.809406
4,UNIT,output_bert-base-multilingual-cased_it_item-wise,0.977576,0.974266,0.975919
5,PHYSICAL_QUALITY,output_bert-base-multilingual-cased_it_item-wise,0.497925,0.074906,0.130222
6,TASTE,output_bert-base-multilingual-cased_it_item-wise,0.0,0.0,0.0
7,PART,output_bert-base-multilingual-cased_it_item-wise,0.166667,0.007576,0.014493
8,PURPOSE,output_bert-base-multilingual-cased_it_item-wise,0.681818,0.428571,0.526316
9,COLOR,output_bert-base-multilingual-cased_it_item-wise,0.75,0.006342,0.012579


In [27]:
df_results.to_csv('/home/pgajo/working/food/src/ner/metrics/multilingual_ner_results.csv')

In [None]:
print(test_set[0].ents)
print(len(test_set[0].ents))
print(len(test_set[0]))

print(test_set[0][0])
print(dir(test_set[0][0]))
print(test_set[0][0].__class__)
print(test_set[0][0].ent_iob_)

In [None]:
from sklearn.metrics import classification_report # this doesn't work with classification report, gotta look into how to actually compute metrics for NER

true_ents = []
pred_ents = []
test_set = test_set[:3]
for item in tqdm(test_set, total = len(test_set)):
    # true_ents.append([tok.ent_type_ for tok in item])
    # pred_ents.append([tok.ent_type_ for tok in nlp(item.text)])
    print([tok.ent_type_ for tok in item])
    print([tok.ent_type_ for tok in nlp(item.text)])

# classification_report(true_ents, pred_ents, labels=ENTITIES)

In [None]:
# make confusion matrix
pred_ents = []
true_ents = []

for recipe in tqdm(test_set, total=len(test_set)):
  # tok.ent_type_ gets the ent per token, as opposed to breaking the Doc into
  # entities. This ensures that `true_ents` and `pred_ents` are the same length.
  true_ents += [tok.ent_type_ for tok in recipe]
  # `recipe.text` grabs the raw recipe, because `recipe` already contains entity
  # labels.
  pred_ents += [tok.ent_type_ for tok in nlp(recipe.text)]

%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from tasteset_utils import ENTITIES

# create and display the confusion matrix
cm = confusion_matrix(true_ents, pred_ents, labels=ENTITIES)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=ENTITIES)

disp.plot()
plt.xticks(rotation=70)
plt.show()

In [None]:
# Evaluation metrics
import spacy 
from spacy.tokens import Span
from spacy import displacy
from spacy.training import *
from spacy.scorer import Scorer
from spacy.util import minibatch, compounding
from tqdm import tqdm

model_paths = ['/home/pgajo/working/food/src/ner/spacy_outputs/output_bert-base-multilingual-cased_it_item-wise/model-best',
'/home/pgajo/working/food/src/ner/spacy_outputs/output_bert-base-multilingual-cased_it/model-best']

results = []

for i, model_path in enumerate(model_paths):
    nlp = spacy.load(model_path)
    test_data = list(test_bin.get_docs(nlp.vocab))

    # evaluate function
    def evaluate(ner_model, testing_data):
        scorer = Scorer()
        examples = []
        for sample in tqdm(testing_data, desc="Evaluating", total=len(testing_data)):
            doc_gold_text = ner_model.make_doc(sample.text)
            example = Example.from_dict(doc_gold_text, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in sample.ents if ent.label_ in ENTITIES]})
            print('1', example.__class__)
            print('1', example)
            example.predicted = ner_model(doc_gold_text)
            print('2', example.predicted.__class__)
            print('2', example.predicted)
            for tok in example.predicted:
                print(tok.ent_type_)
            examples.append(example)
            print('*************')
            
        return scorer.score(examples)

    # print the results
    results.append(evaluate(nlp, test_data[:1]))
    print(i, results[i])

In [None]:
stats = []
for i, result in enumerate(results):
    model_stats = []
    overall_stats = [model_paths[i].split('/')[-2], 'macro', result['ents_p'], result['ents_r'], result['ents_f']]
    model_stats.append(overall_stats)
    for ent in result['ents_per_type'].keys():
        ent_stats = [model_paths[i].split('/')[-2], ent, result['ents_per_type'][ent]['p'], result['ents_per_type'][ent]['r'], result['ents_per_type'][ent]['f']]
        model_stats.append(ent_stats)
    stats.append(model_stats)
for el in stats:
    print(el)

import pandas as pd

metrics_dir = '/home/pgajo/working/food/src/ner/metrics'

dataframes = []
for i, stat in enumerate(stats):
    df = pd.DataFrame(stat, columns=['model', 'entity', 'precision', 'recall', 'f-score'])
    dataframes.append(df)
    display(df)
    df.to_csv(os.path.join(metrics_dir, f'{model_paths[i].split("/")[-2]}_stats.csv'), index=False)

## Inference

In [None]:
model_path = '/home/pgajo/working/food/src/ner/spacy_outputs/output_bert-base-multilingual-cased_it_item-wise/model-best'
# model_path = '/home/pgajo/working/food/src/ner/spacy_outputs/output_bert-base-multilingual-cased_it/model-best'
ner_model = spacy.load(model_path)
docs = list(test_bin.get_docs(ner_model.vocab))
print('len(docs):', len(docs))
print(docs[0].text)
ner_model(docs[0].text)
from spacy import displacy
displacy.render(ner_model(docs[0].text), style="ent", jupyter=True)

In [None]:
!pip install recipe-scrapers

In [None]:
from recipe_scrapers import scrape_me

# RECIPE_URL = "https://ricette.giallozafferano.it/Trota-salmonata-in-crosta-di-pistacchi.html"
RECIPE_URL = "https://cucchiaio.it/ricetta/torta-con-farina-di-mandorle/"
scraper = scrape_me(RECIPE_URL)
scraper.ingredients()

### Text Pre-Processing Function

Note the float representations of ingredient quantities, in spite of the fact that the website shows them in mixed numbers.

In [None]:
from fractions import Fraction
import re


def fraction_to_mixed_number(fraction: Fraction) -> str:
  if fraction.numerator >= fraction.denominator:
    whole, remainder = divmod(fraction.numerator, fraction.denominator)
    if remainder == 0:
      return str(whole)
    else:
      return f"{whole} {Fraction(remainder, fraction.denominator)}"
  else:
    return str(fraction)


def convert_floats_to_fractions(text: str) -> str:
    return re.sub(
        r'\b-?\d+\.\d+\b',
        lambda match: fraction_to_mixed_number(
            Fraction(float(match.group())).limit_denominator()), text
        )


def process_text(text, model=nlp):
  """
  A wrapper function to pre-process text and run it through our pipeline.
  """
  return nlp(convert_floats_to_fractions(text))

In [None]:
# Let's have a look at our processing function at work\
fraction_mapping = { 
        '½': '1/2', '¼': '1/4', '¾': '3/4',
        '⅓': '1/3', '⅔': '2/3', '⅕': '1/5',
        '⅖': '2/5', '⅗': '3/5', '⅘': '4/5',
        '⅙': '1/6', '⅚': '5/6', '⅛': '1/8',
        '⅜': '3/8', '⅝': '5/8', '⅞': '7/8',
    }
import re
def convert_single_char_fractions(text):
    for key in fraction_mapping.keys():
        text = text.replace(key, fraction_mapping[key])
    return text

[convert_single_char_fractions(convert_floats_to_fractions(line)) for line in scraper.ingredients()]

### Running Inference with Processing

In [None]:
import spacy

In [None]:
# Load the model again for good measure
nlp = spacy.load(f"{output_path}/model-best/")

from spacy import displacy
# process the recipe, line-by-line
docs = [process_text(line, model = nlp) for line in scraper.ingredients()]

displacy.render(docs, style="ent", jupyter=True)

In [None]:
PREPPY_URL = 'https://preppykitchen.com/coffee-cake/'
scraper = scrape_me(PREPPY_URL, wild_mode=True)
scraper.ingredients()
# process the recipe, line-by-line
docs_preppy = [process_text(line) for line in scraper.ingredients()]

displacy.render(docs_preppy, style="ent", jupyter=True)