## Inference

In [3]:
from recipe_scrapers import scrape_me

recipe_url_en = "https://www.allrecipes.com/recipe/246868/pecan-sour-cream-coffee-cake/"
recipe_url_it = "https://ricette.giallozafferano.it/Polpette-alla-cacciatora.html"
recipe_url_es_1 = "https://mahatmarice.com/es/recetas/autentica-paella-espanola-con-marisco/" # doesn't work because the recipe scraper only works with a fixed number of websites
recipe_url_es_2 = "https://www.comedera.com/como-hacer-paella-de-marisco/" # doesn't work because the recipe scraper only works with a fixed number of websites


scraper_en = scrape_me(recipe_url_en)
ingredient_list_en = scraper_en.ingredients()
scraper_it = scrape_me(recipe_url_it)
ingredient_list_it = scraper_it.ingredients()
print(scraper_en.title())
for ingredient in ingredient_list_en:
    print('\t', ingredient)
print(scraper_it.title())
for ingredient in ingredient_list_it:
    print('\t', ingredient)

Pecan Sour Cream Coffee Cake
	 1.5 cups pecans, finely chopped
	 0.33333334326744 cup white sugar
	 0.33333334326744 cup packed light brown sugar
	 3 tablespoons melted butter
	 1 teaspoon cinnamon
	 0.125 teaspoon salt
	 1.875 cups all-purpose flour
	 1 teaspoon baking powder
	 0.75 teaspoon baking soda
	 0.5 teaspoon fine sea salt
	 1 cup white sugar
	 0.5 cup unsalted butter, softened
	 2 large eggs
	 1 cup sour cream or creme fraiche
	 1.5 teaspoons vanilla extract
Polpette alla cacciatora
	 Macinato di vitello 500 g
	 Pangrattato 100 g
	 Grana Padano DOP (da grattugiare) 40 g
	 Uova 3
	 Latte intero 50 g
	 Aglio 1 spicchio
	 Prezzemolo (da tritare) q.b.
	 Sale fino q.b.
	 Passata di pomodoro 500 g
	 Cipolle 1
	 Carote 1
	 Sedano ½ costa
	 Vino rosso 50 g
	 Rosmarino 1 rametto
	 Olio extravergine d'oliva q.b.
	 Sale fino q.b.
	 Pepe nero q.b.


### Text Pre-Processing Function

Note the float representations of ingredient quantities, in spite of the fact that the website shows them in mixed numbers.

In [4]:
from fractions import Fraction
import re


def fraction_to_mixed_number(fraction: Fraction) -> str:
  if fraction.numerator >= fraction.denominator:
    whole, remainder = divmod(fraction.numerator, fraction.denominator)
    if remainder == 0:
      return str(whole)
    else:
      return f"{whole} {Fraction(remainder, fraction.denominator)}"
  else:
    return str(fraction)


def convert_floats_to_fractions(text: str) -> str:
    return re.sub(
        r'\b-?\d+\.\d+\b',
        lambda match: fraction_to_mixed_number(
            Fraction(float(match.group())).limit_denominator()), text
        )


def process_text(text, model):
  """
  A wrapper function to pre-process text and run it through our pipeline.
  """
  return model(convert_floats_to_fractions(text))

In [5]:
# Let's have a look at our processing fucntion at work
ingredient_list_en = [convert_floats_to_fractions(line) for line in scraper_en.ingredients()]
ingredient_list_it = [convert_floats_to_fractions(line) for line in scraper_it.ingredients()]
print(ingredient_list_en)
print(ingredient_list_it)

['1 1/2 cups pecans, finely chopped', '1/3 cup white sugar', '1/3 cup packed light brown sugar', '3 tablespoons melted butter', '1 teaspoon cinnamon', '1/8 teaspoon salt', '1 7/8 cups all-purpose flour', '1 teaspoon baking powder', '3/4 teaspoon baking soda', '1/2 teaspoon fine sea salt', '1 cup white sugar', '1/2 cup unsalted butter, softened', '2 large eggs', '1 cup sour cream or creme fraiche', '1 1/2 teaspoons vanilla extract']
['Macinato di vitello 500 g', 'Pangrattato 100 g', 'Grana Padano DOP (da grattugiare) 40 g', 'Uova 3', 'Latte intero 50 g', 'Aglio 1 spicchio', 'Prezzemolo (da tritare) q.b.', 'Sale fino q.b.', 'Passata di pomodoro 500 g', 'Cipolle 1', 'Carote 1', 'Sedano ½ costa', 'Vino rosso 50 g', 'Rosmarino 1 rametto', "Olio extravergine d'oliva q.b.", 'Sale fino q.b.', 'Pepe nero q.b.']


### Running Inference with Processing

In [6]:
import spacy



In [7]:
# Load the model again for good measure
# nlp_cpu = spacy.load('/home/pgajo/working/food/TASTEset/output_eff_cpu/model-best')
nlp_roberta = spacy.load('/home/pgajo/working/food/TASTEset/output_transformer_0/model-best')

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
doc_test_en = nlp_roberta(ingredient_list_en[0])
print(doc_test_en)
print(doc_test_en.ents)

for ent in doc_test_en.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
print()
doc_test_it = nlp_roberta(ingredient_list_it[0])
print(doc_test_it)
print(doc_test_it.ents)

for ent in doc_test_it.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

1 1/2 cups pecans, finely chopped
(1 1/2, cups, pecans, finely chopped)
1 1/2 0 5 QUANTITY
cups 6 10 UNIT
pecans 11 17 FOOD
finely chopped 19 33 PROCESS

Macinato di vitello 500 g
(500, g)
500 20 23 QUANTITY
g 24 25 UNIT


In [11]:
from spacy import displacy

displacy.render(doc_test_en, style="ent", jupyter=True)
displacy.render(doc_test_it, style="ent", jupyter=True)

In [None]:
# # process the recipe, line-by-line
# docs_cpu = [process_text(line, model = nlp_cpu) for line in scraper.ingredients()]

# displacy.render(docs_cpu, style="ent", jupyter=True

docs_roberta = [process_text(line, model = nlp_roberta) for line in scraper.ingredients()]

displacy.render(docs_roberta, style="ent", jupyter=True)

In [None]:
PREPPY_URL = 'https://preppykitchen.com/coffee-cake/'
scraper = scrape_me(PREPPY_URL, wild_mode=True)
scraper.ingredients()

In [None]:
# process the recipe, line-by-line
docs_roberta = [process_text(line) for line in scraper.ingredients()]

displacy.render(docs_roberta, style="ent", jupyter=True)

And that's all folks! Feel free to use this code for your own purposes, and happy parsing!