In [168]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [169]:
import pandas as pd
import numpy as np
import time
from typing import List
import pickle

import sys

sys.path.insert(0, "../")  # needed for using the utils file in the notebook.
from src.utils import nutrition_facts_parser, sort_ingredients_alphabetically
from sentence_transformers import SentenceTransformer, util

from rich import inspect
from tqdm.notebook import tqdm

from transformers import AutoTokenizer, AutoModel
import torch

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

## merging the title of the recipes back:

In [170]:
recipes_path = "../data/full_dataset.csv"
recipe_title_link = pd.read_csv(
    recipes_path,
    index_col=0,
    usecols=["title", "link"],
    dtype={
        "title": "string[pyarrow]",
        "link": "string[pyarrow]",
    },
)
recipe_title_link = recipe_title_link.query(
    " link.str.contains('allrecipes.com') "
)


data = pd.read_csv("all_recipes_data_w_nutrition_facts_unstructured.csv")
data_w_nutrition = data.query(" Nutrition_facts_unstructured.notnull() ")
print(data_w_nutrition.shape)

data_w_nutrition.head()

(41080, 3)


Unnamed: 0,link,NER,Nutrition_facts_unstructured
0,www.allrecipes.com/recipe/241895/deconstructed...,"['orange juice', 'jiggers vodka']",Nutrition Facts\nServings Per Recipe 1\nCalori...
1,www.allrecipes.com/recipe/20808/kettle-corn/,"['vegetable oil', 'white sugar', 'popcorn kern...",Nutrition Facts\nServings Per Recipe 5\nCalori...
2,www.allrecipes.com/recipe/238733/pops-fabulous...,"['eggplants', 'green bell peppers', 'red bell ...",Nutrition Facts\nServings Per Recipe 10\nCalor...
3,www.allrecipes.com/recipe/241254/chipotle-mang...,"['avocados', 'tomatoes', 'mango', 'cilantro', ...",Nutrition Facts\nServings Per Recipe 20\nCalor...
4,www.allrecipes.com/recipe/261696/basic-homemad...,"['milk', 'heavy whipping cream', 'white vinega...",Nutrition Facts\nServings Per Recipe 4\nCalori...


In [171]:
data_w_nutrition = data_w_nutrition.merge(
    recipe_title_link.reset_index(),
    on="link",
    how="left",
    validate="1:1",
    copy=False,
)
print(data_w_nutrition.shape)
data_w_nutrition.head()

(41080, 4)


Unnamed: 0,link,NER,Nutrition_facts_unstructured,title
0,www.allrecipes.com/recipe/241895/deconstructed...,"['orange juice', 'jiggers vodka']",Nutrition Facts\nServings Per Recipe 1\nCalori...,Deconstructed Screwdriver (The Raw Egg)
1,www.allrecipes.com/recipe/20808/kettle-corn/,"['vegetable oil', 'white sugar', 'popcorn kern...",Nutrition Facts\nServings Per Recipe 5\nCalori...,Kettle Corn
2,www.allrecipes.com/recipe/238733/pops-fabulous...,"['eggplants', 'green bell peppers', 'red bell ...",Nutrition Facts\nServings Per Recipe 10\nCalor...,Pop'S Fabulous Ajvar
3,www.allrecipes.com/recipe/241254/chipotle-mang...,"['avocados', 'tomatoes', 'mango', 'cilantro', ...",Nutrition Facts\nServings Per Recipe 20\nCalor...,Chipotle-Mango Guacamole
4,www.allrecipes.com/recipe/261696/basic-homemad...,"['milk', 'heavy whipping cream', 'white vinega...",Nutrition Facts\nServings Per Recipe 4\nCalori...,Basic Homemade Ricotta Cheese


In [172]:
recipe_title_link.columns, data_w_nutrition.columns

(Index(['link'], dtype='object'),
 Index(['link', 'NER', 'Nutrition_facts_unstructured', 'title'], dtype='object'))

## sort NER ingredients alphabetically:

In [173]:
data_w_nutrition.NER = data_w_nutrition.NER.apply(eval)

data_w_nutrition["sorted_NER"] = data_w_nutrition.NER.apply(
    sort_ingredients_alphabetically
)
data_w_nutrition.head()

Unnamed: 0,link,NER,Nutrition_facts_unstructured,title,sorted_NER
0,www.allrecipes.com/recipe/241895/deconstructed...,"[orange juice, jiggers vodka]",Nutrition Facts\nServings Per Recipe 1\nCalori...,Deconstructed Screwdriver (The Raw Egg),"[jiggers vodka, orange juice]"
1,www.allrecipes.com/recipe/20808/kettle-corn/,"[vegetable oil, white sugar, popcorn kernels]",Nutrition Facts\nServings Per Recipe 5\nCalori...,Kettle Corn,"[popcorn kernels, vegetable oil, white sugar]"
2,www.allrecipes.com/recipe/238733/pops-fabulous...,"[eggplants, green bell peppers, red bell peppe...",Nutrition Facts\nServings Per Recipe 10\nCalor...,Pop'S Fabulous Ajvar,"[cooking spray, corn oil, eggplants, garlic, g..."
3,www.allrecipes.com/recipe/241254/chipotle-mang...,"[avocados, tomatoes, mango, cilantro, red onio...",Nutrition Facts\nServings Per Recipe 20\nCalor...,Chipotle-Mango Guacamole,"[avocados, black pepper, cayenne pepper, cilan..."
4,www.allrecipes.com/recipe/261696/basic-homemad...,"[milk, heavy whipping cream, white vinegar, salt]",Nutrition Facts\nServings Per Recipe 4\nCalori...,Basic Homemade Ricotta Cheese,"[heavy whipping cream, milk, salt, white vinegar]"


## Prepare for Sentence Transformer:
- create singular "recipes" from each unique ingredient in the corpus --> each ingredient would have its own embedding as well.
- get an embedding for each recipe based on the sorted NER ingredients.

In [174]:
all_ingredients = set(data_w_nutrition.NER.explode())
all_ingredients.remove(np.nan)
len(all_ingredients)

9900

In [175]:
data_w_nutrition.columns

Index(['link', 'NER', 'Nutrition_facts_unstructured', 'title', 'sorted_NER'], dtype='object')

In [176]:
# create a 1 ingredient recipe so that we can get the embedding of each ingredient:

singular_ingredient_recipes_data = []

for ingredient in all_ingredients:
    singular_ingredient_recipes_data.append(
        {
            "link": np.nan,
            "NER": [ingredient],
            "Nutrition_facts_unstructured": np.nan,
            "title": ingredient,
            "sorted_NER": [ingredient],
        }
    )
singular_ingredient_recipes = pd.DataFrame(
    data=singular_ingredient_recipes_data
)
print(singular_ingredient_recipes.shape)
singular_ingredient_recipes.head()

(9900, 5)


Unnamed: 0,link,NER,Nutrition_facts_unstructured,title,sorted_NER
0,,[strawberry flavored],,strawberry flavored,[strawberry flavored]
1,,[graham crumbs],,graham crumbs,[graham crumbs]
2,,[Falafel],,Falafel,[Falafel]
3,,[serrano chiles],,serrano chiles,[serrano chiles]
4,,[pomelo],,pomelo,[pomelo]


In [177]:
data_w_nutrition = pd.concat([data_w_nutrition, singular_ingredient_recipes])
print(data_w_nutrition.shape)

(50980, 5)


In [178]:
data_w_nutrition.isnull().sum()

link                            9900
NER                                0
Nutrition_facts_unstructured    9900
title                              0
sorted_NER                         0
dtype: int64

In [179]:
data_w_nutrition['sorted_joined_NER'] = data_w_nutrition.sorted_NER.str.join(" ")

data_w_nutrition.sample()

Unnamed: 0,link,NER,Nutrition_facts_unstructured,title,sorted_NER,sorted_joined_NER
9413,www.allrecipes.com/recipe/255460/apple-fritter...,"[cooking spray, white sugar, butter, eggs, van...",Nutrition Facts\nServings Per Recipe 20\nCalor...,Apple Fritter Bread,"[all-purpose, apple pie filling, baking soda, ...",all-purpose apple pie filling baking soda butt...


In [180]:
# max ingredient count in a recipe:
max_number_ingredients_per_recipe = data_w_nutrition.NER.apply(len).max()
max_number_ingredients_per_recipe

37

In [181]:
device = "mps" if torch.backends.mps.is_available() else "cpu"


# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[
        0
    ]  # First element of model_output contains all token embeddings
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

def get_embedding(sentence: str):
    # Load AutoModel from huggingface model repository
    tokenizer = AutoTokenizer.from_pretrained(
        "sentence-transformers/all-MiniLM-L6-v2"
    )
    model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    model.to(device=device)
    # Tokenize sentences
    encoded_input = tokenizer(
        sentence,
        padding=True,
        truncation=True,
        max_length=max_number_ingredients_per_recipe,
        return_tensors="pt",
    ).to(device=device)

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling. In this case, mean pooling
    sentence_embeddings = mean_pooling(
        model_output, encoded_input["attention_mask"]
    )
    return sentence_embeddings

# make sure we apply new indexing to retrieve the recipes later:
data_w_nutrition = data_w_nutrition.reset_index(drop=True)
# corpus:
# sentences = data_w_nutrition.sorted_joined_NER.tolist()


# sentences_embeddings = data_w_nutrition.sorted_joined_NER.progress_apply(get_embedding)

In [182]:
data_w_nutrition.sorted_joined_NER.values[0]

'jiggers vodka orange juice'

In [183]:
data_w_nutrition.head()

Unnamed: 0,link,NER,Nutrition_facts_unstructured,title,sorted_NER,sorted_joined_NER
0,www.allrecipes.com/recipe/241895/deconstructed...,"[orange juice, jiggers vodka]",Nutrition Facts\nServings Per Recipe 1\nCalori...,Deconstructed Screwdriver (The Raw Egg),"[jiggers vodka, orange juice]",jiggers vodka orange juice
1,www.allrecipes.com/recipe/20808/kettle-corn/,"[vegetable oil, white sugar, popcorn kernels]",Nutrition Facts\nServings Per Recipe 5\nCalori...,Kettle Corn,"[popcorn kernels, vegetable oil, white sugar]",popcorn kernels vegetable oil white sugar
2,www.allrecipes.com/recipe/238733/pops-fabulous...,"[eggplants, green bell peppers, red bell peppe...",Nutrition Facts\nServings Per Recipe 10\nCalor...,Pop'S Fabulous Ajvar,"[cooking spray, corn oil, eggplants, garlic, g...",cooking spray corn oil eggplants garlic green ...
3,www.allrecipes.com/recipe/241254/chipotle-mang...,"[avocados, tomatoes, mango, cilantro, red onio...",Nutrition Facts\nServings Per Recipe 20\nCalor...,Chipotle-Mango Guacamole,"[avocados, black pepper, cayenne pepper, cilan...",avocados black pepper cayenne pepper cilantro ...
4,www.allrecipes.com/recipe/261696/basic-homemad...,"[milk, heavy whipping cream, white vinegar, salt]",Nutrition Facts\nServings Per Recipe 4\nCalori...,Basic Homemade Ricotta Cheese,"[heavy whipping cream, milk, salt, white vinegar]",heavy whipping cream milk salt white vinegar


In [186]:

# recipe_title_embeddings = {}
with open('recipe_title_embeddings.pkl', "rb") as fIn:
    recipe_title_embeddings = pickle.load(fIn)


try:
    for index in tqdm(
        data_w_nutrition.index.values
    ):
        if str(index) not in recipe_title_embeddings.keys():
            title_index = str(index)
            # print(title_index)
            ner = data_w_nutrition.iloc[index].sorted_joined_NER
            
            recipe_title_embeddings[title_index] = get_embedding(sentence=ner)
        # else:
            # print("in dict already !")
except (KeyboardInterrupt):
    with open('recipe_title_embeddings.pkl', "wb") as fOut:
        pickle.dump(recipe_title_embeddings, fOut, protocol=pickle.HIGHEST_PROTOCOL)


  0%|          | 0/50980 [00:00<?, ?it/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 3ad351ab-8325-416d-b32c-5e9a6c4343ea)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/config.json
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 6a727b90-3900-4cfc-a277-c020af361c2d)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer_config.json


In [187]:
with open('recipe_title_embeddings_final.pkl', "wb") as fOut:
        pickle.dump(recipe_title_embeddings, fOut, protocol=pickle.HIGHEST_PROTOCOL)


Core version: 10.0.0
Pillow version: 9.5.0
[autoreload of PIL.Image failed: Traceback (most recent call last):
  File "/Users/man-top/miniconda3/envs/foodflex/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/Users/man-top/miniconda3/envs/foodflex/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 475, in superreload
    module = reload(module)
             ^^^^^^^^^^^^^^
  File "/Users/man-top/miniconda3/envs/foodflex/lib/python3.11/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 621, in _exec
  File "<frozen importlib._bootstrap_external>", line 940, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/Users/man-top/miniconda3/envs/foodflex/lib/python3.11/site-packages/PIL/Image.py", line 111, in <module>
    raise ImportError(msg)
ImportError: The _imaging

In [None]:
corpus_embeddings = corpus_embeddings.to(device)
corpus_embeddings = util.normalize_embeddings(corpus_embeddings)

query_embeddings = query_embeddings.to(device)
# Assuming query_embeddings is a list of individual embeddings
average_query_embedding = np.mean(query_embeddings, axis=0)
query_embeddings = util.normalize_embeddings(query_embeddings)

hits = util.semantic_search(
    query_embeddings, corpus_embeddings, score_function=util.dot_score
)