In [1]:
import pandas as pd
from ast import literal_eval
generic = lambda x: literal_eval(x)

conv = {'nutrition' : generic, 'steps' : generic, 'ingredients' : generic, 'id_column' : generic, 'jaccard_similarity' : generic, 'diff' : generic, 'recipes' : generic, 'ingredients_original' : generic, 'tags': generic}
df = pd.read_csv(r"C:\Users\01din\PycharmProjects\thesis\data\cleaned_recipes\recipes_with_JS.csv", converters=conv)
df.drop(['Unnamed: 0'], inplace=True, axis=1)
df_ingredients = pd.read_csv(r"C:\Users\01din\PycharmProjects\thesis\data\ingredients\ingredients_nutrition.csv")
df_ingredients.drop(['Unnamed: 0'], inplace=True, axis=1)
df_ingredients['nutrition'] = df_ingredients['nutrition'].apply(lambda x: literal_eval(x) if pd.notnull(x) else x)
survey_df = pd.read_csv(r'C:\Users\01din\PycharmProjects\thesis\data\survey_results\survey_results.csv')

In [2]:
df['ingredients_original_analogy'] = df['ingredients_original'].apply(lambda x: ['_'.join(i.split()) for i in x])
survey_df['replaced_analogy'] = survey_df['replaced'].str.replace(' ', '_')
survey_df['replacement_analogy'] = survey_df['replacement'].str.replace(' ', '_')
df_ingredients['ingredient_analogy'] = df_ingredients['ingredient'].str.replace(' ', '_')

In [3]:
from gensim.models import Word2Vec
from collections import defaultdict

#Trained model, trained on all steps + ingredients of the dataset
model_path = r"C:\Users\01din\PycharmProjects\thesis\models\w2v\recipe_word2vec.model"
w2v_model = Word2Vec.load(model_path)

#Filter the survey output for only positive results
survey_df = survey_df[survey_df['result'] == 'Yes']


unique_pairs = survey_df.drop_duplicates(subset=['replaced_analogy', 'replacement_analogy'])
#Method which finds the most common output of ingredient D
def find_most_common_replacement(ingredient_C):
    ingredient_C = ingredient_C.replace(' ', '_')
    #Init dict which will hold replacements
    predicted_replacements = defaultdict(int)

    #Get the Lond_Desc of ingredient C, used to exclude trivial solutions
    long_desc_C = df_ingredients[df_ingredients['ingredient_analogy'] == ingredient_C]['Long_Desc'].values[0]

    #Create a set of all possible ingredients for faster lookup
    all_ingredients = set(df_ingredients['ingredient_analogy'])

    #Iterate over each unique pair of replaced and replacement from the survey
    #These are later taken as A and B in the equation A-B+C=D
    unique_pairs = survey_df.drop_duplicates(subset=['replaced_analogy', 'replacement_analogy'])

    for index, row in unique_pairs.iterrows():
        replaced = row['replaced_analogy']
        replacement = row['replacement_analogy']

        #Predict Ingredient D
        try:
            most_similar = w2v_model.wv.most_similar(positive=[replacement, ingredient_C], negative=[replaced], topn=100)
            for ingredient_D, similarity in most_similar:
                #Skip if ingredient_D is not in all_ingredients, as foods might have different names in the instructions but not exist in the ingredient set
                #This then also does not allow the checking for the same Long_Desc so exclude those
                if ingredient_D not in all_ingredients:
                    continue

                #Check if they have the same Long_Desc (USDA reference ingredient)
                long_desc_D = df_ingredients[df_ingredients['ingredient_analogy'] == ingredient_D]['Long_Desc'].values[0]
                if long_desc_D == long_desc_C:
                    continue

                #Add to dictionary
                predicted_replacements[ingredient_D] += 1
                break  # break the loop as we have found a valid ingredient_D

        except KeyError:
            print(f"One of the ingredients: {replaced}, {replacement}, or {ingredient_C} not in model vocabulary")

    #Sort dict by count and return
    sorted_predictions = dict(sorted(predicted_replacements.items(), key=lambda item: item[1], reverse=True))

    return sorted_predictions

In [4]:
import torch
from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(r"C:\Users\01din\PycharmProjects\thesis\models\classifier")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [5]:
def test_replacements(df, recipe_id, df_ingredients, ingredient_to_replace, model, tokenizer, device):
    recipe_row = df[df['id'] == recipe_id]

    original_recipe = recipe_row['steps'].apply(' '.join).iloc[0]
    original_ingredient_desc = df_ingredients[df_ingredients.ingredient == ingredient_to_replace].FdGrp_Desc.iloc[0]
    original_ingredient_long_desc = df_ingredients[df_ingredients.ingredient == ingredient_to_replace].Long_Desc.iloc[0]
    all_predictions = []

    for i, row in df_ingredients.iterrows():
        if row['FdGrp_Desc'] == original_ingredient_desc and row['Long_Desc'] != original_ingredient_long_desc:
            replacement_ingredient = row['ingredient']
            if replacement_ingredient != ingredient_to_replace:
                #Format the recipe text
                test_recipe = original_recipe + ' [REPLACED] ' + ingredient_to_replace + ' [REPLACEMENT] ' + replacement_ingredient
                encoding = tokenizer.encode_plus(test_recipe, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
                input_ids = encoding['input_ids'].to(device)
                attention_mask = encoding['attention_mask'].to(device)

                model.eval()
                with torch.no_grad():
                    outputs = model(input_ids, attention_mask=attention_mask)
                    logits = outputs.logits
                    _, predicted_labels = torch.max(logits, dim=1)
                    confidence = torch.nn.functional.softmax(logits, dim=1)[:, 1].item()  #The confidence of label 1

                all_predictions.append({
                    'replacement_ingredient': replacement_ingredient,
                    'predicted_label': predicted_labels.item(),
                    'confidence': confidence
                })

    #Sort by confidence for label 1, or correct substitutions
    all_predictions.sort(key=lambda x: -x['confidence'] if x['predicted_label'] == 1 else 1)

    return all_predictions


In [6]:
results = test_replacements(df, 263659, df_ingredients, 'cashews', model, tokenizer, device)
results

[{'replacement_ingredient': 'pecan pieces',
  'predicted_label': 1,
  'confidence': 0.9585049152374268},
 {'replacement_ingredient': 'pecans',
  'predicted_label': 1,
  'confidence': 0.9544975161552429},
 {'replacement_ingredient': 'unsalted cashews',
  'predicted_label': 1,
  'confidence': 0.9505739808082581},
 {'replacement_ingredient': 'roasted cashews',
  'predicted_label': 1,
  'confidence': 0.9502045512199402},
 {'replacement_ingredient': 'salted cashews',
  'predicted_label': 1,
  'confidence': 0.9492326378822327},
 {'replacement_ingredient': 'ground pecans',
  'predicted_label': 1,
  'confidence': 0.9468134641647339},
 {'replacement_ingredient': 'pistachios',
  'predicted_label': 1,
  'confidence': 0.9320209622383118},
 {'replacement_ingredient': 'pepitas',
  'predicted_label': 1,
  'confidence': 0.9285652041435242},
 {'replacement_ingredient': 'hazelnuts',
  'predicted_label': 1,
  'confidence': 0.9159497618675232},
 {'replacement_ingredient': 'pecan halves',
  'predicted_labe

In [18]:
def print_recipe(recipe):
    print('Recipe name: ' + recipe.name.values[0])
    print('\nIngredients: ' + str(recipe.ingredients_original.values[0]))
    print('\nInstructions: ' + str(recipe.steps.values[0]))
def select_non_spice_ingredient(ingredients_list, df_ingredients):
    while True:
        ingredient = random.choice(ingredients_list)
        if df_ingredients.FdGrp_Desc[df_ingredients.ingredient == ingredient].values[0] != 'Spices and Herbs':
            return ingredient

In [19]:
import random
def generate_substitution(n):
    recipe = df.sample(1)
    id = recipe.id.values[0]
    ingredient_to_replace = select_non_spice_ingredient(recipe.ingredients_original.values[0], df_ingredients)
    print_recipe(recipe)
    print(f"\nIngredient to be replaced: {ingredient_to_replace}")
    analogy_solutions = list(find_most_common_replacement(ingredient_to_replace).keys())[:n]
    classifier_solutions = test_replacements(df, id, df_ingredients, ingredient_to_replace, model, tokenizer, device)[:n]
    classifier_solutions = [d['replacement_ingredient'] for d in classifier_solutions]
    print(f"\nFood analogy tool suggests {analogy_solutions}.\nClassifier suggests {classifier_solutions}.")

In [25]:
generate_substitution(5)

Recipe name: peppermint patty liqueur

Ingredients: ['vodka', 'whipping cream', 'sweetened condensed milk', 'chocolate syrup', 'vanilla', 'peppermint extract']

Instructions: ['whisk ingredients together in a large measuring cup', 'pour into decorative bottles , seal & refrigerate for up to 2 weeks']

Ingredient to be replaced: sweetened condensed milk

Food analogy tool suggests ['white_chocolate', 'oreo_cookies', 'bittersweet_chocolate_chips', 'semisweet_chocolate', 'maraschino_cherries'].
Classifier suggests ['evaporated milk', 'skim milk', 'evaporated skim milk', 'milk', 'carnation evaporated milk'].


'Vegetables and Vegetable Products'