In [1]:
import pandas as pd
from ast import literal_eval
survey_df = pd.read_csv(r'C:\Users\01din\PycharmProjects\thesis\data\survey_results\survey_results.csv')
generic = lambda x: literal_eval(x)
conv = {'nutrition' : generic, 'steps' : generic, 'ingredients' : generic, 'id_column' : generic, 'jaccard_similarity' : generic, 'diff' : generic, 'recipes' : generic, 'ingredients_original' : generic, 'tags': generic}
df = pd.read_csv(r"C:\Users\01din\PycharmProjects\thesis\data\cleaned_recipes\recipes_with_JS.csv", converters=conv)
df.drop(['Unnamed: 0'], inplace=True, axis=1)
df_ingredients = pd.read_csv(r"C:\Users\01din\PycharmProjects\thesis\data\ingredients\ingredients_nutrition.csv")
df_ingredients.drop(['Unnamed: 0'], inplace=True, axis=1)
df_ingredients['nutrition'] = df_ingredients['nutrition'].apply(lambda x: literal_eval(x) if pd.notnull(x) else x)
bert_results = pd.read_csv(r'C:\Users\01din\PycharmProjects\thesis\data\results_to_evaluate\bert_results.csv', converters = conv)

Replacing spaces with underscores in the ingredients as that simplifies the process and simply allows the call of wv.most_similar

In [2]:
df['ingredients_original'] = df['ingredients_original'].apply(lambda x: ['_'.join(i.split()) for i in x])
survey_df['replaced'] = survey_df['replaced'].str.replace(' ', '_')
survey_df['replacement'] = survey_df['replacement'].str.replace(' ', '_')
df_ingredients['ingredient'] = df_ingredients['ingredient'].str.replace(' ', '_')
bert_results['replaced'] = bert_results['replaced'].apply(lambda x: x.replace(' ', '_'))

Method which carries out the Food Analogy formula
It takes inputs A,B,C and outputs D
A,B are taken s.t. A is an ingredient which can be replaced by B
C is taken as the ingredient which needs to be replaced
D is calculated through vector addition

In [3]:
from gensim.models import Word2Vec
from collections import defaultdict

#Trained model, trained on all steps + ingredients of the dataset
model_path = r"C:\Users\01din\PycharmProjects\thesis\models\w2v\recipe_word2vec.model"
model = Word2Vec.load(model_path)

#Filter the survey output for only positive results
survey_df = survey_df[survey_df['result'] == 'Yes']


unique_pairs = survey_df.drop_duplicates(subset=['replaced', 'replacement'])
#Method which finds the most common output of ingredient D
def find_most_common_replacement(ingredient_C):
    #Init dict which will hold replacements
    predicted_replacements = defaultdict(int)

    #Get the Lond_Desc of ingredient C, used to exclude trivial solutions
    long_desc_C = df_ingredients[df_ingredients['ingredient'] == ingredient_C]['Long_Desc'].values[0]

    #Create a set of all possible ingredients for faster lookup
    all_ingredients = set(df_ingredients['ingredient'])

    #Iterate over each unique pair of replaced and replacement from the survey
    #These are later taken as A and B in the equation A-B+C=D
    unique_pairs = survey_df.drop_duplicates(subset=['replaced', 'replacement'])

    for index, row in unique_pairs.iterrows():
        replaced = row['replaced']
        replacement = row['replacement']

        #Predict Ingredient D
        try:
            most_similar = model.wv.most_similar(positive=[replacement, ingredient_C], negative=[replaced], topn=100)
            for ingredient_D, similarity in most_similar:
                #Skip if ingredient_D is not in all_ingredients, as foods might have different names in the instructions but not exist in the ingredient set
                #This then also does not allow the checking for the same Long_Desc so exclude those
                if ingredient_D not in all_ingredients:
                    continue

                #Check if they have the same Long_Desc (USDA reference ingredient)
                long_desc_D = df_ingredients[df_ingredients['ingredient'] == ingredient_D]['Long_Desc'].values[0]
                if long_desc_D == long_desc_C:
                    continue

                #Add to dictionary
                predicted_replacements[ingredient_D] += 1
                break  # break the loop as we have found a valid ingredient_D

            #If no valid ingredient_D was found in the loop, print a message
            else:
                print(f"No valid replacement found for {ingredient_C} in the context of {replaced} -> {replacement}")

        except KeyError:
            print(f"One of the ingredients: {replaced}, {replacement}, or {ingredient_C} not in model vocabulary")

    #Sort dict by count and return
    sorted_predictions = dict(sorted(predicted_replacements.items(), key=lambda item: item[1], reverse=True))

    return sorted_predictions

#Output test
ingredient_to_replace = "steak"  # Replace with your ingredient
print(find_most_common_replacement(ingredient_to_replace))


{'pork': 77, 'chicken': 35, 'salmon': 14, 'meat': 12, 'spareribs': 10, 'lamb': 6, 'halibut': 5, 'tofu': 4, 'tuna': 3, 'eggplant': 3, 'bologna': 2, 'ciabatta': 2, 'boneless_skinless_chicken_breast_halves': 2, 'cube_steaks': 2, 'pork_tenderloin': 2, 'sirloin_steaks': 2, 'fish': 2, 'whole_chicken_breasts': 2, 'sausages': 2, 'barbecue_sauce': 2, 'msg': 2, 'meat_tenderizer': 2, 'crisco': 1, 'boneless_pork_loin_roast': 1, 'spam': 1, 'oregano_leaves': 1, 'shrimp': 1, 'double-acting_baking_powder': 1, 'boneless_chicken_breasts': 1, 'lemongrass': 1, 'beef_bouillon_cubes': 1, 'prosciutto': 1, 'spicy_mustard': 1, 'meatballs': 1, 'turkey_gravy': 1, 'roasting_chicken': 1, 'boneless_pork_chops': 1, 'yukon_gold_potatoes': 1, 'fruit': 1, 'cajun_seasoning': 1, 'avocado': 1, 'tempeh': 1, 'baguette': 1, 'guar_gum': 1, 'biscuits': 1, 'sriracha_sauce': 1, 'hash_brown_potatoes': 1, 'rib_eye_steaks': 1}


In [4]:
#Method which gets the two keys with highest values from the output dict
#2 outputs tend to dominate the results so use only those in the evaluation
def get_5_highest_keys(d):
    sorted_dict = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
    keys = list(sorted_dict.keys())[:5]
    return keys

#Extract two best replacements
bert_results['analogy_replacement'] = bert_results['replaced'].apply(lambda x: get_5_highest_keys(find_most_common_replacement(x)))

In [5]:
bert_results.to_csv(r'C:\Users\01din\PycharmProjects\thesis\data\results_to_evaluate\results.csv', index=False)

In [6]:
bert_results

Unnamed: 0,id,replaced,replacements,analogy_replacement
0,263659,cashews,"['pecan pieces', 'pecans', 'raw cashews', 'uns...","[pistachios, peanuts, apricots, almonds, pumpk..."
1,344624,diced_tomatoes,"['diced tomatoes', 'cherry tomatoes', 'tomatoe...","[stewed_tomatoes, crushed_tomatoes, chopped_to..."
2,19881,red_bell_pepper,"['pimientos', 'jalapeno chiles', 'jalapeno', '...","[yellow_bell_pepper, yellow_sweet_pepper, whit..."
3,470964,ground_beef,"['hamburger meat', 'extra lean ground beef', '...","[lean_ground_beef, ground_chuck, hamburger_mea..."
4,168586,worcestershire_sauce,"['sriracha sauce', 'heinz 57 steak sauce', 'ad...","[steak_sauce, chili_sauce, prepared_mustard, t..."
5,34403,sugar,"['unbleached cane sugar', 'demerara sugar', 't...","[honey, molasses, granulated_sugar, white_suga..."
6,448396,dijon_mustard,"['dijon mustard', 'honey dijon mustard', 'grai...","[tarragon_vinegar, prepared_horseradish, white..."
7,253207,unsalted_butter,"['unsalted butter', 'sweet unsalted butter', '...","[salted_butter, light_butter, smart_balance_bu..."
8,256098,vegetable_broth,"['canned chicken broth', 'fat free chicken bro...","[reduced-sodium_chicken_broth, chicken_stock, ..."
9,209474,yellow_onion,"['yellow onion', 'sweet potato', 'vidalia onio...","[sweet_onion, sweet_onions, leek, shallot, low..."


In [9]:
model.wv.most_similar('sugar')

[('honey', 0.6941251754760742),
 ('splenda', 0.6173053979873657),
 ('molasses', 0.6060909032821655),
 ('sugars', 0.5962899923324585),
 ('sugar&', 0.5771386623382568),
 ('granulated_sugar', 0.5520997047424316),
 ('white_sugar', 0.548149585723877),
 ('nutmeg', 0.5481245517730713),
 ('rum', 0.546095073223114),
 ('cocoa', 0.5360195636749268)]