In [7]:
import ast

import pandas as pd
from ast import literal_eval

generic = lambda x: literal_eval(x)
conv = {'nutrition' : generic, 'steps' : generic, 'ingredients' : generic, 'id_column' : generic, 'jaccard_similarity' : generic, 'diff' : generic, 'recipes' : generic, 'ingredients_original' : generic, 'tags': generic}
df = pd.read_csv(r"C:\Users\01din\PycharmProjects\thesis\data\cleaned_recipes\recipes_with_JS.csv", converters=conv)
df.drop(['Unnamed: 0'], inplace=True, axis=1)
def string_to_list(string_repr):
    return ast.literal_eval(string_repr)
df['tokenized'] = df['tokenized'].apply(lambda x: string_to_list(x) if isinstance(x, str) else x)

df_ingredients = pd.read_csv(r"C:\Users\01din\PycharmProjects\thesis\data\ingredients\ingredients_nutrition.csv")
df_ingredients.drop(['Unnamed: 0'], inplace=True, axis=1)
df_ingredients['nutrition'] = df_ingredients['nutrition'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else x)
similar = pd.read_pickle(r'C:\Users\01din\PycharmProjects\thesis\data\similar/similar_sets_pickle.csv')
interactions = pd.read_csv(r"C:\Users\01din\PycharmProjects\thesis/data/raw/interactions/RAW_interactions.csv")
diverse_recipes_df = pd.read_csv(r'C:\Users\01din\PycharmProjects\thesis\data\cleaned_recipes\recipes_to_survey.csv', converters=conv)
diverse_recipes_df.drop(['Unnamed: 0'], inplace=True, axis=1)

Recipes which have more data are more interesting to us, let's sort s.t. the first row is the row with the most substitutions

In [3]:
def replace_ingredient(subs_list, old_value, new_value):
    for d in subs_list:
        if d["ingredient"] == old_value:
            d["ingredient"] = new_value
    return subs_list

# Correction - this was wrongly labeled during the ingredient labeling
old_ingredient = "MORNINGSTAR FARMS Grillers Vegan, frozen, unprepared"
new_ingredient = "Margarine, regular, 80% fat, composite, tub, without salt"

similar["Subs"] = similar["Subs"].apply(lambda x: replace_ingredient(x, old_ingredient, new_ingredient))

In [4]:
similar = similar.sort_values(by='Subs', key=lambda x: x.apply(len), ascending=False)
similar.reset_index(drop=True, inplace=True)
similar

Unnamed: 0,Key,Subs
0,"(Salt, table, Egg, whole, raw, fresh, Wheat fl...","[{'ingredient': 'Vegetable oil, palm kernel', ..."
1,"(Yeast extract spread, Beverages, water, tap, ...","[{'ingredient': 'Oil, corn, peanut, and olive'..."
2,"(Salt, table, Egg, whole, raw, fresh, Wheat fl...","[{'ingredient': 'Snacks, popcorn, oil-popped, ..."
3,"(Egg, whole, raw, fresh, Wheat flour, whole-gr...","[{'ingredient': 'Butter, without salt', 'id': ..."
4,"(Salt, table, Wheat flour, whole-grain, Sugars...","[{'ingredient': 'Seeds, sesame seeds, whole, d..."
...,...,...
9276,"(Vinegar, distilled, Leavening agents, baking ...","[{'ingredient': 'Margarine, regular, 80% fat, ..."
9277,"(Salt, table, Wheat flour, whole-grain, Sugars...","[{'ingredient': 'Cream, fluid, heavy whipping'..."
9278,"(Spices, pepper, red or cayenne, Spinach, raw,...","[{'ingredient': 'Gravy, mushroom, canned', 'id..."
9279,"(Alcoholic beverage, pina colada, prepared-fro...","[{'ingredient': 'Cream, half and half, fat fre..."


Function which finds the base recipe, which is the recipe which will be edited. It is chosen as the recipe with the most ratings from all recipes in that set.

In [9]:
def find_base_recipe(subs, value_counts):
    largest_freq = 0
    id = -1
    for item in subs:
        item_id = item['id']
        freq = value_counts.loc[item_id]
        if freq > largest_freq:
            largest_freq = freq
            id = item_id
    return df[df.id == id]

Some other functions which retrieve the other relevant data

In [10]:
def find_base_instructions(subs, value_counts):
    base_recipe = find_base_recipe(subs, value_counts)
    base_instructions = base_recipe.steps
    return base_instructions.tolist()[0]

def find_ingredient_to_substitute(target_id, subs):
    for item in subs:
        if item['id'] == target_id:
            return item['ingredient']
    return None

Some important methods. Comments in code explain details

In [11]:
from math import sqrt

#Dot product: used in cosine similarity
def dot_product(vec1, vec2):
    return sum(a * b for a, b in zip(vec1, vec2))

#Magnitude: also used in cosine similarity
def magnitude(vec):
    return sqrt(sum(x**2 for x in vec))

#Method which does cosine similarity for ingredients specifically
def cosine_similarity_ingr(desc1, desc2):
    #Make sure both dicts have the same keys, as initially some ingredients had missing nutrition values.
    dict1 = df_ingredients[df_ingredients.Long_Desc == desc1].nutrition.iloc[0]
    dict2 = df_ingredients[df_ingredients.Long_Desc == desc2].nutrition.iloc[0]
    if set(dict1.keys()) != set(dict2.keys()):
        raise AssertionError(f"Dictionaries have different keys.\ndesc1: {desc1}\ndesc2: {desc2}")

    #Turn the nutrition values into lists
    vec1 = [dict1[key] for key in sorted(dict1.keys())]
    vec2 = [dict2[key] for key in sorted(dict2.keys())]

    #Classic cosine similarity calculation
    dot_prod = dot_product(vec1, vec2)
    mag1 = magnitude(vec1)
    mag2 = magnitude(vec2)
    if mag1 == 0 or mag2 == 0:
        return 0
    return dot_prod / (mag1 * mag2)

#Method which finds the best ingredient substitute from the set.
def find_ingredient_substitute(ingredient_to_substitute, subs, base_recipe_id):
    #Holds a ranking
    ingredient_ranking = []
    #Save the food category, used to give foods of the same category priority
    category = df_ingredients[df_ingredients.Long_Desc == ingredient_to_substitute].FdGrp_Desc.iloc[0]
    #Iterate over all possible substitutions
    for item in subs:
        #Save ingredient, recipe id, and category of substitution ingredient
        ingredient = item['ingredient']
        recipe_id = item['id']
        category_other = df_ingredients[df_ingredients.Long_Desc == ingredient].FdGrp_Desc.iloc[0]

        #Skip entry if it is equal to the base recipe or if the recipes are the same (a bit redundant to do both I suppose)
        if recipe_id == base_recipe_id or item['ingredient'] == ingredient_to_substitute:
            continue

        #True if both foods are the same category
        is_same_category = category_other == category

        #Calculate similarity value between nutrition of foods
        similarity = cosine_similarity_ingr(ingredient, ingredient_to_substitute)

        #Add dict to list created at the beginning. Holds the substitute, the recipe ID it comes from, the similarity score, and whether it is the same category.
        ingredient_ranking.append({'ingredient': ingredient, 'id': recipe_id, 'similarity': similarity, 'is_same_category': is_same_category})

    #Return sorted list, sort first by whether the category is the same, then by similarity.
    sorted_ingredient_ranking = sorted(ingredient_ranking, key=lambda x: (x['is_same_category'], x['similarity']), reverse=True)

    return sorted_ingredient_ranking

#Method which finds food.com name of the ingredient from the USDA name
def get_original_ingredient_from_ingredient(recipe, ingredient):
    index = recipe.ingredients.iloc[0].index(ingredient)
    return recipe.ingredients_original.iloc[0][index]

Based on this, we can use the most popular recipe (most frequently rated?) as our base recipe: this is the one that we will find a replacement ingredient for.
Then, we can choose the replacement ingredient through heuristics: food category, and nutritional similarity.

The obvious issue is that the two presented recipes are often two different recipes that just seem to have similar ingredients. To prevent this, we can look at the recipe instructions, and the name of the recipe, and add a ranking based on semantic similarity between the two.

It will lead to two lists: a list of nutritionally similar ingredients
                           a list of semenatically similar recipes
We can simply choose the item which ranks best on both lists combined.

In another notebook, a word2vec model has been trained on text related to each recipe

In [1]:
import numpy as np

#Avg word vector of the recipe
def get_avg_word_vector(tokenized_recipe, model, vector_size):
    word_vectors = np.zeros((len(tokenized_recipe), vector_size))

    for i, word in enumerate(tokenized_recipe):
        word_vectors[i, :] = model.wv[word]

    avg_vector = np.mean(word_vectors, axis=0)
    return avg_vector

from sklearn.metrics.pairwise import cosine_similarity

#Cos sim for simple vectors
def get_cosine_similarity(vector1, vector2):
    return cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]


In [13]:
#Methods similar to the one which ranks the ingredient substitutes, only this one does it based on semantic similarity
def get_semantic_ranking(base_recipe_id, subs, df, model, ingredient_to_substitute):
    semantic_ranking = []

    #Get tokenized text of base recipe and turn it into a vector
    base_recipe_tokenized = df.loc[df['id'] == base_recipe_id, 'tokenized'].iloc[0]
    base_recipe_vector = get_avg_word_vector(base_recipe_tokenized, model, vector_size=100)

    #Iterate over subs
    for item in subs:
        recipe_id = item['id']

        #Skip entry if it is equal to the base recipe or if the recipes are the same (a bit redundant to do both I suppose)
        if recipe_id == base_recipe_id or item['ingredient'] == ingredient_to_substitute:
            continue

        other_recipe = df.loc[df['id'] == recipe_id, 'tokenized'].iloc[0]

        #Get vector of other recipe & calculate cosine similarity
        other_recipe_vector = get_avg_word_vector(other_recipe, model, vector_size=100)
        similarity = get_cosine_similarity(base_recipe_vector, other_recipe_vector)

        #Add to dict
        item['semantic_similarity'] = similarity
        semantic_ranking.append(item)

    #Return sorted on semantic similarity. I suppose I should have also added the categoric boolean here. Would have likely been better.
    semantic_ranking.sort(key=lambda x: x['semantic_similarity'], reverse=True)

    return semantic_ranking


In [14]:
from gensim.models import Word2Vec
model_path = r"C:\Users\01din\PycharmProjects\thesis\models\w2v\w2v4js.model"
model = Word2Vec.load(model_path)

In [15]:
#This is what we discussed, where I combine the two lists based on rank.
def combined_best_ranking(ranking, semantic_ranking, top_n=3):
    #Dict to store combined rank
    combined_dict = {}

    #Iterate over nutrition ranking & save rank
    for i, item in enumerate(ranking):
        rank_score = (i + 1)
        combined_dict[item['id']] = {'item': item, 'rank_score': rank_score}

    #Iterate over semantic ranking & save rank
    for i, item in enumerate(semantic_ranking):
        semantic_rank_score = (i + 1)
        if item['id'] in combined_dict:
            combined_dict[item['id']]['semantic_rank_score'] = semantic_rank_score
        else:
            combined_dict[item['id']] = {'item': item, 'semantic_rank_score': semantic_rank_score}

    #Calculate combined scores.
    combined_list = []
    for key, value in combined_dict.items():
        rank_score = value.get('rank_score', len(ranking) + 1)
        semantic_rank_score = value.get('semantic_rank_score', len(semantic_ranking) + 1)
        combined_item = value['item'].copy()
        combined_item['combined_score'] = (rank_score + semantic_rank_score) / 2
        combined_list.append(combined_item)

    #Sort combined lists and take top n (3 was used)
    combined_list.sort(key=lambda x: x['combined_score'])
    return combined_list[:top_n]


Df of base recipes, also include tags

In [43]:
bases_df = pd.DataFrame()
for i in range(len(similar.index)):
    index = i
    base_recipe = find_base_recipe(similar.Subs.iloc[index], interactions.recipe_id.value_counts())
    bases_df = pd.concat([bases_df, base_recipe], ignore_index=True)

In [45]:
bases_df['tags'] = bases_df['tags'].apply(lambda x: string_to_list(x) if isinstance(x, str) else x)

In [65]:
#Compute tag count for all base recipes.
base_recipe_tags = {index: row.tags for index, row in bases_df.iterrows()}
all_tags = list(set([item for sublist in df.tags for item in sublist]))
tag_counts = {tag: 0 for tag in all_tags}
selected_base_recipes_indices = set()

#Take max 250 unique recipes, with 1-3 replacements for each recipe this should be reasonable
for _ in range(250):
    if not base_recipe_tags:
        break

    #Take tags with lowest count
    min_count = min(tag_counts.values())
    lowest_tags = [tag for tag, count in tag_counts.items() if count == min_count]


    selected_index = None

    #Select a new recipe which has one of the tags which has appeared the least, and which has not been chosen before
    for index, tags in base_recipe_tags.items():
        if index in selected_base_recipes_indices:
            continue
        if any(tag in tags for tag in lowest_tags):
            selected_index = index
            break

    #If none is found break
    if selected_index is None:
        break

    #Update values
    for tag in base_recipe_tags[selected_index]:
        tag_counts[tag] += 1
    selected_base_recipes_indices.add(selected_index)

    #Remove base recipe from list
    del base_recipe_tags[selected_index]

print(list(selected_base_recipes_indices))

[0, 1, 2, 3, 4612, 6, 7, 2056, 5129, 1034, 11, 13, 15, 16, 17, 529, 1039, 20, 21, 2067, 25, 27, 29, 30, 31, 1569, 34, 36, 38, 2086, 40, 41, 3622, 43, 556, 45, 46, 47, 48, 49, 50, 560, 52, 565, 3639, 58, 1083, 60, 5181, 4670, 66, 67, 69, 1608, 1609, 81, 82, 595, 2132, 6226, 87, 3672, 601, 3673, 91, 92, 603, 604, 3674, 609, 2659, 1125, 1638, 3685, 616, 4717, 1647, 2671, 113, 1137, 115, 116, 628, 5743, 7793, 121, 122, 634, 1659, 4735, 1152, 2688, 3202, 133, 1158, 142, 655, 144, 145, 660, 149, 150, 151, 665, 667, 2207, 167, 174, 175, 176, 179, 3251, 181, 695, 2743, 1724, 703, 4291, 196, 3268, 6858, 715, 3790, 6864, 722, 1235, 1752, 2264, 2268, 221, 1758, 6877, 224, 738, 3299, 3301, 1768, 233, 234, 746, 239, 4335, 241, 242, 243, 244, 1266, 246, 2295, 2296, 762, 251, 252, 253, 1275, 768, 261, 774, 1286, 779, 268, 780, 1806, 2319, 6415, 273, 276, 1303, 4892, 285, 286, 288, 289, 3876, 293, 2342, 2859, 300, 7468, 304, 305, 1332, 3892, 1338, 2885, 7493, 840, 842, 3402, 335, 4951, 349, 863, 865, 

All 250 were filled out

In [66]:
len(list(selected_base_recipes_indices))

250

In [6]:
selected_base_recipes_indices = [0, 1, 2, 3, 4612, 6, 7, 2056, 5129, 1034, 11, 13, 15, 16, 17, 529, 1039, 20, 21, 2067, 25, 27, 29, 30, 31, 1569, 34, 36, 38, 2086, 40, 41, 3622, 43, 556, 45, 46, 47, 48, 49, 50, 560, 52, 565, 3639, 58, 1083, 60, 5181, 4670, 66, 67, 69, 1608, 1609, 81, 82, 595, 2132, 6226, 87, 3672, 601, 3673, 91, 92, 603, 604, 3674, 609, 2659, 1125, 1638, 3685, 616, 4717, 1647, 2671, 113, 1137, 115, 116, 628, 5743, 7793, 121, 122, 634, 1659, 4735, 1152, 2688, 3202, 133, 1158, 142, 655, 144, 145, 660, 149, 150, 151, 665, 667, 2207, 167, 174, 175, 176, 179, 3251, 181, 695, 2743, 1724, 703, 4291, 196, 3268, 6858, 715, 3790, 6864, 722, 1235, 1752, 2264, 2268, 221, 1758, 6877, 224, 738, 3299, 3301, 1768, 233, 234, 746, 239, 4335, 241, 242, 243, 244, 1266, 246, 2295, 2296, 762, 251, 252, 253, 1275, 768, 261, 774, 1286, 779, 268, 780, 1806, 2319, 6415, 273, 276, 1303, 4892, 285, 286, 288, 289, 3876, 293, 2342, 2859, 300, 7468, 304, 305, 1332, 3892, 1338, 2885, 7493, 840, 842, 3402, 335, 4951, 349, 863, 865, 2919, 361, 876, 1400, 6520, 892, 1409, 386, 387, 388, 3461, 3463, 5009, 914, 1430, 1942, 6038, 6550, 411, 927, 419, 2468, 422, 2470, 2985, 938, 6069, 6581, 2487, 440, 954, 445, 446, 5061, 454, 971, 4556, 2510, 463, 467, 7126, 471, 1495, 473, 1497, 475, 987, 7639, 478, 3042, 1000, 490, 491, 7673, 507, 1535]

Format all data into one df which can be exported to csv and into google sheets

In [18]:
survey_df = pd.DataFrame(columns=['title', 'id', 'rep_id', 'ingredients', 'instructions', 'ingredient_to_replace', 'replacement'])
for i in selected_base_recipes_indices:
    index = i
    base_recipe = find_base_recipe(similar.Subs.iloc[index], interactions.recipe_id.value_counts())

    base_instructions = find_base_instructions(similar.Subs.iloc[index], interactions.recipe_id.value_counts())

    base_ingredients = similar.Key.iloc[index]

    ingredient_to_substitute = find_ingredient_to_substitute(base_recipe.id.iloc[0], similar.Subs.iloc[index])
    original_ingredient_to_substitute = get_original_ingredient_from_ingredient(base_recipe, ingredient_to_substitute)

    ranking = find_ingredient_substitute(ingredient_to_substitute, similar.Subs.iloc[index], base_recipe.id.iloc[0])

    semantic_ranking = get_semantic_ranking(base_recipe.id.iloc[0], similar.Subs.iloc[index], df, model, ingredient_to_substitute)

    best_dict = combined_best_ranking(ranking, semantic_ranking)
    for i in range(len(best_dict)):
        alt_recipe = df[df.id == best_dict[i]['id']]
        replacement = best_dict[i]['ingredient']
        original_replacement = get_original_ingredient_from_ingredient(alt_recipe, replacement)
        new_entry = {"title": [base_recipe.name.iloc[0]], 'id': [base_recipe.id.iloc[0]], 'rep_id': [alt_recipe.id.iloc[0]], 'ingredients': str(base_recipe.ingredients_original.iloc[0]), 'instructions': [base_instructions], 'ingredient_to_replace': [original_ingredient_to_substitute], 'replacement': [original_replacement]}
        survey_df = pd.concat([survey_df, pd.DataFrame(new_entry)], ignore_index=True)

In [19]:
survey_df.title.value_counts()

quick and easy pizza dough                     9
very vanilla cupcakes                          7
unknownchef86 s very best dinner rolls         6
buttery bread machine rolls                    6
gwen s butter rich dinner rolls                6
                                              ..
light egg whites muffins                       1
garlic    feta lovers rotini pasta for one     1
the best mocha buttercream frosting   icing    1
mama s lemon bars                              1
lu s rum or bourbon balls                      1
Name: title, Length: 227, dtype: int64

Everything after this is saving all relevant dfs etc.

In [None]:
bases_df = pd.to_csv(r'C:\Users\01din\PycharmProjects\thesis\data\cleaned_recipes\bases.csv', converters=conv)
bases_df.drop(['Unnamed: 0'], inplace=True, axis=1)

In [52]:
diverse_recipes_df

In [135]:
bases_df

Unnamed: 0,id,name,n_ingredients,ingredients_original,n_steps,steps,tags,minutes,description,nutrition,ingredients,contributor_id,submitted,similar,combined,tokenized
0,5170,pete s scratch pancakes,7,"[flour, sugar, salt, baking powder, eggs, butt...",4,"[mix the dry items first, combine the eggs and...","[30-minutes-or-less, time-to-make, course, pre...",20,it was the fall of 1987 when pete nyhus walked...,"[209.6, 12.0, 17.0, 13.0, 11.0, 23.0, 9.0]","[Wheat flour, whole-grain, Sugars, granulated,...",1634,11/12/1999,"[35653, 242, 296051, 293049, 203134, 289891, 2...",<name> pete s scratch pancakes <ingredients> f...,"[name, pete, scratch, pancakes, ingredients, f..."
1,13546,beth s pizza crust,6,"[yeast, water, flour, olive oil, sugar, salt]",9,"[combine yeast and warm water, stir until diss...","[30-minutes-or-less, time-to-make, course, pre...",30,this is an excellent crust that requires no ti...,"[1256.3, 19.0, 20.0, 97.0, 70.0, 9.0, 81.0]","[Yeast extract spread, Beverages, water, tap, ...",19832,31/10/2001,"[390932, 436030, 136796, 286865, 67072, 227248...",<name> beth s pizza crust <ingredients> yeast ...,"[name, beth, pizza, crust, ingredients, yeast,..."
2,11763,easy lemon pound cake,8,"[butter, sugar, eggs, lemon juice, salt, flour...",9,"[mix together 1 cup sugar and butter, add eggs...","[weeknight, time-to-make, course, preparation,...",70,i have used this cake for so many different th...,"[338.6, 20.0, 126.0, 10.0, 9.0, 40.0, 16.0]","[Butter, without salt, Sugars, granulated, Egg...",17721,18/09/2001,"[35653, 242, 296051, 293049, 56189, 19509, 154...",<name> easy lemon pound cake <ingredients> but...,"[name, easy, lemon, pound, cake, ingredients, ..."
3,139989,ruhrei mennonite scrambled eggs,5,"[flour, eggs, salt, milk, butter]",7,"[mix the flour and milk into a smooth paste , ...","[15-minutes-or-less, time-to-make, course, mai...",6,this is a different and delicious way to make ...,"[249.2, 25.0, 1.0, 24.0, 29.0, 37.0, 3.0]","[Wheat flour, whole-grain, Egg, whole, raw, fr...",149363,04/10/2005,"[35653, 20238, 378058, 19104, 472363, 187678, ...",<name> ruhrei mennonite scrambled eggs <in...,"[name, ruhrei, mennonite, scrambled, eggs, ing..."
4,5170,pete s scratch pancakes,7,"[flour, sugar, salt, baking powder, eggs, butt...",4,"[mix the dry items first, combine the eggs and...","[30-minutes-or-less, time-to-make, course, pre...",20,it was the fall of 1987 when pete nyhus walked...,"[209.6, 12.0, 17.0, 13.0, 11.0, 23.0, 9.0]","[Wheat flour, whole-grain, Sugars, granulated,...",1634,11/12/1999,"[35653, 242, 296051, 293049, 203134, 289891, 2...",<name> pete s scratch pancakes <ingredients> f...,"[name, pete, scratch, pancakes, ingredients, f..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7573,147243,creamed mashed potatoes with spinach,6,"[boiling potatoes, heavy cream, unsalted butte...",8,[cover potatoes with salted cold water by 1 in...,"[60-minutes-or-less, time-to-make, course, mai...",50,"boy, do we loves potatoes and spinach. when i ...","[383.7, 30.0, 8.0, 26.0, 11.0, 62.0, 15.0]","[Potatoes, raw, skin, Cream, fluid, heavy whip...",102058,05/12/2005,"[514824, 266718, 499776]",<name> creamed mashed potatoes with spinach <i...,"[name, creamed, mashed, potatoes, with, spinac..."
7574,147243,creamed mashed potatoes with spinach,6,"[boiling potatoes, heavy cream, unsalted butte...",8,[cover potatoes with salted cold water by 1 in...,"[60-minutes-or-less, time-to-make, course, mai...",50,"boy, do we loves potatoes and spinach. when i ...","[383.7, 30.0, 8.0, 26.0, 11.0, 62.0, 15.0]","[Potatoes, raw, skin, Cream, fluid, heavy whip...",102058,05/12/2005,"[514824, 266718, 499776]",<name> creamed mashed potatoes with spinach <i...,"[name, creamed, mashed, potatoes, with, spinac..."
7575,60160,southern scalloped potatoes,6,"[idaho potatoes, onions, butter, flour, milk, ...",9,[boil the potatoes until they're halfway done ...,"[60-minutes-or-less, time-to-make, main-ingred...",50,this delicious casserole hasn't been \r\ncompr...,"[226.4, 10.0, 12.0, 3.0, 11.0, 20.0, 12.0]","[Potatoes, raw, skin, Onions, raw, Butter, wit...",61010,18/04/2003,"[275449, 199870, 45646, 133812, 21752, 392733,...",<name> southern scalloped potatoes <ingredient...,"[name, southern, scalloped, potatoes, ingredie..."
7576,320957,mandarin mocha,5,"[coffee, chocolate syrup, orange extract, milk...",6,"[pour coffee into a cup, add chocolate syrup, ...","[15-minutes-or-less, time-to-make, course, pre...",5,"chocolate, coffee, and orange flavors. what c...","[187.1, 9.0, 50.0, 7.0, 9.0, 16.0, 9.0]","[Beverages, coffee, brewed, breakfast blend, B...",327115,24/08/2008,"[139517, 320954, 381696, 279911]",<name> mandarin mocha <ingredients> coffee cho...,"[name, mandarin, mocha, ingredients, coffee, c..."


In [39]:
flattened_list = [item for sublist in df.tags for item in sublist]
len(set(flattened_list))

523

In [20]:
survey_df = survey_df.drop_duplicates(subset = ['id', 'ingredient_to_replace', 'replacement'])

In [22]:
survey_df

Unnamed: 0,title,id,rep_id,ingredients,instructions,ingredient_to_replace,replacement
0,pete s scratch pancakes,5170,37119,"['flour', 'sugar', 'salt', 'baking powder', 'e...","[mix the dry items first, combine the eggs and...",butter,margarine
1,pete s scratch pancakes,5170,212194,"['flour', 'sugar', 'salt', 'baking powder', 'e...","[mix the dry items first, combine the eggs and...",butter,cooking oil
2,pete s scratch pancakes,5170,203678,"['flour', 'sugar', 'salt', 'baking powder', 'e...","[mix the dry items first, combine the eggs and...",butter,oil
3,beth s pizza crust,13546,144943,"['yeast', 'water', 'flour', 'olive oil', 'suga...","[combine yeast and warm water, stir until diss...",olive oil,shortening
4,beth s pizza crust,13546,377904,"['yeast', 'water', 'flour', 'olive oil', 'suga...","[combine yeast and warm water, stir until diss...",olive oil,cooking oil
...,...,...,...,...,...,...,...
540,ooey gooey butter cake,56916,126977,"['yellow cake mix', 'margarine', 'egg', 'pecan...","[mix first four ingredients, mixture will be t...",pecans,brown sugar
541,rochelle s chocolate chip cookies betty crocker,62232,118818,"['shortening', 'butter', 'sugar', 'brown sugar...","[preheat oven to 350 degrees f, mix shortening...",butter,margarine
542,rochelle s chocolate chip cookies betty crocker,62232,390636,"['shortening', 'butter', 'sugar', 'brown sugar...","[preheat oven to 350 degrees f, mix shortening...",butter,nuts
543,rochelle s chocolate chip cookies betty crocker,62232,79666,"['shortening', 'butter', 'sugar', 'brown sugar...","[preheat oven to 350 degrees f, mix shortening...",butter,oatmeal


In [27]:
survey_df = pd.read_csv(r'C:\Users\01din\PycharmProjects\thesis\data\survey\survey_dfs.csv')

In [29]:
survey_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,id,rep_id,ingredients,instructions,ingredient_to_replace,replacement
0,0,0,pete s scratch pancakes,5170,37119,"frozenset({'Salt, table', 'Egg, whole, raw, fr...","['mix the dry items first', 'combine the eggs ...",butter,margarine
1,1,1,pete s scratch pancakes,5170,212194,"frozenset({'Salt, table', 'Egg, whole, raw, fr...","['mix the dry items first', 'combine the eggs ...",butter,cooking oil
2,2,2,pete s scratch pancakes,5170,203678,"frozenset({'Salt, table', 'Egg, whole, raw, fr...","['mix the dry items first', 'combine the eggs ...",butter,oil
3,3,3,beth s pizza crust,13546,144943,"frozenset({'Yeast extract spread', 'Beverages,...","['combine yeast and warm water', 'stir until d...",olive oil,shortening
4,4,4,beth s pizza crust,13546,377904,"frozenset({'Yeast extract spread', 'Beverages,...","['combine yeast and warm water', 'stir until d...",olive oil,cooking oil
...,...,...,...,...,...,...,...,...,...
515,583,583,kittencal s creamy vanilla milkshake,376850,240314,"frozenset({'Sugars, granulated', 'Cream, fluid...",['in a blender combine all ingredients startin...,vanilla ice cream,egg yolks
516,584,584,kittencal s creamy vanilla milkshake,376850,458507,"frozenset({'Sugars, granulated', 'Cream, fluid...",['in a blender combine all ingredients startin...,vanilla ice cream,eggs
517,586,586,terrific tuscan vegetable soup ellie krieger,475171,475108,"frozenset({'Spices, pepper, black', 'Tomatoes,...",['in a small bowl mash half of the beans with ...,low sodium chicken broth,chicken
518,587,587,bauernfruhstuck,422978,123467,"frozenset({'Spices, pepper, black', 'Pork, cur...","['fry the bacon until crisp', 'remove and drai...",potatoes,cheddar cheese


In [3]:
new_survey_df = survey_df.drop_duplicates(subset=['ingredient_to_replace', 'replacement', 'title'])

In [31]:
updated_df = survey_df.merge(df[['id', 'ingredients_original']], on='id', how='left')

# Overwrite the 'ingredients' column in 'survey_df' with the 'ingredients_original' column from 'updated_df'
survey_df['ingredients'] = updated_df['ingredients_original']

In [5]:
new_survey_df.to_csv(r'C:\Users\01din\PycharmProjects\thesis\data\survey\survey_dfs.csv')

In [33]:
survey_df.to_csv(r'C:\Users\01din\PycharmProjects\thesis\data\survey\survey_dfs.csv')