In [1]:
import pandas as pd
from ast import literal_eval

generic = lambda x: literal_eval(x)
conv = {'nutrition' : generic, 'steps' : generic, 'ingredients_original' : generic, 'id_column' : generic, 'ingredients' : generic, 'similar' : generic}
df = pd.read_csv(r"C:\Users\01din\PycharmProjects\thesis\data\cleaned_recipes\recipes_with_JS.csv", converters=conv)
ingredients = pd.read_csv(r"C:\Users\01din\PycharmProjects\thesis\data\ingredients\ingredients_nutrition.csv")

The most interesting things are recipes which differ on exactly 1 ingredient. Let's filter those.

In [42]:
from tqdm import tqdm

#Function to compare ingredient sets and check if they differ by only one ingredient
def differ_by_one(ingredients1, ingredients2):
    return len(ingredients1 ^ ingredients2) == 2

row_dict = {row['id']: row for _, row in df.iterrows()}
#Convert ingredient lists to sets
for row_id in row_dict:
    row_dict[row_id]['ingredients'] = set(row_dict[row_id]['ingredients'])

#Extract sets of recipes that differ by only one ingredient
result = []

#Prevent double checking
checked_pairs = set()

#Iterate through the DataFrame, comparing each recipe to each other
for row1 in tqdm(row_dict.values(), total=len(df), desc="Processing rows"):
    for similar_id in row1['similar']:
        pair = frozenset([row1['id'], similar_id])
        if pair not in checked_pairs:
            row2 = row_dict[similar_id]
            if differ_by_one(row1['ingredients'], row2['ingredients']):
                recipe_set = sorted([row1['id'], row2['id']])
                result.append(recipe_set)
            checked_pairs.add(pair)


100%|██████████| 144188/144188 [02:44<00:00, 875.88it/s] 


In [1]:
import csv

path = r'C:\Users\01din\PycharmProjects\thesis\data\similar\ids_one_difference.csv'

#Open the CSV file for writing
with open(path, 'w', newline='') as file:
    #Create a CSV writer object
    writer = csv.writer(file)

    #Write each row of results to the CSV file
    for row in result:
        writer.writerow(row)

NameError: name 'result' is not defined

We can also create sets of recipes, where they all differ on the same ingredient.
This essentially finds all recipes which have the same ingredients A,B,C and then a different ingredient for D

In [2]:
import pandas as pd
from tqdm import tqdm

#Dictionary to store groups of similar recipes
recipe_groups = {}

#Set to keep track of processed recipes
processed_recipes = set()

for _, row in tqdm(df.iterrows(), total=df.shape[0]):

    recipe_id = row['id']
    ingredient_set = set(row['ingredients'])

    #Check for similar recipes based on the 'similar' column
    for similar_id in row['similar']:
        #Skip the iteration if the similar recipe has already been processed
        if similar_id in processed_recipes:
            continue

        similar_ingredient_set = set(df[df['id'] == similar_id].ingredients.values[0])
        if len(ingredient_set) == len(similar_ingredient_set):
            #Find the different ingredient
            difference = ingredient_set ^ similar_ingredient_set
            #Check if the number of different ingredients is exactly 1
            if len(difference) == 2:
                similar_ingredients = ingredient_set.intersection(similar_ingredient_set)
                if frozenset(similar_ingredients) not in recipe_groups:
                    recipe_groups[frozenset(similar_ingredients)] = set()
                diff_list = list(difference)
                if diff_list[0] not in ingredient_set:
                    diff_list.reverse()
                recipe_groups[frozenset(similar_ingredients)].add(tuple([diff_list[0], recipe_id]))
                recipe_groups[frozenset(similar_ingredients)].add(tuple([diff_list[1], similar_id]))

    #Add the current recipe_id to the processed_recipes set
    processed_recipes.add(recipe_id)


100%|██████████| 144188/144188 [01:29<00:00, 1619.88it/s] 


This dataframe will hold the ingredients which are the same in Key, and then in Tuples it holds a list of tuples, with [0] as the missing ingredient and [1] as the recipe id from which that ingredient comes

In [9]:
rows = []
for key, value in recipe_groups.items():
    rows.append([key, value])

df_group = pd.DataFrame(rows, columns=["Key", "Tuples"])
df_group.Tuples = df_group.Tuples.apply(list)

In [19]:
df_group

Unnamed: 0,Key,Tuples
0,"(Salt, table, Spices, pepper, black, Potatoes,...","[(Milk, reduced fat, fluid, 2% milkfat, with a..."
1,"(Spices, pepper, white, Lemon juice, raw, Salt...","[(Beans, snap, green, raw, 454753), (Parsley, ..."
2,"(Salt, table, Lemon juice, raw, Egg, yolk, raw...","[(Spices, pepper, black, 156420), (Spices, pep..."
3,"(Salt, table, Wheat flour, whole-grain, Sugars...","[(Vegetable oil, palm kernel, 238576), (Oil, c..."
4,"(Wheat flour, whole-grain, Sugars, granulated,...","[(Spices, caraway seed, 19509), (Vanilla extra..."
...,...,...
12226,"(Nuts, walnuts, english, Dates, medjool, Spice...","[(Yogurt, vanilla, non-fat, 310162), (Yogurt, ..."
12227,"(Spices, oregano, dried, Basil, fresh, Onions,...","[(Margarine-like, butter-margarine blend, 80% ..."
12228,"(Sugars, granulated, Oil, corn, peanut, and ol...","[(Basil, fresh, 378410), (Spices, poultry seas..."
12229,"(Oil, coconut, Sugars, granulated, Vanilla ext...","[(Puddings, rice, dry mix, 95938), (Raisins, g..."


Remove herbs and spices differences

In [21]:
def is_in_herbs_and_spices(ingredient):
    category = ingredients.loc[ingredients['Long_Desc'] == ingredient, 'FdGrp_Desc'].values
    return len(category) > 0 and category[0] == "Spices and Herbs"
df_group['Tuples'] = df_group['Tuples'].apply(lambda tuples: [t for t in tuples if not is_in_herbs_and_spices(t[0])])
df_group = df_group[df_group['Tuples'].apply(len) > 1]

In [22]:
df_group

Unnamed: 0,Key,Tuples
0,"(Salt, table, Spices, pepper, black, Potatoes,...","[(Milk, reduced fat, fluid, 2% milkfat, with a..."
1,"(Spices, pepper, white, Lemon juice, raw, Salt...","[(Beans, snap, green, raw, 454753), (Parsley, ..."
3,"(Salt, table, Wheat flour, whole-grain, Sugars...","[(Vegetable oil, palm kernel, 238576), (Oil, c..."
4,"(Wheat flour, whole-grain, Sugars, granulated,...","[(Bread, cinnamon, 285997), (Strawberries, raw..."
5,"(Salt, table, Wheat flour, whole-grain, Sugars...","[(Milk, whole, 3.25% milkfat, with added vitam..."
...,...,...
12225,"(Beverages, water, tap, municipal, Wheat flour...","[(Butter, without salt, 503263), (Oil, coconut..."
12226,"(Nuts, walnuts, english, Dates, medjool, Spice...","[(Yogurt, vanilla, non-fat, 310162), (Yogurt, ..."
12227,"(Spices, oregano, dried, Basil, fresh, Onions,...","[(Margarine-like, butter-margarine blend, 80% ..."
12229,"(Oil, coconut, Sugars, granulated, Vanilla ext...","[(Puddings, rice, dry mix, 95938), (Raisins, g..."


This was pickled and turned into better formatting in the explore_sets notebook

In [23]:
df_group.to_pickle(r'C:\Users\01din\PycharmProjects\thesis\data\similar\similar_sets_pickle.csv')

In [41]:
df_group.to_csv(r'C:\Users\01din\PycharmProjects\thesis\data\similar/similar_sets.csv', index=False)