## Clean recipe data

In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from IPython.display import display, Image, HTML
import re
from fuzzywuzzy import fuzz, process
from collections import defaultdict
from fractions import Fraction

user_recipe_df = pd.read_csv("../raw/recipes.csv")

#displays all columns of df instead of truncating
pd.set_option('display.max_columns', None)

print("First few rows: \n")
display(user_recipe_df.head())

## remove missing values and duplicates 
#### makes shape of recipe df go from 1048543 to 44 (prob cus not all recipes have associated images and yada yada), going to omit removing nulls for now ####
user_recipe_df = user_recipe_df.drop_duplicates()

# print(user_recipe_df.shape)

## check for and remove anomolies only if using user_food_df
def anomaly_detection(df, columns):
    for column in columns:
        row = df[column]
        Q1 = row.quantile(0.25)
        Q3 = row.quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        anomalies = df[(row < lower_bound) | (row > upper_bound)]
        #print(f'\n\nNumber of anomalies in {column}: ', len(anomalies))

        df_anomaly_free = df[(row >= lower_bound) & (row <= upper_bound)]
        #print(f'\n\nNumber of anomaly-free in {column}: ', len(df), '\n\n')
    
    return df_anomaly_free

columns_to_check = ['RecipeId', 'Calories', 'FatContent', 'SaturatedFatContent', 'CholesterolContent', 
                    'SodiumContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent', 
                    'RecipeServings']


clean_recipe_df = anomaly_detection(user_recipe_df, columns_to_check)
#gets rid of setting with copy warning
clean_recipe_df = clean_recipe_df.copy()


  user_recipe_df = pd.read_csv("../raw/recipes.csv")


First few rows: 



Unnamed: 0,RecipeId,Barcode,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,RecipeCategory,Keywords,RecipeIngredientQuantities,RecipeIngredientParts,AggregatedRating,ReviewCount,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38.0,*38*,Low-Fat Berry Blue Frozen Dessert,1533.0,Dancer,PT24H,PT45M,PT24H45M,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen ...,"c(""https://img.sndimg.com/food/image/upload/w_...",Frozen Desserts,"c(""Dessert"", ""Low Protein"", ""Low Cholesterol"",...","c(""4"", ""1/4"", ""1"", ""1"")","c(""blueberries"", ""granulated sugar"", ""vanilla ...",4.5,4.0,170.9,2.5,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"c(""Toss 2 cups berries with sugar."", ""Let stan..."
1,39.0,*39*,Biryani,1567.0,elly9812,PT25M,PT4H,PT4H25M,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_...",Chicken Breast,"c(""Chicken Thigh & Leg"", ""Chicken"", ""Poultry"",...","c(""1"", ""4"", ""2"", ""2"", ""8"", ""1/4"", ""8"", ""1/2"", ...","c(""saffron"", ""milk"", ""hot green chili peppers""...",3.0,1.0,1110.7,58.8,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,"c(""Soak saffron in warm milk for 5 minutes and..."
2,40.0,*40*,Best Lemonade,1566.0,Stephen Little,PT5M,PT30M,PT35M,1999-09-05T19:52:00Z,This is from one of my first Good House Keepi...,"c(""https://img.sndimg.com/food/image/upload/w_...",Beverages,"c(""Low Protein"", ""Low Cholesterol"", ""Healthy"",...","c(""1 1/2"", ""1"", NA, ""1 1/2"", NA, ""3/4"")","c(""sugar"", ""lemons, rind of"", ""lemon, zest of""...",4.5,10.0,311.1,0.2,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"c(""Into a 1 quart Jar with tight fitting lid, ..."
3,41.0,*41*,Carina's Tofu-Vegetable Kebabs,1586.0,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03T14:54:00Z,This dish is best prepared a day in advance to...,"c(""https://img.sndimg.com/food/image/upload/w_...",Soy/Tofu,"c(""Beans"", ""Vegetable"", ""Low Cholesterol"", ""We...","c(""12"", ""1"", ""2"", ""1"", ""10"", ""1"", ""3"", ""2"", ""2...","c(""extra firm tofu"", ""eggplant"", ""zucchini"", ""...",4.5,2.0,536.1,24.0,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"c(""Drain the tofu, carefully squeezing out exc..."
4,42.0,*42*,Cabbage Soup,1538.0,Duckie067,PT30M,PT20M,PT50M,1999-09-19T06:19:00Z,Make and share this Cabbage Soup recipe from F...,"""https://img.sndimg.com/food/image/upload/w_55...",Vegetable,"c(""Low Protein"", ""Vegan"", ""Low Cholesterol"", ""...","c(""46"", ""4"", ""1"", ""2"", ""1"")","c(""plain tomato juice"", ""cabbage"", ""onion"", ""c...",4.5,11.0,103.6,0.4,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,,"c(""Mix everything together and bring to a boil..."


## Convert data types of columns used into lists. Import model and convert predicted output into list.

In [None]:

#convert RecipeIngredientParts to lists using regex
def parse_ingredients(val):
    if isinstance(val, str):
        #find all quoted strings within c() notation
        matches = re.findall(r'"(.*?)"', val)
        return matches
    return val

#convert ingredient to list
clean_recipe_df.loc[:, 'RecipeIngredientParts'] = clean_recipe_df['RecipeIngredientParts'].apply(parse_ingredients)
#continue to convert other c() vectors to lists
clean_recipe_df.loc[:, 'RecipeIngredientQuantities'] = clean_recipe_df['RecipeIngredientQuantities'].apply(parse_ingredients)
clean_recipe_df.loc[:, 'RecipeInstructions'] = clean_recipe_df['RecipeInstructions'].apply(parse_ingredients)
clean_recipe_df.loc[:, 'Images'] = clean_recipe_df['Images'].apply(parse_ingredients)


#####   Replace below with classified ingredients output from the model when a prediction is made on example images of ingredients we import from the web #####

# Hypothetical classified ingredients
## TODO: to be replaced with actual classified ingredients
classified_ingredients = ['extra firm tofu', 'eggplant', 'zucchini', 'mushrooms', 'soy sauce', 
                          'low sodium soy sauce', 'olive oil', 'maple syrup', 'honey', 
                          'red wine vinegar', 'lemon juice', 'garlic cloves', 'mustard powder', 
                          'black pepper']

# Synonym dictionary
##TODO: to be replaced with category/subcategroy/ingredient relations
##ALSO: consider keyword column from recipe dataset 
### ALSO: could be completely unecessary and removed from matching score calculation
synonym_dict = {
    'soy sauce': 'low sodium soy sauce',
    'low sodium soy sauce': 'soy sauce',
    'olive oil': 'extra virgin olive oil',
    'extra virgin olive oil': 'olive oil',
    'maple syrup': 'honey',
    'honey': 'maple syrup',
    'red wine vinegar': 'balsamic vinegar',
    'balsamic vinegar': 'red wine vinegar',
    'lemon juice': 'lime juice',
    'lime juice': 'lemon juice',
    'garlic cloves': 'garlic',
    'garlic': 'garlic cloves',
    'mustard powder': 'dry mustard',
    'dry mustard': 'mustard powder',
    'black pepper': 'pepper',
    'pepper': 'black pepper'
}

#display(user_recipe_df[['Name', 'RecipeIngredientParts', 'RecipeIngredientQuantities', 'RecipeInstructions', 'Images']].head())

## Function to match model ouptut with recipe data (string matching, regex)

In [None]:

# Matching ingredients, output match score and matched ingredients
def match_ingredients(recipe_ingredients, classified_ingredients, synonym_dict):
    match_score = 0
    matched_ingredients = set()
    
    # Exact string matching
    for ingredient in classified_ingredients:
        if ingredient in recipe_ingredients:
            match_score += 1
            matched_ingredients.add(ingredient)
    
    # Fuzzy matching
    for ingredient in classified_ingredients:
        matches = process.extract(ingredient, recipe_ingredients, limit=3)
        for match in matches:
            if match[1] > 80:  # Threshold 
                match_score += 1
                matched_ingredients.add(match[0])
    
    # Synonym handling
    # for ingredient in classified_ingredients:
    #     if ingredient in synonym_dict:
    #         synonym = synonym_dict[ingredient]
    #         if synonym in recipe_ingredients:
    #             match_score += 1
    #             matched_ingredients.add(synonym)

    # Set-based matching
    classified_set = set(classified_ingredients)
    recipe_set = set(recipe_ingredients)
    match_score += len(classified_set & recipe_set)
    
    return match_score, matched_ingredients

# Apply matching df
clean_recipe_df.loc[:, 'MatchScore'] = clean_recipe_df['RecipeIngredientParts'].apply(lambda x: match_ingredients(x, classified_ingredients, synonym_dict)[0])
clean_recipe_df.loc[:, 'MatchedIngredients'] = clean_recipe_df['RecipeIngredientParts'].apply(lambda x: match_ingredients(x, classified_ingredients, synonym_dict)[1])


# match scores
display(clean_recipe_df[['Name', 'RecipeIngredientParts', 'MatchScore', 'MatchedIngredients']])

Unnamed: 0,Name,RecipeIngredientParts,MatchScore,MatchedIngredients
0,Low-Fat Berry Blue Frozen Dessert,"[blueberries, granulated sugar, vanilla yogurt...",3,{lemon juice}
1,Biryani,"[saffron, milk, hot green chili peppers, onion...",3,"{garlic, fresh lemon juice, clove}"
2,Best Lemonade,"[sugar, lemons, rind of, lemon, zest of, fresh...",1,{fresh lemon juice}
3,Carina's Tofu-Vegetable Kebabs,"[extra firm tofu, eggplant, zucchini, mushroom...",44,"{low sodium soy sauce, maple syrup, eggplant, ..."
4,Cabbage Soup,"[plain tomato juice, cabbage, onion, carrots, ...",1,{plain tomato juice}
...,...,...,...,...
1219,Tiramisu II,"[eggs, water, mascarpone, sugar, salt, ladyfin...",0,{}
1224,Tropical Fruit with Lime,"[granulated sugar, water, lime rind, lime juic...",0,{}
1225,Tuna Tostadas,"[fat free tortillas, tuna, fat free sour cream...",0,{}
1226,Turkish Spinach and Lentil Soup,"[lentils, nonfat beef broth, salt, olive oil, ...",3,{olive oil}


## Function to display recipes (name, image, ingredients, instructions) with high string matching scores

In [None]:

## TODO: output recipes with high matching scores, output their photo, name, ingredients, and instructions

#display recipes with high matching scores
def display_high_matching_recipes(df, score_threshold=5):
    # Filter recipes with high matching scores
    high_matching_recipes = df[df['MatchScore'] >= score_threshold]
    
    for index, row in high_matching_recipes.iterrows():
        # recipe name
        display(HTML(f"<h2>{row['Name']}</h2>"))
        
        # recipe image
        if isinstance(row['Images'], list) and len(row['Images']) > 0:
            display(Image(url=row['Images'][0], width=400))
        
        # ingredients
        ## I just added 'cup' or 'cups' but i literally have no idea if thats valid or what measurements these quantities actually represent
        ## Its definitely not just cups across the board because that would not make sense for large amounts of certain ingredients
        ## idk remove it? the numbers also dont make senese otherwise so they can be removed as well
        display(HTML("<h3>Ingredients:</h3>"))
        ingredients_with_quantities = zip(row['RecipeIngredientParts'], row['RecipeIngredientQuantities'])
        display(HTML("<ul>" + "".join([f"<li>{ingredient} , {quantity} {'cups' if Fraction(quantity) > 1 else 'cup'}</li>" for ingredient, quantity in ingredients_with_quantities]) + "</ul>"))
        
        
        # instructions
        display(HTML("<h3>Instructions:</h3>"))
        instructions = row['RecipeInstructions']  
        display(HTML("<ol>" + "".join([f"<li>{instruction.strip()}</li>" for instruction in instructions if instruction.strip()]) + "</ol>"))

# display high matching recipes
display_high_matching_recipes(clean_recipe_df, 15)