In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import re
import ast

### Import first dataset

In [6]:
df_recipes2 = pd.read_parquet('/Users/bianp/Desktop/3A DSSA/Recipe_Finder/recipes.parquet')
df_recipes2.head()

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38.0,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09 21:46:00+00:00,Make and share this Low-Fat Berry Blue Frozen ...,[https://img.sndimg.com/food/image/upload/w_55...,...,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"[Toss 2 cups berries with sugar., Let stand fo..."
1,39.0,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29 13:12:00+00:00,Make and share this Biryani recipe from Food.com.,[https://img.sndimg.com/food/image/upload/w_55...,...,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,[Soak saffron in warm milk for 5 minutes and p...
2,40.0,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05 19:52:00+00:00,This is from one of my first Good House Keepi...,[https://img.sndimg.com/food/image/upload/w_55...,...,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"[Into a 1 quart Jar with tight fitting lid, pu..."
3,41.0,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03 14:54:00+00:00,This dish is best prepared a day in advance to...,[https://img.sndimg.com/food/image/upload/w_55...,...,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"[Drain the tofu, carefully squeezing out exces..."
4,42.0,Cabbage Soup,1538,Duckie067,PT30M,PT20M,PT50M,1999-09-19 06:19:00+00:00,Make and share this Cabbage Soup recipe from F...,[https://img.sndimg.com/food/image/upload/w_55...,...,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,,"[Mix everything together and bring to a boil.,..."


In [7]:
# Extract relevant columns
selected_columns = ['Name', 
                    'AuthorName', 
                    'CookTime', 
                    'PrepTime', 
                    'TotalTime', 
                    'Description', 
                    'Images', 
                    'RecipeCategory', 
                    'Keywords', 
                    'RecipeIngredientQuantities', 
                    'RecipeIngredientParts', 
                    'AggregatedRating', 
                    'Calories', 
                    'FatContent', 
                    'SugarContent',
                    'ProteinContent', 
                    'RecipeServings', 
                    'RecipeInstructions']
df_recipes2 = df_recipes2[selected_columns]

In [8]:
# Drop duplicate recipes
df_recipes2 = df_recipes2.drop_duplicates(subset=['Name','AuthorName'])

In [9]:
df_recipes2.describe()

Unnamed: 0,AggregatedRating,Calories,FatContent,SugarContent,ProteinContent,RecipeServings
count,268902.0,520468.0,520468.0,520468.0,520468.0,338261.0
mean,4.632061,484.473628,24.616125,21.87149,17.474475,8.607093
std,0.641841,1399.174903,111.674705,142.84146,40.186924,114.545381
min,1.0,0.0,0.0,0.0,0.0,1.0
25%,4.5,174.2,5.6,2.5,3.5,4.0
50%,5.0,317.1,13.8,6.4,9.1,6.0
75%,5.0,529.1,27.4,17.9,25.1,8.0
max,5.0,612854.6,64368.1,90682.3,18396.2,32767.0


In [10]:
# Filter out outliers
df_recipes2 = df_recipes2[ ( df_recipes2['Calories'] > 0 ) & ( df_recipes2['Calories'] <= 1500) & ( df_recipes2['RecipeServings'] <= 72 ) ]

In [11]:
# Convert columns of arrays into lists
array_cols = ['Images', 'Keywords', 'RecipeInstructions']
for col in array_cols:
    df_recipes2[col] = df_recipes2[col].apply(
        lambda x: list(x) if isinstance(x, (list, np.ndarray)) and not all(item is None for item in x) else np.nan
    )

In [12]:
# Keep the first link only in the column 'Images'
df_recipes2['Images'] = df_recipes2['Images'].apply(
    lambda x: x[0] if isinstance(x, list) and len(x) > 0 else np.nan
)

In [13]:
df_recipes2['RecipeInstructions'] = df_recipes2['RecipeInstructions'].apply(
    lambda x: [instr.strip() + '.' for instr in ' '.join(x).split('.') if instr.strip()] if isinstance(x, list) else np.nan
)

In [14]:
df_recipes2.isna().sum()

Name                               0
AuthorName                         0
CookTime                       50197
PrepTime                           0
TotalTime                          0
Description                        3
Images                        224357
RecipeCategory                   459
Keywords                       11310
RecipeIngredientQuantities         0
RecipeIngredientParts              0
AggregatedRating              162017
Calories                           0
FatContent                         0
SugarContent                       0
ProteinContent                     0
RecipeServings                     0
RecipeInstructions                 0
dtype: int64

In [15]:
# Fill NaN values in the column CookTime
df_recipes2['CookTime'] = df_recipes2['CookTime'].fillna('PT0M')

In [16]:
df_recipes2 = df_recipes2.dropna().reset_index(drop=True)

In [20]:
df_recipes2

Unnamed: 0,Name,AuthorName,CookTime,PrepTime,TotalTime,Description,Images,RecipeCategory,Keywords,RecipeIngredientQuantities,RecipeIngredientParts,AggregatedRating,Calories,FatContent,SugarContent,ProteinContent,RecipeServings,RecipeInstructions
0,Low-Fat Berry Blue Frozen Dessert,Dancer,PT24H,PT45M,PT24H45M,Make and share this Low-Fat Berry Blue Frozen ...,https://img.sndimg.com/food/image/upload/w_555...,Frozen Desserts,"[Dessert, Low Protein, Low Cholesterol, Health...","[4, 1⁄4, 1, 1]","[blueberries, granulated sugar, vanilla yogurt...",4.5,170.9,2.5,30.2,3.2,4.0,"[Toss 2 cups berries with sugar., Let stand fo..."
1,Biryani,elly9812,PT25M,PT4H,PT4H25M,Make and share this Biryani recipe from Food.com.,https://img.sndimg.com/food/image/upload/w_555...,Chicken Breast,"[Chicken Thigh & Leg, Chicken, Poultry, Meat, ...","[1, 4, 2, 2, 8, 1⁄4, 8, 1⁄2, 1, 1, 1⁄4, 1⁄4, 1...","[saffron, milk, hot green chili peppers, onion...",3.0,1110.7,58.8,20.4,63.4,6.0,[Soak saffron in warm milk for 5 minutes and p...
2,Best Lemonade,Stephen Little,PT5M,PT30M,PT35M,This is from one of my first Good House Keepi...,https://img.sndimg.com/food/image/upload/w_555...,Beverages,"[Low Protein, Low Cholesterol, Healthy, Summer...","[1 1⁄2, 1, None, 1 1⁄2, None, 3⁄4]","[sugar, lemons, rind of, lemon, zest of, fresh...",4.5,311.1,0.2,77.2,0.3,4.0,"[Into a 1 quart Jar with tight fitting lid, pu..."
3,Carina's Tofu-Vegetable Kebabs,Cyclopz,PT20M,PT24H,PT24H20M,This dish is best prepared a day in advance to...,https://img.sndimg.com/food/image/upload/w_555...,Soy/Tofu,"[Beans, Vegetable, Low Cholesterol, Weeknight,...","[12, 1, 2, 1, 10, 1, 3, 2, 2, 2, 1, 2, 1⁄2, 1⁄...","[extra firm tofu, eggplant, zucchini, mushroom...",4.5,536.1,24.0,32.1,29.3,2.0,"[Drain the tofu, carefully squeezing out exces..."
4,Cabbage Soup,Duckie067,PT30M,PT20M,PT50M,Make and share this Cabbage Soup recipe from F...,https://img.sndimg.com/food/image/upload/w_555...,Vegetable,"[Low Protein, Vegan, Low Cholesterol, Healthy,...","[46, 4, 1, 2, 1]","[plain tomato juice, cabbage, onion, carrots, ...",4.5,103.6,0.4,17.7,4.3,4.0,"[Mix everything together and bring to a boil.,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86990,Suffering Bastard Cocktail,Chef PotPie,PT0M,PT5M,PT5M,This rum and gin based cocktail was served as ...,https://img.sndimg.com/food/image/upload/w_555...,Beverages,[< 15 Mins],"[1, 1, 1⁄2, 2, None, None]","[Bourbon, fresh lime juice, ginger ale, mint s...",5.0,160.4,0.0,0.3,0.1,1.0,"[Add the bourbon, gin, lime juice and bitters ..."
86991,Cantaloupe Margarita,Chef PotPie,PT0M,PT10M,PT10M,I've been trying all kinds of margaritas this ...,https://img.sndimg.com/food/image/upload/w_555...,Beverages,"[< 15 Mins, Easy]","[4, 1⁄2, 1⁄4, 2, 1, 2, 2]","[cantaloupe, lime juice, sugar, coarse salt]",5.0,165.1,0.6,38.2,2.8,2.0,"[In a blender or food processor, blend the can..."
86992,Honey Paloma Cocktail,Chef PotPie,PT0M,PT20M,PT20M,Added this to the list of cocktails to try thi...,https://img.sndimg.com/food/image/upload/w_555...,Beverages,[< 30 Mins],"[2, 1, 2, 2, 2, 1, 1⁄4, 1⁄4]","[grapefruit juice, fresh lime juice, honey syr...",5.0,285.4,0.1,75.5,0.6,1.0,[For the honey syrup: In a small sauce pot com...
86993,Backyard Breakfast Egg Salad,duonyte,PT0M,PT20M,PT20M,I was gifted with eggs from the backyard coop ...,https://img.sndimg.com/food/image/upload/w_555...,Breakfast,"[Brunch, < 30 Mins, Easy, Inexpensive, From Sc...","[6, 2, 2, 2, 4, 3, 1, 1⁄4, 1⁄4, 4]","[eggs, mayonnaise, Dijon mustard, red onions, ...",5.0,179.7,10.9,1.0,13.3,4.0,"[1., Chop the eggs., 2., Combine the mayonnais..."


### Import second dataset with units of measurement

In [21]:
df_2M = pd.read_csv('/Users/bianp/Desktop/3A DSSA/Recipe_Finder/recipes_data.csv')
df_2M

Unnamed: 0,title,ingredients,directions,link,source,NER,site
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""bite size shredded rice biscuits"", ""vanilla""...",www.cookbooks.com
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""cream of mushroom soup"", ""beef"", ""sour cream...",www.cookbooks.com
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""pepper"", ""cream cheese"", ""gar...",www.cookbooks.com
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken gravy"", ""cream of mushroom soup"", ""c...",www.cookbooks.com
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""graham cracker crumbs"", ""powdered sugar"", ""p...",www.cookbooks.com
...,...,...,...,...,...,...,...
2231137,Sunny's Fake Crepes,"[""1/2 cup chocolate hazelnut spread (recommend...","[""Spread hazelnut spread on 1 side of each tor...",www.foodnetwork.com/recipes/sunny-anderson/sun...,Recipes1M,"[""chocolate hazelnut spread"", ""marshmallows"", ...",www.foodnetwork.com
2231138,Devil Eggs,"[""1 dozen eggs"", ""1 paprika"", ""1 salt and pepp...","[""Boil eggs on medium for 30mins."", ""Then cool...",cookpad.com/us/recipes/355411-devil-eggs,Recipes1M,"[""choice"", ""miracle whip"", ""eggs"", ""relish"", ""...",cookpad.com
2231139,Extremely Easy and Quick - Namul Daikon Salad,"[""150 grams Daikon radish"", ""1 tbsp Sesame oil...","[""Julienne the daikon and squeeze out the exce...",cookpad.com/us/recipes/153324-extremely-easy-a...,Recipes1M,"[""soy sauce"", ""radish"", ""white sesame seeds"", ...",cookpad.com
2231140,Pan-Roasted Pork Chops With Apple Fritters,"[""1 cup apple cider"", ""6 tablespoons sugar"", ""...","[""In a large bowl, mix the apple cider with 4 ...",cooking.nytimes.com/recipes/1015164,Recipes1M,"[""apple cider"", ""egg"", ""sugar"", ""freshly groun...",cooking.nytimes.com


In [22]:
# Extract relevant columns
selected_columns = ['title', 'ingredients', 'directions', 'link', 'NER']
df_2M = df_2M[selected_columns]

In [23]:
# Drop duplicate recipes
df_2M = df_2M.drop_duplicates(subset=['title', 'directions'])

In [24]:
df_2M.head()

Unnamed: 0,title,ingredients,directions,link,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,"[""bite size shredded rice biscuits"", ""vanilla""..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,"[""cream of mushroom soup"", ""beef"", ""sour cream..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,"[""frozen corn"", ""pepper"", ""cream cheese"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,"[""chicken gravy"", ""cream of mushroom soup"", ""c..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,"[""graham cracker crumbs"", ""powdered sugar"", ""p..."


In [25]:
# Convert strings to lists
str_list_cols = ['ingredients', 'directions', 'NER']
for col in str_list_cols:
    df_2M[col] = df_2M[col].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else np.nan
    )   

In [26]:
df_2M.isna().sum()

title          1
ingredients    0
directions     0
link           0
NER            0
dtype: int64

### Merge the two datasets 

In [27]:
df_merged = pd.merge(df_recipes2, df_2M, left_on='Name', right_on='title', how='left')

In [28]:
df_merged.head()

Unnamed: 0,Name,AuthorName,CookTime,PrepTime,TotalTime,Description,Images,RecipeCategory,Keywords,RecipeIngredientQuantities,...,FatContent,SugarContent,ProteinContent,RecipeServings,RecipeInstructions,title,ingredients,directions,link,NER
0,Low-Fat Berry Blue Frozen Dessert,Dancer,PT24H,PT45M,PT24H45M,Make and share this Low-Fat Berry Blue Frozen ...,https://img.sndimg.com/food/image/upload/w_555...,Frozen Desserts,"[Dessert, Low Protein, Low Cholesterol, Health...","[4, 1⁄4, 1, 1]",...,2.5,30.2,3.2,4.0,"[Toss 2 cups berries with sugar., Let stand fo...",Low-Fat Berry Blue Frozen Dessert,"[4 cups blueberries, fresh or frozen, 1/4 cup ...","[Toss 2 cups berries with sugar., Let stand fo...",www.food.com/recipe/low-fat-berry-blue-frozen-...,"[blueberries, sugar, vanilla yogurt, lemon juice]"
1,Biryani,elly9812,PT25M,PT4H,PT4H25M,Make and share this Biryani recipe from Food.com.,https://img.sndimg.com/food/image/upload/w_555...,Chicken Breast,"[Chicken Thigh & Leg, Chicken, Poultry, Meat, ...","[1, 4, 2, 2, 8, 1⁄4, 8, 1⁄2, 1, 1, 1⁄4, 1⁄4, 1...",...,58.8,20.4,63.4,6.0,[Soak saffron in warm milk for 5 minutes and p...,Biryani,"[1/2 lb. meat or chicken, cut in pieces, 1/2 l...",[Marinate meat or chicken in thick yogurt for ...,www.cookbooks.com/Recipe-Details.aspx?id=548876,"[meat, ginger, cinnamon, cloves, onion, thick ..."
2,Biryani,elly9812,PT25M,PT4H,PT4H25M,Make and share this Biryani recipe from Food.com.,https://img.sndimg.com/food/image/upload/w_555...,Chicken Breast,"[Chicken Thigh & Leg, Chicken, Poultry, Meat, ...","[1, 4, 2, 2, 8, 1⁄4, 8, 1⁄2, 1, 1, 1⁄4, 1⁄4, 1...",...,58.8,20.4,63.4,6.0,[Soak saffron in warm milk for 5 minutes and p...,Biryani,"[1 tablespoon saffron, 4 teaspoons milk, warm,...",[Soak saffron in warm milk for 5 minutes and p...,www.food.com/recipe/biryani-39,"[tomatoes, basmati rice, coriander seed, eggs,..."
3,Biryani,elly9812,PT25M,PT4H,PT4H25M,Make and share this Biryani recipe from Food.com.,https://img.sndimg.com/food/image/upload/w_555...,Chicken Breast,"[Chicken Thigh & Leg, Chicken, Poultry, Meat, ...","[1, 4, 2, 2, 8, 1⁄4, 8, 1⁄2, 1, 1, 1⁄4, 1⁄4, 1...",...,58.8,20.4,63.4,6.0,[Soak saffron in warm milk for 5 minutes and p...,Biryani,"[2 cups basmati rice, 34 kg chicken piece, 12 ...","[Mix tomato puree, yogurt, ginger garlic paste...",www.food.com/recipe/biryani-484928,"[oil, basmati rice, cumin powder, tomato puree..."
4,Best Lemonade,Stephen Little,PT5M,PT30M,PT35M,This is from one of my first Good House Keepi...,https://img.sndimg.com/food/image/upload/w_555...,Beverages,"[Low Protein, Low Cholesterol, Healthy, Summer...","[1 1⁄2, 1, None, 1 1⁄2, None, 3⁄4]",...,0.2,77.2,0.3,4.0,"[Into a 1 quart Jar with tight fitting lid, pu...",Best Lemonade,"[1 1/2 c. sugar, 1 Tbsp. lemon peel, finely gr...","[In 1-quart jar with tight fitting lid, shake ...",www.cookbooks.com/Recipe-Details.aspx?id=293155,"[sugar, lemon juice, cold water, very hot wate..."


In [29]:
df_merged.shape[0]

1334836

#### Keep the identical recipes based on the instructions

In [30]:
filtered_df = df_merged[
    df_merged['RecipeInstructions'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None) ==
    df_merged['directions'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)
].reset_index(drop=True)

In [31]:
filtered_df

Unnamed: 0,Name,AuthorName,CookTime,PrepTime,TotalTime,Description,Images,RecipeCategory,Keywords,RecipeIngredientQuantities,...,FatContent,SugarContent,ProteinContent,RecipeServings,RecipeInstructions,title,ingredients,directions,link,NER
0,Low-Fat Berry Blue Frozen Dessert,Dancer,PT24H,PT45M,PT24H45M,Make and share this Low-Fat Berry Blue Frozen ...,https://img.sndimg.com/food/image/upload/w_555...,Frozen Desserts,"[Dessert, Low Protein, Low Cholesterol, Health...","[4, 1⁄4, 1, 1]",...,2.5,30.2,3.2,4.0,"[Toss 2 cups berries with sugar., Let stand fo...",Low-Fat Berry Blue Frozen Dessert,"[4 cups blueberries, fresh or frozen, 1/4 cup ...","[Toss 2 cups berries with sugar., Let stand fo...",www.food.com/recipe/low-fat-berry-blue-frozen-...,"[blueberries, sugar, vanilla yogurt, lemon juice]"
1,Biryani,elly9812,PT25M,PT4H,PT4H25M,Make and share this Biryani recipe from Food.com.,https://img.sndimg.com/food/image/upload/w_555...,Chicken Breast,"[Chicken Thigh & Leg, Chicken, Poultry, Meat, ...","[1, 4, 2, 2, 8, 1⁄4, 8, 1⁄2, 1, 1, 1⁄4, 1⁄4, 1...",...,58.8,20.4,63.4,6.0,[Soak saffron in warm milk for 5 minutes and p...,Biryani,"[1 tablespoon saffron, 4 teaspoons milk, warm,...",[Soak saffron in warm milk for 5 minutes and p...,www.food.com/recipe/biryani-39,"[tomatoes, basmati rice, coriander seed, eggs,..."
2,Cabbage Soup,Duckie067,PT30M,PT20M,PT50M,Make and share this Cabbage Soup recipe from F...,https://img.sndimg.com/food/image/upload/w_555...,Vegetable,"[Low Protein, Vegan, Low Cholesterol, Healthy,...","[46, 4, 1, 2, 1]",...,0.4,17.7,4.3,4.0,"[Mix everything together and bring to a boil.,...",Cabbage Soup,"[46 ounces plain tomato juice, 4 cups cabbage,...","[Mix everything together and bring to a boil.,...",www.food.com/recipe/cabbage-soup-42,"[cabbage, onion, celery, carrots, tomato juice]"
3,Cabbage Soup,Duckie067,PT30M,PT20M,PT50M,Make and share this Cabbage Soup recipe from F...,https://img.sndimg.com/food/image/upload/w_555...,Vegetable,"[Low Protein, Vegan, Low Cholesterol, Healthy,...","[46, 4, 1, 2, 1]",...,0.4,17.7,4.3,4.0,"[Mix everything together and bring to a boil.,...",Cabbage Soup,"[1 head cabbage, chopped, 1 onion, diced, 4 ce...","[Mix everything together and bring to a boil.,...",www.food.com/recipe/cabbage-soup-204921,"[cabbage, tomato paste, pepper, onion, mushroo..."
4,Buttermilk Pie With Gingersnap Crumb Crust,tristitia,PT50M,PT30M,PT1H20M,Make and share this Buttermilk Pie With Ginger...,https://img.sndimg.com/food/image/upload/w_555...,Pie,"[Dessert, Healthy, Weeknight, Oven, < 4 Hours]","[3⁄4, 1, 1, 2, 3, 1⁄4, 1, 1⁄2, 1⁄2, 2]",...,7.1,24.7,4.2,8.0,"[Preheat oven to 350°F., Make pie crust, using...",Buttermilk Pie With Gingersnap Crumb Crust,"[3/4 cup sugar, 1 tablespoon margarine, 1 egg,...","[Preheat oven to 350°F., Make pie crust, using...",www.food.com/recipe/buttermilk-pie-with-ginger...,"[buttermilk, gingersnap crumbs, egg, sugar, gr..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47378,Korean Chicken Porridge (Dakjuk),PanNan,PT40M,PT1H,PT1H40M,Porridge is very popular as a breakfast food i...,https://img.sndimg.com/food/image/upload/w_555...,Breakfast,"[Korean, Asian, < 4 Hours]","[1, 6, 1 1⁄2, 1, 1, 3 -4, 2, 2, 1]",...,11.9,7.0,13.2,4.0,"[Soak the rice for an hour and drain., Finely ...",Korean Chicken Porridge (Dakjuk),[1 cup short-grain rice (or sweet rice glutino...,"[Soak the rice for an hour and drain., Finely ...",www.food.com/recipe/korean-chicken-porridge-da...,"[scallions, sesame seeds, short-grain rice, ce..."
47379,Grilled Cuban Sandwich,Bonnie G 2,PT10M,PT10M,PT20M,Once I made Cuban bread needed to find an auth...,https://img.sndimg.com/food/image/upload/w_555...,Ham,"[Pork, Meat, < 30 Mins]","[1, 1⁄4, 3, 1 1⁄2, 1 1⁄2, 1, 1]",...,32.8,1.3,53.2,8.0,"[Assemble the sandwich., Spread 2 tablespoons ...",Grilled Cuban Sandwich,"[1 loaf Cuban bread, sliced lengthwise, 1/4 cu...","[Assemble the sandwich., Spread 2 tablespoons ...",www.food.com/recipe/grilled-cuban-sandwich-538973,"[yellow mustard, swiss cheese, ham, dill pickl..."
47380,Boulets Liegeois (Belgian Meatballs),Chef PotPie,PT1H30M,PT10M,PT1H40M,Belgian meatballs swimming in an apple and oni...,https://img.sndimg.com/food/image/upload/w_555...,Belgian,"[European, < 4 Hours]","[1, 1, 4, 1, 1, 2, None, None, 2, 2, 3, 2, 1, ...",...,37.7,20.5,33.1,6.0,[Mix beef and pork together in a large bowl to...,Boulets Liegeois (Belgian Meatballs),"[For the meatballs, 1 lb ground beef, 1 lb gro...",[Mix beef and pork together in a large bowl to...,www.food.com/recipe/boulets-liegeois-belgian-m...,"[ground pork, red wine vinegar, beef stock, un..."
47381,Cuban Mojo Potatoes,PanNan,PT45M,PT15M,PT1H,Make and share this Cuban Mojo Potatoes recipe...,https://img.sndimg.com/food/image/upload/w_555...,Cuban,"[Caribbean, Low Protein, Low Cholesterol, Heal...","[1 1⁄2, 1, 1⁄2, 1⁄4, 1, 6, 3, 1, 1⁄2]",...,13.8,3.2,4.1,4.0,[Lay a sheet of aluminum foil on the counterto...,Cuban Mojo Potatoes,"[1 1/2 lbs yukon gold potatoes, large dice, 1 ...",[Lay a sheet of aluminum foil on the counterto...,www.food.com/recipe/cuban-mojo-potatoes-539090,"[gold potatoes, scallions, cilantro, red peppe..."


#### Drop duplicate columns

In [32]:
columns_to_drop = ['Name', 'RecipeIngredientQuantities', 'RecipeIngredientParts', 'RecipeInstructions']
df = filtered_df.drop(columns=columns_to_drop)

In [33]:
df

Unnamed: 0,AuthorName,CookTime,PrepTime,TotalTime,Description,Images,RecipeCategory,Keywords,AggregatedRating,Calories,FatContent,SugarContent,ProteinContent,RecipeServings,title,ingredients,directions,link,NER
0,Dancer,PT24H,PT45M,PT24H45M,Make and share this Low-Fat Berry Blue Frozen ...,https://img.sndimg.com/food/image/upload/w_555...,Frozen Desserts,"[Dessert, Low Protein, Low Cholesterol, Health...",4.5,170.9,2.5,30.2,3.2,4.0,Low-Fat Berry Blue Frozen Dessert,"[4 cups blueberries, fresh or frozen, 1/4 cup ...","[Toss 2 cups berries with sugar., Let stand fo...",www.food.com/recipe/low-fat-berry-blue-frozen-...,"[blueberries, sugar, vanilla yogurt, lemon juice]"
1,elly9812,PT25M,PT4H,PT4H25M,Make and share this Biryani recipe from Food.com.,https://img.sndimg.com/food/image/upload/w_555...,Chicken Breast,"[Chicken Thigh & Leg, Chicken, Poultry, Meat, ...",3.0,1110.7,58.8,20.4,63.4,6.0,Biryani,"[1 tablespoon saffron, 4 teaspoons milk, warm,...",[Soak saffron in warm milk for 5 minutes and p...,www.food.com/recipe/biryani-39,"[tomatoes, basmati rice, coriander seed, eggs,..."
2,Duckie067,PT30M,PT20M,PT50M,Make and share this Cabbage Soup recipe from F...,https://img.sndimg.com/food/image/upload/w_555...,Vegetable,"[Low Protein, Vegan, Low Cholesterol, Healthy,...",4.5,103.6,0.4,17.7,4.3,4.0,Cabbage Soup,"[46 ounces plain tomato juice, 4 cups cabbage,...","[Mix everything together and bring to a boil.,...",www.food.com/recipe/cabbage-soup-42,"[cabbage, onion, celery, carrots, tomato juice]"
3,Duckie067,PT30M,PT20M,PT50M,Make and share this Cabbage Soup recipe from F...,https://img.sndimg.com/food/image/upload/w_555...,Vegetable,"[Low Protein, Vegan, Low Cholesterol, Healthy,...",4.5,103.6,0.4,17.7,4.3,4.0,Cabbage Soup,"[1 head cabbage, chopped, 1 onion, diced, 4 ce...","[Mix everything together and bring to a boil.,...",www.food.com/recipe/cabbage-soup-204921,"[cabbage, tomato paste, pepper, onion, mushroo..."
4,tristitia,PT50M,PT30M,PT1H20M,Make and share this Buttermilk Pie With Ginger...,https://img.sndimg.com/food/image/upload/w_555...,Pie,"[Dessert, Healthy, Weeknight, Oven, < 4 Hours]",4.0,228.0,7.1,24.7,4.2,8.0,Buttermilk Pie With Gingersnap Crumb Crust,"[3/4 cup sugar, 1 tablespoon margarine, 1 egg,...","[Preheat oven to 350°F., Make pie crust, using...",www.food.com/recipe/buttermilk-pie-with-ginger...,"[buttermilk, gingersnap crumbs, egg, sugar, gr..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47378,PanNan,PT40M,PT1H,PT1H40M,Porridge is very popular as a breakfast food i...,https://img.sndimg.com/food/image/upload/w_555...,Breakfast,"[Korean, Asian, < 4 Hours]",5.0,386.2,11.9,7.0,13.2,4.0,Korean Chicken Porridge (Dakjuk),[1 cup short-grain rice (or sweet rice glutino...,"[Soak the rice for an hour and drain., Finely ...",www.food.com/recipe/korean-chicken-porridge-da...,"[scallions, sesame seeds, short-grain rice, ce..."
47379,Bonnie G 2,PT10M,PT10M,PT20M,Once I made Cuban bread needed to find an auth...,https://img.sndimg.com/food/image/upload/w_555...,Ham,"[Pork, Meat, < 30 Mins]",5.0,532.2,32.8,1.3,53.2,8.0,Grilled Cuban Sandwich,"[1 loaf Cuban bread, sliced lengthwise, 1/4 cu...","[Assemble the sandwich., Spread 2 tablespoons ...",www.food.com/recipe/grilled-cuban-sandwich-538973,"[yellow mustard, swiss cheese, ham, dill pickl..."
47380,Chef PotPie,PT1H30M,PT10M,PT1H40M,Belgian meatballs swimming in an apple and oni...,https://img.sndimg.com/food/image/upload/w_555...,Belgian,"[European, < 4 Hours]",5.0,627.9,37.7,20.5,33.1,6.0,Boulets Liegeois (Belgian Meatballs),"[For the meatballs, 1 lb ground beef, 1 lb gro...",[Mix beef and pork together in a large bowl to...,www.food.com/recipe/boulets-liegeois-belgian-m...,"[ground pork, red wine vinegar, beef stock, un..."
47381,PanNan,PT45M,PT15M,PT1H,Make and share this Cuban Mojo Potatoes recipe...,https://img.sndimg.com/food/image/upload/w_555...,Cuban,"[Caribbean, Low Protein, Low Cholesterol, Heal...",5.0,293.2,13.8,3.2,4.1,4.0,Cuban Mojo Potatoes,"[1 1/2 lbs yukon gold potatoes, large dice, 1 ...",[Lay a sheet of aluminum foil on the counterto...,www.food.com/recipe/cuban-mojo-potatoes-539090,"[gold potatoes, scallions, cilantro, red peppe..."


## Cleaning

### Columns CookTime, PrepTime and TotalTime

In [34]:
def duration_to_minutes(duration: str) -> float:
    """
    Function to convert ISO 8601 durations to total minutes

    Args:
        - duration (str): duration in ISO 8601 format (ex: 'PT1H30M')
    """
    if pd.isna(duration): 
        return np.nan

    hours = re.search(r'(\d+)H', duration)
    minutes = re.search(r'(\d+)M', duration)
    
    total_minutes = 0
    if hours:
        total_minutes += int(hours.group(1)) * 60
    if minutes:
        total_minutes += int(minutes.group(1))
    
    return total_minutes

In [35]:
def duration_to_readable_format(duration: str) -> str:
    """
    Function to convert ISO 8601 durations to a more readable format
    
    Args:
        - duration (str): duration in ISO 8601 format (ex: 'PT1H30M')

    Example output: "1 h 30 min"
    """
    if pd.isna(duration): 
        return np.nan

    hours = re.search(r'(\d+)H', duration)
    minutes = re.search(r'(\d+)M', duration)
    
    result = []
    if hours:
        result.append(f"{int(hours.group(1))} h")
    if minutes:
        result.append(f"{int(minutes.group(1))} min")
    
    return ' '.join(result)

In [36]:
duration_to_readable_format('PT1H30M')

'1 h 30 min'

In [37]:
columns = ['CookTime', 'PrepTime', 'TotalTime']
for col in columns:
    df[f'{col}_min'] = df[col].apply(duration_to_minutes)

### Columns RecipeCategory and Keywords

In [38]:
df['RecipeCategory'].value_counts()

RecipeCategory
Dessert            4782
Lunch/Snacks       3520
Beverages          3224
Vegetable          3123
One Dish Meal      2491
                   ... 
Moose                 1
Ecuadorean            1
Small Appliance       1
Nigerian              1
Belgian               1
Name: count, Length: 247, dtype: int64

In [39]:
df['RecipeCategory'].nunique()

247

#### Group categories

- Dessert 
- Main course
- Breakfast
- Beverages

In [40]:
from collections import Counter
all_keywords = [keyword for keywords in df['Keywords'] for keyword in keywords]
keyword_counts = Counter(all_keywords)
keyword_counts_df = pd.DataFrame(keyword_counts.items(), columns=['Keyword', 'Count'])
keyword_counts_df = keyword_counts_df.sort_values(by='Count', ascending=False)


In [41]:
keyword_counts_df['Keyword'].unique()

array(['Easy', '< 60 Mins', '< 30 Mins', '< 15 Mins', 'Meat', 'Healthy',
       '< 4 Hours', 'Low Cholesterol', 'Vegetable', 'Beginner Cook',
       'Oven', 'Low Protein', 'Fruit', 'Inexpensive', 'For Large Groups',
       'Kid Friendly', 'Poultry', 'European', 'Stove Top', 'Brunch',
       'Weeknight', 'Dessert', 'Chicken', 'High In...', 'Free Of...',
       'Asian', 'Summer', 'Sweet', 'Breads', 'Cookie & Brownie', 'Cheese',
       'Potluck', 'Vegan', 'Very Low Carbs', 'Savory', 'High Protein',
       'Small Appliance', 'Lunch/Snacks', 'Spicy', 'Winter', 'Berries',
       'Christmas', 'Mexican', 'Tropical Fruits', 'No Cook', 'Nuts',
       'Beans', 'Beverages', 'Canadian', 'Pork', 'Egg Free', 'Breakfast',
       'Grains', 'Apple', 'Citrus', 'Spring', 'Rice', 'Potato',
       'Refrigerator', 'Lactose Free', 'Thanksgiving', 'Toddler Friendly',
       'Kosher', 'Southwestern U.S.', 'Australian', 'Onions', 'Greens',
       'African', 'Strawberry', 'From Scratch', 'Chicken Breast',
       

In [42]:
trigger_patterns = {
    'Main Course': r'lunch|meal|meat|chicken|beef|pork|steak|turkey|duck|fish|salmon|lamb|crab|shrimp|lobster|tuna|vegetable|potato|rice|noodle|pasta|penne|spaghetti|macaroni|linguine|pizza|quiche|bean|lentil|onion|soup|stew|dressing',
    'Breakfast': r'breakfast',
    'Dessert': r'dessert|cake|cookie|brownie|muffin|biscuit|babka|sweet|candy|sugar|banana',
    'Beverages': r'beverage|cocktail|smoothie|lemonade|coffee',
}

In [43]:
def assign_category(row):
    keywords = row['Keywords']
    title = row['title'] 
    category = row['RecipeCategory']
    ingredients = row['NER']

    # Check the Keywords first
    if keywords is not None and isinstance(keywords, (list, np.ndarray)):
        keywords_list = [str(keyword) for keyword in keywords if keyword is not None and not pd.isna(keyword)]
        if keywords_list:
            keywords_str = ' '.join(keywords_list).lower()
            for cat, pattern in trigger_patterns.items():
                if re.search(pattern, keywords_str):
                    return cat
    
    # Check the title
    if title is not None and isinstance(title, str):
        title_lower = title.lower()
        for cat, pattern in trigger_patterns.items():
            if re.search(pattern, title_lower):
                return cat
    
    # Check the RecipeCategory
    if category is not None and isinstance(category, str):
        category_lower = category.lower()
        for cat, pattern in trigger_patterns.items():
            if re.search(pattern, category_lower):
                return cat
    
    # # Check the ingredients
    # if ingredients is not None and isinstance(ingredients, str):
    #     ingredients_list = ast.literal_eval(ingredients)
    #     if isinstance(ingredients_list, list):
    #         ingredients_str = ' '.join(map(str, ingredients_list)).lower()
    #         for cat, pattern in trigger_patterns.items():
    #             if re.search(pattern, ingredients_str):
    #                 return cat
    
    return row['RecipeCategory']

In [44]:
df['RecipeType'] = df.apply(assign_category, axis=1)

In [45]:
df['RecipeType'].nunique()

144

In [46]:
df['RecipeType'].value_counts()[:50]

RecipeType
Main Course         28122
Dessert              9856
Beverages            3399
Breakfast            1626
Breads                392
Yeast Breads          329
Sauces                327
Low Protein           301
< 15 Mins             232
Quick Breads          213
Cheese                190
< 30 Mins             151
European              135
Spreads               115
Low Cholesterol       114
< 60 Mins             104
Mexican               102
Fruit                 100
Very Low Carbs         98
Grains                 75
Asian                  70
Scones                 61
Tilapia                60
Apple                  55
Healthy                51
Pineapple              44
Chowders               41
Kid Friendly           40
High In...             33
< 4 Hours              31
Halibut                28
Greek                  27
Weeknight              27
Brunch                 24
Vegan                  24
Canadian               23
High Protein           23
Strawberry             22
A

In [47]:
df[['title', 'RecipeType', 'RecipeCategory', 'Keywords']][df['RecipeCategory']=='< 30 Mins'][:50]

Unnamed: 0,title,RecipeType,RecipeCategory,Keywords
18,Anzac Biscuits,Dessert,< 30 Mins,[Oven]
19,Anzac Biscuits,Dessert,< 30 Mins,[Oven]
91,Vegetarian Baked Stuffed Red Bell Peppers,< 30 Mins,< 30 Mins,[Oven]
158,Hot Pizza Dip,Main Course,< 30 Mins,[Oven]
409,Herbed Garlic Croutons,< 30 Mins,< 30 Mins,[Easy]
1243,The Real Deal Deviled Eggs,< 30 Mins,< 30 Mins,"[Refrigerator, Stove Top, Easy]"
1667,Bratwurst Skillet,< 30 Mins,< 30 Mins,[Stove Top]
2037,Creamy Dill Noodles,Main Course,< 30 Mins,[Stove Top]
2615,Pasta With Scallops,Main Course,< 30 Mins,"[Stove Top, Easy]"
3026,Orzo Salad,< 30 Mins,< 30 Mins,[Easy]


Create a new binary column df['Begginer_Friendly'] that takes the value True when "Easy" is in df['Keywords'] False if not 

In [48]:
# Find indices of recipes with "Easy" keyword
easy_recipe_indices = df[df['Keywords'].apply(lambda x: 'Easy' in x)].index

# Create a new column 'Beginner_Friendly' and set it to False initially
df['Beginner_Friendly'] = False

# Set the value to True for recipes with "Easy" keyword
df.loc[easy_recipe_indices, 'Beginner_Friendly'] = True

In [49]:
df['Beginner_Friendly'].value_counts(dropna=False)

Beginner_Friendly
True     26742
False    20641
Name: count, dtype: int64

Create a new binary column df['TotalTime_range'] that takes the 3 values : 
- under30m if the value in df['TotalTime'] indicates under 30 minutes of preparation time 
- between30and60m if the value in df['TotalTime'] indicates between 30 minutes and 1 hour of preparation time 
- above60m if the value in df['TotalTime'] indicates more than 1 hour of preparation time 

In [50]:
df["CookTime"]

0          PT24H
1          PT25M
2          PT30M
3          PT30M
4          PT50M
          ...   
47378      PT40M
47379      PT10M
47380    PT1H30M
47381      PT45M
47382       PT0M
Name: CookTime, Length: 47383, dtype: object

In [51]:
def categorize_time(time_str):
    """Categorizes a time string into three categories:

    - 'under30m' if the time is less than 30 minutes
    - 'between30and60m' if the time is between 30 and 60 minutes
    - 'above60m' if the time is more than 60 minutes

    Args:
        time_str: The time string in the format 'PTxxHx' or 'PTxxMx'.

    Returns:
        The category string.
    """

    if time_str in ["PT{}M".format(i) for i in range(0, 31)]:
        return 'under30m'
    elif time_str in ["PT{}M".format(i) for i in range(31, 61)]:
        return 'between30and60m'
    else:
        return 'above60m'

In [52]:
df['TotalTime_range'] = df['CookTime'].apply(categorize_time)

In [53]:
df[['TotalTime_range','CookTime']] 

Unnamed: 0,TotalTime_range,CookTime
0,above60m,PT24H
1,under30m,PT25M
2,under30m,PT30M
3,under30m,PT30M
4,between30and60m,PT50M
...,...,...
47378,between30and60m,PT40M
47379,under30m,PT10M
47380,above60m,PT1H30M
47381,between30and60m,PT45M


Create a new binary column df['Begginer_Friendly'] that takes the value True when "Easy" is in df['Keywords'] False if not

In [54]:
# Find indices of recipes with "Easy" keyword
easy_recipe_indices = df[df['Keywords'].apply(lambda x: 'Easy' in x)].index

# Create a new column 'Beginner_Friendly' and set it to False initially
df['Beginner_Friendly'] = False

# Set the value to True for recipes with "Easy" keyword
df.loc[easy_recipe_indices, 'Beginner_Friendly'] = True

In [55]:
df['Beginner_Friendly'].value_counts(dropna=False)

Beginner_Friendly
True     26742
False    20641
Name: count, dtype: int64

Create a new binary column df['Vegetarian_Friendly'] that takes the value True if the value in df['NewRecipeCategory'] indicates Vegetarian Dishes 

In [56]:
# Define the list of non-vegetarian keywords
non_veg_keywords = {
    'meat', 'chicken', 'beef', 'pork', 'fish', 'bacon', 'ham', 
    'sausage', 'lamb', 'duck', 'goose', 'lobster', 'shrimp', 'crab', 
    'squid', 'octopus', 'calamari', 'oyster', 'mussel', 'clam', 'snail', 
    'foie gras', 'prosciutto', 'salami', 'pepperoni', 'pancetta', 
    'chorizo', 'andouille', 'boudin noir', 'blood sausage', 'pate', 
    'terrine', 'confit', 'charcuterie', 'shellfish', 'crustacean', 'mollusk', 
    'veal', 'venison', 'game', 'poultry', 'seafood', 
    'turkey', 'bison', 'elk', 'reindeer', 'wild boar', 
    'jerky', 'biltong', 
    'tuna', 'salmon', 'cod', 'haddock', 'halibut', 
    'tilapia', 'shrimp', 'prawns', 'scallops', 'mussels', 
    'clams', 'oysters', 'scallops', 'anchovies'
}

def is_non_veg(ingredient_list):
    """
    Checks if any non-vegetarian keyword is present in the list of ingredients.

    Args:
        ingredient_list: A list of ingredients.

    Returns:
        True if any non-vegetarian keyword is found, False otherwise.
    """
    for ingredient in ingredient_list:
        if any(keyword in str(ingredient).lower() for keyword in non_veg_keywords):
            return True
    return False


df['ingredients_list'] = df['ingredients'].tolist()

df['Vegetarian_Friendly'] = ~df['ingredients_list'].apply(lambda x: is_non_veg(x)) 



In [57]:
df['Vegetarian_Friendly'].value_counts(dropna=False)

Vegetarian_Friendly
True     29253
False    18130
Name: count, dtype: int64

In [63]:
selected_df = df[['title','Vegetarian_Friendly', 'ingredients']][:100]

# Convert to CSV
#selected_df.to_csv('output.csv', index=False) 

Create a new column to propose World Cuisine !