In [11]:
# Loading packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [12]:
# Loading data 
recipes_raw = pd.read_parquet("../data/recipes.parquet")
recipes_raw['RecipeId'] = recipes_raw['RecipeId'].astype(int)
recipes_raw.set_index('RecipeId', inplace=True)

reviews_raw = pd.read_parquet("../data/reviews.parquet")
reviews_raw.set_index('ReviewId', inplace=True)

keywords = pd.read_parquet("../data/clean_columns/keywords_clean.parquet")

In [13]:
# We retrieve the keywords for our recipes
recipes = pd.merge(recipes_raw, keywords[['KeywordsClean']], left_index=True, right_index=True, how='inner')
recipes[['Name', 'Description', 'RecipeCategory', 'KeywordsClean']].sample(5)

Unnamed: 0_level_0,Name,Description,RecipeCategory,KeywordsClean
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
432075,Pineapple Infused After Dinner Drink (Pineappl...,"Ran into a special on fresh pineapples, and my...",Beverages,"[freezer, dessert, < 15 mins, lemon, pineapple..."
482251,Garlic Scape Pesto,Simple garlic scape pesto recipe from http://w...,< 15 Mins,"[simple, easy]"
448016,Spicy Braised Sweet Potatoes,from NY Times' &quot;Recipes for Health&quot; ...,Yam/Sweet Potato,"[potato, vegetable, < 4 hours, healthy, low pr..."
41802,Crock Pot Beef in Sauce,Throw it out at a dinner party! I am sure your...,Sauces,"[meat, healthy, sauce, weeknight, low choleste..."
402442,"Mushroom, French Lentil and Chestnut Ragout","&quot;smoky dried chestnuts, roasted mushrooms...",< 4 Hours,"[oven, roast, lentil, lemon, vegan, onion, green]"


In [14]:
# This is just for analysis purposes

from collections import Counter
from itertools import chain

flattened_keywords = list(chain.from_iterable(recipes['KeywordsClean']))
keyword_counts = Counter(flattened_keywords)
sorted_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=False)

### Creating meal type columns.

By lack of time I will identify the words in `sorted_keywords` that correspond to a meal type by hand. I will do this by analyzing batches of 50 words.

In [15]:
# The first list was manually retreived from the keywords
all_meal_types = ['weeknight', 'summer', 'dessert', 'winter', 'for large groups', 'breakfast', 'christmas', 'lunch/snacks', 
                  'kid friendly', "st. patrick's day", 'thanksgiving', 'toddler friendly', 'frozen dessert', 'chinese new year', 
                  'for large groups holiday/event', 'lime dessert', 'breakfast egg','breakfast potatoes', 'strawberries dessert', 
                  'memorial day', 'labor day', 'desserts easy', 'coconut dessert']

main_meal_types = {
    'weeknight': ['weeknight'],
    'summer': ['summer'],
    'winter': ['winter'],
    'spring': ['spring'],
    'large_groups': ['for large groups', 'for large groups holiday/event'],
    'holiday': ['christmas', "st. patrick's day", 'chinese new year', 'for large groups holiday/event', 'memorial day', 'labor day', 'birthday'],
    'kid_friendly': ['kid friendly', 'toddler friendly'],
    'dessert': ['frozen dessert', 'lime dessert', 'strawberries dessert', 'desserts easy', 'coconut dessert'],
    'breakfast': ['breakfast', 'breakfast egg', 'breakfast potatoes'],
    'lunch_or_snack': ['lunch/snacks', 'brunch'],
    'sauce': ['sauce'],
    'beverage': ['margarita', 'beverage', 'drink', 'beverage', 'smoothie', 'tea', 'coffee', 'cafe', 'cappuccino', 'shake']
}

Now, we retrieve the Meal Type keywords for each meal in the `MealType` attribute. 

In [16]:
def get_meal_types(arr):
    meal_types = set()
    for word in arr:
        for meal_type in main_meal_types:
            if word in main_meal_types[meal_type]:
                meal_types.add(meal_type)
    return list(meal_types)

recipes['MealType'] = recipes['KeywordsClean'].apply(get_meal_types)

We can create one column for each main meal type and set it to 1 or 0 dependeding on whether the corresponding meal type is in the `MealType` list of the recipe. __However__ for storage purposes we will avoid saving the 0/1 columns here.

In [17]:
for meal_type in main_meal_types:
    recipes[meal_type] = recipes['MealType'].apply(lambda x: 1 if meal_type in x else 0)

recipes[['Name','MealType'] + list(main_meal_types)].sample(5)

Unnamed: 0_level_0,Name,MealType,weeknight,summer,winter,spring,large_groups,holiday,kid_friendly,dessert,breakfast,lunch_or_snack,sauce,beverage
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
229054,Sausage Onion Cornbread,[lunch_or_snack],0,0,0,0,0,0,0,0,0,1,0,0
10173,Wild Cranberry Jelly,[],0,0,0,0,0,0,0,0,0,0,0,0
31599,Blueberry Glazed Pie,[summer],0,1,0,0,0,0,0,0,0,0,0,0
173613,Egyptian Sweet Couscous Dessert,[],0,0,0,0,0,0,0,0,0,0,0,0
43950,Summer Surprise Cake,"[sauce, large_groups, summer]",0,1,0,0,1,0,0,0,0,0,1,0


In [18]:
# Saving the meal types per recipe in a file
recipes[['MealType']].to_parquet('../data/clean_columns/mealtype_clean.parquet')

percent_w_mealtype = len(recipes[recipes['MealType'].apply(len) > 0]) / len(recipes)
print(f"{round(percent_w_mealtype, 3)*100}% of the dataset in this Notebook has at least one meal type entry.")

53.2% of the dataset in this Notebook has at least one meal type entry.


### Creating Main ingredients columns

In [19]:
# List of all ingredient-like keyword retrieved by hand

all_ingredients = {
    'chicken', 'meat', 'poultry', 'chicken thigh & leg', 'beans', 'vegetable', 'cabbage', 'cucumber', 'fruit', 'nuts',
    'pineapple', 'berries', 'corn', 'salad', 'sauce', 'cheese', 'rice', 'citrus', 'grapes', 'apple', 'egg', 'mango', 'chocolate'
    'lime', 'onions', 'potato', 'spaghetti', 'pork', 'tuna', 'pasta', 'short grain rice', 'coconut', 'beef liver', 'butter',
    'beef organ meats', 'white rice', 'chicken breast', 'chocolate', 'peppers', 'lobster', 'greens', 'plums', 'meatballs',
    'crab', 'avocado', 'pears', 'lemon', 'spinach', 'cauliflower', 'soy/tofu', 'ham', 'oranges', 'long grain rice', 'raspberries',
    'pumpkin', 'lentil', 'meatloaf', 'mushroom', 'veggie', 'lamb/sheep', 'cherries', 'crawfish', 'strawberry', 'oatmeal',
    'oysters', 'artichoke', 'whole chicken', 'ramen', 'melons', 'burger', 'grains', 'papaya', 'bass', 'penne', 'no shell fish', 'mussels',
    'squid', 'goose', 'brown rice', 'catfish', 'deer', 'tilapia', 'duck breasts', 'quail', 'medium grain rice', 'chicken livers',
    'kiwifruit', 'octopus', 'moose', 'roast beef', 'whole duck', 'whole turkey', 'eggs breakfast', 'pork loin', 'breakfast eggs', 'breakfast potatoes'
}

In [20]:
# Ingredient main groups and their corresponding keywords
main_ingredients = {
    'meat': ['meat', 'chicken', 'poultry', 'chicken thigh & leg', 'pork', 'beef liver', 'beef organ meat', 'meatball', 'ham', 'lamb/sheep', 'meatloaf', 
             'burger', 'beef','roast beef', 'pork loin', 'chicken breast', 'chicken liver', 'duck breast', 'whole chicken', 'whole duck', 'whole turkey', 
             'goose', 'deer', 'quail', 'moose', 'tuna', 'lobster', 'crab', 'squid', 'no shell fish', 'oyster', 'bass', 'mussel', 'catfish', 'tilapia', 
             'octopus', 'crawfish'],
    'poultry': ['poultry', 'chicken', 'poultry', 'chicken thigh & leg', 'chicken breast', 'chicken liver', 'duck breast', 'whole chicken', 'whole duck', 
                'whole turkey', 'goose', 'deer', 'quail'],
    'beef': ['beef', 'beef liver', 'beef organ meat', 'roast beef'],
    'pork':['pork', 'pork loin', 'ham', 'bacon'],
    'non_meat_protein': ['bean', 'lentil', 'soy/tofu'],
    'vegetables': ['vegetable', 'cabbage', 'cucumber', 'corn', 'onion', 'potato', 'pepper', 'green', 'spinach', 'cauliflower', 'mushroom', 'artichoke', 
                   'breakfast potatoes', 'salad', 'veggies'],
    'fruit': ['fruit', 'pineapple', 'berry', 'citrus', 'grape', 'apple', 'mango', 'lime', 'plum', 'avocado', 'pear', 'lemon', 'strawberry', 'pumpkin', 
              'melons', 'cherry', 'raspberry', 'kiwifruit', 'papaya', 'melon', 'orange', 'coconut'],
    'dairy': ['cheese', 'butter', 'cheesecake'],
    'chocolate': ['chocolate', 'chocolate chip cookie', 'cookie & brownie'],
    'eggs': ['egg', 'eggs breakfast', 'breakfast egg'],
    'grains': ['grain', 'rice', 'short grain rice', 'white rice', 'long grain rice', 'brown rice', 'medium grain rice', 'oatmeal', 'penne'],
    'seafood': ['tuna', 'lobster', 'crab', 'squid', 'no shell fish', 'oyster', 'bass', 'mussel', 'catfish', 'tilapia', 'octopus', 'crawfish'],
    'pasta': ['pasta', 'penne', 'pasta elbow', 'pasta shell']
}

# Creating a reverse dictionary to map keywords to their respective list of main groups
# This step optimizes (time-wise) what we did for meal type
reverse_ingredients_dict = {}
for group, keywords in main_ingredients.items():
    for keyword in keywords:
        if keyword not in reverse_ingredients_dict:
            reverse_ingredients_dict[keyword] = [group]
        else:
            reverse_ingredients_dict[keyword].append(group)

# Getting ingredient groups for a given list of keywords
def get_main_ingredients(keyword_list):
    groups = set()
    for keyword in keyword_list:
        if keyword in reverse_ingredients_dict:
            groups = groups | set(reverse_ingredients_dict[keyword])
    return list(groups)

recipes['MainIngredients'] = recipes['KeywordsClean'].apply(get_main_ingredients)

In [21]:
# Testing that everything works as it should
for main_ingredient in main_ingredients:
    recipes[main_ingredient] = recipes['MainIngredients'].apply(lambda x: 1 if main_ingredient in x else 0)

recipes[['Name', 'MainIngredients', 'Description'] + list(main_ingredients)].sample(5)

Unnamed: 0_level_0,Name,MainIngredients,Description,meat,poultry,beef,pork,non_meat_protein,vegetables,fruit,dairy,chocolate,eggs,grains,seafood,pasta
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
197793,Nestles Pumpkin Nog,[fruit],"A combination of pumpkin, evaporated milk, cin...",0,0,0,0,0,0,1,0,0,0,0,0,0
313778,Blueberry Muffins,"[dairy, fruit]",This recipe uses a bit of regular flour and a ...,0,0,0,0,0,0,1,1,0,0,0,0,0
325102,Wild Mushroom Terrine,"[vegetables, fruit]",with pickled onion and spinach n top of baby l...,0,0,0,0,0,1,1,0,0,0,0,0,0
431881,Lemon Balm on Ice,[fruit],Make and share this Lemon Balm on Ice recipe f...,0,0,0,0,0,0,1,0,0,0,0,0,0
76161,Triple Chocolate Custard Pie,[chocolate],Make and share this Triple Chocolate Custard P...,0,0,0,0,0,0,0,0,1,0,0,0,0


In [22]:
# Saving the main ingredients per recipe in a file
recipes[['MainIngredients']].to_parquet('../data/clean_columns/main_ingredients_clean.parquet')

percent_w_ingredients = len(recipes[recipes['MainIngredients'].apply(len) > 0]) / len(recipes)
print(f"{round(percent_w_ingredients, 3)*100}% of the dataset in this Notebook has at least one main ingredient entry.")

90.9% of the dataset in this Notebook has at least one main ingredient entry.
