In [39]:
# Loading packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [40]:
# Loading data 
recipes_raw = pd.read_parquet("../data/recipes.parquet")
reviews_raw = pd.read_parquet("../data/reviews.parquet")
keywords = pd.read_pickle("../data/clean_columns/keywords_clean.pk")

In [41]:
# We retrieve the keywords for our recipes
recipes = recipes_raw.merge(keywords, how='inner', on='RecipeId')
recipes[['Name', 'Description', 'RecipeCategory', 'KeywordsClean']].sample(5)

Unnamed: 0,Name,Description,RecipeCategory,KeywordsClean
277444,Sugarless Jam,"Good for diabetics, or WeightWatchers program,...",Fruit,"[beginner cook, for large groups, healthy, low..."
263858,Spoiled Chicken,The reason I have called it spoiled chicken is...,Chicken Breast,"[beginner cook, meat, < 4 hours, inexpensive, ..."
80786,Beef Soup With Noodles,This is a great soup with a hint of oriental c...,Onions,"[soup, sauce, grain, peppers, meat, pepper, st..."
73481,Roast Goose with Forcemeat and Spiced Cranberr...,I just ordered my goose (a 14lb one) for Chris...,Goose,"[spicy, cocktail, poultry, stock, meat, pepper..."
13737,Pistachio Coconut Cookies,Make and share this Pistachio Coconut Cookies ...,Drop Cookies,"[nuts, dessert, < 15 mins, coconut, fruit, bak..."


In [42]:
# This is just for analysis purposes

from collections import Counter
from itertools import chain

flattened_keywords = list(chain.from_iterable(recipes['KeywordsClean']))
keyword_counts = Counter(flattened_keywords)
sorted_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=False)

### Creating meal type columns.

By lack of time I will identify the words in `sorted_keywords` that correspond to a meal type by hand. I will do this by analyzing batches of 50 words.

In [43]:
# The first list was manually retreived from the keywords
all_meal_types = ['weeknight', 'summer', 'dessert', 'winter', 'for large groups', 'breakfast', 'christmas', 'lunch/snacks', 
                  'kid friendly', "st. patrick's day", 'thanksgiving', 'toddler friendly', 'frozen dessert', 'chinese new year', 
                  'for large groups holiday/event', 'lime dessert', 'breakfast egg','breakfast potatoes', 'strawberries dessert', 
                  'memorial day', 'labor day', 'desserts easy', 'coconut dessert']

main_meal_types = {
    'weeknight': ['weeknight'],
    'summer': ['summer'],
    'winter': ['winter'],
    'spring': ['spring'],
    'large_groups': ['for large groups', 'for large groups holiday/event'],
    'holiday': ['christmas', "st. patrick's day", 'chinese new year', 'for large groups holiday/event', 'memorial day', 'labor day', 'birthday'],
    'kid_friendly': ['kid friendly', 'toddler friendly'],
    'dessert': ['frozen dessert', 'lime dessert', 'strawberries dessert', 'desserts easy', 'coconut dessert'],
    'breakfast': ['breakfast', 'breakfast egg', 'breakfast potatoes'],
    'lunch_or_snack': ['lunch/snacks', 'brunch'],
    'sauce': ['sauce'],
    'beverage': ['margarita', 'beverage', 'drink', 'beverage', 'smoothie', 'tea', 'coffee', 'cafe', 'cappuccino', 'shake']
}

Now, we retrieve the Meal Type keywords for each meal in the `MealType` attribute. 

In [44]:
def get_meal_types(arr):
    meal_types = set()
    for word in arr:
        for meal_type in main_meal_types:
            if word in main_meal_types[meal_type]:
                meal_types.add(meal_type)
    return list(meal_types)

recipes['MealType'] = recipes['KeywordsClean'].apply(get_meal_types)

We can create one column for each main meal type and set it to 1 or 0 dependeding on whether the corresponding meal type is in the `MealType` list of the recipe. __However__ for storage purposes we will avoid saving the 0/1 columns here.

In [45]:
for meal_type in main_meal_types:
    recipes[meal_type] = recipes['MealType'].apply(lambda x: 1 if meal_type in x else 0)

recipes[['RecipeId', 'MealType'] + list(main_meal_types)].sample(5)

Unnamed: 0,RecipeId,MealType,weeknight,summer,winter,spring,large_groups,holiday,kid_friendly,dessert,breakfast,lunch_or_snack,sauce,beverage
86816,92068.0,[beverage],0,0,0,0,0,0,0,0,0,0,0,1
260580,271205.0,[beverage],0,0,0,0,0,0,0,0,0,0,0,1
431626,447865.0,[weeknight],1,0,0,0,0,0,0,0,0,0,0,0
494661,513120.0,"[sauce, weeknight]",1,0,0,0,0,0,0,0,0,0,1,0
194764,203483.0,"[sauce, spring]",0,0,0,1,0,0,0,0,0,0,1,0


In [46]:
# Saving the meal types per recipe in a file
recipes[['RecipeId', 'MealType']].to_pickle('../data/clean_columns/mealtype_clean.pk')

percent_w_mealtype = len(recipes[recipes['MealType'].apply(len) > 0]) / len(recipes)
print(f"{round(percent_w_mealtype, 3)*100}% of the dataset in this Notebook has at least one meal type entry.")

53.2% of the dataset in this Notebook has at least one meal type entry.


### Creating Main ingredients columns

In [23]:
# List of all ingredient-like keyword retrieved by hand

all_ingredients = {
    'chicken', 'meat', 'poultry', 'chicken thigh & leg', 'beans', 'vegetable', 'cabbage', 'cucumber', 'fruit', 'nuts',
    'pineapple', 'berries', 'corn', 'salad', 'sauce', 'cheese', 'rice', 'citrus', 'grapes', 'apple', 'egg', 'mango', 'chocolate'
    'lime', 'onions', 'potato', 'spaghetti', 'pork', 'tuna', 'pasta', 'short grain rice', 'coconut', 'beef liver', 'butter',
    'beef organ meats', 'white rice', 'chicken breast', 'chocolate', 'peppers', 'lobster', 'greens', 'plums', 'meatballs',
    'crab', 'avocado', 'pears', 'lemon', 'spinach', 'cauliflower', 'soy/tofu', 'ham', 'oranges', 'long grain rice', 'raspberries',
    'pumpkin', 'lentil', 'meatloaf', 'mushroom', 'veggie', 'lamb/sheep', 'cherries', 'crawfish', 'strawberry', 'oatmeal',
    'oysters', 'artichoke', 'whole chicken', 'ramen', 'melons', 'burger', 'grains', 'papaya', 'bass', 'penne', 'no shell fish', 'mussels',
    'squid', 'goose', 'brown rice', 'catfish', 'deer', 'tilapia', 'duck breasts', 'quail', 'medium grain rice', 'chicken livers',
    'kiwifruit', 'octopus', 'moose', 'roast beef', 'whole duck', 'whole turkey', 'eggs breakfast', 'pork loin', 'breakfast eggs', 'breakfast potatoes'
}

In [47]:
# Ingredient main groups and their corresponding keywords
main_ingredients = {
    'meat': ['meat', 'chicken', 'poultry', 'chicken thigh & leg', 'pork', 'beef liver', 'beef organ meat', 'meatball', 'ham', 'lamb/sheep', 'meatloaf', 
             'burger', 'beef','roast beef', 'pork loin', 'chicken breast', 'chicken liver', 'duck breast', 'whole chicken', 'whole duck', 'whole turkey', 
             'goose', 'deer', 'quail', 'moose', 'tuna', 'lobster', 'crab', 'squid', 'no shell fish', 'oyster', 'bass', 'mussel', 'catfish', 'tilapia', 
             'octopus', 'crawfish'],
    'poultry': ['poultry', 'chicken', 'poultry', 'chicken thigh & leg', 'chicken breast', 'chicken liver', 'duck breast', 'whole chicken', 'whole duck', 
                'whole turkey', 'goose', 'deer', 'quail'],
    'beef': ['beef', 'beef liver', 'beef organ meat', 'roast beef'],
    'pork':['pork', 'pork loin', 'ham', 'bacon'],
    'non_meat_protein': ['bean', 'lentil', 'soy/tofu'],
    'vegetables': ['vegetable', 'cabbage', 'cucumber', 'corn', 'onion', 'potato', 'pepper', 'green', 'spinach', 'cauliflower', 'mushroom', 'artichoke', 
                   'breakfast potatoes', 'salad', 'veggies'],
    'fruit': ['fruit', 'pineapple', 'berry', 'citrus', 'grape', 'apple', 'mango', 'lime', 'plum', 'avocado', 'pear', 'lemon', 'strawberry', 'pumpkin', 
              'melons', 'cherry', 'raspberry', 'kiwifruit', 'papaya', 'melon', 'orange', 'coconut'],
    'dairy': ['cheese', 'butter', 'cheesecake'],
    'chocolate': ['chocolate', 'chocolate chip cookie', 'cookie & brownie'],
    'eggs': ['egg', 'eggs breakfast', 'breakfast egg'],
    'grains': ['grain', 'rice', 'short grain rice', 'white rice', 'long grain rice', 'brown rice', 'medium grain rice', 'oatmeal', 'penne'],
    'seafood': ['tuna', 'lobster', 'crab', 'squid', 'no shell fish', 'oyster', 'bass', 'mussel', 'catfish', 'tilapia', 'octopus', 'crawfish'],
    'pasta': ['pasta', 'penne', 'pasta elbow', 'pasta shell']
}

# Creating a reverse dictionary to map keywords to their respective list of main groups
# This step optimizes (time-wise) what we did for meal type
reverse_ingredients_dict = {}
for group, keywords in main_ingredients.items():
    for keyword in keywords:
        if keyword not in reverse_ingredients_dict:
            reverse_ingredients_dict[keyword] = [group]
        else:
            reverse_ingredients_dict[keyword].append(group)

# Getting ingredient groups for a given list of keywords
def get_main_ingredients(keyword_list):
    groups = set()
    for keyword in keyword_list:
        if keyword in reverse_ingredients_dict:
            groups = groups | set(reverse_ingredients_dict[keyword])
    return list(groups)

recipes['MainIngredients'] = recipes['KeywordsClean'].apply(get_main_ingredients)

In [48]:
# Testing that everything works as it should
for main_ingredient in main_ingredients:
    recipes[main_ingredient] = recipes['MainIngredients'].apply(lambda x: 1 if main_ingredient in x else 0)

recipes[['RecipeId', 'Name', 'MainIngredients', 'Description'] + list(main_ingredients)].sample(5)

Unnamed: 0,RecipeId,Name,MainIngredients,Description,meat,poultry,beef,pork,non_meat_protein,vegetables,fruit,dairy,chocolate,eggs,grains,seafood,pasta
4196,6996.0,Chinese-Style Spareribs,"[vegetables, dairy]",This is one of the Zaar recipes that I adopted...,0,0,0,0,0,1,0,1,0,0,0,0,0
438099,454580.0,Mom's Famous Raw Pesto,[pasta],My mom is vegan and makes this all the time. I...,0,0,0,0,0,0,0,0,0,0,0,0,1
225658,235284.0,Smoked Salmon Potatoes,"[vegetables, fruit]",Make and share this Smoked Salmon Potatoes rec...,0,0,0,0,0,1,1,0,0,0,0,0,0
193228,201905.0,Fried Whitebait,[],Whitebait are little fish which are dredged in...,0,0,0,0,0,0,0,0,0,0,0,0,0
400453,415348.0,Mrs. H's Lemon Bars,[fruit],This is a coveted recipe - seriously -- I thin...,0,0,0,0,0,0,1,0,0,0,0,0,0


In [49]:
# Saving the main ingredients per recipe in a file
recipes[['RecipeId', 'MainIngredients']].to_pickle('../data/clean_columns/main_ingredients_clean.pk')

percent_w_ingredients = len(recipes[recipes['MainIngredients'].apply(len) > 0]) / len(recipes)
print(f"{round(percent_w_ingredients, 3)*100}% of the dataset in this Notebook has at least one main ingredient entry.")

90.9% of the dataset in this Notebook has at least one main ingredient entry.
