## Cleaning and Resorting Categories 

<u>Input</u>: Raw .json from epicurious.com *data/recipe_urls.json*  <br>
<u>Output</u>: *data/recipe_urls_final.json* 

In [4]:
import json 

with open("data/recipe_urls.json", "r") as f:
    recipes = json.load(f)

len(recipes)

35770

### Data Exploration: Categories

In [51]:
for recipe in recipes[10:20]: 
    print(recipe['categories'])

{'special-consideration': ['Healthy', 'Vegetarian', 'Vegan', 'Peanut Free', 'Wheat/Gluten-Free'], 'occasion': ['Winter'], 'meal': ['Lunch'], 'type': ['Salad'], 'tag': ['Quick and Healthy'], 'source': ['HarperCollins'], 'ingredient': ['Arugula', 'Lettuce', 'Persimmon', 'Pistachio', 'Nut']}
{'special-consideration': ['Kid-Friendly', 'Wheat/Gluten-Free', 'Peanut Free', 'Dairy Free'], 'source': ['HarperCollins'], 'type': ['Candy'], 'tag': ['snack', 'Small Plates'], 'meal': ['Dessert'], 'ingredient': ['Chocolate', 'Almond', 'Coconut', 'Nut']}
{'occasion': ['Winter'], 'meal': ['Dinner'], 'source': ['HarperCollins'], 'ingredient': ['Potato', 'Apple', 'Ham', 'Dairy', 'Mustard', 'Bacon', 'Pork']}
{'source': ['HarperCollins'], 'occasion': ["New Year's Eve", 'Winter', 'Christmas', 'Christmas Eve'], 'type': ['Salad'], 'ingredient': ['Pear', 'Radish', 'Dill', 'Walnut', 'Meat']}
{'special-consideration': ['Vegetarian', 'Wheat/Gluten-Free', 'Kid-Friendly'], 'source': ['HarperCollins'], 'type': ['Lasa

In [52]:
unique_cats = {"occasion": set(), 
               "ingredient": set(), 
               "special-consideration": set(),
               "cuisine": set(),
               "location": set(),
               "type": set(),
               "source": set(),
               "tag": set(),
               "tags": set(),
               "meal": set(),
               "equipment": set(), 
               "technique": set(), 
               "partner": set()}


for recipe in recipes:
    
    try: 
        for uc,vals in recipe["categories"].items(): 

            for val in vals:
                if val not in unique_cats[uc]:
                    unique_cats[uc].add(val)
    
    except AttributeError: 
        continue

In [53]:
all_cats = 0
for k,v in unique_cats.items(): 
    print('Length Category "{}": {}'.format(k, len(v)))
    all_cats += len(v)
print('-'*45 + '\nLength All Cats: {}'.format(all_cats))

Length Category "occasion": 53
Length Category "ingredient": 339
Length Category "special-consideration": 29
Length Category "cuisine": 61
Length Category "location": 141
Length Category "type": 71
Length Category "source": 10
Length Category "tag": 45
Length Category "tags": 6
Length Category "meal": 12
Length Category "equipment": 23
Length Category "technique": 22
Length Category "partner": 1
---------------------------------------------
Length All Cats: 813


In [54]:
unique_cats["meal"]

{'Appetizer',
 'Breakfast',
 'Brunch',
 'Buffet',
 'Dessert',
 'Dinner',
 'Drink',
 "Hors D'Oeuvre",
 'Lunch',
 'One-Pot Meal',
 'Side',
 'leftovers'}

In [55]:
#meal matching 

count_none = 0
count_no_meal = 0

for recipe in recipes:
        
    try: 
        if recipe["categories"]["meal"]:    
            continue
            
    except TypeError: 
        count_none += 1
        continue
    
    except KeyError: 
        count_no_meal += 1
        continue
        
(count_none, count_no_meal)

(713, 11003)

In [2]:
from collections import defaultdict as dd 

def get_category_count(category):
    
    dict_count = dd(int)

    for recipe in recipes:

        try: 
            cats = recipe["categories"][category]
            for c in cats:
                dict_count[c] += 1

        except (TypeError, KeyError):
            continue

    return dict_count

In [57]:
meal_count = get_category_count("meal")
meal_count

defaultdict(int,
            {'Appetizer': 2890,
             'Breakfast': 1603,
             'Brunch': 1809,
             'Buffet': 229,
             'Dessert': 7396,
             'Dinner': 5117,
             'Drink': 2154,
             "Hors D'Oeuvre": 69,
             'Lunch': 2479,
             'One-Pot Meal': 47,
             'Side': 5776,
             'leftovers': 18})

#### Type

In [58]:
type_count = get_category_count("type")
type_count

defaultdict(int,
            {'Alcoholic': 1604,
             'Aperitif': 33,
             'Biscuit': 13,
             'Bread': 1123,
             'Brownie': 13,
             'Burrito': 2,
             'Cake': 1513,
             'Candy': 216,
             'Casserole/Gratin': 321,
             'Cheesecake': 10,
             'Chili': 15,
             'Chowder': 1,
             'Cobbler/Crumble': 14,
             'Cocktail': 864,
             'Condiment/Spread': 1303,
             'Cookies': 875,
             'Cranberry Sauce': 11,
             'Crêpe': 2,
             'Cupcake': 24,
             'Custard': 10,
             'Digestif': 15,
             'Dip': 44,
             'Edible Gift': 286,
             'Egg Nog': 15,
             'Flat Bread': 14,
             'Frittata': 7,
             'Fritter': 3,
             'Frozen Dessert': 720,
             'Guacamole': 1,
             'Hamburger': 12,
             'Hot Drink': 103,
             'Hummus': 8,
             'Ice Cream': 35,
  

#### Tags, Partner & Source

In [12]:
tags_count = get_category_count("tags")
partner_count = get_category_count("partner")
source_count = get_category_count("source")
(tags_count, partner_count, source_count)

(defaultdict(int,
             {'Anthony Bourdain': 2,
              'Dorie Greenspan': 1,
              'Emeril Lagasse': 1,
              'Holiday 2018': 68,
              'Nancy Silverton': 1,
              'Suzanne Goin': 1}),
 defaultdict(int, {'HarperCollins': 101}),
 defaultdict(int,
             {'Bon Appétit': 15366,
              'Bon App��tit': 7,
              'Cookie': 130,
              'Gourmet': 11146,
              'HarperCollins': 437,
              'House & Garden': 884,
              'Parade': 262,
              'Self': 626,
              'The Chew': 1,
              'Weelicious': 79}))

**Conclusions from exploration:** 
- **Occasion, Ingredient** pretty staight forward (synonyms, plural form important!)
- **Cusines** also straight forward (synonyms i.e. Italy instead of Italian) 
- **Type** staight forward
- **Tags** might be better adapted and merged with other cats on one-by-one basis 
- **Equipment** can be taken into search (incl. synonyms), however might not be really complete
- **Special-Cosideration**: must be taken into search
- **Technique** can be taken over (incl. synonyms)
- **Meal** difficult: not all dishes (~ a third) have a categorization for "meal", how to treat the non-tagged ones?
    - first default: merge lunch and dinner to no-category (default search)
    - all other meals are searchable ('breakfast', 'brunch' etc. incl. synonyms)
    - merge "Hors d'Oeuvre" with "Appetizer"
- **Locations** questionnable if meaningful for user search (skipped ftm)
- **Partner & Source & Tags** can be dropped for search (irrelevant)

### Category Cleaning & Preperation for Entities (Dialogflow)

#### Technique

In [59]:
technique_count = get_category_count("technique")
technique_count

defaultdict(int,
            {'Advance Prep Required': 224,
             'Bake': 8305,
             'Boil': 703,
             'Braise': 476,
             'Brine': 53,
             'Broil': 778,
             'Chill': 1825,
             'Deep-Fry': 162,
             'Freeze/Chill': 673,
             'Fry': 819,
             'Grill/Barbecue': 1670,
             'Marinate': 823,
             'No-Cook': 1755,
             'Pan-Fry': 397,
             'Poach': 230,
             'Roast': 1994,
             'Sauté': 3128,
             'Saut��': 1,
             'Simmer': 1285,
             'Steam': 357,
             'Stew': 265,
             'Stir-Fry': 307})

In [62]:
for recipe in recipes: 
    
    try: 
        techniques = recipe['categories']['technique']
        
        for technique in techniques: 
            
            #manipulate
            if "Grill/Barbecue" in techniques:  
                techniques = ["Grill" 
                              if item == "Grill/Barbecue" 
                              else item
                              for item in techniques]
                
            if "Freeze/Chill" in techniques:  
                techniques = ["Freeze" 
                             if item == "Freeze/Chill" 
                             else item
                             for item in techniques]

            if technique.startswith("Saut") and len(technique) > 5: 
                recipe['categories']['technique'].remove(technique)

            if technique == "Advance Prep Required":  
                techniques.remove("Advance Prep Required")
                try: 
                    recipe['categories']['special-consideration'].append("Advance Prep Required")
                except KeyError: 
                    recipe['categories']['special-consideration'] = ["Advance Prep Required"]
            
            if technique == "No-Cook":  
                techniques.remove("No-Cook")
                try: 
                    recipe['categories']['special-consideration'].append("No Cook")
                except KeyError: 
                    recipe['categories']['special-consideration'] = ["No Cook"]
                    
            if technique == "Stew":  
                techniques.remove("Stew")
                try: 
                    recipe['categories']['type'].append("Soup/Stew")
                except KeyError: 
                    recipe['categories']['type'] = ["Soup/Stew"]
            
            if "-" in technique:  
                techniques = [item.replace("-"," ") 
                             if "-" in item 
                             else item
                             for item in techniques]
            
            #store back in dict
            recipe['categories']['technique'] = techniques
    
    except: 
        continue

In [63]:
get_category_count("technique"), get_category_count("special-consideration"), get_category_count("type")

(defaultdict(int,
             {'Bake': 8305,
              'Boil': 703,
              'Braise': 476,
              'Brine': 53,
              'Broil': 778,
              'Chill': 1825,
              'Deep Fry': 162,
              'Freeze': 673,
              'Fry': 819,
              'Grill': 1670,
              'Marinate': 823,
              'Pan Fry': 397,
              'Poach': 230,
              'Roast': 1994,
              'Sauté': 3128,
              'Simmer': 1285,
              'Steam': 357,
              'Stir Fry': 307}),
 defaultdict(int,
             {'Advance Prep Required': 224,
              'Dairy Free': 5915,
              'Diabetes-Friendly': 202,
              'Fat Free': 677,
              'Healthy': 3977,
              'High Fiber': 1766,
              'Kid-Friendly': 4045,
              'Kidney Friendly': 5241,
              'Kosher': 12512,
              'Kosher for Passover': 71,
              'Low Cal': 1856,
              'Low Carb': 572,
              'Low C

To be done: 
- Add explanations: https://whatscookingamerica.net/Glossary/A.html

Notes:
- "Fry" is always tagged when either "Deep Fry" and "Stir Fry" are tagged (no synonyms necessary)

#### Equipment 

In [25]:
equip_count = get_category_count("equipment")
equip_count

defaultdict(int,
            {'Air Fryer': 3,
             'Blender': 997,
             'Bread Machine': 2,
             'Candy Thermometer': 113,
             'Cast Iron': 4,
             'Coffee Grinder': 27,
             'Double Boiler': 62,
             'Food Processor': 1584,
             'Grill': 919,
             'Ice Cream Machine': 292,
             'Instant Pot': 43,
             'Juicer': 21,
             'Mandoline': 16,
             'Microwave': 79,
             'Mixer': 1262,
             'Mortar and Pestle': 5,
             'Pasta Maker': 19,
             'Pressure Cooker': 40,
             'Ramekin': 90,
             'Sheet Pan': 12,
             'Slow Cooker': 64,
             'Smoker': 17,
             'Wok': 137})

In [64]:
#map all "Grill" to Barbecue 
for recipe in recipes: 
    
    try: 
        equipments = recipe['categories']['equipment']
        
        for equipment in equipments: 
            
            if equipment == "Grill": 
            
                try: 
                    if "Grill" not in recipe['categories']['technique']:
                        recipe['categories']['technique'].append("Grill")
                except KeyError:
                    recipe['categories']['technique'] = ["Grill"]

    except: 
        continue

In [65]:
technique_count = get_category_count("technique")
technique_count["Grill"]

1790

#### Tag

Clean up tags and merge with searchable categories.

In [66]:
tag_count = get_category_count("tag")
tag_count

defaultdict(int,
            {'#CAKEWEEK': 6,
             '#WasteLess': 4,
             '#cook90': 13,
             '22-Minute Meals': 27,
             '3-Ingredient Recipes': 70,
             '30 Days of Groceries': 13,
             'Camping': 4,
             'Cheese Week': 37,
             'Cook Like a Diner': 5,
             'Cookbook Critic': 4,
             'Cooking After Dark': 3,
             'Dinner 1-2-3': 3,
             'Dinner Party Gameplan': 1,
             'Drinks': 44,
             'Epi + USHG': 3,
             'Epi Loves the Microwave': 1,
             'Epi Recipe Club': 1,
             'Epic Hacks': 1,
             'Flaming Hot Summer': 13,
             'Frankenrecipe': 14,
             'Freezer Food': 7,
             'Gluten-Free and Fresh': 9,
             'House Cocktail': 18,
             'Iced Coffee': 3,
             'Kitchen Intelligence': 1,
             'Kitchen Olympics': 1,
             'Kitchen Organization': 2,
             'Microwave': 3,
             '

In [69]:
for recipe in recipes: 
    
    try: 
        tags = recipe['categories']['tag']
        
        for tag in tags: 
            
            #map 22-minute meals, 3-ingredient recipes to quick & easy
            if tag == "22-Minute Meals" or tag == '3-Ingredient Recipes': 
                try: 
                    if "Quick & Easy" not in recipe['categories']['special-consideration']:
                        recipe['categories']['special-consideration'].append("Quick & Easy")
                except KeyError:
                    recipe['categories']['technique'] = ["Quick & Easy"]
                
            #map gluten-free & fresh to special consideration 
            if tag == "Gluten-Free and Fresh":  
                try: 
                    if "Wheat/Gluten-Free" not in recipe['categories']['special-consideration']:
                        recipe['categories']['special-consideration'].append("Wheat/Gluten-Free")
                except KeyError: 
                    recipe['categories']['special-consideration'] = ["Wheat/Gluten-Free"]

            #push quick & healthy to special consideration 
            if tag == "Quick and Healthy":  
                try: 
                    if "Healthy" not in recipe['categories']['special-consideration']:
                        recipe['categories']['special-consideration'].append("Healthy")
                    recipe['categories']['special-consideration'].append("Quick")
                except KeyError: 
                    recipe['categories']['special-consideration'] = ["Quick", "Healthy"]

            #push house cocktail, tacos, sandwich theory to type 
            if tag == "House Cocktail":  
                try: 
                    if "Cocktail" not in recipe['categories']['type']:
                        recipe['categories']['type'].append("Cocktail")
                except KeyError: 
                    recipe['categories']['type'] = ["Cocktail"]
            
            if tag == "Taco":
                try: 
                    recipe['categories']['type'].append("Taco")
                except KeyError: 
                    recipe['categories']['type'] = ["Taco"]
            
            if tag == "Sandwich Theory":  
                try: 
                    if "Sandwich" not in recipe['categories']['type']:
                        recipe['categories']['type'].append("Sandwich")
                except KeyError: 
                    recipe['categories']['type'] = ["Sandwich"]
            
            
            #push sourdough, pickles to ingredients
            if tag == "Sourdough" or tag == "Pickles":  
                try: 
                    recipe['categories']['ingredient'].append(tag)
                except KeyError: 
                    recipe['categories']['ingredient'] = [tag]
                
            #push small plates, snack, snack week & drinks to meal 
            if tag == "Small Plates":  
                try: 
                    recipe['categories']['meal'].append("Small Plates")
                except KeyError: 
                    recipe['categories']['meal'] = ["Small Plates"]
            
            if tag == "snack" or tag == "snack week":  
                try: 
                    recipe['categories']['meal'].append("Snack")
                except KeyError: 
                    recipe['categories']['meal'] = ["Snack"]
            
            if tag == "Drinks":  
                try: 
                    if "Drink" not in recipe['categories']['meal']:
                        recipe['categories']['meal'].append("Drink")
                except KeyError: 
                    recipe['categories']['meal'] = ["Drink"]
            
            #map no-meat no problem to vegetarian 
            if tag == "No Meat, No Problem":
                try: 
                    if "Vegetarian" not in recipe['categories']['special-consideration']:
                        recipe['categories']['special-consideration'].append("Vegetarian")
                except KeyError: 
                    recipe['categories']['special-consideration'] = ["Vegetarian"]
    
    except: 
        continue
        
special_count = get_category_count("special-consideration")
type_count = get_category_count("type")
meal_count = get_category_count("meal")
special_count, type_count, meal_count

(defaultdict(int,
             {'Advance Prep Required': 224,
              'Dairy Free': 5915,
              'Diabetes-Friendly': 202,
              'Fat Free': 677,
              'Healthy': 4098,
              'High Fiber': 1766,
              'Kid-Friendly': 4045,
              'Kidney Friendly': 5241,
              'Kosher': 12512,
              'Kosher for Passover': 71,
              'Low Cal': 1856,
              'Low Carb': 572,
              'Low Cholesterol': 760,
              'Low Fat': 1526,
              'Low Sodium': 681,
              'Low Sugar': 26,
              'Low/No Sugar': 493,
              'No Cook': 1755,
              'No Sugar Added': 5808,
              'Organic': 12,
              'Paleo': 1405,
              'Peanut Free': 16346,
              'Pescatarian': 12278,
              'Quick': 615,
              'Quick & Easy': 9591,
              'Raw': 138,
              'Soy Free': 15937,
              'Sugar Conscious': 4863,
              'Tree Nut Free':

#### Meals

In [70]:
meal_count = get_category_count("meal")
meal_count

defaultdict(int,
            {'Appetizer': 2890,
             'Breakfast': 1603,
             'Brunch': 1809,
             'Buffet': 229,
             'Dessert': 7396,
             'Dinner': 5117,
             'Drink': 2168,
             "Hors D'Oeuvre": 69,
             'Lunch': 2479,
             'One-Pot Meal': 47,
             'Side': 5776,
             'Small Plates': 11937,
             'Snack': 471,
             'leftovers': 18})

In [71]:
for recipe in recipes: 
    
    try: 
        meals = recipe['categories']['meal']
        
        for meal in meals: 
            
            #map Hors d'Oevre to Appetizer
            if meal == "Hors D'Oeuvre":
                try:
                    if "Appetizer" not in recipe['categories']['meal']:
                        recipe['categories']['meal'].append("Appetizer")
                except KeyError: 
                    recipe['categories']['meal'] = ["Appetizer"] 
                finally: 
                    recipe['categories']['meal'].remove(meal)
                
            #map One-Pot-Meal to Stew
            if meal == "One-Pot Meal":
                try: 
                    if "Stew" not in recipe['categories']['technique']:
                        recipe['categories']['technique'].append("Stew")
                except KeyError: 
                    recipe['categories']['technique'] = ["Stew"]
                finally:
                    recipe['categories']['meal'].remove(meal)
            
            #merge breakfast and brunch
            if meal == "Brunch": 
                try: 
                    if "Breakfast" not in recipe['categories']['meal']:
                        recipe['categories']['meal'].append("Breakfast")
                except KeyError: 
                    recipe['categories']['meal'] = ["Breakfast"]
                finally:
                    recipe['categories']['meal'].remove(meal)
                
    except: continue

meal_count = get_category_count("meal")
meal_count

defaultdict(int,
            {'Appetizer': 2906,
             'Breakfast': 2179,
             'Buffet': 229,
             'Dessert': 7396,
             'Dinner': 5117,
             'Drink': 2168,
             'Lunch': 2479,
             'Side': 5776,
             'Small Plates': 11937,
             'Snack': 471,
             'leftovers': 18})

#### Special Consideration

In [72]:
special_count = get_category_count("special-consideration")
special_count

defaultdict(int,
            {'Advance Prep Required': 224,
             'Dairy Free': 5915,
             'Diabetes-Friendly': 202,
             'Fat Free': 677,
             'Healthy': 4098,
             'High Fiber': 1766,
             'Kid-Friendly': 4045,
             'Kidney Friendly': 5241,
             'Kosher': 12512,
             'Kosher for Passover': 71,
             'Low Cal': 1856,
             'Low Carb': 572,
             'Low Cholesterol': 760,
             'Low Fat': 1526,
             'Low Sodium': 681,
             'Low Sugar': 26,
             'Low/No Sugar': 493,
             'No Cook': 1755,
             'No Sugar Added': 5808,
             'Organic': 12,
             'Paleo': 1405,
             'Peanut Free': 16346,
             'Pescatarian': 12278,
             'Quick': 615,
             'Quick & Easy': 9591,
             'Raw': 138,
             'Soy Free': 15937,
             'Sugar Conscious': 4863,
             'Tree Nut Free': 13877,
             'Vegan': 

In [75]:
s = "special-consideration"

for recipe in recipes: 

    try: 
        specials = recipe['categories'][s]
        
        for special in specials: 
            
            #split quick & easy in quick | easy 
            if special == "Quick & Easy": 
                recipe['categories'][s].extend(["Quick", "Easy"])
                recipe['categories'][s].remove(special)
                
            #rename wheat-free in gluten-free
            if special == "Wheat/Gluten-Free": 
                recipe['categories'][s].append("Gluten-Free") 
                recipe['categories'][s].remove(special) 
            
            #merge low/no sugar & sugar-conscious 
            if special == "Low/No Sugar" or special == "Sugar Conscious": 
                try: 
                    if "Low Sugar" not in recipe['categories'][s]:
                        recipe['categories'][s].append("Low Sugar")
                except KeyError: 
                    recipe['categories'][s] = ["Low Sugar"]
                finally: 
                    recipe['categories'][s].remove(special)
    
    except: continue
                    
special_count = get_category_count("special-consideration")
special_count

defaultdict(int,
            {'Advance Prep Required': 224,
             'Dairy Free': 5915,
             'Diabetes-Friendly': 202,
             'Easy': 9591,
             'Fat Free': 677,
             'Gluten-Free': 9480,
             'Healthy': 4098,
             'High Fiber': 1766,
             'Kid-Friendly': 4045,
             'Kidney Friendly': 5241,
             'Kosher': 12512,
             'Kosher for Passover': 71,
             'Low Cal': 1856,
             'Low Carb': 572,
             'Low Cholesterol': 760,
             'Low Fat': 1526,
             'Low Sodium': 681,
             'Low Sugar': 5253,
             'No Cook': 1755,
             'No Sugar Added': 5808,
             'Nut Free': 13877,
             'Organic': 12,
             'Paleo': 1405,
             'Peanut Free': 16346,
             'Pescatarian': 12278,
             'Quick': 10206,
             'Raw': 138,
             'Soy Free': 15937,
             'Vegan': 3448,
             'Vegetarian': 13448})

Here, synonyms and definition of alternative expressions will prove important!

#### Occasion

In [76]:
occasion_count = get_category_count("occasion")
occasion_count

defaultdict(int,
            {'Anniversary': 250,
             'Back to School': 458,
             'Backyard BBQ': 1585,
             'Bastille Day': 54,
             'Birthday': 484,
             'Christmas': 2052,
             'Christmas Eve': 701,
             'Cinco de Mayo': 304,
             'Cocktail Party': 2250,
             'Diwali': 32,
             'Easter': 473,
             'Engagement Party': 410,
             'Entertaining': 45,
             'Fall': 5093,
             'Family Reunion': 649,
             "Father's Day": 572,
             'Fourth of July': 690,
             'Friendsgiving': 11,
             'Graduation': 175,
             'Halloween': 149,
             'Hanukkah': 202,
             'Kentucky Derby': 92,
             'Kwanzaa': 30,
             'Labor Day': 13,
             'Lunar New Year': 87,
             'Mardi Gras': 121,
             'Memorial Day': 1,
             "Mother's Day": 400,
             "New Year's Day": 176,
             "New Year's Eve"

In [78]:
o = "occasion"

for recipe in recipes: 

    try: 
        occasions = recipe['categories'][o]
        
        for occasion in occasions: 
            
            #merge anniversary & birthday
            if occasion == "Birthday": 
                try: 
                    if "Anniversary" not in recipe['categories'][o]:
                        recipe['categories'][o].append("Anniversary")
                except KeyError: 
                    recipe['categories'][o] = ["Anniversary"]
                finally: 
                    recipe['categories'][o].remove(occasion)
                    
            #merge Backyard BBQ with Grill 
            if occasion == "Backyard BBQ":
                try: 
                    if "Grill" not in recipe['categories']['technique']:
                        recipe['categories']['technique'].append("Grill")
                except KeyError:
                    recipe['categories']['technique'] = ["Grill"]
            
            #merge Christmas cats 
            if occasion == "Christmas Eve": 
                try: 
                    if "Christmas" not in recipe['categories'][o]:
                        recipe['categories'][o].append("Christmas")
                except KeyError: 
                    recipe['categories'][o] = ["Christmas"]
                finally: 
                    recipe['categories'][o].remove(occasion)
                
            #merge New Years Cats
            if occasion == "New Year's Day" or occasion == "New Year's Eve": 
                recipe['categories'][o].append("New Year")
                recipe['categories'][o].remove(occasion)
            
            #split / cats
            if "/" in occasion:  
                recipe['categories'][o] = [item.split("/")[1]
                                           if "/" in item 
                                           else item
                                           for item in occasions]
            
    except: continue
                    
occasion_count = get_category_count(o)
technique_count = get_category_count("technique")
occasion_count, technique_count

(defaultdict(int,
             {'Anniversary': 685,
              'Back to School': 458,
              'Backyard BBQ': 1585,
              'Bastille Day': 54,
              'Christmas': 2131,
              'Cinco de Mayo': 304,
              'Cocktail Party': 2250,
              'Diwali': 32,
              'Easter': 473,
              'Engagement Party': 410,
              'Entertaining': 45,
              'Fall': 5093,
              'Family Reunion': 649,
              "Father's Day": 572,
              'Fourth of July': 690,
              'Friendsgiving': 11,
              'Game Night': 337,
              'Graduation': 175,
              'Halloween': 149,
              'Hanukkah': 202,
              'Kentucky Derby': 92,
              'Kwanzaa': 30,
              'Labor Day': 13,
              'Lunar New Year': 87,
              'Mardi Gras': 121,
              'Memorial Day': 1,
              "Mother's Day": 400,
              'New Year': 545,
              'Oktoberfest': 58,
      

Also here, synonym matching will be crucial! e.g. Independence Day <-> 4th of July

#### Cusines 

Constrain to count > 5.

In [79]:
cuisine_count = get_category_count("cuisine")
cuisine_count

defaultdict(int,
            {'African': 106,
             'American': 5406,
             'Argentine': 31,
             'Ashkenazi': 13,
             'Asian': 1408,
             'Australian/New Zealand': 20,
             'Basque': 12,
             'Brazilian': 26,
             'British': 45,
             'Cajun/Creole': 201,
             'Californian': 33,
             'Canadian': 16,
             'Cantonese': 1,
             'Central American/Caribbean': 212,
             'Central Asian': 4,
             'Central/South American': 161,
             'Chinese': 303,
             'Cuban': 49,
             'Eastern European/Russian': 176,
             'English': 253,
             'Ethiopian': 5,
             'European': 159,
             'French': 1370,
             'French Provençal': 57,
             'German': 102,
             'Greek': 293,
             'Indian': 409,
             'Indonesian': 20,
             'Irish': 126,
             'Israeli': 34,
             'Italian': 2303,
    

In [81]:
c = "cuisine"

for recipe in recipes: 

    try: 
        cuisines = recipe['categories'][c]
        
        for cuisine in cuisines: 
            
            #merge Central and South American Cats 
            if cuisine in ['Central American/Caribbean', 'Latin American',
                           'Central/South American', 'Nuevo Latino']:
                recipe['categories'][c].append("Latin American")
                recipe['categories'][c].remove(cuisine)
                
            #split / cats 
            if "/" in cuisine:  
                recipe['categories'][c] = [item.split("/")[0]
                                           if "/" in item 
                                           else item
                                           for item in cuisines]
    except: continue 

cuisine_count = get_category_count("cuisine")
{k:v for k,v in cuisine_count.items() if v > 5}

{'African': 106,
 'American': 5406,
 'Argentine': 31,
 'Ashkenazi': 13,
 'Asian': 1408,
 'Australian': 20,
 'Basque': 12,
 'Brazilian': 26,
 'British': 45,
 'Cajun': 201,
 'Californian': 33,
 'Canadian': 16,
 'Chinese': 303,
 'Cuban': 49,
 'Eastern European': 176,
 'English': 253,
 'European': 159,
 'French': 1370,
 'French Provençal': 57,
 'German': 102,
 'Greek': 293,
 'Indian': 409,
 'Indonesian': 20,
 'Irish': 126,
 'Israeli': 34,
 'Italian': 2303,
 'Italian American': 223,
 'Japanese': 228,
 'Jewish': 415,
 'Korean': 107,
 'Latin American': 673,
 'Mediterranean': 458,
 'Mexican': 943,
 'Middle Eastern': 405,
 'Midwestern': 23,
 'Moroccan': 197,
 'New England': 56,
 'Northern Italian': 28,
 'Pacific Northwest': 12,
 'Scandinavian': 137,
 'Sephardic': 17,
 'South American': 53,
 'South Asian': 36,
 'Southeast Asian': 142,
 'Southern': 363,
 'Southern Italian': 26,
 'Southwestern': 239,
 'Spanish': 376,
 'Sushi': 8,
 'Szechuan': 7,
 'Tex-Mex': 140,
 'Thai': 247,
 'Turkish': 41,
 'Vie

#### Ingredient 

Important: Are there any synonyms that match other categories? If yes, merge ingredients!

In [82]:
ingredient_count = get_category_count("ingredient")
ingredient_count

defaultdict(int,
            {'Almond': 978,
             'Amaretto': 53,
             'Anchovy': 78,
             'Anise': 163,
             'Apple': 1027,
             'Apple Juice': 16,
             'Apricot': 363,
             'Artichoke': 241,
             'Arugula': 414,
             'Asian Pear': 25,
             'Asparagus': 338,
             'Avocado': 430,
             'Bacon': 891,
             'Banana': 306,
             'Barley': 79,
             'Basil': 849,
             'Bass': 118,
             'Bean': 1093,
             'Beef': 1199,
             'Beef Rib': 100,
             'Beef Shank': 15,
             'Beef Tenderloin': 78,
             'Beer': 209,
             'Beet': 342,
             'Bell Pepper': 1064,
             'Berry': 675,
             'Bitters': 203,
             'Blackberry': 207,
             'Blue Cheese': 336,
             'Blueberry': 283,
             'Bok Choy': 76,
             'Bourbon': 201,
             'Bran': 12,
             'Brandy': 4

In [83]:
i = "ingredient"

for recipe in recipes: 

    try: 
        ingredients = recipe['categories'][i]
        
        for ingredient in ingredients: 
            
            #split / cats 
            if "/" in ingredient:  
                recipe['categories'][i] = [item.split("/")[0]
                                           if "/" in item 
                                           else item
                                           for item in ingredients]
    except: continue 

ingredient_count = get_category_count(i)
{k:v for k,v in ingredient_count.items() if v > 5}

{'Almond': 978,
 'Amaretto': 53,
 'Anchovy': 78,
 'Anise': 163,
 'Apple': 1027,
 'Apple Juice': 16,
 'Apricot': 363,
 'Artichoke': 241,
 'Arugula': 414,
 'Asian Pear': 25,
 'Asparagus': 338,
 'Avocado': 430,
 'Bacon': 891,
 'Banana': 306,
 'Barley': 79,
 'Basil': 849,
 'Bass': 118,
 'Bean': 1093,
 'Beef': 1199,
 'Beef Rib': 100,
 'Beef Shank': 15,
 'Beef Tenderloin': 78,
 'Beer': 209,
 'Beet': 342,
 'Bell Pepper': 1064,
 'Berry': 675,
 'Bitters': 203,
 'Blackberry': 207,
 'Blue Cheese': 336,
 'Blueberry': 283,
 'Bok Choy': 76,
 'Bourbon': 201,
 'Bran': 12,
 'Brandy': 410,
 'Breadcrumbs': 102,
 'Brie': 38,
 'Brisket': 55,
 'Broccoli': 206,
 'Broccoli Rabe': 56,
 'Brown Rice': 26,
 'Brussels Sprout': 114,
 'Buffalo': 12,
 'Bulgur': 69,
 'Butter': 469,
 'Buttermilk': 156,
 'Butternut Squash': 240,
 'Butterscotch': 34,
 'Cabbage': 490,
 'Calvados': 46,
 'Campari': 39,
 'Cantaloupe': 70,
 'Capers': 291,
 'Caraway': 73,
 'Cardamom': 91,
 'Carrot': 1024,
 'Cashew': 104,
 'Cauliflower': 195,
 

#### Type 

For a smooth dialogflow, type needs to be cleaned up too: Some categories fall under meal (i.e.

In [5]:
get_category_count("type")

defaultdict(int,
            {'Alcoholic': 1604,
             'Aperitif': 33,
             'Biscuit': 13,
             'Bread': 1123,
             'Brownie': 13,
             'Burrito': 2,
             'Cake': 1513,
             'Candy': 216,
             'Casserole/Gratin': 321,
             'Cheesecake': 10,
             'Chili': 15,
             'Chowder': 1,
             'Cobbler/Crumble': 14,
             'Cocktail': 865,
             'Condiment/Spread': 1303,
             'Cookies': 875,
             'Cranberry Sauce': 11,
             'Crêpe': 2,
             'Cupcake': 24,
             'Custard': 10,
             'Digestif': 15,
             'Dip': 44,
             'Edible Gift': 286,
             'Egg Nog': 15,
             'Flat Bread': 14,
             'Frittata': 7,
             'Fritter': 3,
             'Frozen Dessert': 720,
             'Guacamole': 1,
             'Hamburger': 12,
             'Hot Drink': 103,
             'Hummus': 8,
             'Ice Cream': 35,
  

In [6]:
for recipe in recipes: 

    try: 
        types = recipe['categories']['type']
        
        for t in types: 
            
            #push alcoholic, non-alcoholic to special consideration
            if t == "Alcoholic" or t == "Non-Alcoholic":  
                try: 
                    recipe['categories']['special-consideration'].append(t)
                except KeyError: 
                    recipe['categories']['special-consideration'] = [t]
                finally: 
                    recipe['categories']['type'].remove(t)
    except: continue 

get_category_count('type'), get_category_count('special-consideration')

(defaultdict(int,
             {'Aperitif': 33,
              'Biscuit': 13,
              'Bread': 1123,
              'Brownie': 13,
              'Burrito': 2,
              'Cake': 1513,
              'Candy': 216,
              'Casserole/Gratin': 321,
              'Cheesecake': 10,
              'Chili': 15,
              'Chowder': 1,
              'Cobbler/Crumble': 14,
              'Cocktail': 865,
              'Condiment/Spread': 1303,
              'Cookies': 875,
              'Cranberry Sauce': 11,
              'Crêpe': 2,
              'Cupcake': 24,
              'Custard': 10,
              'Digestif': 15,
              'Dip': 44,
              'Edible Gift': 286,
              'Egg Nog': 15,
              'Flat Bread': 14,
              'Frittata': 7,
              'Fritter': 3,
              'Frozen Dessert': 720,
              'Guacamole': 1,
              'Hamburger': 12,
              'Hot Drink': 103,
              'Hummus': 8,
              'Ice Cream': 35,
 

#### Create Unique ID

In [None]:
u_id = 1

for recipe in recipes: 
    
    recipe["_recipe_id"] = u_id
    u_id += 1

### Drop edited file

In [7]:
with open('data/recipe_urls_final_v2.json', 'a') as f:
    json.dump(recipes, f)

### Archive

In [None]:
#Load Spacy 
nlp = spacy.load('en_core_web_md')

for cat in unique_cats[:10]: 
    
    doc = nlp(cat)

    # document level
    ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
    print(ents)

In [291]:
import requests 
import json 
 
res = requests.get('http://words.bighugelabs.com/api/2/{}/{}/json'.format(key,"Almond"))
res.json()

{'noun': {'syn': ['sweet almond',
   'Prunus dulcis',
   'Prunus amygdalus',
   'Amygdalus communis',
   'almond tree',
   'drupe',
   'edible nut',
   'stone fruit']}}