## Data Preperation for Elasticsearch

### Edit #1: A little bit of perfectionism <br>

    sed -i 's/preparation-group/preparation_group/g' recipe_urls_final_v2.json 

    sed -i 's/ingredient-group/ingredient_group/g' recipe_urls_final_v2.json

In [1]:
import json 

with open("data/recipe_urls_final_v2.json", "r") as f:
    recipes = json.load(f)

len(recipes)

35770

In [5]:
recipes[40:42]

[{'_recipe_id': 41,
  'categories': {'difficulty': ['Normal'],
   'ingredient': ['Dill', 'Spice', 'Potato'],
   'meal': ['Side', 'Small Plates', 'Appetizer'],
   'occasion': ['Cocktail Party', 'New Year'],
   'source': ['Bon Appétit'],
   'special': ['Peanut Free',
    'Soy Free',
    'Dairy Free',
    'Gluten-Free',
    'Kid-Friendly'],
   'tag': ['Small Plates'],
   'tags': ['Holiday 2018'],
   'time': ['Standard']},
  'date': '2018-12-20T13:00:00.000Z',
  'desc': 'These homemade onion-flavored chips just might be more addictive than the store-bought versions.',
  'image_link': 'https://assets.epicurious.com/photos/5c1149646aaa854800070764/6:4/w_274%2Ch_169/onion-dip-potato-chips-recipe-BA-121218.jpg',
  'ingredients': [{'ingredient_group': 'Group 1',
    'ingredient_group_content': ['2 Tbsp. onion powder',
     '1 Tbsp. garlic powder',
     '3/4 tsp. crushed red pepper flakes',
     '1/2 tsp. kosher salt',
     '1 (8-oz.) bag kettle-cooked salted potato chips',
     '1/4 cup extra-v

In [2]:
from collections import defaultdict as dd 

def get_category_count(category):
    
    dict_count = dd(int)

    for recipe in recipes:

        try: 
            cats = recipe["categories"][category]
            for c in cats:
                dict_count[c] += 1

        except (TypeError, KeyError):
            continue

    return dict_count

In [4]:
get_category_count("ingredient")

defaultdict(int,
            {'Almond': 978,
             'Amaretto': 53,
             'Anchovy': 78,
             'Anise': 163,
             'Apple': 1027,
             'Apple Juice': 16,
             'Apricot': 363,
             'Artichoke': 241,
             'Arugula': 414,
             'Asian Pear': 25,
             'Asparagus': 338,
             'Avocado': 430,
             'Bacon': 891,
             'Banana': 306,
             'Barley': 79,
             'Basil': 849,
             'Bass': 118,
             'Bean': 1093,
             'Beef': 1199,
             'Beef Rib': 100,
             'Beef Shank': 15,
             'Beef Tenderloin': 78,
             'Beer': 209,
             'Beet': 342,
             'Bell Pepper': 1064,
             'Berry': 675,
             'Bitters': 203,
             'Blackberry': 207,
             'Blue Cheese': 336,
             'Blueberry': 283,
             'Bok Choy': 76,
             'Bourbon': 201,
             'Bran': 12,
             'Brandy': 4

In [30]:
for recipe in recipes: 
    
    try: 
        tech = recipe['categories']['technique']
        
        #if either lunch or dinner, make extra category "main"
        if "Quick & Easy" in tech:  
            recipe['categories']['technique'].remove("Quick & Easy")
                
    except: 
        continue

### Edit #2: Cleaning meals: 

In [None]:
for recipe in recipes: 
    
    try: 
        meals = recipe['categories']['meal']
        
        #if either lunch or dinner, make extra category "main"
        if any(meal in ["Lunch", "Dinner"] for meal in meals):  
            recipe['categories']['meal'].append("Main")
                
    except: 
        continue

### Edit #3: Create "Quickness" & "Easiness" Category

In [28]:
get_category_count("special-consideration")

defaultdict(int,
            {'Advance Prep Required': 224,
             'Alcoholic': 1604,
             'Dairy Free': 5915,
             'Diabetes-Friendly': 202,
             'Easy': 9591,
             'Fat Free': 677,
             'Gluten-Free': 9480,
             'Healthy': 4098,
             'High Fiber': 1766,
             'Kid-Friendly': 4045,
             'Kidney Friendly': 5241,
             'Kosher': 12512,
             'Kosher for Passover': 71,
             'Low Cal': 1856,
             'Low Carb': 572,
             'Low Cholesterol': 760,
             'Low Fat': 1526,
             'Low Sodium': 681,
             'Low Sugar': 5253,
             'No Cook': 1755,
             'No Sugar Added': 5808,
             'Non-Alcoholic': 453,
             'Nut Free': 13877,
             'Organic': 12,
             'Paleo': 1405,
             'Peanut Free': 16346,
             'Pescatarian': 12278,
             'Quick': 10001,
             'Raw': 138,
             'Soy Free': 15937,
  

In [29]:
for recipe in recipes: 
    
    try: 
        specials = recipe['categories']['special-consideration']
        
        #if either lunch or dinner, make extra category "main"
        if "Easy" in specials:  
            recipe['categories']['difficulty'] = ["Easy"]
            recipe['categories']['special-consideration'].remove("Easy")
        else: 
            recipe['categories']['difficulty'] = ["Normal"]
        
        if "Quick" in specials:  
            recipe['categories']['time'] = ["Quick"]
            recipe['categories']['special-consideration'].remove("Quick")
        else: 
            recipe['categories']['time'] = ["Standard"]
        if "Advance Prep Required" in specials:  
            recipe['categories']['time'] = ["Long"]
            recipe['categories']['special-consideration'].remove("Advance Prep Required")
    
    #no specials
    except (TypeError, KeyError): 
        
        try:  
            recipe['categories']['difficulty'] = ["Normal"]
            recipe['categories']['time'] = ["Standard"]
        
        #no categories
        except (TypeError, KeyError): 
            recipe['categories'] = {"difficulty": "Normal", "time": "Standard"}

In [32]:
for recipe in recipes: 
    
    #remove duplicates
    try: 
        recipe['categories']['special-consideration'].remove("Quick")
    except: pass

In [33]:
get_category_count("special-consideration")

defaultdict(int,
            {'Alcoholic': 1604,
             'Dairy Free': 5915,
             'Diabetes-Friendly': 202,
             'Fat Free': 677,
             'Gluten-Free': 9480,
             'Healthy': 4098,
             'High Fiber': 1766,
             'Kid-Friendly': 4045,
             'Kidney Friendly': 5241,
             'Kosher': 12512,
             'Kosher for Passover': 71,
             'Low Cal': 1856,
             'Low Carb': 572,
             'Low Cholesterol': 760,
             'Low Fat': 1526,
             'Low Sodium': 681,
             'Low Sugar': 5253,
             'No Cook': 1755,
             'No Sugar Added': 5808,
             'Non-Alcoholic': 453,
             'Nut Free': 13877,
             'Organic': 12,
             'Paleo': 1405,
             'Peanut Free': 16346,
             'Pescatarian': 12278,
             'Raw': 138,
             'Soy Free': 15937,
             'Vegan': 3448,
             'Vegetarian': 13448})

In [34]:
diff, time = (get_category_count("difficulty"), get_category_count("time"))
print(diff, time)
sum(diff.values()), sum(time.values())

defaultdict(<class 'int'>, {'Normal': 26179, 'Easy': 9591}) defaultdict(<class 'int'>, {'Standard': 25872, 'Quick': 9674, 'Long': 224})


(35770, 35770)

### Edit #4: Rename "special-consideration" to "special"

In [35]:
for recipe in recipes: 
    
    try: 
        recipe['categories']['special'] = recipe['categories']['special-consideration']
        del recipe['categories']['special-consideration']
        
    except: pass

get_category_count("special")

defaultdict(int,
            {'Alcoholic': 1604,
             'Dairy Free': 5915,
             'Diabetes-Friendly': 202,
             'Fat Free': 677,
             'Gluten-Free': 9480,
             'Healthy': 4098,
             'High Fiber': 1766,
             'Kid-Friendly': 4045,
             'Kidney Friendly': 5241,
             'Kosher': 12512,
             'Kosher for Passover': 71,
             'Low Cal': 1856,
             'Low Carb': 572,
             'Low Cholesterol': 760,
             'Low Fat': 1526,
             'Low Sodium': 681,
             'Low Sugar': 5253,
             'No Cook': 1755,
             'No Sugar Added': 5808,
             'Non-Alcoholic': 453,
             'Nut Free': 13877,
             'Organic': 12,
             'Paleo': 1405,
             'Peanut Free': 16346,
             'Pescatarian': 12278,
             'Raw': 138,
             'Soy Free': 15937,
             'Vegan': 3448,
             'Vegetarian': 13448})

In [56]:
a = "Sring"
c = None
b = ["Frin", "Fran"]
c1 = ["Frin", "Fran"]

In [57]:
", ".join([a,c]) + ", " +", ".join(b) +", " +", ".join(c1)

TypeError: sequence item 1: expected str instance, NoneType found

In [36]:
get_category_count("special-consideration")

defaultdict(int, {})

In [37]:
get_category_count("time")

defaultdict(int, {'Long': 224, 'Quick': 9674, 'Standard': 25872})

### Edit #5: Dedup tag values

In [38]:
for recipe in recipes: 
    
    for cat in ["meal", "special", "time", "difficulty", "ingredient", "cuisine", "occasion"]:
        try: 
            recipe['categories'][cat] = list(set(recipe['categories'][cat]))
        
        except: pass

### Edit #6: Consistent formatting on ingredients/preperation for Elasticsearch

In [39]:
for recipe in recipes: 
    
    try: 
        if isinstance(recipe["ingredients"][0], str): 
            recipe["ingredients"] = [{"ingredient_group_content": recipe["ingredients"], 
                                     "ingredient_group": "All ingredients"}]
        if recipe["ingredients"] is None: 
            recipe["ingredients"] = [{"ingredient_group_content": None, 
                                     "ingredient_group": None}]
    except (KeyError, IndexError):
        pass
    
    try: 
        if isinstance(recipe["preparation"][0], str): 
            recipe["preparation"] = [{"preparation_group_content": recipe["preparation"], 
                                     "preparation_group": "All preparation steps"}]
        if recipe["preparation"] is None: 
            recipe["preparation"] = [{"preparation_group_content": None, 
                                     "preparation_group": None}]
    
    except (KeyError, IndexError):
        pass

In [40]:
count = 0

for recipe in recipes: 
    
    try: 
        if isinstance(recipe["ingredients"][0], str): 
            count += 1
    
    except (KeyError, IndexError):
        
        try:
            if isinstance(recipe["preparation"][0], str):
                count += 1
        
        except (KeyError, IndexError): 
            continue

count

0

In [6]:
recipes[:3]

[{'_recipe_id': 1,
  'categories': {'difficulty': ['Normal'],
   'ingredient': ['Vinegar',
    'Egg',
    'Cornmeal',
    'Endive',
    'Orange',
    'Orange Juice',
    'Garlic',
    'Scallop',
    'Avocado',
    'Basil'],
   'meal': ['Dinner', 'Main'],
   'occasion': ['Winter'],
   'special': ['Dairy Free', 'Gluten-Free'],
   'time': ['Standard'],
   'type': ['Salad']},
  'date': '2019-02-11T17:28:13.812Z',
  'desc': 'Shallow-fried scallops get extra crispy on the outside and super tender inside when double-dredged in an Old Bay–seasoned cornmeal mixture. (Added bonus: they’re gluten-free.)',
  'image_link': 'https://assets.epicurious.com/photos/5c61a910d843834ac1a6c9c7/6:4/w_274%2Ch_169/Crunchy-Scallops-on-a-Winter-Salad-recipe-30012019.jpg',
  'ingredients': [{'ingredient_group': 'All ingredients',
    'ingredient_group_content': ['1/2 cup cornstarch',
     '1/2 cup fine-grind yellow cornmeal',
     '2 tsp. Old Bay seasoning',
     '1/2 tsp. baking powder',
     '3 tsp. kosher salt

### Edit #7: Number format in nutrients 

Dropped to 'inf' instead of None in 174 cases.

In [42]:
count = 0 

for recipe in recipes: 
    
    if recipe["nutrients"] is None: 
        continue
        
    for k,v in recipe["nutrients"].items():
        boolish = False
        if v is None: 
            continue
        if v > 300000: 
            boolish = True
    
    if boolish:
        recipe["nutrients"] = None
        count += 1
count

174

### Edit #8: Cuisine cleaning 

Merge logical categories together to improve tag search.

In [3]:
get_category_count("cuisine")

defaultdict(int,
            {'African': 106,
             'American': 5406,
             'Argentine': 31,
             'Ashkenazi': 13,
             'Asian': 1408,
             'Australian': 20,
             'Basque': 12,
             'Brazilian': 26,
             'British': 45,
             'Cajun': 201,
             'Californian': 33,
             'Canadian': 16,
             'Cantonese': 1,
             'Central': 4,
             'Central Asian': 4,
             'Chinese': 303,
             'Cuban': 49,
             'Eastern European': 176,
             'English': 253,
             'Ethiopian': 5,
             'European': 159,
             'French': 1370,
             'French Provençal': 57,
             'German': 102,
             'Greek': 293,
             'Indian': 409,
             'Indonesian': 20,
             'Irish': 126,
             'Israeli': 34,
             'Italian': 2303,
             'Italian American': 223,
             'Japanese': 228,
             'Jewish': 415,


In [22]:
#typing ... yey!
asian = ["Thai", "Indian", "Southeast Asian", "Chinese", "Korean", "Japanese", 
        "Indonesian", "Vietnamese", "Sushi", "Szechuan", "South Asian", "Cantonese", 
         "Central Asian"]
jewish = ["Ashkenazi", "Sephardic"]
american = ["Southern", "Italian American", "Tex-Mex", "New England", "Pacific Northwest",
           "Southwestern", "Californian", "Canadian", "Midwestern", "Central"]
european = ["Eastern European", "French", "German", "British", "English", "Scandinavian", 
           "Irish", "Scottish"]
european_med = ["Italian", "French Provençal", "Spanish", "Turkish", "Greek", 
                "Southern Italian", "Northern Italian", "Basque"]
african_med = ["Moroccan"]
african = ["Cajun", "Ethiopian"]
middle_eastern = ["Turkish"]
latin_american = ["Mexican", "Cuban", "Tex-Mex", "Brazilian", "Argentine"]
british = ["English", "Scottish"]

#sum up
cuisines_1 = (asian, jewish, american, european, african, middle_eastern, british, latin_american)
cuisines_1_names = ("Asian", "Jewish", "American", "European", "African", "Middle Eastern", 
                   "British", "Latin American")
cuisines_2 = (european_med, african_med)
cuisines_2_names = (["Mediterranean", "European"], ["Mediterranean", "African"])

In [None]:
#quick & dirty, leads to duplicate values
for recipe in recipes: 
    
    try:
        cuisine = recipe["categories"]["cuisine"]
        print(cuisine)
        
        #appending 
        for c,c_name in zip(cuisines_1,cuisines_1_names): 

            if any(a in cuisine for a in c): 
                recipe["categories"]["cuisine"].append(c_name)
        
        #extending 
        for c,c_names in zip(cuisines_2,cuisines_2_names): 

            if any(a in cuisine for a in c): 
                recipe["categories"]["cuisine"].extend(c_names)
        
    except KeyError: 
        continue

In [24]:
#dedup again 
c = 0
for recipe in recipes: 
    try: 
        recipe['categories']['cuisine'] = list(set(recipe['categories']['cuisine']))
        c += 1
    except: pass
c

16273

We've got a cuisine tag for 16273 recipes, not too bad.

### Export again

In [9]:
with open('data/recipe_urls_final_v3.json', 'a') as f:
    json.dump(recipes, f)