## Data Preperation for Elasticsearch

### Edit #1: A little bit of perfectionism <br>

    sed -i 's/preparation-group/preparation_group/g' recipe_urls_final_v2.json 

    sed -i 's/ingredient-group/ingredient_group/g' recipe_urls_final_v2.json

In [24]:
import json 

with open("data/recipe_urls_final_v2.json", "r") as f:
    recipes = json.load(f)

len(recipes)

35770

In [25]:
recipes[:5]

[{'_recipe_id': 1,
  'categories': {'difficulty': ['Normal'],
   'ingredient': ['Scallop',
    'Cornmeal',
    'Egg',
    'Orange',
    'Orange Juice',
    'Vinegar',
    'Garlic',
    'Endive',
    'Avocado',
    'Basil'],
   'meal': ['Dinner', 'Main'],
   'occasion': ['Winter'],
   'special-consideration': ['Dairy Free', 'Gluten-Free'],
   'time': ['Standard'],
   'type': ['Salad']},
  'date': '2019-02-11T17:28:13.812Z',
  'desc': 'Shallow-fried scallops get extra crispy on the outside and super tender inside when double-dredged in an Old Bay–seasoned cornmeal mixture. (Added bonus: they’re gluten-free.)',
  'image_link': 'https://assets.epicurious.com/photos/5c61a910d843834ac1a6c9c7/6:4/w_274%2Ch_169/Crunchy-Scallops-on-a-Winter-Salad-recipe-30012019.jpg',
  'ingredients': ['1/2 cup cornstarch',
   '1/2 cup fine-grind yellow cornmeal',
   '2 tsp. Old Bay seasoning',
   '1/2 tsp. baking powder',
   '3 tsp. kosher salt, divided, plus more',
   '1 tsp. freshly ground black pepper',
   

In [6]:
from collections import defaultdict as dd 

def get_category_count(category):
    
    dict_count = dd(int)

    for recipe in recipes:

        try: 
            cats = recipe["categories"][category]
            for c in cats:
                dict_count[c] += 1

        except (TypeError, KeyError):
            continue

    return dict_count

### Edit #2: Create "Main" Course Category in meals

In [26]:
for recipe in recipes: 
    
    try: 
        meals = recipe['categories']['meal']
        
        #if either lunch or dinner, make extra category "main"
        if any(meal in ["Lunch", "Dinner"] for meal in meals):  
            recipe['categories']['meal'].append("Main")
                
    except: 
        continue

In [27]:
get_category_count("meal")

defaultdict(int,
            {'Appetizer': 2906,
             'Breakfast': 2179,
             'Buffet': 229,
             'Dessert': 7396,
             'Dinner': 5117,
             'Drink': 2168,
             'Lunch': 2479,
             'Main': 12982,
             'Side': 5776,
             'Small Plates': 7958,
             'Snack': 314,
             'leftovers': 18})

### Edit #3: Create "Quickness" & "Easiness" Category

In [28]:
get_category_count("special-consideration")

defaultdict(int,
            {'Advance Prep Required': 224,
             'Alcoholic': 1604,
             'Dairy Free': 5915,
             'Diabetes-Friendly': 202,
             'Easy': 9591,
             'Fat Free': 677,
             'Gluten-Free': 9480,
             'Healthy': 4098,
             'High Fiber': 1766,
             'Kid-Friendly': 4045,
             'Kidney Friendly': 5241,
             'Kosher': 12512,
             'Kosher for Passover': 71,
             'Low Cal': 1856,
             'Low Carb': 572,
             'Low Cholesterol': 760,
             'Low Fat': 1526,
             'Low Sodium': 681,
             'Low Sugar': 5253,
             'No Cook': 1755,
             'No Sugar Added': 5808,
             'Non-Alcoholic': 453,
             'Nut Free': 13877,
             'Organic': 12,
             'Paleo': 1405,
             'Peanut Free': 16346,
             'Pescatarian': 12278,
             'Quick': 10001,
             'Raw': 138,
             'Soy Free': 15937,
  

In [29]:
for recipe in recipes: 
    
    try: 
        specials = recipe['categories']['special-consideration']
        
        #if either lunch or dinner, make extra category "main"
        if "Easy" in specials:  
            recipe['categories']['difficulty'] = ["Easy"]
            recipe['categories']['special-consideration'].remove("Easy")
        else: 
            recipe['categories']['difficulty'] = ["Normal"]
        
        if "Quick" in specials:  
            recipe['categories']['time'] = ["Quick"]
            recipe['categories']['special-consideration'].remove("Quick")
        else: 
            recipe['categories']['time'] = ["Standard"]
        if "Advance Prep Required" in specials:  
            recipe['categories']['time'] = ["Long"]
            recipe['categories']['special-consideration'].remove("Advance Prep Required")
    
    #no specials
    except (TypeError, KeyError): 
        
        try:  
            recipe['categories']['difficulty'] = ["Normal"]
            recipe['categories']['time'] = ["Standard"]
        
        #no categories
        except (TypeError, KeyError): 
            recipe['categories'] = {"difficulty": "Normal", "time": "Standard"}

In [32]:
for recipe in recipes: 
    
    #remove duplicates
    try: 
        recipe['categories']['special-consideration'].remove("Quick")
    except: pass

In [33]:
get_category_count("special-consideration")

defaultdict(int,
            {'Alcoholic': 1604,
             'Dairy Free': 5915,
             'Diabetes-Friendly': 202,
             'Fat Free': 677,
             'Gluten-Free': 9480,
             'Healthy': 4098,
             'High Fiber': 1766,
             'Kid-Friendly': 4045,
             'Kidney Friendly': 5241,
             'Kosher': 12512,
             'Kosher for Passover': 71,
             'Low Cal': 1856,
             'Low Carb': 572,
             'Low Cholesterol': 760,
             'Low Fat': 1526,
             'Low Sodium': 681,
             'Low Sugar': 5253,
             'No Cook': 1755,
             'No Sugar Added': 5808,
             'Non-Alcoholic': 453,
             'Nut Free': 13877,
             'Organic': 12,
             'Paleo': 1405,
             'Peanut Free': 16346,
             'Pescatarian': 12278,
             'Raw': 138,
             'Soy Free': 15937,
             'Vegan': 3448,
             'Vegetarian': 13448})

In [34]:
diff, time = (get_category_count("difficulty"), get_category_count("time"))
print(diff, time)
sum(diff.values()), sum(time.values())

defaultdict(<class 'int'>, {'Normal': 26179, 'Easy': 9591}) defaultdict(<class 'int'>, {'Standard': 25872, 'Quick': 9674, 'Long': 224})


(35770, 35770)

### Edit #4: Rename "special-consideration" to "special"

In [35]:
for recipe in recipes: 
    
    try: 
        recipe['categories']['special'] = recipe['categories']['special-consideration']
        del recipe['categories']['special-consideration']
        
    except: pass

get_category_count("special")

defaultdict(int,
            {'Alcoholic': 1604,
             'Dairy Free': 5915,
             'Diabetes-Friendly': 202,
             'Fat Free': 677,
             'Gluten-Free': 9480,
             'Healthy': 4098,
             'High Fiber': 1766,
             'Kid-Friendly': 4045,
             'Kidney Friendly': 5241,
             'Kosher': 12512,
             'Kosher for Passover': 71,
             'Low Cal': 1856,
             'Low Carb': 572,
             'Low Cholesterol': 760,
             'Low Fat': 1526,
             'Low Sodium': 681,
             'Low Sugar': 5253,
             'No Cook': 1755,
             'No Sugar Added': 5808,
             'Non-Alcoholic': 453,
             'Nut Free': 13877,
             'Organic': 12,
             'Paleo': 1405,
             'Peanut Free': 16346,
             'Pescatarian': 12278,
             'Raw': 138,
             'Soy Free': 15937,
             'Vegan': 3448,
             'Vegetarian': 13448})

In [36]:
get_category_count("special-consideration")

defaultdict(int, {})

In [37]:
get_category_count("time")

defaultdict(int, {'Long': 224, 'Quick': 9674, 'Standard': 25872})

### Edit #5: Dedup tag values

In [38]:
for recipe in recipes: 
    
    for cat in ["meal", "special", "time", "difficulty", "ingredient", "cuisine", "occasion"]:
        try: 
            recipe['categories'][cat] = list(set(recipe['categories'][cat]))
        
        except: pass

### Edit #6: Consistent formatting on ingredients/preperation for Elasticsearch

In [39]:
for recipe in recipes: 
    
    try: 
        if isinstance(recipe["ingredients"][0], str): 
            recipe["ingredients"] = [{"ingredient_group_content": recipe["ingredients"], 
                                     "ingredient_group": "All ingredients"}]
        if recipe["ingredients"] is None: 
            recipe["ingredients"] = [{"ingredient_group_content": None, 
                                     "ingredient_group": None}]
    except (KeyError, IndexError):
        pass
    
    try: 
        if isinstance(recipe["preparation"][0], str): 
            recipe["preparation"] = [{"preparation_group_content": recipe["preparation"], 
                                     "preparation_group": "All preparation steps"}]
        if recipe["preparation"] is None: 
            recipe["preparation"] = [{"preparation_group_content": None, 
                                     "preparation_group": None}]
    
    except (KeyError, IndexError):
        pass

In [40]:
count = 0

for recipe in recipes: 
    
    try: 
        if isinstance(recipe["ingredients"][0], str): 
            count += 1
    
    except (KeyError, IndexError):
        
        try:
            if isinstance(recipe["preparation"][0], str):
                count += 1
        
        except (KeyError, IndexError): 
            continue

count

0

In [41]:
recipes[:5]

[{'_recipe_id': 1,
  'categories': {'difficulty': ['Normal'],
   'ingredient': ['Vinegar',
    'Egg',
    'Cornmeal',
    'Endive',
    'Orange',
    'Orange Juice',
    'Garlic',
    'Scallop',
    'Avocado',
    'Basil'],
   'meal': ['Dinner', 'Main'],
   'occasion': ['Winter'],
   'special': ['Dairy Free', 'Gluten-Free'],
   'time': ['Standard'],
   'type': ['Salad']},
  'date': '2019-02-11T17:28:13.812Z',
  'desc': 'Shallow-fried scallops get extra crispy on the outside and super tender inside when double-dredged in an Old Bay–seasoned cornmeal mixture. (Added bonus: they’re gluten-free.)',
  'image_link': 'https://assets.epicurious.com/photos/5c61a910d843834ac1a6c9c7/6:4/w_274%2Ch_169/Crunchy-Scallops-on-a-Winter-Salad-recipe-30012019.jpg',
  'ingredients': [{'ingredient_group': 'All ingredients',
    'ingredient_group_content': ['1/2 cup cornstarch',
     '1/2 cup fine-grind yellow cornmeal',
     '2 tsp. Old Bay seasoning',
     '1/2 tsp. baking powder',
     '3 tsp. kosher salt

### Edit #7: Number format in nutrients dropped to 'inf' instead of None in 174 cases

In [42]:
count = 0 

for recipe in recipes: 
    
    if recipe["nutrients"] is None: 
        continue
        
    for k,v in recipe["nutrients"].items():
        boolish = False
        if v is None: 
            continue
        if v > 300000: 
            boolish = True
    
    if boolish:
        recipe["nutrients"] = None
        count += 1
count

174

### Export again

In [43]:
with open('data/recipe_urls_final_v3.json', 'a') as f:
    json.dump(recipes, f)