In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval

In [2]:
df = pd.read_csv("./data/recipedata-grouped.csv", index_col=0)

We are going to drop columns we are not interested in, and columns with too many missing values.

In [3]:
df = df.drop(['gaps',
              'lowFodmap', 
              'aggregateLikes', 
              'spoonacularScore',
              'weightWatcherSmartPoints',
              'creditsText', 
              'sourceName',
              'sourceUrl', 
              'image', 
              'imageType',
              'occasions',
              'author', 
              'nutrition', 
              'winePairing',
              'originalId', 
              'spoonacularSourceUrl', 
              'license', 
              'preparationMinutes', 
              'cookingMinutes', 
              'cuisines',
              # 'diets', 
              'dishTypes', 'analyzedInstructions'], axis=1)
# df.head()

In [4]:
# drop 
df = df.replace('[]', np.nan)
for column in df.columns:
    col = df[column]
    n_empty = sum(col.isna()==True)
    print(column, n_empty)

vegetarian 0
vegan 0
glutenFree 0
dairyFree 0
veryHealthy 0
cheap 0
veryPopular 0
sustainable 0
healthScore 0
pricePerServing 0
extendedIngredients 0
id 0
title 0
readyInMinutes 0
servings 0
summary 2
diets 289
instructions 83


We are not going to drop the summary and instructions columns because they are important for our NLP model. Instead, we are going to drop the data points (rows).

In [5]:
print("Before dropping rows: ", df.shape)
df = df.dropna()
print("After dropping rows:", df.shape)

Before dropping rows:  (1422, 18)
After dropping rows: (1060, 18)


In [17]:
df.head()

Unnamed: 0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,veryPopular,sustainable,healthScore,pricePerServing,extendedIngredients,id,title,readyInMinutes,servings,summary,instructions
0,False,False,True,False,False,False,False,False,24.0,274.6,"[{'id': 1032028, 'aisle': 'Ethnic Foods;Spices...",780001,Pesto Chicken Zoodles,45,3,Need a <b>gluten free and primal main course</...,"To get started, place 1/4 cup of olive oil int..."
1,True,True,True,True,True,False,False,False,100.0,126.27,"[{'id': 4053, 'aisle': 'Oil, Vinegar, Salad Dr...",663559,Tomato and lentil soup,45,4,Tomato and lentil soup takes about <b>about 45...,Saut onion and garlic in olive oil for 5 minut...
2,True,True,True,True,False,False,False,False,40.0,43.68,"[{'id': 1032035, 'aisle': 'Spices and Seasonin...",716195,Spicy Indian-Style Hummus,45,12,Spicy Indian-Style Hummus might be just the ho...,<p>Rinse the chickpeas and soak for 8 hours or...
3,True,False,True,True,False,False,False,False,4.0,86.32,"[{'id': 1123, 'aisle': 'Milk, Eggs, Other Dair...",662276,Sun Dried Tomato and Herb Baked Eggs,25,1,Sun Dried Tomato and Herb Baked Eggs might be ...,<ol><li>Preheat oven to 350 F.</li><li>Cover t...
4,True,False,False,False,False,False,False,False,3.0,59.75,"[{'id': 1001, 'aisle': 'Milk, Eggs, Other Dair...",633970,Banana & Oreo Muffin,45,4,Banana & Oreo Muffin is a <b>lacto ovo vegetar...,"Sift the flour, baking power and baking soda t..."


In [18]:
# Count the number of empty cells in each column
df.to_csv(f"data/cleaneddata.csv")

In [48]:
pd.DataFrame(map(literal_eval, df["extendedIngredients"]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,"{'id': 1032028, 'aisle': 'Ethnic Foods;Spices ...","{'id': 5062, 'aisle': 'Meat', 'image': 'chicke...","{'id': 2044, 'aisle': 'Produce;Spices and Seas...","{'id': 11215, 'aisle': 'Produce', 'image': 'ga...","{'id': 1062047, 'aisle': 'Spices and Seasoning...","{'id': 4053, 'aisle': 'Oil, Vinegar, Salad Dre...","{'id': 1033, 'aisle': 'Cheese', 'image': 'parm...","{'id': 1102047, 'aisle': 'Spices and Seasoning...","{'id': 11477, 'aisle': 'Produce', 'image': 'zu...",,...,,,,,,,,,,
1,"{'id': 4053, 'aisle': 'Oil, Vinegar, Salad Dre...","{'id': 11282, 'aisle': 'Produce', 'image': 'br...","{'id': 11215, 'aisle': 'Produce', 'image': 'ga...","{'id': 11124, 'aisle': 'Produce', 'image': 'sl...","{'id': 10011693, 'aisle': 'Canned and Jarred',...","{'id': 2004, 'aisle': 'Produce;Spices and Seas...","{'id': 14412, 'aisle': 'Beverages', 'image': '...","{'id': 16069, 'aisle': 'Pasta and Rice;Canned ...","{'id': 2047, 'aisle': 'Spices and Seasonings',...","{'id': 11297, 'aisle': 'Produce;Spices and Sea...",...,,,,,,,,,,
2,"{'id': 1032035, 'aisle': 'Spices and Seasoning...","{'id': 2031, 'aisle': 'Spices and Seasonings',...","{'id': 16056, 'aisle': 'Pasta and Rice;Canned ...","{'id': 11156, 'aisle': 'Produce', 'image': 'fr...","{'id': 11216, 'aisle': 'Produce;Ethnic Foods;S...","{'id': 11297, 'aisle': 'Produce;Spices and Sea...","{'id': 11215, 'aisle': 'Produce', 'image': 'ga...","{'id': 1002013, 'aisle': 'Spices and Seasoning...","{'id': 1002014, 'aisle': 'Spices and Seasoning...","{'id': 1002030, 'aisle': 'Spices and Seasoning...",...,,,,,,,,,,
3,"{'id': 1123, 'aisle': 'Milk, Eggs, Other Dairy...","{'id': 2044, 'aisle': 'Produce;Spices and Seas...","{'id': 11297, 'aisle': 'Produce;Spices and Sea...","{'id': 4053, 'aisle': 'Oil, Vinegar, Salad Dre...","{'id': 11955, 'aisle': 'Canned and Jarred;Prod...",,,,,,...,,,,,,,,,,
4,"{'id': 1001, 'aisle': 'Milk, Eggs, Other Dairy...","{'id': 19335, 'aisle': 'Baking', 'image': 'sug...","{'id': 20081, 'aisle': 'Baking', 'image': 'flo...","{'id': 18371, 'aisle': 'Baking', 'image': 'whi...","{'id': 18372, 'aisle': 'Baking', 'image': 'whi...","{'id': 1123, 'aisle': 'Milk, Eggs, Other Dairy...","{'id': 1095, 'aisle': 'Baking', 'image': 'evap...","{'id': 9040, 'aisle': 'Produce', 'image': 'ban...","{'id': 10018166, 'aisle': 'Sweet Snacks', 'ima...","{'id': 1012050, 'aisle': 'Baking', 'image': 'v...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1132,"{'id': 2064, 'aisle': 'Produce', 'image': 'min...","{'id': 18617, 'aisle': 'Sweet Snacks', 'image'...","{'id': 2021, 'aisle': 'Spices and Seasonings',...","{'id': 1065062, 'aisle': None, 'image': 'whole...","{'id': 1197, 'aisle': 'Milk, Eggs, Other Dairy...",,,,,,...,,,,,,,,,,
1133,"{'id': 6008, 'aisle': 'Canned and Jarred', 'im...","{'id': 23584, 'aisle': 'Meat', 'image': 'top-s...","{'id': 10011090, 'aisle': 'Produce', 'image': ...","{'id': 19334, 'aisle': 'Baking', 'image': 'lig...","{'id': 11124, 'aisle': 'Produce', 'image': 'sl...","{'id': 10011135, 'aisle': 'Produce', 'image': ...","{'id': 11420421, 'aisle': 'Pasta and Rice', 'i...","{'id': 20027, 'aisle': 'Baking', 'image': 'whi...","{'id': 11260, 'aisle': 'Produce', 'image': 'mu...","{'id': 1022020, 'aisle': 'Spices and Seasoning...",...,,,,,,,,,,
1134,"{'id': 1001, 'aisle': 'Milk, Eggs, Other Dairy...","{'id': 1017, 'aisle': 'Cheese', 'image': 'crea...","{'id': 1125, 'aisle': 'Milk, Eggs, Other Dairy...","{'id': 1125, 'aisle': 'Milk, Eggs, Other Dairy...","{'id': 1123, 'aisle': 'Milk, Eggs, Other Dairy...","{'id': 20081, 'aisle': 'Baking', 'image': 'flo...","{'id': 9156, 'aisle': 'Produce', 'image': 'zes...","{'id': 9216, 'aisle': 'Produce', 'image': 'ora...","{'id': 9302, 'aisle': 'Produce', 'image': 'ras...","{'id': 2047, 'aisle': 'Spices and Seasonings',...",...,,,,,,,,,,
1135,"{'id': 2048, 'aisle': 'Oil, Vinegar, Salad Dre...","{'id': 9037, 'aisle': 'Produce', 'image': 'avo...","{'id': 11080, 'aisle': 'Produce', 'image': 'be...","{'id': 11291, 'aisle': 'Produce', 'image': 'sp...","{'id': 11446, 'aisle': 'Ethnic Foods', 'image'...","{'id': 4053, 'aisle': 'Oil, Vinegar, Salad Dre...","{'id': 11429, 'aisle': 'Produce', 'image': 'ra...","{'id': 12516, 'aisle': 'Savory Snacks', 'image...","{'id': 4528, 'aisle': 'Oil, Vinegar, Salad Dre...",,...,,,,,,,,,,


In [52]:
literal_eval(df["extendedIngredients"].iloc[0])

[{'id': 1032028,
  'aisle': 'Ethnic Foods;Spices and Seasonings',
  'image': 'chili-powder.jpg',
  'consistency': 'solid',
  'name': 'cajun seasoning',
  'nameClean': 'cajun seasoning',
  'original': '1 tsp Cajun seasoning',
  'originalName': 'Cajun seasoning',
  'amount': 1.0,
  'unit': 'tsp',
  'meta': [],
  'measures': {'us': {'amount': 1.0,
    'unitShort': 'tsp',
    'unitLong': 'teaspoon'},
   'metric': {'amount': 1.0, 'unitShort': 'tsp', 'unitLong': 'teaspoon'}}},
 {'id': 5062,
  'aisle': 'Meat',
  'image': 'chicken-breasts.png',
  'consistency': 'solid',
  'name': 'chicken breasts',
  'nameClean': 'chicken breast',
  'original': '2 8oz chicken breasts seasoned, grilled, and sliced (batch cook chicken for the week to save time!)',
  'originalName': 'chicken breasts seasoned, grilled, and sliced (batch cook chicken for the week to save time!)',
  'amount': 16.0,
  'unit': 'oz',
  'meta': ['sliced', 'for the week to save time!)'],
  'measures': {'us': {'amount': 16.0, 'unitShort':