In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
import re

In [2]:
df = pd.read_csv("../data/recipedata-grouped.csv", index_col=0)

## Remove columns that we are not going to use

We are going to drop columns we are not interested in, and columns with too many missing values.

In [3]:
df = df.drop(['id', 
              'gaps',
              'lowFodmap', 
              'aggregateLikes', 
              'spoonacularScore',
              'weightWatcherSmartPoints',
              'creditsText', 
              'sourceName',
              'sourceUrl', 
              'image', 
              'imageType',
              'occasions',
              'author', 
              'nutrition', 
              'winePairing',
              'originalId', 
              'spoonacularSourceUrl', 
              'license', 
              'preparationMinutes', 
              'cookingMinutes', 
              'cuisines',
              # 'diets', 
              'dishTypes', 'analyzedInstructions'], axis=1)
# df.head()

## Drop data points with missing data (if needed)

In [4]:
# check how many missing values we have for each column
count_missing = df.replace('[]', np.nan)
for column in count_missing.columns:
    col = count_missing[column]
    n_empty = sum(col.isna()==True)
    print(column, n_empty)

vegetarian 0
vegan 0
glutenFree 0
dairyFree 0
veryHealthy 0
cheap 0
veryPopular 0
sustainable 0
healthScore 0
pricePerServing 0
extendedIngredients 0
title 0
readyInMinutes 0
servings 0
summary 5
diets 946
instructions 238


We only have missing values in the "summary", "diets" and "instructions" columns. I am going to keep all the data points, but you can drop some of them if you want. 

In [5]:
print("Before dropping rows: ", df.shape)
# df = df.dropna()
print("After dropping rows:", df.shape)

Before dropping rows:  (4438, 17)
After dropping rows: (4438, 17)


In [6]:
df.head()

Unnamed: 0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,veryPopular,sustainable,healthScore,pricePerServing,extendedIngredients,title,readyInMinutes,servings,summary,diets,instructions
0,True,False,False,False,False,False,False,False,3.0,75.55,"[{'id': 20081, 'aisle': 'Baking', 'image': 'fl...",Orange-Fig Teacake with Caramel Glaze,45,10,Orange-Fig Teacake with Caramel Glaze is a <b>...,['lacto ovo vegetarian'],"<ol><li>You will need a 9"" springform pan, or ..."
1,True,False,False,False,False,False,False,False,15.0,147.7,"[{'id': 18064, 'aisle': 'Bakery/Bread', 'image...",Poached Eggs On A Bed Of Fried Mushrooms and C...,45,2,Poached Eggs On A Bed Of Fried Mushrooms and C...,['lacto ovo vegetarian'],<ol><li>In a frying pan heat up oil then add m...
2,True,False,False,True,False,False,False,False,1.0,26.06,"[{'id': 20081, 'aisle': 'Baking', 'image': 'fl...",Pandan Chiffon Cake,45,9,"For <b>26 cents per serving</b>, this recipe <...","['dairy free', 'lacto ovo vegetarian']",<ol><li>Preheat the oven to 170C.</li><li>Blen...
3,False,False,True,True,False,False,False,False,17.0,242.23,"[{'id': 9003, 'aisle': 'Produce', 'image': 'ap...","Pork Chop with Honey, Mustard and Apples",45,4,"Pork Chop with Honey, Mustard and Apples might...","['gluten free', 'dairy free', 'paleolithic', '...",<ol><li>Pre-heat your oven to 200C / 400F.</li...
4,False,False,False,False,False,False,False,False,12.0,417.69,"[{'id': 98853, 'aisle': 'Pasta and Rice;Refrig...",Beet Gnocchi With Steak and Brown Butter Sauce,45,4,The recipe Beet Gnocchi With Steak and Brown B...,[],Cooking beets\nHeat oven to 400 degrees\nWash ...


## Convert the `extendedIngredients` column to a simple list of ingredient names

In [7]:
evaluated_ingredient = pd.DataFrame(map(literal_eval, df["extendedIngredients"]))
evaluated_diets = pd.DataFrame(map(literal_eval, df["diets"]))

In [8]:
row, ingredient_col = evaluated_ingredient.shape
diet_row, diet_col = evaluated_diets.shape

assert row==diet_row,'row count should be the same'

all_ingredients = []
all_ingredient_type = []
all_diets = []

for r in range(row):
    ingredient_names = []
    ingredient_types = []
    
    for c in range(ingredient_col):
        ingredient = evaluated_ingredient.iloc[r,c]
        if ingredient: 

            name = ingredient["name"]
            if name: ingredient_names.append(name)
            
            ingredient_type = ingredient["aisle"]
            if ingredient_type: 
                ingredient_type = re.split('; |, |;|,|\/|\n', ingredient_type)
                ingredient_type = " ".join(ingredient_type)
                ingredient_types.append(ingredient_type)
    
    ingredient_names = "; ".join(ingredient_names) if len(ingredient_names) > 0 else np.nan
    all_ingredients.append(ingredient_names)
    
    ingredient_types = " ".join(set(ingredient_types)) if len(ingredient_types) > 0 else np.nan
    all_ingredient_type.append(ingredient_types)
    
    
    diets = []
    for c in range(diet_col):
        diet = evaluated_diets.iloc[r,c]
        if diet:
            diets.append(diet)
    # diets += ["-".join(d.split()) for d in diets]
    diets = "; ".join(diets) if len(diets) > 0 else np.nan
    all_diets.append(diets)
    
    

In [9]:
df["ingredients"] = pd.Series(all_ingredients)
df["ingredient types"] = pd.Series(all_ingredient_type)
df["diets"] = pd.Series(all_diets)

If you want to keep the `extendIngredients` column, do not run the following cell.

In [10]:
df = df.drop(["extendedIngredients"], axis=1)

In [11]:
# rearrange the columns
columns = df.columns.tolist()
columns = ['title',
           'summary',
           'instructions',
           'ingredients',
           'ingredient types', 
           'diets',
           'vegetarian',
           'vegan',
           'glutenFree',
           'dairyFree',
           'veryHealthy',
           'cheap',
           'veryPopular',
           'sustainable',
           'healthScore',
           'pricePerServing',
           'readyInMinutes',
           'servings',]

In [12]:
df = df[columns]

In [17]:
df.head()

Unnamed: 0,title,summary,instructions,ingredients,ingredient types,diets,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,veryPopular,sustainable,healthScore,pricePerServing,readyInMinutes,servings
0,Orange-Fig Teacake with Caramel Glaze,Orange-Fig Teacake with Caramel Glaze is a <b>...,"<ol><li>You will need a 9"" springform pan, or ...",AP flour; baking powder; cardamom; eggs; fresh...,Produce Spices and Seasonings Baking Milk Eggs...,lacto ovo vegetarian,True,False,False,False,False,False,False,False,3.0,75.55,45,10
1,Poached Eggs On A Bed Of Fried Mushrooms and C...,Poached Eggs On A Bed Of Fried Mushrooms and C...,<ol><li>In a frying pan heat up oil then add m...,bread; butter; eggs; eggs; mushrooms; oil; sal...,Produce Spices and Seasonings Bakery Bread Oil...,lacto ovo vegetarian,True,False,False,False,False,False,False,False,15.0,147.7,45,2
2,Pandan Chiffon Cake,"For <b>26 cents per serving</b>, this recipe <...",<ol><li>Preheat the oven to 170C.</li><li>Blen...,all purpose flour; bay leaves; coconut milk; c...,Spices and Seasonings Produce Spices and Seaso...,dairy free; lacto ovo vegetarian,True,False,False,True,False,False,False,False,1.0,26.06,45,9
3,"Pork Chop with Honey, Mustard and Apples","Pork Chop with Honey, Mustard and Apples might...",<ol><li>Pre-heat your oven to 200C / 400F.</li...,apples; dijon mustard; garlic cloves; honey; j...,Produce Spices and Seasonings Nut butters Jams...,gluten free; dairy free; paleolithic; primal,False,False,True,True,False,False,False,False,17.0,242.23,45,4
4,Beet Gnocchi With Steak and Brown Butter Sauce,The recipe Beet Gnocchi With Steak and Brown B...,Cooking beets\nHeat oven to 400 degrees\nWash ...,gnocchi; beets; olive oil; s&p; goat cheese; r...,Produce Spices and Seasonings Produce Spices a...,,False,False,False,False,False,False,False,False,12.0,417.69,45,4


## Remove the html tags, urls and line breaks from the text data

We will use the following four columns as text data:
1. title
2. summary
3. instructions
4. ingredients

In [18]:
for r in range(row):
    for c in range(4): # only the 4 first columns are text data that we are going to use
        text = df.iloc[r, c]
        if text is not np.nan and text is not None:
            # remove line breaks and html tags
            text = re.sub(r"<[^>]*>|[\n\r]+", " ", text).lower()
            # remove urls 
            text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", " ", text)
            # remove extra spaces
            text = re.sub(r"\s\s+", " ", text)
            # remove special symbols
            text = re.sub(r'[\!"#$%&\*+,-./:<=>?@^_`()|~=]|\n', " ", text)
            if df.columns[c] not in ["ingredients", "diets"]:
                text = re.sub(r';', " ", text)
            df.iloc[r, c] = text

In [19]:
df.head()

Unnamed: 0,title,summary,instructions,ingredients,ingredient types,diets,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,veryPopular,sustainable,healthScore,pricePerServing,readyInMinutes,servings
0,orange fig teacake with caramel glaze,orange fig teacake with caramel glaze is a veg...,you will need a 9 springform pan or a cake ...,ap flour; baking powder; cardamom; eggs; fresh...,Produce Spices and Seasonings Baking Milk Eggs...,lacto ovo vegetarian,True,False,False,False,False,False,False,False,3.0,75.55,45,10
1,poached eggs on a bed of fried mushrooms and c...,poached eggs on a bed of fried mushrooms and c...,in a frying pan heat up oil then add mushroom...,bread; butter; eggs; eggs; mushrooms; oil; sal...,Produce Spices and Seasonings Bakery Bread Oil...,lacto ovo vegetarian,True,False,False,False,False,False,False,False,15.0,147.7,45,2
2,pandan chiffon cake,for 26 cents per serving this recipe covers ...,preheat the oven to 170c blend the pandan le...,all purpose flour; bay leaves; coconut milk; c...,Spices and Seasonings Produce Spices and Seaso...,dairy free; lacto ovo vegetarian,True,False,False,True,False,False,False,False,1.0,26.06,45,9
3,pork chop with honey mustard and apples,pork chop with honey mustard and apples might...,pre heat your oven to 200c 400f line a roa...,apples; dijon mustard; garlic cloves; honey; j...,Produce Spices and Seasonings Nut butters Jams...,gluten free; dairy free; paleolithic; primal,False,False,True,True,False,False,False,False,17.0,242.23,45,4
4,beet gnocchi with steak and brown butter sauce,the recipe beet gnocchi with steak and brown b...,cooking beets heat oven to 400 degrees wash be...,gnocchi; beets; olive oil; s p; goat cheese; r...,Produce Spices and Seasonings Produce Spices a...,,False,False,False,False,False,False,False,False,12.0,417.69,45,4


## Export the resulting pd as csv

In [143]:
df.to_csv(f"../data/cleaneddata.csv")