In [1]:
import pandas as pd
import numpy as np
import pickle
import json

In [2]:
with open("scraped_img_urls.pkl", "rb") as f:
    scraped_img_urls = pickle.load(f)
with open("cuisine_lst.pkl", "rb") as f:
    cuisine_lst = pickle.load(f)
    cuisine_lst = [cuisine.lower() for cuisine in cuisine_lst]
    cuisine_map = {cuisine: i for i, cuisine in enumerate(cuisine_lst)}
with open("food_data/ingr_map.pkl", "rb") as f:
    ingr_map_df = pickle.load(f)

In [3]:
recipe_df = pd.read_csv("recipe_df_178k_with_updated_users.csv", 
                        converters={"ingredients": eval, "quantities": eval, 
                                    "measurement_units": eval, "tags": eval,
                                   "nutrition": eval, "steps": eval, })
user_df = pd.read_csv("users.csv")
raw_interactions_df = pd.read_csv("interactions_with_updated_users.csv")
pp_recipe_df = pd.read_csv("food_data/PP_recipes.csv", converters={"ingredient_ids": eval})

In [4]:
recipe_df["calories"] = recipe_df["nutrition"].str[0]

In [5]:
recipe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178265 entries, 0 to 178264
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   name               178265 non-null  object 
 1   id                 178265 non-null  int64  
 2   minutes            178265 non-null  int64  
 3   contributor_id     178265 non-null  int64  
 4   submitted          178265 non-null  object 
 5   tags               178265 non-null  object 
 6   nutrition          178265 non-null  object 
 7   n_steps            178265 non-null  int64  
 8   steps              178265 non-null  object 
 9   description        174311 non-null  object 
 10  ingredients        178265 non-null  object 
 11  n_ingredients      178265 non-null  int64  
 12  food_recipe_url    178265 non-null  object 
 13  quantities         178265 non-null  object 
 14  measurement_units  178265 non-null  object 
 15  calories           178265 non-null  float64
dtypes:

In [6]:
display(recipe_df.iloc[0].ingredients)
display(recipe_df.iloc[0].quantities)
display(recipe_df.iloc[0].measurement_units)
display(recipe_df.iloc[0].tags)

['winter squash',
 'mexican seasoning',
 'mixed spice',
 'honey',
 'butter',
 'olive oil',
 'salt']

['1', '1 -2', '1 -2', '', '', '', '']

['lb', 'teaspoon', 'teaspoon', '', '', '', '']

['60-minutes-or-less',
 'time-to-make',
 'course',
 'main-ingredient',
 'cuisine',
 'preparation',
 'occasion',
 'north-american',
 'side-dishes',
 'vegetables',
 'mexican',
 'easy',
 'fall',
 'holiday-event',
 'vegetarian',
 'winter',
 'dietary',
 'christmas',
 'seasonal',
 'squash']

## Converting Strings to Numbers. Three Cases
1. Fractions ("1/3", "1/2" --> 0.333, 0.5)
2. Ranges ("1 -2" --> 1)
3. Normal integers ("1", "4" --> 1, 4)
4. Range with Fractions ("1⁄4 - 1⁄2" --> 1/4)

In [7]:
def convert_to_float(frac_str):
    if frac_str == "":
        return np.nan
    if " " in frac_str:
        frac_str = frac_str.split(" ")[0]
    try:
        return float(frac_str)
    except ValueError:
        if "⁄" in frac_str:
            split_frac = frac_str.split("⁄")
            num, denom = split_frac[0], split_frac[1]
            float_str = float(num) / float(denom)
        return float_str

In [8]:
recipe_df["measurement_units"].map(
    lambda x: [item if item != "" else np.nan for item in x]
)

0              [lb, teaspoon, teaspoon, nan, nan, nan, nan]
1         [(10 ounce) can prepared pizza crust (or use y...
2         [(4 ounce) packages, large, shallots, teaspoon...
3                          [cup, scoop, tablespoons, apple]
4         [teaspoons, cup, cup, clove, teaspoon, tablesp...
                                ...                        
178260    [garlic, teaspoons, lbs, small, heads, cups, c...
178261               [cups, medium, (15 ounce) can, ounces]
178262    [cup, tablespoons, tablespoon, tablespoon, tab...
178263    [cup, teaspoons, small, green, celery, (8 ounc...
178264    [hard, cup, tablespoon, teaspoons, teaspoons, ...
Name: measurement_units, Length: 178265, dtype: object

In [9]:
recipe_df["quantities"] = recipe_df.quantities.map(lambda x: [convert_to_float(item) for item in x])
recipe_df["measurement_units"] = recipe_df["measurement_units"].map(
    lambda x: [item if item != "" else np.nan for item in x]
)

In [10]:
recipe_df["tags"].head()

0    [60-minutes-or-less, time-to-make, course, mai...
1    [30-minutes-or-less, time-to-make, course, mai...
2    [60-minutes-or-less, time-to-make, course, mai...
3    [15-minutes-or-less, time-to-make, course, mai...
4    [15-minutes-or-less, time-to-make, course, mai...
Name: tags, dtype: object

In [11]:
def output_tag(tag_lst):
    lst = [tag for tag in tag_lst for cuisine in cuisine_map if (cuisine in tag)]
    if len(lst) == 0:
        return "global"
    return lst[0]

In [12]:
recipe_df["cuisine"] = recipe_df.tags.map(output_tag)

In [13]:
recipe_df["img_url"] = scraped_img_urls

In [14]:
recipe_df["serves"] = np.random.choice(np.arange(1, 5), size=recipe_df.shape[0])

In [15]:
recipe_df.columns

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients', 'food_recipe_url', 'quantities', 'measurement_units',
       'calories', 'cuisine', 'img_url', 'serves'],
      dtype='object')

In [16]:
recipe_df = recipe_df.rename(columns={"id": "recipe_id", "minutes": "time_to_prepare", 
                          "submitted": "date_submitted", "steps": "recipe_text", "name": "recipe_name",
                         "contributor_id": "creator_id"})

In [17]:
recipe_df["recipe_text"] = recipe_df["recipe_text"].map(lambda x: ". ".join([elem.capitalize() for elem in x]))

In [18]:
recipe_df["recipe_text"]

0         Make a choice and proceed with recipe. Dependi...
1         Preheat oven to 425 degrees f. Press dough int...
2         Place potatoes in a large pot of lightly salte...
3         Combine ingredients in blender. Cover and blen...
4         Toast the fennel seeds and lightly crush them....
                                ...                        
178260    Into each of six sterile pint mason jars , put...
178261    Place the lettuce on a platter or serving dish...
178262    To make the sauce , combine the mayonnaise , h...
178263    Bring 3 quarts salted to water to a boil. Add ...
178264    In a bowl , combine the mashed yolks and mayon...
Name: recipe_text, Length: 178265, dtype: object

In [19]:
cuisine_df = pd.DataFrame({"cuisine_id": np.arange(recipe_df["cuisine"].unique().shape[0]), 
                           "cuisine_name": recipe_df["cuisine"].unique()})

In [20]:
cuisine_df

Unnamed: 0,cuisine_id,cuisine_name
0,0,north-american
1,1,global
2,2,asian
3,3,german
4,4,italian
5,5,danish
6,6,french
7,7,greek
8,8,english
9,9,russian


In [21]:
recipe_df = recipe_df.merge(pp_recipe_df, 
                left_on="recipe_id",
                            
                right_on="id")[recipe_df.columns.tolist() + ["ingredient_ids"]]
recipe_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 178265 entries, 0 to 178264
Data columns (total 20 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   recipe_name        178265 non-null  object 
 1   recipe_id          178265 non-null  int64  
 2   time_to_prepare    178265 non-null  int64  
 3   creator_id         178265 non-null  int64  
 4   date_submitted     178265 non-null  object 
 5   tags               178265 non-null  object 
 6   nutrition          178265 non-null  object 
 7   n_steps            178265 non-null  int64  
 8   recipe_text        178265 non-null  object 
 9   description        174311 non-null  object 
 10  ingredients        178265 non-null  object 
 11  n_ingredients      178265 non-null  int64  
 12  food_recipe_url    178265 non-null  object 
 13  quantities         178265 non-null  object 
 14  measurement_units  178265 non-null  object 
 15  calories           178265 non-null  float64
 16  cu

In [22]:
recipe_df = recipe_df.merge(cuisine_df, 
                left_on="cuisine", 
                right_on="cuisine_name")[recipe_df.columns.tolist() + ["cuisine_id"]]

recipe_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 178265 entries, 0 to 178264
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   recipe_name        178265 non-null  object 
 1   recipe_id          178265 non-null  int64  
 2   time_to_prepare    178265 non-null  int64  
 3   creator_id         178265 non-null  int64  
 4   date_submitted     178265 non-null  object 
 5   tags               178265 non-null  object 
 6   nutrition          178265 non-null  object 
 7   n_steps            178265 non-null  int64  
 8   recipe_text        178265 non-null  object 
 9   description        174311 non-null  object 
 10  ingredients        178265 non-null  object 
 11  n_ingredients      178265 non-null  int64  
 12  food_recipe_url    178265 non-null  object 
 13  quantities         178265 non-null  object 
 14  measurement_units  178265 non-null  object 
 15  calories           178265 non-null  float64
 16  cu

In [23]:
recipe_df.to_csv("recipe_178k_with_all_updated_columns.csv",index=False)

In [22]:
ingr_map_df.to_csv("ingr_map.csv", index=False)

In [223]:
pp_recipe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178265 entries, 0 to 178264
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   id                 178265 non-null  int64 
 1   i                  178265 non-null  int64 
 2   name_tokens        178265 non-null  object
 3   ingredient_tokens  178265 non-null  object
 4   steps_tokens       178265 non-null  object
 5   techniques         178265 non-null  object
 6   calorie_level      178265 non-null  int64 
 7   ingredient_ids     178265 non-null  object
dtypes: int64(3), object(5)
memory usage: 10.9+ MB


In [21]:
recipe_df

Unnamed: 0,recipe_name,recipe_id,time_to_prepare,creator_id,date_submitted,tags,nutrition,n_steps,recipe_text,description,ingredients,n_ingredients,food_recipe_url,quantities,measurement_units,calories,cuisine,img_url,serves
0,arriba baked winter squash mexican style,137739,55,1,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,Make a choice and proceed with recipe. Dependi...,autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7,https://www.food.com/recipe/arriba---baked-win...,"[1.0, 1.0, 1.0, nan, nan, nan, nan]","[lb, teaspoon, teaspoon, , , , ]",51.5,north-american,https://img.sndimg.com/food/image/upload/w_555...,3
1,a bit different breakfast pizza,31490,30,2,2002-06-17,"[30-minutes-or-less, time-to-make, course, mai...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,Preheat oven to 425 degrees f. Press dough int...,this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6,https://www.food.com/recipe/a-bit-different--b...,"[1.0, 1.0, 3.0, 0.5, nan, 2.0]",[(10 ounce) can prepared pizza crust (or use y...,173.4,north-american,https://img.sndimg.com/food/image/upload/w_555...,4
2,alouette potatoes,59389,45,3,2003-04-14,"[60-minutes-or-less, time-to-make, course, mai...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,Place potatoes in a large pot of lightly salte...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...",11,https://www.food.com/recipe/alouette--potatoes...,"[2.0, 12.0, 2.0, 2.0, 1.0, 5.0, 2.0, 0.125, 0....","[(4 ounce) packages, large, shallots, teaspoon...",368.1,global,https://img.sndimg.com/food/image/upload/w_555...,3
3,apple a day milk shake,5289,0,4,1999-12-06,"[15-minutes-or-less, time-to-make, course, mai...","[160.2, 10.0, 55.0, 3.0, 9.0, 20.0, 7.0]",4,Combine ingredients in blender. Cover and blen...,,"[milk, vanilla ice cream, frozen apple juice c...",4,https://www.food.com/recipe/apple-a-day--milk-...,"[1.0, 1.0, 2.0, 0.5]","[cup, scoop, tablespoons, apple]",160.2,north-american,https://geniuskitchen.sndimg.com/fdc-new/img/f...,2
4,aww marinated olives,25274,15,5,2002-04-14,"[15-minutes-or-less, time-to-make, course, mai...","[380.7, 53.0, 7.0, 24.0, 6.0, 24.0, 6.0]",4,Toast the fennel seeds and lightly crush them....,my italian mil was thoroughly impressed by my ...,"[fennel seeds, green olives, ripe olives, garl...",9,https://www.food.com/recipe/aww--marinated-oli...,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0]","[teaspoons, cup, cup, clove, teaspoon, tablesp...",380.7,north-american,https://geniuskitchen.sndimg.com/fdc-new/img/f...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178260,zydeco green beans,185979,20,85,2006-09-13,"[30-minutes-or-less, time-to-make, course, mai...","[103.1, 2.0, 39.0, 97.0, 11.0, 0.0, 7.0]",6,"Into each of six sterile pint mason jars , put...",haricots verts are very slender french green b...,"[garlic cloves, yellow mustard seeds, french h...",8,https://www.food.com/recipe/zydeco-green-beans...,"[6.0, 6.0, 3.0, 6.0, 6.0, 3.0, 3.0, 2.0]","[garlic, teaspoons, lbs, small, heads, cups, c...",103.1,global,https://img.sndimg.com/food/image/upload/w_555...,1
178261,zydeco salad,367912,5,1542,2009-04-25,"[15-minutes-or-less, time-to-make, preparation...","[14.1, 0.0, 8.0, 0.0, 1.0, 0.0, 1.0]",4,Place the lettuce on a platter or serving dish...,"recipe courtesy of b&c seafood, vacherie, la a...","[iceberg lettuce, tomatoes, 3 bean mix, olive ...",4,https://www.food.com/recipe/zydeco-salad-367912,"[4.0, 2.0, 1.0, 8.0]","[cups, medium, (15 ounce) can, ounces]",14.1,global,https://img.sndimg.com/food/image/upload/w_555...,3
178262,zydeco sauce,357451,15,2898,2009-02-23,"[15-minutes-or-less, time-to-make, course, cui...","[239.9, 30.0, 19.0, 22.0, 1.0, 14.0, 5.0]",3,"To make the sauce , combine the mayonnaise , h...",great sauce for cheeseburgers or dipping fries...,"[mayonnaise, prepared horseradish, worcestersh...",6,https://www.food.com/recipe/zydeco-sauce-357451,"[1.0, 2.0, 1.0, 1.0, 1.0, 1.0]","[cup, tablespoons, tablespoon, tablespoon, tab...",239.9,north-american,https://geniuskitchen.sndimg.com/fdc-new/img/f...,2
178263,zydeco shrimp wrap,188810,57,492,2006-10-03,"[60-minutes-or-less, time-to-make, course, mai...","[482.6, 13.0, 25.0, 37.0, 33.0, 9.0, 27.0]",14,Bring 3 quarts salted to water to a boil. Add ...,a wrap inspired by great cajun flavors,"[white rice, vegetable oil, onion, green bell ...",11,https://www.food.com/recipe/zydeco-shrimp-wrap...,"[1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 2.0, 2.0, 20.0,...","[cup, teaspoons, small, green, celery, (8 ounc...",482.6,global,https://img.sndimg.com/food/image/upload/w_555...,1
