In [1]:
import pandas as pd
from tqdm.notebook import tqdm

## INTERACTIONS CSV

In [2]:
interactions_csv = pd.read_csv("../Data/RAW_interactions.csv")

In [3]:
# FILTER OUT USERS WHITH LESS THAN 10 INTERACTIONS
counts = list(interactions_csv["user_id"].value_counts().to_dict().items())
counts = sorted(counts, key=lambda x: x[1])

user_ids_with_10_or_more = [user_id for user_id, n_interactions in counts if n_interactions >= 10]

interactions_csv = interactions_csv[interactions_csv["user_id"].isin(user_ids_with_10_or_more)]

* Use map to redefine the user and recipe ids in the interactions file

In [4]:
import pickle

In [5]:
with (
    open("objects/recipe_ids_map.dat", "rb") as f1, 
    open("objects/user_ids_map.dat", "rb") as f2, 
    open("objects/tags_map.dat", "rb") as f3,
):
    recipe_ids_map = pickle.load(f1)
    user_ids_map = pickle.load(f2)
    tags_map = pickle.load(f3)

In [6]:
interactions_csv.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [7]:
user_id_list = [user_ids_map[user_id] for user_id in tqdm(interactions_csv["user_id"].tolist())]
recipe_id_list = [recipe_ids_map[recipe_id] for recipe_id in tqdm(interactions_csv["recipe_id"].tolist())]

  0%|          | 0/803338 [00:00<?, ?it/s]

  0%|          | 0/803338 [00:00<?, ?it/s]

In [9]:
new_interactions_csv = pd.DataFrame.from_dict({
    "user_id": user_id_list,
    "recipe_id": recipe_id_list,
    "rating": interactions_csv["rating"]
})
new_interactions_csv.head()

Unnamed: 0,user_id,recipe_id,rating
0,617,22741,4
1,11478,22741,5
2,62,24831,4
3,2568,48726,5
4,1152,48726,5


In [10]:
new_interactions_csv.to_csv("processed_dataframes/interactions.csv", index=False)

## RECIPES CSV

In [8]:
recipes_csv = pd.read_csv("../Data/RAW_recipes.csv")

* Use map to redefine the recipe_id

In [9]:
recipes_csv.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [10]:
mapped_recipe_ids = [recipe_ids_map[recipe_id] for recipe_id in tqdm(recipes_csv["id"].tolist())]

  0%|          | 0/231637 [00:00<?, ?it/s]

* Use maped tags

In [45]:
# mapped_tags = []
# 
# for _, row in tqdm(recipes_csv.iterrows()):
#     tag_ids_list = []
#     for tag in eval(row["tags"]):
#         tag_ids_list.append(tags_map[tag])
#         
#     mapped_tags.append(tag_ids_list)

0it [00:00, ?it/s]

79681
[7, 90, 130, 140, 168, 202, 283, 294, 296, 322, 326, 346, 413, 419, 427, 453, 484, 514, 525, 528]


**Construct sparse matrix instead**

In [14]:
import scipy.sparse as sparse
import numpy as np

In [15]:
tags_matrix = sparse.lil_matrix((len(recipe_ids_map), len(tags_map)), dtype=np.float32)

for _, row in tqdm(recipes_csv.iterrows()):
    row_idx = recipe_ids_map[row["id"]]
    for tag in eval(row["tags"]):
        col_idx = tags_map[tag]
        tags_matrix[row_idx, col_idx] = 1.

0it [00:00, ?it/s]

In [16]:
sparse.save_npz("objects/tags_matrix", tags_matrix.tocsr())

* save the duplicated recipe_id to call the correct tfidf row to get the embedded preparation

In [17]:
# preparation_ids = mapped_recipe_ids.copy()

* use minutes column
* Use full nutrition array
* use n_steps

In [15]:
# Full text maped id recipes
# pd.DataFrame.from_dict({
#     "recipe_id": mapped_recipe_ids,
#     "name": recipes_csv["name"].tolist(),
#     "minutes": recipes_csv["minutes"].tolist(),
#     "tags": recipes_csv["tags"].tolist(),
#     "steps": recipes_csv["steps"].tolist(),
#     "ingredients": recipes_csv["ingredients"].tolist(),
# }).to_csv("processed_dataframes/recipes_with_text.csv", index=False)

In [18]:
new_recipes_csv = pd.DataFrame.from_dict({
    "recipe_id": mapped_recipe_ids,
    "minutes": recipes_csv["minutes"].tolist(),
#     "tags": mapped_tags,
    "nutrition": recipes_csv["nutrition"].tolist(), # always 7 elements
    "n_steps": recipes_csv["n_steps"].tolist(),
    "n_ingredients": recipes_csv["n_ingredients"].tolist(),
#     "preparation_id": preparation_ids,
})

In [19]:
new_recipes_csv.head()

Unnamed: 0,recipe_id,minutes,nutrition,n_steps,n_ingredients
0,79681,55,"[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,7
1,17031,30,"[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,6
2,65277,130,"[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,13
3,33775,45,"[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,11
4,24632,190,"[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,8


In [20]:
new_recipes_csv.to_csv("processed_dataframes/recipes.csv", index=False)