In [1]:
import pandas as pd
from tqdm.notebook import tqdm

# Step 1

map recipe_id and user_ids to 0 index values (interactions file)

In [2]:
recipes_csv = pd.read_csv("../Data/RAW_recipes.csv")
# recipes_csv.head()

In [3]:
recipe_ids_set = set(recipes_csv["id"].tolist())

In [4]:
interactions_csv = pd.read_csv("../Data/RAW_interactions.csv")
# interactions_csv.head()

In [5]:
# FILTER OUT USERS WHITH LESS THAN 10 INTERACTIONS
counts = list(interactions_csv["user_id"].value_counts().to_dict().items())
counts = sorted(counts, key=lambda x: x[1])

user_ids_with_10_or_more = [user_id for user_id, n_interactions in counts if n_interactions >= 10]

interactions_csv = interactions_csv[interactions_csv["user_id"].isin(user_ids_with_10_or_more)]

In [7]:
user_ids_set = set(interactions_csv["user_id"].tolist())

In [8]:
recipe_ids_map = {recipe_id: i for i, recipe_id in enumerate(sorted(recipe_ids_set))}
user_ids_map = {user_id: i for i, user_id in enumerate(sorted(user_ids_set))}

In [9]:
import pickle

In [10]:
with open("objects/recipe_ids_map.dat", "wb") as f:
    pickle.dump(recipe_ids_map, f)
    
with open("objects/user_ids_map.dat", "wb") as f:
    pickle.dump(user_ids_map, f)

## Step 2

map tags to ids

In [11]:
# recipes_csv.head()

In [12]:
tags_set = set()

for _, row in tqdm(recipes_csv.iterrows()):
    for tag in eval(row["tags"]):
        tags_set.add(tag)

0it [00:00, ?it/s]

In [13]:
tags_map = {tag: i for i, tag in enumerate(tags_set)}

In [14]:
with open("objects/tags_map.dat", "wb") as f:
    pickle.dump(tags_map, f)

## Step 3

concatenate all steps and ingredients into one text and run tfidf for al recipes, convert to torch sparse matrix and save to file (preparation)

In [15]:
# recipes_csv.head()

In [16]:
steps_ingredients = [None for _ in range(len(recipe_ids_map))]

for _, row in tqdm(recipes_csv.iterrows()):
    text_data = " ".join(eval(row["steps"])) + " ".join(eval(row["ingredients"]))
    recipe_id = row["id"]
    
    steps_ingredients[recipe_ids_map[recipe_id]] = text_data

0it [00:00, ?it/s]

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
# Removes stop words, radicalizes and applies l2 norm
vectorizer = TfidfVectorizer(stop_words="english")
vectorized_preparations = vectorizer.fit_transform(steps_ingredients)
vectorized_preparations.shape

(231637, 120348)

In [19]:
import scipy.sparse

In [20]:
scipy.sparse.save_npz("objects/preparations_tfidf", vectorized_preparations)

To pytorch

In [21]:
coo_preparations = vectorized_preparations.tocoo()

In [22]:
import torch
import numpy as np

In [23]:
values = coo_preparations.data
indices = np.vstack((coo_preparations.row, coo_preparations.col))

preparations_tfidf = torch.sparse.FloatTensor(
    torch.LongTensor(indices), 
    torch.FloatTensor(values), 
    torch.Size(coo_preparations.shape)
)

In [24]:
torch.save(preparations_tfidf, "objects/preparations_tfidf.pt")