In [138]:
!pip install kagglehub --upgrade
!pip install -Iv tensorflow==2.13.0

Defaulting to user installation because normal site-packages is not writeable
Using pip 24.0 from /opt/conda/lib/python3.11/site-packages/pip (python 3.11)
Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow==2.13.0
  Obtaining dependency information for tensorflow==2.13.0 from https://files.pythonhosted.org/packages/ed/30/310fee0477ce46f722c561dd7e21eebca0d1d29bdb3cf4a2335b845fbba4/tensorflow-2.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading tensorflow-2.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting absl-py>=1.0.0 (from tensorflow==2.13.0)
  Obtaining dependency information for absl-py>=1.0.0 from https://files.pythonhosted.org/packages/a2/ad/e0d3c824784ff121c03cc031f944bc7e139a8f1870ffd2845cc2dd76f6c4/absl_py-2.1.0-py3-none-any.whl.metadata
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow==2.1

In [139]:
import os
import pandas as pd
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import random
from collections import defaultdict

In [3]:
path = kagglehub.dataset_download("shuyangli94/food-com-recipes-and-user-interactions")
print("Path to dataset files:", path)

# List all files in the dataset directory
dataset_files = os.listdir(path)
print("Files in the dataset:", dataset_files)

Path to dataset files: /home/cepeters/.cache/kagglehub/datasets/shuyangli94/food-com-recipes-and-user-interactions/versions/2
Files in the dataset: ['interactions_validation.csv', 'PP_users.csv', 'RAW_recipes.csv', 'interactions_test.csv', 'interactions_train.csv', 'ingr_map.pkl', 'PP_recipes.csv', 'RAW_interactions.csv']


In [4]:
# Dictionary to store DataFrames for each useful file
dataframes = {}

# Load each useful CSV file into a DataFrame dictionary
for file_name in dataset_files:
    file_path = os.path.join(path, file_name)
    if file_name.endswith('.csv') and os.path.exists(file_path):
        print(f"Loading {file_name} into a DataFrame...")
        df_name = file_name.split('.')[0]  # Use filename without extension as key
        dataframes[df_name] = pd.read_csv(file_path)
        print(f"Loaded {file_name} with {dataframes[df_name].shape[0]} rows and {dataframes[df_name].shape[1]} columns.")

Loading interactions_validation.csv into a DataFrame...
Loaded interactions_validation.csv with 7023 rows and 6 columns.
Loading PP_users.csv into a DataFrame...
Loaded PP_users.csv with 25076 rows and 6 columns.
Loading RAW_recipes.csv into a DataFrame...
Loaded RAW_recipes.csv with 231637 rows and 12 columns.
Loading interactions_test.csv into a DataFrame...
Loaded interactions_test.csv with 12455 rows and 6 columns.
Loading interactions_train.csv into a DataFrame...
Loaded interactions_train.csv with 698901 rows and 6 columns.
Loading PP_recipes.csv into a DataFrame...
Loaded PP_recipes.csv with 178265 rows and 8 columns.
Loading RAW_interactions.csv into a DataFrame...
Loaded RAW_interactions.csv with 1132367 rows and 5 columns.


In [15]:
raw_interactions_df = dataframes.get('RAW_interactions')

In [44]:
# Split interactions into training and testing dataframes
shuffled_raw_interactions_df = raw_interactions_df.sample(frac=1).reset_index(drop=True)
train_raw_interactions_df = shuffled_raw_interactions_df[0:500000]
test_raw_interactions_df = shuffled_raw_interactions_df[500000:600000]

In [45]:
# Iterate through training set to find values
globalAvg = 0
ratingsPerUser = defaultdict(list)
ratingsPerRecipe = defaultdict(list)

for i in range(train_raw_interactions_df.shape[0]):
    row = train_raw_interactions_df.iloc[i]
    rating, user, recipe = row["rating"], row["user_id"], row["recipe_id"]
    globalAvg += rating
    ratingsPerUser[user].append((recipe, rating))
    ratingsPerRecipe[recipe].append((user, rating))

globalAvg /= train_raw_interactions_df.shape[0]
globalAvg

4.410302

In [46]:
# Baseline: Find MSE of predicting global average every time
MSE = 0
for i in range(test_raw_interactions_df.shape[0]):
    row = test_raw_interactions_df.iloc[i]
    error = (row["rating"] - globalAvg) ** 2
    MSE += error

MSE /= test_raw_interactions_df.shape[0]

print("Test MSE (Using Global Averages) = " + str(MSE))

Test MSE (Using Global Averages) = 1.5887777994048145


In [47]:
# Baseline: Find MSE of predicting user average, or global average if there is none
MSE = 0
for i in range(test_raw_interactions_df.shape[0]):
    row = test_raw_interactions_df.iloc[i]
    if row["user_id"] in ratingsPerUser:
        userRatings = [r[1] for r in ratingsPerUser[row["user_id"]]]
        avgUserRating = sum(userRatings)/len(userRatings)
        error = (row["rating"] - avgUserRating) ** 2
    else:
        error = (row["rating"] - globalAvg) ** 2
    MSE += error

MSE /= test_raw_interactions_df.shape[0]

print("Test MSE (Using User Averages) = " + str(MSE))

Test MSE (Using User Averages) = 1.5933343466580254


In [48]:
# Baseline: Find MSE of predicting recipe average, or global average if there is none
MSE = 0
for i in range(test_raw_interactions_df.shape[0]):
    row = test_raw_interactions_df.iloc[i]
    if row["recipe_id"] in ratingsPerRecipe:
        recipeRatings = [r[1] for r in ratingsPerRecipe[row["recipe_id"]]]
        avgRecipeRating = sum(recipeRatings)/len(recipeRatings)
        error = (row["rating"] - avgRecipeRating) ** 2
    else:
        error = (row["rating"] - globalAvg) ** 2
    MSE += error

MSE /= test_raw_interactions_df.shape[0]

print("Test MSE (Using Recipe Averages) = " + str(MSE))

Test MSE (Using Recipe Averages) = 1.8437603678229684


In [49]:
# Baseline: Find MSE of predicting user average or recipe average (whichever has more), or global average if there is neither
MSE = 0
for i in range(test_raw_interactions_df.shape[0]):
    row = test_raw_interactions_df.iloc[i]
    user, recipe = row["user_id"], row["recipe_id"]
    if user in ratingsPerUser and (recipe not in ratingsPerRecipe or len(ratingsPerRecipe[recipe]) <= len(ratingsPerUser[user])):
        userRatings = [r[1] for r in ratingsPerUser[user]]
        avgRating = sum(userRatings) / len(userRatings)
        error = (row["rating"] - avgRating) ** 2
    elif recipe in ratingsPerRecipe and (user not in ratingsPerUser or len(ratingsPerRecipe[recipe]) >= len(ratingsPerUser[user])):
        recipeRatings = [r[1] for r in ratingsPerRecipe[recipe]]
        avgRating = sum(recipeRatings) / len(recipeRatings)
        error = (row["rating"] - avgRating) ** 2
    else:
        error = (row["rating"] - globalAvg) ** 2
    MSE += error

MSE /= test_raw_interactions_df.shape[0]

print("Test MSE (Using User & Recipe Averages) = " + str(MSE))

Test MSE (Using User & Recipe Averages) = 1.5781550707033012


In [50]:
# Iteration function for our model...
def iterate(lamb):
    newAlpha = 0
    for i in range(train_raw_interactions_df.shape[0]):
        row = train_raw_interactions_df.iloc[i]
        rating = row["rating"]
        user = row["user_id"]
        recipe = row["recipe_id"]
        newAlpha += rating - (betaU[user] + betaR[recipe])
    alpha = newAlpha / train_raw_interactions_df.shape[0]
    for user in ratingsPerUser:
        newBetaU = 0
        for recipe,rating in ratingsPerUser[user]:
            newBetaU += rating - (alpha + betaR[recipe])
        betaU[user] = newBetaU / (lamb + len(ratingsPerUser[user]))
    for recipe in ratingsPerRecipe:
        newBetaR = 0
        for user,rating in ratingsPerRecipe[recipe]:
            newBetaR += rating - (alpha + betaU[user])
        betaR[recipe] = newBetaR / (lamb + len(ratingsPerRecipe[recipe]))
    mse = 0
    for i in range(train_raw_interactions_df.shape[0]):
        row = train_raw_interactions_df.iloc[i]
        rating = row["rating"]
        user = row["user_id"]
        recipe = row["recipe_id"]
        prediction = alpha + betaU[user] + betaR[recipe]
        mse += (rating - prediction)**2
    regularizer = 0
    for u in betaU:
        regularizer += betaU[u]**2
    for r in betaR:
        regularizer += betaR[r]**2
    mse /= train_raw_interactions_df.shape[0]
    return mse, (mse * train_raw_interactions_df.shape[0]) + lamb*regularizer

In [51]:
# Model generation...
# Values:
betaU = {}
betaR = {}
for u in ratingsPerUser:
    betaU[u] = 0

for r in ratingsPerRecipe:
    betaR[r] = 0

alpha = globalAvg

# Iterating:
iterations = 0
mse, objective, newMSE, newObjective = 0, 0, 0, 0
while iterations < 10 or objective - newObjective > 0.0001:
    mse, objective = newMSE, newObjective
    newMSE, newObjective = iterate(1)
    iterations += 1
    print("Objective after "
        + str(iterations) + (" iterations = " if iterations != 1 else " iteration = ") + str(newObjective))
    print("MSE after "
        + str(iterations) + (" iterations = " if iterations != 1 else " iteration = ") + str(newMSE))

Objective after 1 iteration = 426413.9127441286
MSE after 1 iteration = 0.5813430435560253
Objective after 2 iterations = 413436.41958465974
MSE after 2 iterations = 0.5813672803234096
Objective after 3 iterations = 411638.79800325056
MSE after 3 iterations = 0.582191948248583
Objective after 4 iterations = 410769.1507638578
MSE after 4 iterations = 0.5819238398581439
Objective after 5 iterations = 410070.884568783
MSE after 5 iterations = 0.5813452785917188
Objective after 6 iterations = 409459.0172681245
MSE after 6 iterations = 0.5807216913188809
Objective after 7 iterations = 408919.7374748698
MSE after 7 iterations = 0.580130986536033
Objective after 8 iterations = 408447.90265646734
MSE after 8 iterations = 0.579594780258637
Objective after 9 iterations = 408038.8991968699
MSE after 9 iterations = 0.5791172015994022
Objective after 10 iterations = 407687.58645509905
MSE after 10 iterations = 0.5786964237172423
Objective after 11 iterations = 407388.37232360046
MSE after 11 iterat

KeyboardInterrupt: 

In [52]:
# Testing our model
MSE = 0
for i in range(test_raw_interactions_df.shape[0]):
    row = test_raw_interactions_df.iloc[i]
    rating = row["rating"]
    user = row["user_id"]
    recipe = row["recipe_id"]
    bu = 0
    br = 0
    if user in betaU:
        bu = betaU[user]
    if recipe in betaR:
        br = betaR[recipe]
    prediction = alpha + bu + br
    MSE += (rating - prediction) ** 2

MSE /= test_raw_interactions_df.shape[0]
print("Test MSE = " + str(MSE))

Test MSE = 1.6571930897383143


In [60]:
userIDs = {}
recipeIDs = {}
interactions = []

for i in range(raw_interactions_df.shape[0]):
    row = raw_interactions_df.iloc[i]
    user = row['user_id']
    recipe = row['recipe_id']
    rating = row['rating']
    if not user in userIDs: userIDs[user] = len(userIDs)
    if not recipe in recipeIDs: recipeIDs[recipe] = len(recipeIDs)
    interactions.append((user,recipe,rating))

random.shuffle(interactions)
len(interactions)

1132367

In [61]:
nTrain = int(len(interactions) * 0.9)
nTest = len(interactions) - nTrain
interactionsTrain = interactions[:nTrain]
interactionsTest = interactions[nTrain:]

In [62]:
recipesPerUser = defaultdict(list)
usersPerRecipe = defaultdict(list)
for user,recipe,rating in interactionsTrain:
    recipesPerUser[user].append(recipe)
    usersPerRecipe[recipe].append(user)

In [315]:
mu = sum([rating for _,_,rating in interactionsTrain]) / len(interactionsTrain)
optimizer = tf.keras.optimizers.Adam(0.1)

In [316]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.cast(tf.Variable(mu), dtype=tf.float32)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaR = tf.Variable(tf.random.normal([len(recipeIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaR = tf.Variable(tf.random.normal([len(recipeIDs),K],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, r):
        prediction = self.alpha + self.betaU[u] + self.betaR[r] +\
            tf.tensordot(self.gammaU[u], self.gammaR[r], 1)
        return prediction

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaR**2) +\
                            tf.reduce_sum(self.gammaU**2) +\
                            tf.reduce_sum(self.gammaR**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleR):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        r = tf.convert_to_tensor(sampleR, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_r = tf.nn.embedding_lookup(self.betaR, r)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_r = tf.nn.embedding_lookup(self.gammaR, r)
        prediction = self.alpha + beta_u + beta_r + tf.reduce_sum(tf.multiply(gamma_u, gamma_r), 1)
        return prediction
    
    # Loss
    def call(self, sampleUser, sampleRecipe, sampleRating):
        prediction = self.predictSample(sampleUser, sampleRecipe)
        rating = tf.convert_to_tensor(sampleRating, dtype=tf.float32)
        return tf.nn.l2_loss(prediction - rating) / len(sampleRating)

In [317]:
modelLFM = LatentFactorModel(mu, 5, 0.00001)

In [318]:
def trainingStep(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleUser, sampleRecipe, sampleRating = [], [], []
        for _ in range(Nsamples):
            user,recipe,rating = random.choice(interactions)
            sampleUser.append(userIDs[user])
            sampleRecipe.append(recipeIDs[recipe])
            sampleRating.append(rating)

        loss = model(sampleUser,sampleRecipe,sampleRating)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [319]:
for i in range(100):
    obj = trainingStep(modelLFM, interactionsTrain)
    print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 1, objective = 0.78203887


KeyboardInterrupt: 

In [322]:
# Testing our model
MSE = 0
for (i,(user,recipe,rating)) in enumerate(interactionsTest):
    if i % 10000 == 0: print(i, "/", len(interactionsTest))
    prediction = modelLFM.predict(userIDs[user], recipeIDs[recipe])
    MSE += (rating - prediction) ** 2

MSE /= len(interactionsTest)
print("Test MSE = " + str(MSE))

0 / 113237
10000 / 113237
20000 / 113237
30000 / 113237
40000 / 113237
50000 / 113237
60000 / 113237
70000 / 113237
80000 / 113237
90000 / 113237
100000 / 113237
110000 / 113237
Test MSE = tf.Tensor(1.5663366, shape=(), dtype=float32)


In [325]:
# Testing our model
MSE = 0

predictions = modelLFM.predictSample([interaction[0] for interaction in interactionsTest],[interaction[1] for interaction in interactionsTest])

for i in range(len(interactionsTest)):
    MSE += (interactionsTest[i][2] - predictions[i]) ** 2

MSE /= len(interactionsTest)
print("Test MSE = " + str(MSE))

Test MSE = tf.Tensor(1.593523, shape=(), dtype=float32)
