In [1]:
!pip install kagglehub --upgrade
!pip install -Iv tensorflow==2.13.0
!pip install pandas
!pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Using pip 24.0 from /opt/conda/lib/python3.11/site-packages/pip (python 3.11)
Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow==2.13.0
  Obtaining dependency information for tensorflow==2.13.0 from https://files.pythonhosted.org/packages/ed/30/310fee0477ce46f722c561dd7e21eebca0d1d29bdb3cf4a2335b845fbba4/tensorflow-2.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached tensorflow-2.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting absl-py>=1.0.0 (from tensorflow==2.13.0)
  Obtaining dependency information for absl-py>=1.0.0 from https://files.pythonhosted.org/packages/a2/ad/e0d3c824784ff121c03cc031f944bc7e139a8f1870ffd2845cc2dd76f6c4/absl_py-2.1.0-py3-none-any.whl.metadata
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow==2

In [2]:
import os
import pandas as pd
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import random
from collections import defaultdict

# Dataset loading (original code)
path = kagglehub.dataset_download("shuyangli94/food-com-recipes-and-user-interactions")
print("Path to dataset files:", path)

# List all files in the dataset directory
dataset_files = os.listdir(path)
print("Files in the dataset:", dataset_files)

# Dictionary to store DataFrames for each useful file
dataframes = {}


2024-12-04 06:05:19.657547: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-04 06:05:19.833711: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-04 06:05:19.835616: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Path to dataset files: /home/jweston/.cache/kagglehub/datasets/shuyangli94/food-com-recipes-and-user-interactions/versions/2
Files in the dataset: ['RAW_recipes.csv', 'PP_users.csv', 'interactions_test.csv', 'interactions_train.csv', 'PP_recipes.csv', 'ingr_map.pkl', 'interactions_validation.csv', 'RAW_interactions.csv']


In [3]:
# Load each useful CSV file into a DataFrame dictionary
for file_name in dataset_files:
    file_path = os.path.join(path, file_name)
    if file_name.endswith('.csv') and os.path.exists(file_path):
        print(f"Loading {file_name} into a DataFrame...")
        df_name = file_name.split('.')[0]  # Use filename without extension as key
        dataframes[df_name] = pd.read_csv(file_path)
        print(f"Loaded {file_name} with {dataframes[df_name].shape[0]} rows and {dataframes[df_name].shape[1]} columns.")

raw_interactions_df = dataframes.get('RAW_interactions')


Loading RAW_recipes.csv into a DataFrame...
Loaded RAW_recipes.csv with 231637 rows and 12 columns.
Loading PP_users.csv into a DataFrame...
Loaded PP_users.csv with 25076 rows and 6 columns.
Loading interactions_test.csv into a DataFrame...
Loaded interactions_test.csv with 12455 rows and 6 columns.
Loading interactions_train.csv into a DataFrame...
Loaded interactions_train.csv with 698901 rows and 6 columns.
Loading PP_recipes.csv into a DataFrame...
Loaded PP_recipes.csv with 178265 rows and 8 columns.
Loading interactions_validation.csv into a DataFrame...
Loaded interactions_validation.csv with 7023 rows and 6 columns.
Loading RAW_interactions.csv into a DataFrame...
Loaded RAW_interactions.csv with 1132367 rows and 5 columns.


In [4]:
# Generate userIDs, recipeIDs, and put interactions into a set
userIDs = {}
recipeIDs = {}
interactions = []

for i in range(raw_interactions_df.shape[0]):
    row = raw_interactions_df.iloc[i]
    user = row['user_id']
    recipe = row['recipe_id']
    rating = row['rating']
    if not user in userIDs: userIDs[user] = len(userIDs)
    if not recipe in recipeIDs: recipeIDs[recipe] = len(recipeIDs)
    interactions.append((user, recipe, rating))

random.shuffle(interactions)


In [5]:
# Split interactions into training and testing sets
# Dataset preparation (Ensure this runs first)
random.shuffle(interactions)
nTrain = int(len(interactions) * 0.9)
interactionsTrain = interactions[:nTrain]
interactionsTest = interactions[nTrain:]

# Split training data into training and validation sets
train_split = int(len(interactionsTrain) * 0.8)
interactionsTrainSplit = interactionsTrain[:train_split]
interactionsVal = interactionsTrain[train_split:]
# Baselines (original code)
# Global average
globalAvg = sum([rating for _, _, rating in interactionsTrain]) / len(interactionsTrain)
MSE = 0
for user, recipe, rating in interactionsTest:
    error = (rating - globalAvg) ** 2
    MSE += error
MSE /= len(interactionsTest)
print("Test MSE (Using Global Averages) = " + str(MSE))



Test MSE (Using Global Averages) = 1.6081370643884438


In [6]:
# User averages
recipesPerUser = defaultdict(list)
usersPerRecipe = defaultdict(list)
ratingsPerUser = defaultdict(list)
ratingsPerRecipe = defaultdict(list)
for user, recipe, rating in interactionsTrain:
    recipesPerUser[user].append(recipe)
    usersPerRecipe[recipe].append(user)
    ratingsPerUser[user].append(rating)
    ratingsPerRecipe[recipe].append(rating)

MSE = 0
for user, recipe, rating in interactionsTest:
    if user in recipesPerUser:
        avgUserRating = sum(ratingsPerUser[user]) / len(ratingsPerUser[user])
        error = (rating - avgUserRating) ** 2
    else:
        error = (rating - globalAvg) ** 2
    MSE += error
MSE /= len(interactionsTest)
print("Test MSE (Using User Averages) = " + str(MSE))


Test MSE (Using User Averages) = 1.5925926749585777


In [7]:
# Latent Factor Model (original code)
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        self.alpha = tf.cast(tf.Variable(mu), dtype=tf.float32)
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)], stddev=0.001))
        self.betaR = tf.Variable(tf.random.normal([len(recipeIDs)], stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs), K], stddev=0.001))
        self.gammaR = tf.Variable(tf.random.normal([len(recipeIDs), K], stddev=0.001))
        self.lamb = lamb

    def predictSample(self, sampleU, sampleR):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        r = tf.convert_to_tensor(sampleR, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_r = tf.nn.embedding_lookup(self.betaR, r)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_r = tf.nn.embedding_lookup(self.gammaR, r)
        prediction = self.alpha + beta_u + beta_r + tf.reduce_sum(tf.multiply(gamma_u, gamma_r), 1)
        return prediction

    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) + tf.reduce_sum(self.betaR**2) + tf.reduce_sum(self.gammaU**2) + tf.reduce_sum(self.gammaR**2))
    
    def call(self, sampleUser, sampleRecipe, sampleRating):
        prediction = self.predictSample(sampleUser, sampleRecipe)
        rating = tf.convert_to_tensor(sampleRating, dtype=tf.float32)
        return tf.nn.l2_loss(prediction - rating) / len(sampleRating)


In [8]:
# Training step (original code)
def trainingStep(model, interactions, optimizer):
    Nsamples = 250000
    with tf.GradientTape() as tape:
        sampleUser, sampleRecipe, sampleRating = [], [], []
        for _ in range(Nsamples):
            user, recipe, rating = random.choice(interactions)
            sampleUser.append(userIDs[user])
            sampleRecipe.append(recipeIDs[recipe])
            sampleRating.append(rating)

        loss = model(sampleUser, sampleRecipe, sampleRating)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for (grad, var) in zip(gradients, model.trainable_variables) if grad is not None)
    return loss.numpy()



In [9]:
# New: Split training data for validation
train_split = int(len(interactionsTrain) * 0.8)
interactionsTrainSplit = interactionsTrain[:train_split]
interactionsVal = interactionsTrain[train_split:]

In [10]:
# New: Hyperparameter tuning
# def grid_search(interactionsTrain, interactionsVal, userIDs, recipeIDs, globalAvg, K_values, lamb_values, lr_values, num_iterations=50):
#     best_params = None
#     best_val_MSE = float('inf')
    
#     for K in K_values:
#         for lamb in lamb_values:
#             for lr in lr_values:
#                 print(f"Testing K={K}, λ={lamb}, learning_rate={lr}")
#                 optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
#                 model = LatentFactorModel(globalAvg, K, lamb)
#                 for i in range(num_iterations):
#                     loss = trainingStep(model, interactionsTrain)
#                     if i % 10 == 0:
#                         print(f"Iteration {i+1}, loss: {loss}")
                
#                 val_MSE = 0
#                 val_predictions = model.predictSample(
#                     [userIDs[interaction[0]] for interaction in interactionsVal],
#                     [recipeIDs[interaction[1]] for interaction in interactionsVal]
#                 )
#                 for i in range(len(interactionsVal)):
#                     val_MSE += (interactionsVal[i][2] - val_predictions[i]) ** 2
#                 val_MSE /= len(interactionsVal)
#                 print(f"Validation MSE for K={K}, λ={lamb}, lr={lr}: {val_MSE}")
#                 if val_MSE < best_val_MSE:
#                     best_val_MSE = val_MSE
#                     best_params = (K, lamb, lr)
    
#     return best_params, best_val_MSE

In [11]:
# train_split = int(len(interactionsTrain) * 0.8)
# interactionsTrainSplit = interactionsTrain[:train_split]
# interactionsVal = interactionsTrain[train_split:]

def grid_search(interactionsTrain, interactionsVal, userIDs, recipeIDs, globalAvg, K_values, lamb_values, lr_values, num_iterations=10):
    best_params = None
    best_val_MSE = float('inf')
    
    for K in K_values:
        for lamb in lamb_values:
            for lr in lr_values:
                print(f"Testing K={K}, λ={lamb}, learning_rate={lr}")
                optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
                model = LatentFactorModel(globalAvg, K, lamb)
                for i in range(num_iterations):
                    loss = trainingStep(model, interactionsTrain, optimizer)  # Pass optimizer here
                    if i % 10 == 0:
                        print(f"Iteration {i+1}, loss: {loss}")
                
                val_MSE = 0
                val_predictions = model.predictSample(
                    [userIDs[interaction[0]] for interaction in interactionsVal],
                    [recipeIDs[interaction[1]] for interaction in interactionsVal]
                )
                for i in range(len(interactionsVal)):
                    val_MSE += (interactionsVal[i][2] - val_predictions[i]) ** 2
                val_MSE /= len(interactionsVal)
                print(f"Validation MSE for K={K}, λ={lamb}, lr={lr}: {val_MSE}")
                if val_MSE < best_val_MSE:
                    best_val_MSE = val_MSE
                    best_params = (K, lamb, lr)
    
    return best_params, best_val_MSE


In [12]:
# Define parameter ranges
# Define parameter ranges for grid search
K_values = [8]
lamb_values = [1e-4, 1e-3, 1e-2]
lr_values = [0.01, 0.05, 0.1]

# Run grid search
best_params, best_val_MSE = grid_search(
    interactionsTrainSplit, 
    interactionsVal, 
    userIDs, 
    recipeIDs, 
    globalAvg, 
    K_values, 
    lamb_values, 
    lr_values
)

print(f"Best parameters: K={best_params[0]}, λ={best_params[1]}, learning_rate={best_params[2]}")
print(f"Best Validation MSE: {best_val_MSE}")


Testing K=8, λ=0.0001, learning_rate=0.01


2024-12-04 06:06:53.255731: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Iteration 1, loss: 0.7943146228790283
Validation MSE for K=8, λ=0.0001, lr=0.01: 1.564568042755127
Testing K=8, λ=0.0001, learning_rate=0.05
Iteration 1, loss: 0.8058547973632812
Validation MSE for K=8, λ=0.0001, lr=0.05: 1.5292800664901733
Testing K=8, λ=0.0001, learning_rate=0.1
Iteration 1, loss: 0.7970572710037231
Validation MSE for K=8, λ=0.0001, lr=0.1: 1.5359610319137573
Testing K=8, λ=0.001, learning_rate=0.01
Iteration 1, loss: 0.8078670501708984
Validation MSE for K=8, λ=0.001, lr=0.01: 1.581279993057251
Testing K=8, λ=0.001, learning_rate=0.05
Iteration 1, loss: 0.8016877174377441
Validation MSE for K=8, λ=0.001, lr=0.05: 1.5717825889587402
Testing K=8, λ=0.001, learning_rate=0.1
Iteration 1, loss: 0.8042346835136414
Validation MSE for K=8, λ=0.001, lr=0.1: 1.5722134113311768
Testing K=8, λ=0.01, learning_rate=0.01
Iteration 1, loss: 0.836323082447052
Validation MSE for K=8, λ=0.01, lr=0.01: 1.5968083143234253
Testing K=8, λ=0.01, learning_rate=0.05
Iteration 1, loss: 0.8406

In [14]:
# Retrain with best parameters
final_model = LatentFactorModel(globalAvg, best_params[0], best_params[1])
optimizer = tf.keras.optimizers.Adam(learning_rate=best_params[2])

for i in range(100):
    loss = trainingStep(final_model, interactionsTrain, optimizer)
    if i % 10 == 0:
        print(f"Iteration {i+1}, loss: {loss}")


Iteration 1, loss: 0.7909595370292664
Iteration 11, loss: 0.7872207164764404
Iteration 21, loss: 0.7802796363830566
Iteration 31, loss: 0.7818827033042908
Iteration 41, loss: 0.7761270403862
Iteration 51, loss: 0.7837302684783936
Iteration 61, loss: 0.7791503071784973
Iteration 71, loss: 0.7825021743774414
Iteration 81, loss: 0.7894480228424072
Iteration 91, loss: 0.7632641792297363


In [15]:
# Evaluate on test set
test_MSE = 0
test_predictions = final_model.predictSample(
    [userIDs[interaction[0]] for interaction in interactionsTest],
    [recipeIDs[interaction[1]] for interaction in interactionsTest]
)

for i in range(len(interactionsTest)):
    test_MSE += (interactionsTest[i][2] - test_predictions[i]) ** 2
test_MSE /= len(interactionsTest)

print(f"Final Test MSE: {test_MSE}")

Final Test MSE: 1.538376808166504


In [16]:
print(best_params)

(8, 0.0001, 0.05)
