In [1]:
!pip install kagglehub --upgrade

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import os
import pandas as pd
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

In [5]:
path = kagglehub.dataset_download("shuyangli94/food-com-recipes-and-user-interactions")
print("Path to dataset files:", path)

# List all files in the dataset directory
dataset_files = os.listdir(path)
print("Files in the dataset:", dataset_files)

Path to dataset files: /home/jweston/.cache/kagglehub/datasets/shuyangli94/food-com-recipes-and-user-interactions/versions/2
Files in the dataset: ['RAW_recipes.csv', 'PP_users.csv', 'interactions_test.csv', 'interactions_train.csv', 'PP_recipes.csv', 'ingr_map.pkl', 'interactions_validation.csv', 'RAW_interactions.csv']


In [6]:
# Dictionary to store DataFrames for each useful file
dataframes = {}

# Load each useful CSV file into a DataFrame dictionary
for file_name in dataset_files:
    file_path = os.path.join(path, file_name)
    if file_name.endswith('.csv') and os.path.exists(file_path):
        print(f"Loading {file_name} into a DataFrame...")
        df_name = file_name.split('.')[0]  # Use filename without extension as key
        dataframes[df_name] = pd.read_csv(file_path)
        print(f"Loaded {file_name} with {dataframes[df_name].shape[0]} rows and {dataframes[df_name].shape[1]} columns.")

Loading RAW_recipes.csv into a DataFrame...
Loaded RAW_recipes.csv with 231637 rows and 12 columns.
Loading PP_users.csv into a DataFrame...
Loaded PP_users.csv with 25076 rows and 6 columns.
Loading interactions_test.csv into a DataFrame...
Loaded interactions_test.csv with 12455 rows and 6 columns.
Loading interactions_train.csv into a DataFrame...
Loaded interactions_train.csv with 698901 rows and 6 columns.
Loading PP_recipes.csv into a DataFrame...
Loaded PP_recipes.csv with 178265 rows and 8 columns.
Loading interactions_validation.csv into a DataFrame...
Loaded interactions_validation.csv with 7023 rows and 6 columns.
Loading RAW_interactions.csv into a DataFrame...
Loaded RAW_interactions.csv with 1132367 rows and 5 columns.


In [7]:
raw_interactions_df = dataframes.get('RAW_interactions')

In [8]:
# Split interactions into training and testing dataframes
train_raw_interactions_df = raw_interactions_df[0:1000000]
test_raw_interactions_df = raw_interactions_df[1000000:]
test_raw_interactions_df.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
1000000,774665,247898,2008-08-17,4,"This was good and my family enjoyed it, but it..."
1000001,428060,247898,2009-04-21,4,This was very tasty. We used turkey served ove...
1000002,911707,247898,2009-08-24,5,"Very good recipe. I would marinade it longer, ..."
1000003,1244621,247898,2009-08-31,4,This was pretty good. I will say however that...
1000004,939002,247898,2011-04-08,3,I tried this recipe tonight and it was just ok...


In [9]:
# Iterate through training set to find values
globalAvg = 0
ratingsPerUser = defaultdict(list)
ratingsPerRecipe = defaultdict(list)

for i in range(train_raw_interactions_df.shape[0]):
    row = train_raw_interactions_df.iloc[i]
    rating, user, recipe = row["rating"], row["user_id"], row["recipe_id"]
    globalAvg += rating
    ratingsPerUser[user].append((recipe, rating))
    ratingsPerRecipe[recipe].append((user, rating))

globalAvg /= train_raw_interactions_df.shape[0]
globalAvg

4.409849

In [10]:
# Baseline: Find MSE of predicting global average every time
MSE = 0
for i in range(test_raw_interactions_df.shape[0]):
    row = test_raw_interactions_df.iloc[i]
    error = (row["rating"] - globalAvg) ** 2
    MSE += error

MSE /= test_raw_interactions_df.shape[0]

print("Test MSE (Using Global Averages) = " + str(MSE))

Test MSE (Using Global Averages) = 1.5624910195183335


In [11]:
# Baseline: Find MSE of predicting user average, or global average if there is none
MSE = 0
for i in range(test_raw_interactions_df.shape[0]):
    row = test_raw_interactions_df.iloc[i]
    if row["user_id"] in ratingsPerUser:
        userRatings = [r[1] for r in ratingsPerUser[row["user_id"]]]
        avgUserRating = sum(userRatings)/len(userRatings)
        error = (row["rating"] - avgUserRating) ** 2
    else:
        error = (row["rating"] - globalAvg) ** 2
    MSE += error

MSE /= test_raw_interactions_df.shape[0]

print("Test MSE (Using User Averages) = " + str(MSE))

Test MSE (Using User Averages) = 1.5401254614048214


In [12]:
# Baseline: Find MSE of predicting recipe average, or global average if there is none
MSE = 0
for i in range(test_raw_interactions_df.shape[0]):
    row = test_raw_interactions_df.iloc[i]
    if row["recipe_id"] in ratingsPerRecipe:
        recipeRatings = [r[1] for r in ratingsPerRecipe[row["recipe_id"]]]
        avgRecipeRating = sum(recipeRatings)/len(recipeRatings)
        error = (row["rating"] - avgRecipeRating) ** 2
    else:
        error = (row["rating"] - globalAvg) ** 2
    MSE += error

MSE /= test_raw_interactions_df.shape[0]

print("Test MSE (Using Recipe Averages) = " + str(MSE))

Test MSE (Using Recipe Averages) = 1.5821126648278145


In [13]:
# Baseline: Find MSE of predicting user average or recipe average (whichever has more), or global average if there is neither
MSE = 0
for i in range(test_raw_interactions_df.shape[0]):
    row = test_raw_interactions_df.iloc[i]
    user, recipe = row["user_id"], row["recipe_id"]
    if user in ratingsPerUser and (recipe not in ratingsPerRecipe or len(ratingsPerRecipe[recipe]) <= len(ratingsPerUser[user])):
        userRatings = [r[1] for r in ratingsPerUser[user]]
        avgRating = sum(userRatings) / len(userRatings)
        error = (row["rating"] - avgRating) ** 2
    elif recipe in ratingsPerRecipe and (user not in ratingsPerUser or len(ratingsPerRecipe[recipe]) >= len(ratingsPerUser[user])):
        recipeRatings = [r[1] for r in ratingsPerRecipe[recipe]]
        avgRating = sum(recipeRatings) / len(recipeRatings)
        error = (row["rating"] - avgRating) ** 2
    else:
        error = (row["rating"] - globalAvg) ** 2
    MSE += error

MSE /= test_raw_interactions_df.shape[0]

print("Test MSE (Using User & Recipe Averages) = " + str(MSE))

Test MSE (Using User & Recipe Averages) = 1.5451860262455706


In [14]:
# Iteration function for our model...
def iterate(lamb):
    newAlpha = 0
    for i in range(train_raw_interactions_df.shape[0]):
        row = train_raw_interactions_df.iloc[i]
        rating = row["rating"]
        user = row["user_id"]
        recipe = row["recipe_id"]
        newAlpha += rating - (betaU[user] + betaR[recipe])
    alpha = newAlpha / train_raw_interactions_df.shape[0]
    for user in ratingsPerUser:
        newBetaU = 0
        for recipe,rating in ratingsPerUser[user]:
            newBetaU += rating - (alpha + betaR[recipe])
        betaU[user] = newBetaU / (lamb + len(ratingsPerUser[user]))
    for recipe in ratingsPerRecipe:
        newBetaR = 0
        for user,rating in ratingsPerRecipe[recipe]:
            newBetaR += rating - (alpha + betaU[user])
        betaR[recipe] = newBetaR / (lamb + len(ratingsPerRecipe[recipe]))
    mse = 0
    for i in range(train_raw_interactions_df.shape[0]):
        row = train_raw_interactions_df.iloc[i]
        rating = row["rating"]
        user = row["user_id"]
        recipe = row["recipe_id"]
        prediction = alpha + betaU[user] + betaR[recipe]
        mse += (rating - prediction)**2
    regularizer = 0
    for u in betaU:
        regularizer += betaU[u]**2
    for r in betaR:
        regularizer += betaR[r]**2
    mse /= train_raw_interactions_df.shape[0]
    return mse, mse + lamb*regularizer

In [15]:
# Model generation...
# Values:
betaU = {}
betaR = {}
for u in ratingsPerUser:
    betaU[u] = 0

for r in ratingsPerRecipe:
    betaR[r] = 0

alpha = globalAvg

# Iterating:
iterations = 0
mse, objective, newMSE, newObjective = 0, 0, 0, 0
while iterations < 3: # or objective - newObjective > 0.0001:
    mse, objective = newMSE, newObjective
    newMSE, newObjective = iterate(1)
    iterations += 1
    print("Objective after "
        + str(iterations) + (" iterations = " if iterations != 1 else " iteration = ") + str(newObjective))
    print("MSE after "
        + str(iterations) + (" iterations = " if iterations != 1 else " iteration = ") + str(newMSE))

Objective after 1 iteration = 231933.72652197632
MSE after 1 iteration = 0.6837470587392875
Objective after 2 iterations = 211783.20725310096
MSE after 2 iterations = 0.6822491745464135
Objective after 3 iterations = 208170.59559952267
MSE after 3 iterations = 0.682594319265065


In [16]:
# Testing our model
MSE = 0
for i in range(test_raw_interactions_df.shape[0]):
    row = test_raw_interactions_df.iloc[i]
    rating = row["rating"]
    user = row["user_id"]
    recipe = row["recipe_id"]
    bu = 0
    br = 0
    if user in betaU:
        bu = betaU[user]
    if recipe in betaR:
        br = betaR[recipe]
    prediction = alpha + bu + br
    MSE += (rating - prediction) ** 2

MSE /= test_raw_interactions_df.shape[0]
print("Test MSE = " + str(MSE))

Test MSE = 1.4816742447245397
