In [1]:
import dataclasses

import itertools
import os
import requests
import time
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.metrics import mean_squared_error

from IPython.display import display

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam

train = pd.read_csv("data/train.csv")
movies_df = pd.read_csv("data/movies.csv")

data = train.merge(movies_df, on="movie_id", how="left")


def weighted_rmse(y_true, y_pred):
    total_ratings = tf.reduce_sum(tf.cast(~tf.math.is_nan(y_true), tf.float32), axis=0)  # count of ratings per movie
    weights = tf.where(total_ratings > 0, 1.0 / tf.sqrt(total_ratings), 0.0)
    y_true = tf.where(tf.math.is_nan(y_true), 0.0, y_true)
    weighted_error = weights * tf.square(y_pred - y_true)
    return tf.sqrt(tf.reduce_sum(weighted_error) / tf.reduce_sum(weights))



# Map user IDs and movie IDs to unique indices
user_ids = data['user_id'].unique()
movie_ids = data['movie_id'].unique()
user_to_index = {user: idx for idx, user in enumerate(user_ids)}
movie_to_index = {movie: idx for idx, movie in enumerate(movie_ids)}
data['user_id'] = data['user_id'].map(user_to_index)
data['movie_id'] = data['movie_id'].map(movie_to_index)

# Fill NaN ratings with placeholder (will not be used during training)
data['rating_filled'] = data['rating'].fillna(0.0)

# Define parameters
num_users = len(user_ids)
num_movies = len(movie_ids)
embedding_dim = 32  # Reduced size for faster training

# Split data into training and test sets
X = data[['user_id', 'movie_id']].values
y = data['rating'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the model
# User input and embedding
user_input = Input(shape=(1,), name="user_input")
user_embedding = Embedding(num_users, embedding_dim, name="user_embedding")(user_input)
user_vec = Flatten(name="user_flatten")(user_embedding)

# Movie input and embedding
movie_input = Input(shape=(1,), name="movie_input")
movie_embedding = Embedding(num_movies, embedding_dim, name="movie_embedding")(movie_input)
movie_vec = Flatten(name="movie_flatten")(movie_embedding)

# Concatenate and dense layers
concat = Concatenate()([user_vec, movie_vec])
dense = Dense(64, activation="relu")(concat)
dense = Dropout(0.3)(dense)  # Lower dropout for faster convergence
dense = Dense(32, activation="relu")(dense)
output = Dense(1, activation="linear", name="output")(dense)

# Compile the model
model = Model([user_input, movie_input], output)
model.compile(optimizer=Adam(learning_rate=0.001), loss=weighted_rmse, metrics=[weighted_rmse])

# Train the model
history = model.fit(
    [X_train[:, 0], X_train[:, 1]],
    np.nan_to_num(y_train, nan=0.0),  # Replace NaN ratings with 0 for training
    epochs=5,  # Reduced epochs for faster execution
    batch_size=128,  # Larger batch size for efficient GPU/CPU usage
    validation_split=0.2,
    verbose=1
)

# Save the model
model.save("movielens_recommender_model.h5")
print("Model saved as 'movielens_recommender_model.h5'")

# Evaluate the model
results = model.evaluate([X_test[:, 0], X_test[:, 1]], np.nan_to_num(y_test, nan=0.0), verbose=1)
print("Test WRMSE:", results[1])

# Load ratings_submission.csv and update predictions
def update_submission_file():
    submission_file = 'data/ratings_submission.csv'
    submission_data = pd.read_csv(submission_file)

    # Parse user_id and movie_id from the 'id' column
    submission_data[['user_id', 'movie_id']] = submission_data['id'].str.split('_', expand=True)
    submission_data['user_id'] = submission_data['user_id'].astype(int)
    submission_data['movie_id'] = submission_data['movie_id'].astype(int)

    # Map user and movie IDs to model indices
    submission_data['user_id'] = submission_data['user_id'].map(user_to_index)
    submission_data['movie_id'] = submission_data['movie_id'].map(movie_to_index)

    # Predict ratings
    predictions = model.predict([
        submission_data['user_id'].values,
        submission_data['movie_id'].values
    ]).flatten()

    # Update predictions in the submission file
    submission_data['prediction'] = predictions
    submission_data[['id', 'prediction']].to_csv('updated_ratings_submission.csv', index=False)
    print("Updated submission file 'updated_ratings_submission.csv' created.")

# Update the submission file
update_submission_file()

In [None]:
import tensorflow as tf
from tensorflow import keras
import os
import random

# Set random seeds for reproducibility
tf.random.set_seed(1)
np.random.seed(1)
random.seed(1)

In [None]:
train = pd.read_csv("data/train.csv")
movies_df = pd.read_csv("data/movies.csv")
tags = pd.read_csv("data/tags.csv")
submission = pd.read_csv("data/ratings_submission.csv")

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow import keras

In [None]:
# Merge datasets
merged_df = train.merge(movies_df, on="movie_id", how="left")

# Extract year from title and process genres
merged_df['year'] = merged_df['title'].str.extract(r'\((\d{4})\)', expand=False)
merged_df['year'] = pd.to_numeric(merged_df['year'], errors='coerce')
merged_df['title'] = merged_df['title'].str.replace(r' \(\d{4}\)$', '', regex=True)
genre_dummies = merged_df['genres'].str.get_dummies(sep='|').astype(int)
merged_df = pd.concat([merged_df, genre_dummies], axis=1)
merged_df.drop(columns=['genres'], inplace=True)
merged_df.fillna(0, inplace=True)



In [None]:
# Train-validation split
df_train, df_val = train_test_split(merged_df, test_size=0.05, random_state=1)

# Calculate movie weights
movie_counts = df_train['movie_id'].value_counts()
movie_weights_train = 1 / np.sqrt(movie_counts.reindex(df_train['movie_id']).fillna(1).values)
movie_weights_val = 1 / np.sqrt(movie_counts.reindex(df_val['movie_id']).fillna(1).values)

# Define custom W-RMSE loss function
def weighted_rmse_loss(movie_weights):
    def loss(y_true, y_pred):
        errors = movie_weights * keras.backend.square(y_true - y_pred)
        return keras.backend.sqrt(keras.backend.sum(errors) / keras.backend.sum(movie_weights))
    return loss

# Define model
hidden_units = (32, 4)
movie_embedding_size = 8
user_embedding_size = 8

user_id_input = keras.Input(shape=(1,), name='user_id')
movie_id_input = keras.Input(shape=(1,), name='movie_id')

user_embedded = keras.layers.Embedding(df_train['user_id'].max() + 1, user_embedding_size, input_length=1, name='user_embedding')(user_id_input)
movie_embedded = keras.layers.Embedding(df_train['movie_id'].max() + 1, movie_embedding_size, input_length=1, name='movie_embedding')(movie_id_input)

concatenated = keras.layers.Concatenate()([user_embedded, movie_embedded])
out = keras.layers.Flatten()(concatenated)

for n_hidden in hidden_units:
    out = keras.layers.Dense(n_hidden, activation='relu')(out)
out = keras.layers.Dense(1, activation='linear', name='prediction')(out)

model = keras.Model(inputs=[user_id_input, movie_id_input], outputs=out)
model.compile(optimizer=Adam(learning_rate=0.005), loss=weighted_rmse_loss(movie_weights_train))

# Train the model
history = model.fit(
    [df_train['user_id'], df_train['movie_id']],
    df_train['rating'],  # Target column is now 'rating'
    batch_size=5000,
    epochs=20,
    verbose=1,
    validation_data=(
        [df_val['user_id'], df_val['movie_id']], df_val['rating']  # Validation target is also 'rating'
    )
)

# Evaluate W-RMSE on validation set
y_pred = model.predict([df_val['user_id'], df_val['movie_id']], verbose=0).flatten()
y_true = df_val['rating'].values
wrmse = weighted_rmse_loss(movie_weights_val)(y_true, y_pred).numpy()
print(f"Validation W-RMSE: {wrmse:.6f}")

# Generate predictions for submission
submission[['user_id', 'movie_id']] = submission['id'].str.split('_', expand=True).astype(int)
user_ids = submission['user_id'].values
movie_ids = submission['movie_id'].values

predictions = np.zeros(len(submission))
BATCH_SIZE = 10000
for i in tqdm(range((len(submission) + BATCH_SIZE - 1) // BATCH_SIZE)):
    start_idx = i * BATCH_SIZE
    end_idx = min((i + 1) * BATCH_SIZE, len(submission))
    batch_users = user_ids[start_idx:end_idx].reshape(-1, 1)
    batch_movies = movie_ids[start_idx:end_idx].reshape(-1, 1)
    predictions[start_idx:end_idx] = model.predict([batch_users, batch_movies], verbose=0).flatten()

final_submission = pd.DataFrame({
    'id': submission['id'],
    'prediction': predictions
})
final_submission.to_csv('submission_predictions.csv', index=False)
print("Submission saved as 'submission_predictions.csv'")

In [None]:
# Display both DataFrames
display("train df:", train.head(5))
display("movies df:", movies_df.head(5))
display("tags df:", tags.head(5))
display("submission df:", submission.head(5))

In [None]:
print(f"number of rows in train df: {train.shape[0]:,}")
print(f"number of rows in movies df: {movies_df.shape[0]:,}")
print(f"number of rows in tags df: {tags.shape[0]:,}")
print(f"number of rows in submission df: {submission.shape[0]:,}")

In [None]:
unique_movie_ids = movies_df['movie_id'].nunique()
print(f"Number of unique values in the 'movie_id' column: {unique_movie_ids:,}")

unique_user_ids = train['user_id'].nunique()
print(f"Number of unique values in the 'user_id' column: {unique_user_ids:,}")

In [None]:
merged_df = train.merge(movies_df, on="movie_id", how="left")

# Calculate the mean-centered 'y' column
# The y column in the table is the result of subtracting the mean rating from each individual rating in the rating 
# column. This process is known as mean-centering. Here's how it's calculated:
mean_rating = merged_df['rating'].mean()
merged_df['y'] = merged_df['rating'] - mean_rating

merged_df.head(5)

In [None]:
# Extract year from title using raw string to fix the SyntaxWarning
merged_df['year'] = merged_df['title'].str.extract(r'\((\d{4})\)', expand=False)
merged_df['year'] = pd.to_numeric(merged_df['year'], errors='coerce')

# Remove the year from the title
merged_df['title'] = merged_df['title'].str.replace(r' \(\d{4}\)$', '', regex=True)

# Create dummy columns for genres, using integers (0/1)
genre_dummies = merged_df['genres'].str.get_dummies(sep='|').astype(int)
merged_df = pd.concat([merged_df, genre_dummies], axis=1)

# Drop the original genres column
merged_df.drop(columns=['genres'], inplace=True)

In [None]:
merged_df.fillna(0, inplace=True)

In [None]:
merged_df.dtypes

In [None]:
print(f"the shape of merged_df: {merged_df.shape}")

In [None]:
n_movies = len(merged_df.movie_id.unique())
n_users = len(merged_df.user_id.unique())
print(
    "{1:,} distinct users rated {0:,} different movies (total ratings = {2:,})".format(
        n_movies, n_users, len(merged_df),
    )
)

In [None]:
df = merged_df.copy()

In [None]:
# https://www.kaggle.com/code/colinmorris/embedding-layers

hidden_units = (32,4)
movie_embedding_size = 8
user_embedding_size = 8

# Each instance will consist of two inputs: a single user id, and a single movie id
user_id_input = keras.Input(shape=(1,), name='user_id')
movie_id_input = keras.Input(shape=(1,), name='movie_id')
user_embedded = keras.layers.Embedding(df['user_id'].max()+1, user_embedding_size, 
                                       input_length=1, name='user_embedding')(user_id_input)
movie_embedded = keras.layers.Embedding(df['movie_id'].max()+1, movie_embedding_size, 
                                        input_length=1, name='movie_embedding')(movie_id_input)
# Concatenate the embeddings (and remove the useless extra dimension)
concatenated = keras.layers.Concatenate()([user_embedded, movie_embedded])
out = keras.layers.Flatten()(concatenated)

# Add one or more hidden layers
for n_hidden in hidden_units:
    out = keras.layers.Dense(n_hidden, activation='relu')(out)

# A single output: our predicted rating
out = keras.layers.Dense(1, activation='linear', name='prediction')(out)

model = keras.Model(
    inputs = [user_id_input, movie_id_input],
    outputs = out,
)
model.summary(line_length=88)

In [None]:
from tensorflow.keras.optimizers import Adam

# Compiling the model
model.compile(
    optimizer=Adam(learning_rate=0.005),  # Updated optimizer
    loss='mean_squared_error',           # Updated loss function for clarity
    metrics=['mean_absolute_error']      # Updated metric
)

In [None]:
history = model.fit(
    [df['user_id'], df['movie_id']],
#    df['y'],
    df['rating'],
    batch_size=5000,
    epochs=20,
    verbose=0,
    validation_split=.05,
)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

df_train, df_val = train_test_split(df, test_size=.05, random_state=1)

def get_metrics(y_true, y_pred):
    return metrics.mean_absolute_error(y_true, y_pred), metrics.mean_squared_error(y_true, y_pred)

mean_rating = df_train['rating'].mean()
print("Average rating in training set is {:.2f} stars".format(mean_rating))

y_true = df_val['rating'].values
always_mean = np.full(y_true.shape, mean_rating)

mae, mse = get_metrics(y_true, always_mean)
print("Always predicting global average rating results in Mean Absolute Error={:.2f}, Mean Squared Error={:.2f}".format(
    mae, mse))

movies = movies_df.copy().set_index('movie_id')
mean_per_movie = df_train.groupby('movie_id')['rating'].mean()
movies['mean_rating'] = mean_per_movie
ratings_per_movie = df_train.groupby('movie_id').size()
movies['n_ratings'] = ratings_per_movie
# There are a few movies in the validation set not present in the training set. We'll just use the global
# mean rating in their case.
y_movie_mean = df_val.join(mean_per_movie, on='movie_id', rsuffix='mean')['ratingmean'].fillna(mean_rating).values

mae, mse = get_metrics(y_true, y_movie_mean)
print("Predicting mean per movie results in Mean Absolute Error={:.2f}, Mean Squared Error={:.2f}".format(mae, mse))

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
ax.plot(history.epoch, history.history['val_mean_absolute_error'], label='Validation MAE')
ax.plot(history.epoch, history.history['mean_absolute_error'], label='Training MAE')
ax.set_xlabel('Epoch')
ax.set_ylabel('Mean Absolute Error')
ax.set_xlim(left=0, right=history.epoch[-1])
baseline_mae = 0.73
ax.axhline(baseline_mae, ls='--', label='Baseline', color='#002255', alpha=.5)
ax.grid()
fig.legend()

In [None]:
# Save training history for later comparison
hdf = pd.DataFrame(dict(
    epoch=history.epoch,
    val_mae=history.history['val_mean_absolute_error'],
    train_mae=history.history['mean_absolute_error'],
))
hdf.to_csv('history-1.csv')

In [None]:
ratings_per_user = df.groupby('user_id').size()
uid = ratings_per_user[ratings_per_user < 30].sample(1, random_state=1).index[0]
user_ratings = df[df['user_id']==uid]
print("User #{} has rated {} movies (avg. rating = {:.1f}):".format(
    uid, len(user_ratings), user_ratings['rating'].mean(),
))
cols = ['user_id', 'movie_id', 'rating', 'title', 'year']
user_ratings.sort_values(by='rating', ascending=False)[cols]

In [None]:
movies

In [None]:
candidate_movies = movies.loc[movies.index == 1188].copy()
uid = 0
uid

In [None]:
candidate_movies

In [None]:
# candidate_movies = movies[
#     movies.title.str.contains('Naked Gun')
#     | (movies.title == 'The Sisterhood of the Traveling Pants')
#     | (movies.title == 'Lilo & Stitch')
# ].copy()


# Convert inputs to numpy arrays with correct shape
user_ids = np.array([uid] * len(candidate_movies))
movie_ids = np.array(candidate_movies.index)

# Reshape to match the model's expected input shape (samples, 1)
user_ids = user_ids.reshape(-1, 1)
movie_ids = movie_ids.reshape(-1, 1)

# Make predictions
preds = model.predict([user_ids, movie_ids])

# Rest of the code remains the same
row = df.iloc[0]
y_delta = row.rating - row.y
candidate_movies['predicted_rating_delta'] = preds + y_delta
candidate_movies['predicted_rating'] = preds
candidate_movies['delta'] = candidate_movies['predicted_rating'] - candidate_movies['mean_rating']
candidate_movies.sort_values(by='delta', ascending=False)

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Read the submission template
submission_df = pd.read_csv('data/ratings_submission.csv')

# Extract user_ids and movie_ids from the id column
submission_df[['user_id', 'movie_id']] = submission_df['id'].str.split('_', expand=True).astype(int)

# Convert to numpy arrays for faster processing
user_ids = submission_df['user_id'].values
movie_ids = submission_df['movie_id'].values

# Batch size for processing
BATCH_SIZE = 10000
n_samples = len(submission_df)
n_batches = (n_samples + BATCH_SIZE - 1) // BATCH_SIZE

# Initialize array for predictions
predictions = np.zeros(n_samples)

# Process in batches
print("Generating predictions...")
for i in tqdm(range(n_batches)):
    start_idx = i * BATCH_SIZE
    end_idx = min((i + 1) * BATCH_SIZE, n_samples)
    
    # Get batch data
    batch_users = user_ids[start_idx:end_idx].reshape(-1, 1)
    batch_movies = movie_ids[start_idx:end_idx].reshape(-1, 1)
    
    # Make predictions for the batch
    batch_preds = model.predict(
        [batch_users, batch_movies],
        verbose=0,
        batch_size=BATCH_SIZE
    )
    
    # Store predictions
    predictions[start_idx:end_idx] = batch_preds.flatten()

# Adjust predictions with y_delta
# predictions += y_delta

# Create final submission dataframe
final_submission = pd.DataFrame({
    'id': submission_df['id'],
    'prediction': predictions
})

# Save the submission file
final_submission.to_csv('submission_predictions.csv', index=False)

# Display first few rows
print("\nFirst few predictions:")
print(final_submission.head())

# Display basic statistics
print("\nPrediction statistics:")
print(final_submission['prediction'].describe())

In [None]:
# Save the complete model (architecture + weights)
model.save('recommendation_model.h5')

# Save just the weights
model.save_weights('model.weights.h5')  # Note the .weights.h5 extension

# To load the model back
from tensorflow import keras

# Method 1: Load the complete model
loaded_model = keras.models.load_model('recommendation_model.h5')

# Method 2: If you need to recreate the model architecture and load weights separately
# First recreate the model architecture (you'll need the same architecture code as before)
loaded_model = keras.Model(
    inputs = [user_id_input, movie_id_input],
    outputs = out,
)
# Then load the weights
loaded_model.load_weights('model.weights.h5')  # Note the .weights.h5 extension

# Compile the loaded model
loaded_model.compile(
    optimizer=Adam(learning_rate=0.005),
    loss='mean_squared_error',
    metrics=['mean_absolute_error']
)

# Verify the loaded model works
# Make a test prediction
test_user = np.array([[1]])  # reshape to (1,1) for single prediction
test_movie = np.array([[1]])
prediction = loaded_model.predict([test_user, test_movie])
print(f"Test prediction: {prediction[0][0]}")

In [2]:
train_df = pd.read_csv("data/train.csv")
train_df.dropna(subset=['rating'], inplace=True)

In [4]:
ratings = train_df.pivot(
    index="user_id",
    columns="movie_id",
    values="rating",
).fillna(0)


print(f"the shape of ratings: {ratings.shape}")
ratings.head()

the shape of ratings: (100000, 2000)


movie_id,0,1,3,4,6,8,9,11,12,13,...,206851,207323,208693,209959,210855,217459,225183,254732,262997,270688
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,4.0,4.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
def train_test_split(
    ratings: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Splits rating matrix to train and test.

    Args:
        ratings: rating matrix.

    Returns:
        Train and test matrices.
    """
    test = ratings.copy()
    test.iloc[:, :] = 0
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        non_zero_items = ratings.iloc[user, :].to_numpy().nonzero()[0]
        test_ratings = np.random.choice(
            non_zero_items, size=10, replace=False)
        train.iloc[user, test_ratings] = 0
        test.iloc[user, test_ratings] = ratings.iloc[user, test_ratings]

    # Test and training are truly disjoint.
    assert(np.all((train * test) == 0))
    return train, test

train, test = train_test_split(ratings)
train.shape, test.shape

((100000, 2000), (100000, 2000))

In [6]:
@dataclasses.dataclass(frozen=True)
class Metrics:
    # RMSE of training set.
    rmse_train: np.float32
    # RMSE of test set.
    rmse_test: np.float32
    # Weighted RMSE of test set.
    wrmse_test: np.float32
    # MRR of test set with cutoff 5.
    mrr5_test: np.float32
    # MRR of test set with cutoff 10.
    mrr10_test: np.float32
    # Mean_NDCG of test set with cutoff 5.
    mean_ndcg5_test: np.float32
    # Mean_NDCG of test set with cutoff 10.
    mean_ndcg10_test: np.float32
    # MAP of test set with cutoff 5.
    map5_test: np.float32
    # MAP of test set with cutoff 10.
    map10_test: np.float32


class ExplicitMF:

    def __init__(
            self,
            train: pd.DataFrame,
            n_factors: int = 10,
            method: str = "pop",
            reg: float = 1e-3,
            n_iterations: int = 20,
            learning_rate: float = 1e-3,
            eps: float = 1e-5,
    ):
        """Trains a matrix factorization model."""
        self.train = train
        self.n_factors = n_factors
        self.method = method
        self.reg = reg
        self.n_iterations = n_iterations
        self.learning_rate = learning_rate
        self.eps = eps
        self.n_users, self.n_items = train.shape
        self.model_params = {}
        self.progress = {}
        self.train_rmse = []
        self.test_rmse = []

    def fit_popularity(self):
        """Computes parameters in a popularity-based algorithm."""
        self.model_params["popularity"] = np.zeros(self.train.shape)
        popularity = self.model_params["popularity"]
        item_avg = np.ma.masked_equal(self.train, value=0).mean(axis=0)
        item_avg[np.isnan(item_avg)] = 0
        for i in range(self.n_items):
            popularity[:, i] = item_avg.data[i]
        self.model_params["popularity"] = popularity
        self.train_rmse.append(self.rmse(self.train))
        self.test_rmse.append(self.rmse(test))

    def fit_bias(self):
        """Computes parameters in a bias-based algorithm."""
        mu = np.ma.masked_equal(self.train, value=0).mean()
        self.model_params["bias"] = np.full(self.train.shape, fill_value=mu)
        bias_params = self.model_params["bias"]
        user_bias = np.ma.masked_equal(self.train - mu, value=-mu).mean(axis=1)
        user_bias[np.isnan(user_bias)] = 0
        item_bias = np.ma.masked_equal(self.train - mu, value=-mu).mean(axis=0)
        item_bias[np.isnan(item_bias)] = 0
        for i in range(self.n_users):
            for j in range(1, self.n_items):
                bias_params[i, j] = mu + user_bias.data[i] + item_bias.data[j]
        self.model_params['bias'] = bias_params
        self.train_rmse.append(self.rmse(self.train))
        self.test_rmse.append(self.rmse(test))


    def update_gd(self):
        """
        Computes gradient descent step using fully vectorized operations.

        Optimizations:
        - Completely vectorized implementation
        - Eliminates loops
        - Handles broadcasting correctly
        """
        # Create boolean mask for rated items
        rated_mask = self.train.to_numpy() > 0

        # Compute prediction errors for all user-item pairs
        predictions = np.dot(self.model_params["U"], self.model_params["V"].T)
        error_matrix = self.train.to_numpy() - predictions

        # Apply mask to focus only on rated items
        error_matrix *= rated_mask

        # Vectorized gradient update for user factors
        user_gradient = np.dot(error_matrix, self.model_params["V"]) - self.reg * self.model_params["U"]
        self.model_params["U"] += self.learning_rate * user_gradient

        # Vectorized gradient update for item factors
        item_gradient = np.dot(error_matrix.T, self.model_params["U"]) - self.reg * self.model_params["V"]
        self.model_params["V"] += self.learning_rate * item_gradient

    def update_als(self):
        """
        Computes alternating least squares step using vectorized operations.

        Optimizations:
        - Fully vectorized ALS update
        - Robust handling of rated items
        - Efficient linear algebra computations
        """
        # Prepare identity matrix for regularization
        I = np.eye(self.n_factors)

        # Update user latent vectors
        for u in range(self.n_users):
            # Find indices of items rated by this user
            rated_items = np.where(self.train.iloc[u].to_numpy() > 0)[0]

            if len(rated_items) > 0:
                # Select corresponding item factors and ratings
                V_u = self.model_params["V"][rated_items, :]
                ratings_u = self.train.iloc[u, rated_items].to_numpy()

                # Solve for user factors
                VtV = V_u.T @ V_u
                VtR = V_u.T @ ratings_u
                self.model_params["U"][u, :] = np.linalg.solve(VtV + self.reg * I, VtR)

        # Update item latent vectors
        for i in range(self.n_items):
            # Find indices of users who rated this item
            rated_users = np.where(self.train.iloc[:, i].to_numpy() > 0)[0]

            if len(rated_users) > 0:
                # Select corresponding user factors and ratings
                U_i = self.model_params["U"][rated_users, :]
                ratings_i = self.train.iloc[rated_users, i].to_numpy()

                # Solve for item factors
                UtU = U_i.T @ U_i
                UtR = U_i.T @ ratings_i
                self.model_params["V"][i, :] = np.linalg.solve(UtU + self.reg * I, UtR)


    def training(self, test: pd.DataFrame):
        """Main method for training all algorithms.

        Args:
            test: test rating matrix.
        """
        if self.method == "pop":
            self.fit_popularity()
            return

        if self.method == "bias":
            self.fit_bias()
            return

        # Initialize latent vectors.
        self.model_params["U"] = np.random.randn(
            self.n_users, self.n_factors) * 0.01
        self.model_params["V"] = np.random.randn(
            self.n_items, self.n_factors) * 0.01

        for i in range(self.n_iterations):
            if i % 10 == 0:
                print(f"\tcurrent iteration: {i}")
            if self.method == "als":
                self.update_als()
            elif self.method == "gd":
                self.update_gd()

            self.train_rmse.append(self.rmse(self.train))
            self.test_rmse.append(self.rmse(test))
            # Don't break in the first iteration.
            if not i:
                continue

            if self.eps > abs(self.train_rmse[i] - self.train_rmse[i-1]):
                break

    def recommend_unseen(self, user: int, n_items: int) -> list:
        """Recommends unseen items per user, oredered by predicted ratings desc.

        Args:
          user: user id.
          n_items: number of items to suggest.

        Returns:
          Top suggestions.
        """
        if self.method == "pop":
            predicted_ratings = self.model_params["popularity"][user]
        elif self.method == "bias":
            predicted_ratings = self.model_params["bias"][user]
        else:
            user_vec = self.model_params["U"][user, :]
            predicted_ratings = user_vec @ self.model_params["V"].T

        pred_sorted = np.argsort(predicted_ratings)[::-1]
        unseen_movie = np.where(self.train.iloc[user] == 0)
        final = pred_sorted[np.in1d(pred_sorted, unseen_movie)]
        return final[:n_items]

    def predict(self, user: int, item: int) -> float:
        """Predicts the rating of a specific item for a specific user.

        Args:
          user: user id.
          item: item id.

        Returns:
          Predicted rating.
        """
        if self.method == "pop":
            return self.model_params["popularity"][user, item]

        if self.method == "bias":
            return  self.model_params["bias"][user, item]

        user_vec = self.model_params["U"][user, :]
        item_vec = self.model_params["V"][item, :]
        return np.dot(user_vec, item_vec)

    def predict_all(self):
        """Predicts ratings for every user and item.

        Args: None

        Returns:
            All predicted ratings
        """
        predictions = np.zeros(shape=(self.n_users, self.n_items))
        for u in range(self.n_users):
            for i in range(self.n_items):
                predictions[u, i] = self.predict(u, i)
        return predictions

    def rmse(self, actual: pd.DataFrame) -> float:
        """Computes the total RMSE of a model, compared to the actual rating.

        Args:
          actual: actual rating matrix.

        Returns:
          Total RMSE.
        """
        preds = self.predict_all()
        preds = preds[actual > 0].flatten()
        actual = actual.to_numpy()
        actual_non_zeros = actual[actual > 0].flatten()
        rmse = np.sqrt(np.mean((actual_non_zeros - preds)**2))
        return rmse


    def mrr(self, test: pd.DataFrame, k: int = 5) -> float:
        """Computes the Mean Reciprocal Rank for all users.

        Note: relevancy is considered rating 3 or above.

        Args:
          test: rating matrix.
          k: cutoff value.

        Returns:
          MRR.
        """
        mrr = 0
        for user in range(self.n_users):
            list_pred = self.recommend_unseen(user, k)
            user_test = test.iloc[user]
            for j in range(k):
                if user_test.iloc[list_pred[j]] >= 3:
                    mrr += 1 / (j + 1)
                    break
        return mrr / self.n_users

    def dcg(self, user: int, test: pd.DataFrame, k: int = 5) -> float:
        """Computes the Discounted Cumulative Gain for a given user.

        Args:
            user: user id.
            test: rating matrix.
            k: cutoff value.

        Returns:
            DCG.
        """
        rank_pred = self.recommend_unseen(user, k)
        user_test = test.iloc[user]
        actual = user_test[user_test >= 0]

        dcg = 0
        for j in range(k):
            if rank_pred[j] in actual:
                rating = user_test.iloc[rank_pred[j]]
                m = rating / np.log2(j + 2)
                dcg += m
        return dcg

    def idcg(self, user: int, test: pd.DataFrame, k: int = 5) -> float:
        """Computes the Idealized Discounted Cumulative Gain for a given user.

        Args:
            user: user id.
            test: rating matrix.
            k: cutoff value.

        Returns:
            iDCG.
        """
        user_test = test.iloc[user]
        actual = user_test[user_test >= 0]
        true_sort = actual.sort_values(ascending=False)[:k]
        idcg = 0
        for i in range(len(true_sort)):
            m = true_sort.iloc[i] / np.log2(i + 2)
            idcg += m
        return idcg

    def mean_ndcg(self, test: pd.DataFrame, k: int = 5) -> float:
        """Computes Mean Normalized Discounted Cumulative Gain for all users.

        Args:
          test: rating matrix.
          k: cutoff value.

        Returns:
          Mean nDCG.
        """
        ndcg = 0
        for user in range(self.n_users):
            dcg = self.dcg(user, test, k)
            idcg = self.idcg(user, test, k)
            if idcg > 0:
                m = dcg / idcg
            else:
                m = 0
            ndcg += m
        return ndcg / self.n_users

    def map(self, test: pd.DataFrame, k: int = 5) -> float:
        """Computes the mean average precision for all users.

        Note: relevancy is considered rating 3 or above.

        Args:
          test: rating matrix.
          k: cutoff value.

        Returns:
          Overall MAP.
        """
        ap_list = []
        for user in range(self.n_users):
            list_pred = self.recommend_unseen(user, k)
            user_test = test.iloc[user]
            ap = []
            hits = 0
            for j in range(k):
                if user_test.iloc[list_pred[j]] >= 3:
                    hits += 1
                    ap.append(hits / (j + 1))
            if ap:
                ap_list.append(np.mean(ap))
            else:
                ap_list.append(0)
        map = np.mean(ap_list)
        return map

    def wrmse(self, test: pd.DataFrame) -> float:
        """Computes Weighted Root Mean Squared Error (W-RMSE).

        Args:
          test: test rating matrix.

        Returns:
          Weighted RMSE.
        """
        preds = self.predict_all()
        test_array = test.to_numpy()

        # Compute weights for each movie based on total ratings
        total_ratings = np.sum(test_array > 0, axis=0)  # Total ratings per movie
        weights = np.where(total_ratings > 0, 1 / np.sqrt(total_ratings), 0)

        # Apply weights to the squared error
        squared_error = (preds - test_array) ** 2
        weighted_squared_error = np.nansum(weights * squared_error, axis=1)

        # Calculate W-RMSE
        wrmse = np.sqrt(np.nansum(weighted_squared_error) / np.nansum(weights))
        return wrmse

    def get_all_metrics(self, test: pd.DataFrame) -> Metrics:
        """Gets all metrics of the trained model."""
        return Metrics(
            rmse_train=self.rmse(self.train),
            rmse_test=self.rmse(test),
            wrmse_test=self.wrmse(test),
            mrr5_test=self.mrr(test, 5),
            mrr10_test=self.mrr(test, 10),
            mean_ndcg5_test=self.mean_ndcg(test, 5),
            mean_ndcg10_test=self.mean_ndcg(test, 10),
            map5_test=self.map(test, 5),
            map10_test=self.map(test, 10),
        )

In [7]:
def plot_learning_curve(model: ExplicitMF):
    """Plots learning curve.

    Args:
        model: trained RS model.
    """
    plt.plot(model.train_rmse, label="Training", linewidth=5)
    plt.plot(model.test_rmse, label="Test", linewidth=5)
    plt.xlabel("iterations");
    plt.ylabel("RMSE");
    plt.legend(loc="best");

In [8]:
results = []

In [9]:
method = "pop"
pop = ExplicitMF(train, method=method)
start_time = time.time()
pop.training(test)
training_time = time.time() - start_time
print(f"--- {training_time} seconds ---")
metrics = pop.get_all_metrics(test)
results.append(
    [method] + list(dataclasses.astuple(metrics)) + [training_time]
)
print(results)

--- 206.68216729164124 seconds ---


  final = pred_sorted[np.in1d(pred_sorted, unseen_movie)]


[['pop', np.float64(3.5277894207860947), np.float64(3.61172836961149), np.float64(75.40387666481197), 0.12375633333334486, 0.14134305158731816, np.float64(0.022510236500179543), np.float64(0.024957082187930704), np.float64(0.12197576388888888), np.float64(0.1345392656084656), 206.68216729164124]]


In [10]:
method = "bias"
bias = ExplicitMF(train, method=method)
start_time = time.time()
bias.training(test)
training_time = time.time() - start_time
print(f"--- {training_time} seconds ---")
metrics = bias.get_all_metrics(test)
results.append(
    [method] + list(dataclasses.astuple(metrics)) + [training_time]
)
print(results)

--- 516.4402890205383 seconds ---


  final = pred_sorted[np.in1d(pred_sorted, unseen_movie)]


[['pop', np.float64(3.5277894207860947), np.float64(3.61172836961149), np.float64(75.40387666481197), 0.12375633333334486, 0.14134305158731816, np.float64(0.022510236500179543), np.float64(0.024957082187930704), np.float64(0.12197576388888888), np.float64(0.1345392656084656), 206.68216729164124], ['bias', np.float64(3.4120294473758435), np.float64(3.6148165693019023), np.float64(84.17444510203728), 0.12375633333334486, 0.14134305158731816, np.float64(0.022510236500179543), np.float64(0.024957082187930704), np.float64(0.12197576388888888), np.float64(0.1345392656084656), 516.4402890205383]]


In [11]:
method = "gd"
gd = ExplicitMF(train, method=method)
start_time = time.time()
gd.training(test)
training_time = time.time() - start_time
print(f"--- {training_time} seconds ---")
metrics = gd.get_all_metrics(test)
results.append(
    [method] + list(dataclasses.astuple(metrics)) + [training_time]
)
plot_learning_curve(gd)
print(results)

	current iteration: 0



KeyboardInterrupt



In [None]:
method = "als"
als = ExplicitMF(train, method=method)
start_time = time.time()
als.training(test)
training_time = time.time() - start_time
print(f"--- {training_time} seconds ---")
metrics = als.get_all_metrics(test)
results.append(
    [method] + list(dataclasses.astuple(metrics)) + [training_time]
)
plot_learning_curve(als)
print(results)

In [None]:
columns = [
    "RMSE training",
    "RMSE test",
    "MRR k=5 test",
    "MRR k=10 test",
    "Mean NDCG k=5 test",
    "Mean NDCG k=10 test",
    "MAP k=5 test",
    "MAP k=10 test",
    "WRMSE test"
    "Training time",
]


def results_to_dataframe(results: list) -> pd.DataFrame:
    """Prepares results for table.

    Args:
        results: results list.

    Returns:
        Results table.
    """

    results_pd = pd.DataFrame(results).set_index(0)
    results_pd = results_pd.set_index(results_pd.index.rename("Model / Metric"))
    results_pd.columns = columns
    return results_pd


lower_better = ["RMSE training", "RMSE test", "Training time"]
higher_better = list(set(columns) - set(lower_better))
results_pd = results_to_dataframe(results)
results_pd.style.background_gradient(
    "RdYlGn_r", axis=0, subset=lower_better,
).background_gradient("RdYlGn", axis=0, subset=higher_better)