In [18]:
import pandas as pd
import numpy as np
import tensorflow as tf
from rapidfuzz import process, fuzz
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Flatten, Add, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split

In [9]:
# Inputs
MOVIES_FILE = '../Data/clean_parsed_tmdb500.csv' 
LINKS_FILE = '../Data/links.csv'
RATINGS_FILE_SRC = '../Data/ratings.csv'
RATINGS_FILE = '../Data/ml_ratings_with_tmdb_id.csv'  # output used for training

In [11]:
# Build ratings+tmdb mapping 
links_df = pd.read_csv(LINKS_FILE)
ratings_df = pd.read_csv(RATINGS_FILE_SRC)
links_df = links_df.dropna(subset=['tmdbId']).copy()
links_df['tmdbId'] = links_df['tmdbId'].astype(int)
links_cleaned = links_df[['movieId', 'tmdbId']]
ratings_cleaned = ratings_df[['userId', 'movieId', 'rating']]
merged_ratings_tmdb = pd.merge(ratings_cleaned, links_cleaned, on='movieId', how='inner')
merged_ratings_tmdb.rename(columns={'tmdbId': 'id'}, inplace=True)
merged_ratings_tmdb.to_csv(RATINGS_FILE, index=False)

print("Saved:", RATINGS_FILE, "shape:", merged_ratings_tmdb.shape)

Saved: ../Data/ml_ratings_with_tmdb_id.csv shape: (100823, 4)


In [12]:
# Model config
LATENT_DIM = 50
EPOCHS = 20
BATCH_SIZE = 64
SEED = 42
REG_L2 = 0.005

tf.random.set_seed(SEED)
np.random.seed(SEED)

# Load ratings for training
ratings_df = pd.read_csv(RATINGS_FILE)
ratings = ratings_df[['userId', 'movieId', 'rating']]

In [13]:
# Encode ids to contiguous ranges
user_ids = ratings['userId'].astype('category').cat.codes
movie_ids = ratings['movieId'].astype('category').cat.codes
user_map = dict(enumerate(ratings['userId'].astype('category').cat.categories))
movie_map = dict(enumerate(ratings['movieId'].astype('category').cat.categories))

num_users = user_ids.nunique()
num_movies = movie_ids.nunique()
global_mean = ratings['rating'].mean()

X = pd.DataFrame({'user_id': user_ids, 'movie_id': movie_ids})
y = ratings['rating'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)


In [14]:
def build_svd_model_with_biases(num_users, num_movies, latent_dim, reg_l2, global_mean):
    user_input = Input(shape=(1,), name='user_input')
    movie_input = Input(shape=(1,), name='movie_input')

    user_vec = Flatten()(Embedding(num_users, latent_dim, embeddings_regularizer=l2(reg_l2), name='user_factors')(user_input))
    movie_vec = Flatten()(Embedding(num_movies, latent_dim, embeddings_regularizer=l2(reg_l2), name='movie_factors')(movie_input))

    user_bias = Flatten()(Embedding(num_users, 1, embeddings_regularizer=l2(reg_l2), name='user_bias')(user_input))
    movie_bias = Flatten()(Embedding(num_movies, 1, embeddings_regularizer=l2(reg_l2), name='movie_bias')(movie_input))

    dot = Dot(axes=1, name='latent_dot_product')([user_vec, movie_vec])
    mu = Lambda(lambda x: x + global_mean, name='global_mean_add')(dot)
    out = Add(name='predicted_rating')([mu, user_bias, movie_bias])

    model = Model([user_input, movie_input], out)
    model.compile(optimizer=Adam(0.001), loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError(name='RMSE'), 'mae'])
    return model

In [15]:
model_svd = build_svd_model_with_biases(num_users, num_movies, LATENT_DIM, REG_L2, global_mean)
model_svd.summary()

In [16]:
print("\nTraining...")
history = model_svd.fit(
    [X_train['user_id'], X_train['movie_id']], y_train,
    batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1,
    validation_data=([X_test['user_id'], X_test['movie_id']], y_test)
)

metrics = model_svd.evaluate([X_test['user_id'], X_test['movie_id']], y_test, verbose=0)
loss, rmse, mae = metrics[0], metrics[1], metrics[2]
print(f"\nTest RMSE: {rmse:.4f}  MAE: {mae:.4f}  MSE: {loss:.4f}")


Training...
Epoch 1/20
[1m1261/1261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 13ms/step - RMSE: 1.0025 - loss: 1.0482 - mae: 0.7963 - val_RMSE: 0.9789 - val_loss: 0.9933 - val_mae: 0.7769
Epoch 2/20
[1m1261/1261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 13ms/step - RMSE: 0.9723 - loss: 0.9895 - mae: 0.7710 - val_RMSE: 0.9672 - val_loss: 0.9859 - val_mae: 0.7667
Epoch 3/20
[1m1261/1261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 13ms/step - RMSE: 0.9665 - loss: 0.9870 - mae: 0.7663 - val_RMSE: 0.9646 - val_loss: 0.9853 - val_mae: 0.7647
Epoch 4/20
[1m1261/1261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 13ms/step - RMSE: 0.9651 - loss: 0.9867 - mae: 0.7653 - val_RMSE: 0.9640 - val_loss: 0.9852 - val_mae: 0.7642
Epoch 5/20
[1m1261/1261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 13ms/step - RMSE: 0.9647 - loss: 0.9867 - mae: 0.7651 - val_RMSE: 0.9638 - val_loss: 0.9853 - val_mae: 0.7641
Epoch 6/20
[1m1261/1261[0m [32m━━━

In [None]:
# Example prediction
original_user_id = 1
original_movie_id = 302
try:
    encoded_user_id = user_ids[ratings['userId'] == original_user_id].iloc[0]
    encoded_movie_id = movie_ids[ratings['movieId'] == original_movie_id].iloc[0]
except IndexError:
    encoded_user_id = X_test['user_id'].iloc[0]
    encoded_movie_id = X_test['movie_id'].iloc[0]
    original_user_id = user_map[encoded_user_id]
    original_movie_id = movie_map[encoded_movie_id]

pred = model_svd.predict([np.array([encoded_user_id]), np.array([encoded_movie_id])])[0][0]
print(f"\nPredicted rating for User {original_user_id} and Movie {original_movie_id}: {pred:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step

Predicted rating for User 1 and Movie 302: 3.7455
