In [1]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, SVDpp
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from train_valid_test_loader import load_train_valid_test_datasets

#======================================================
# Step 0: Load data
#======================================================
train_tuple, valid_tuple, test_tuple, n_users, n_items = load_train_valid_test_datasets()

user_id_train, item_id_train, rating_train = train_tuple
user_id_valid, item_id_valid, rating_valid = valid_tuple
# user_id_test, item_id_test, rating_test = test_tuple  # Might not be needed here, but available if you want to measure test.

train_df = pd.DataFrame({
    'user_id': user_id_train,
    'item_id': item_id_train,
    'rating': rating_train
})

valid_df = pd.DataFrame({
    'user_id': user_id_valid,
    'item_id': item_id_valid,
    'rating': rating_valid
})

# Adjust rating scale as needed (1 to 5 if it's MovieLens)
reader = Reader(rating_scale=(1, 5))

train_data = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
valid_data = Dataset.load_from_df(valid_df[['user_id', 'item_id', 'rating']], reader)

trainset = train_data.build_full_trainset()
validset = valid_data.build_full_trainset()
validset_list = validset.build_testset()  # Convert valid to testset for evaluation


In [None]:

#======================================================
# Step 1: Hyperparameter Tuning with SVD++
#======================================================
param_grid = {
    'n_factors': [20, 50, 100],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.02, 0.1],
    'n_epochs': [20, 50]
}

# Using GridSearchCV with SVD++
gs = GridSearchCV(SVDpp, param_grid, measures=['mae'], cv=3, n_jobs=-1)
gs.fit(train_data)


In [None]:
import matplotlib.pyplot as plt

# Extract results from GridSearchCV in Surprise
results = [
    {
        'params': params,
        'mean_test_mae': mean_mae
    }
    for params, mean_mae in zip(gs.cv_results['params'], gs.cv_results['mean_test_mae'])
]

# Plot hyperparameter selection
n_factors = [res['params']['n_factors'] for res in results]
mae_scores = [res['mean_test_mae'] for res in results]

plt.figure(figsize=(8, 6))
plt.plot(n_factors, mae_scores, marker='o', label='Validation MAE')
plt.xlabel('Number of Latent Factors (n_factors)', fontsize=12)
plt.ylabel('Mean Absolute Error (MAE)', fontsize=12)
plt.title('Hyperparameter Selection: Validation MAE vs. n_factors', fontsize=14)
plt.xticks(n_factors)
plt.legend()
plt.grid(True)
plt.show()


In [None]:


# Best parameters
print("Best MAE score with SVD++:", gs.best_score['mae'])
print("Best params with SVD++:", gs.best_params['mae'])

best_params = gs.best_params['mae']
algo = SVDpp(**best_params)
algo.fit(trainset)

# Evaluate on validation set
valid_preds = algo.test(validset_list)
valid_mae = accuracy.mae(valid_preds, verbose=True)
print(f"Validation MAE with chosen SVD++: {valid_mae:.4f}")

In [None]:

#======================================================
# Step 2: Combine train+valid for final model training
#======================================================
combined_df = pd.concat([train_df, valid_df], ignore_index=True)
combined_data = Dataset.load_from_df(combined_df[['user_id', 'item_id', 'rating']], reader)
combined_trainset = combined_data.build_full_trainset()

final_algo = SVDpp(**best_params)
final_algo.fit(combined_trainset)


In [None]:

#======================================================
# Step 3: Predict on leaderboard dataset
#======================================================
leaderboard_df = pd.read_csv('./data_movie_lens_100k/ratings_masked_leaderboard_set.csv')
leaderboard_user_ids = leaderboard_df['user_id'].values
leaderboard_item_ids = leaderboard_df['item_id'].values

testset_leaderboard = [(u, i, 0.0) for u, i in zip(leaderboard_user_ids, leaderboard_item_ids)]
leaderboard_preds = final_algo.test(testset_leaderboard)
predicted_ratings = np.array([pred.est for pred in leaderboard_preds])

# Save the predictions
np.savetxt('predicted_ratings_leaderboard2.txt', predicted_ratings, fmt='%.4f')
print("Predictions saved to predicted_ratings_leaderboard2.txt")


In [None]:

#======================================================
# Step 3: Predict on leaderboard dataset
#======================================================
leaderboard_df = pd.read_csv('./data_movie_lens_100k/ratings_masked_leaderboard_set.csv')
leaderboard_user_ids = leaderboard_df['user_id'].values
leaderboard_item_ids = leaderboard_df['item_id'].values

testset_leaderboard = [(u, i, 0.0) for u, i in zip(leaderboard_user_ids, leaderboard_item_ids)]
leaderboard_preds = final_algo.test(testset_leaderboard)
predicted_ratings = np.array([pred.est for pred in leaderboard_preds])

# Save the predictions
np.savetxt('predicted_ratings_leaderboard2.txt', predicted_ratings, fmt='%.4f')
print("Predictions saved to predicted_ratings_leaderboard2.txt")
