# Collaborative-Based Filtering

### Load the data

In [37]:
import pandas
ratings = pandas.read_csv("ratings.csv")[["userId", "movieId", "rating"]]
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


### Create the dataset

In [38]:
from surprise import Dataset, Reader

reader = Reader(rating_scale=(1,5))
dataset = Dataset.load_from_df(ratings, reader)
dataset

<surprise.dataset.DatasetAutoFolds at 0x1812780c980>

### Build the trainset

In [39]:
trainset = dataset.build_full_trainset()

### Train the Model

In [40]:
from surprise import SVD

svd = SVD()

In [41]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1812702f290>

In [42]:
svd.predict(15, 1956)

Prediction(uid=15, iid=1956, r_ui=None, est=3.8002760731359913, details={'was_impossible': False})

### Validation

In [None]:
from surprise import model_selection

cv_results = model_selection.cross_validate(svd, dataset, measures=["RMSE", "MAE"])

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=48f880f8-2f58-41e7-b0c8-ce2863059d0d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>

# Extract the results

In [None]:
test_rmse = cv_results['test_rmse']
test_mae = cv_results['test_mae']
fit_times = cv_results['fit_time']
test_times = cv_results['test_time']

In [None]:
import os
os.makedirs('plots/collaborative', exist_ok=True)

# Create a figure with subplots

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 10))

# Plot RMSE across folds

In [None]:
plt.subplot(2, 2, 1)
plt.bar(range(1, 6), test_rmse, color='skyblue')
plt.title('RMSE Across Folds')
plt.xlabel('Fold')
plt.ylabel('RMSE')
plt.ylim(min(test_rmse) - 0.05, max(test_rmse) + 0.05)
for i, v in enumerate(test_rmse):
    plt.text(i + 0.9, v + 0.01, f"{v:.3f}", color='black')


# Plot MAE across folds

In [None]:
plt.subplot(2, 2, 2)
plt.bar(range(1, 6), test_mae, color='lightgreen')
plt.title('MAE Across Folds')
plt.xlabel('Fold')
plt.ylabel('MAE')
plt.ylim(min(test_mae) - 0.05, max(test_mae) + 0.05)
for i, v in enumerate(test_mae):
    plt.text(i + 0.9, v + 0.01, f"{v:.3f}", color='black')

# Plot timing information

In [None]:
plt.subplot(2, 2, 3)
plt.plot(range(1, 6), fit_times, 'o-', label='Fit Time')
plt.plot(range(1, 6), test_times, 'o-', label='Test Time')
plt.title('Training and Testing Times')
plt.xlabel('Fold')
plt.ylabel('Time (seconds)')
plt.legend()

# Plot overall metrics

In [None]:
plt.subplot(2, 2, 4)
overall_rmse = sum(test_rmse) / len(test_rmse)
overall_mae = sum(test_mae) / len(test_mae)
metrics = ['RMSE', 'MAE']
values = [overall_rmse, overall_mae]
plt.bar(metrics, values, color=['skyblue', 'lightgreen'])
plt.title('Overall Metrics')
plt.ylabel('Score')
for i, v in enumerate(values):
    plt.text(i, v + 0.01, f"{v:.3f}", ha='center', color='black')

plt.tight_layout()

# Save the plots

In [None]:
plt.savefig('plots/collaborative/svd_performance.png', bbox_inches='tight')
plt.show()

# Print summary statistics

In [None]:
print("\nEvaluation Summary:")
print(f"Average RMSE: {overall_rmse:.4f}")
print(f"Average MAE: {overall_mae:.4f}")
print(f"Average Fit Time: {sum(fit_times)/len(fit_times):.2f} sec")
print(f"Average Test Time: {sum(test_times)/len(test_times):.2f} sec")