In [1]:
import os
ROOT = os.path.join('..', '..')
import sys
sys.path.append(ROOT)
# 
import numpy as np
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()

from scripts.data import (
    ml_ratings_df, ml_movies_df, ml_users_df
)
ml_ratings_df = ml_ratings_df.collect().to_pandas()
ml_users_df = ml_users_df.collect().to_pandas()
ml_movies_df = ml_movies_df.collect().to_pandas()

from src.models.user_user import UserUserColaborativeFiltering

# Train - Test Split

In [2]:
MAX_K = 30
T_SPLIT_POINT = 970_000_000

# Split data into train and test in T_SPLIT_POINT
ml_ratings_train_df = ml_ratings_df[ml_ratings_df['Timestamp'] < T_SPLIT_POINT]
ml_ratings_test_df = ml_ratings_df[ml_ratings_df['Timestamp'] >= T_SPLIT_POINT]
# Keep only users in test that have at least MAX_K ratings
users_with_enough_ratings = ml_ratings_test_df['UserID'].value_counts()
users_with_enough_ratings = users_with_enough_ratings[users_with_enough_ratings >= MAX_K]
ml_ratings_test_df = ml_ratings_test_df[ml_ratings_test_df['UserID'].isin(users_with_enough_ratings.index)]
# Keep only users in train that are in the test
users_from_test = ml_ratings_test_df['UserID'].unique()
ml_ratings_train_df = ml_ratings_train_df[ml_ratings_train_df['UserID'].isin(users_from_test)]
#
print('Train shape:', ml_ratings_train_df.shape)
print('Test shape:', ml_ratings_test_df.shape)

Train shape: (85086, 4)
Test shape: (534709, 4)


# User-User Colaborative Filtering

In [3]:
recommender = UserUserColaborativeFiltering(ml_movies_df, ml_users_df)
recommender.fit(ml_ratings_train_df)

In [6]:
metrics = recommender.evaluate(ml_ratings_test_df, MAX_K)

100%|██████████| 3003/3003 [02:03<00:00, 24.38it/s]


In [7]:
print("Average Precision@K:", round(np.mean(metrics['precision@k']), 4))
print("Average Recall@K:", round(np.mean(metrics['recall@k']), 4))
print("Average F1@K:", round(np.mean(metrics['f1@k']), 4))
print()
print("Best Precision@K:", round(np.max(metrics['precision@k']), 4))
print("Best Recall@K:", round(np.max(metrics['recall@k']), 4))
print("Best F1@K:", round(np.max(metrics['f1@k']), 4))

Average Precision@K: 0.4035
Average Recall@K: 0.096
Average F1@K: 0.1388

Best Precision@K: 1.0
Best Recall@K: 0.439
Best F1@K: 0.507


In [8]:
print("Average Precision@K:", round(np.mean(metrics['precision@k']), 4))
print("Average Recall@K:", round(np.mean(metrics['recall@k']), 4))
print("Average F1@K:", round(np.mean(metrics['f1@k']), 4))
print()
print("Best Precision@K:", round(np.max(metrics['precision@k']), 4))
print("Best Recall@K:", round(np.max(metrics['recall@k']), 4))
print("Best F1@K:", round(np.max(metrics['f1@k']), 4))

Average Precision@K: 0.4035
Average Recall@K: 0.096
Average F1@K: 0.1388

Best Precision@K: 1.0
Best Recall@K: 0.439
Best F1@K: 0.507
