In [1]:
import os
ROOT = os.path.join('..', '..')
import sys
sys.path.append(ROOT)
#
import numpy as np
import polars as pl
import altair as alt
from tqdm import tqdm

from scripts.data import (
    ml_ratings_df, ml_movies_df, ml_users_df, ml_df, ml_genres,
    bc_ratings_df, bc_books_df, bc_users_df, bc_df,
)
from src.metrics import (
    ml_precision_at_k, ml_recall_at_k, ml_f1_at_k
)
from src.models.simple import ml_popularity_based_recommendation

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

# Train - Test Split

In [2]:
idx = pl.Series(range(ml_ratings_df.select(pl.len()).collect().item()))
idx = idx.shuffle(seed=42)
train_size = int(len(idx) * 0.8)
train_idx = idx[:train_size]
test_idx = idx[train_size:]

ml_ratings_train_df = ml_ratings_df.collect()[train_idx, :].lazy()
ml_ratings_test_df = ml_ratings_df.collect()[test_idx, :].lazy()
ml_ratings_train_df.select(pl.len()).collect().item(), ml_ratings_test_df.select(pl.len()).collect().item()

(800167, 200042)

# User-User Colaborative Filtering

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
user_item_df = ml_ratings_train_df.sort(by=['UserID']).collect().pivot(index='UserID', columns='MovieID', values='Rating').fill_null(0)
user_item_matrix = user_item_df.select(pl.exclude('UserID')).to_numpy()
user_similarity_matrix = cosine_similarity(user_item_matrix)

def get_top_k_similar_users(user_id, k=5):
    similar_user_indices = np.argsort(user_similarity_matrix[user_id])[::-1][:k]
    similarities = user_similarity_matrix[user_id][similar_user_indices]
    return pl.DataFrame({'UserID': similar_user_indices, 'Similarity': similarities}).lazy().with_columns(pl.col('UserID').cast(pl.Int32))
    
top5users = get_top_k_similar_users(5)
top5users.collect()

UserID,Similarity
i32,f64
5,1.0
1091,0.283548
5429,0.270789
91,0.269771
835,0.261522


In [4]:
def ml_user_user_colaborative_filtering(user_id, n_users, n_recommendations):
    similar_users = get_top_k_similar_users(user_id, n_users+1).collect()
    similar_users_ids = similar_users['UserID']
    similar_users_ratings = user_item_df.filter(pl.col('UserID').is_in(similar_users_ids)) \
        .join(similar_users, on='UserID') \
        .sort(by='Similarity', descending=True)
    similar_users_ratings_pd = similar_users_ratings.to_pandas()
    cols_with_zero = similar_users_ratings_pd[similar_users_ratings_pd["UserID"] == user_id] == 0
    cols_with_zero[["UserID", "Similarity"]] = True
    similar_users_ratings_pd = similar_users_ratings_pd.loc[:, cols_with_zero.iloc[0]]
    similar_users_ratings_pd = similar_users_ratings_pd[similar_users_ratings_pd["UserID"] != user_id]
    sum_of_similarities = similar_users_ratings_pd["Similarity"].sum()
    similar_users_ratings_pd["Similarity"] = similar_users_ratings_pd["Similarity"] / sum_of_similarities
    return pl.DataFrame(similar_users_ratings_pd).select([
        pl.exclude(["UserID", "Similarity"]) / 5 * pl.col('Similarity')
    ]).sum().melt().select(
        pl.col('variable').alias("MovieID").cast(pl.Int32),
        pl.col('value').alias("Rating").cast(pl.Float64)
    ).sort(by='Rating', descending=True).head(n_recommendations).lazy()

ml_test_user_id = 5
ml_recommendation = ml_user_user_colaborative_filtering(ml_test_user_id, 5, 100)
ml_recommendation = ml_recommendation.join(ml_movies_df, on='MovieID').sort(by='Rating', descending=True)
ml_recommendation.collect()

MovieID,Rating,Title,Genres,Year,IsChildren's,IsFantasy,IsAnimation,IsSci-Fi,IsAction,IsCrime,IsWar,IsMystery,IsHorror,IsAdventure,IsFilm-Noir,IsWestern,IsDrama,IsMusical,IsRomance,IsComedy,IsThriller,IsDocumentary
i32,f64,str,list[str],i32,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
918,0.650782,"""Wizard of Oz""",,,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false
1021,0.648728,"""Cinderella""","[""Animation"", ""Children's"", ""Musical""]",1950,true,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false
2393,0.571278,"""Prince of Egypt""",,,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false
1030,0.564863,"""Bedknobs and Broomsticks""","[""Adventure"", ""Children's"", ""Musical""]",1971,true,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false
1776,0.515197,"""Wedding Singer""",,,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1967,0.201076,"""Breakfast Club""",,,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false
2682,0.201076,"""Austin Powers: The Spy Who Sha…","[""Comedy""]",1999,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false
2803,0.201076,"""Christmas Story""",,,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false
3113,0.201076,"""Toy Story 2""","[""Animation"", ""Children's"", ""Comedy""]",1999,true,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false


In [5]:
print(f"Precision@5 : {ml_precision_at_k(5, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Precision@10: {ml_precision_at_k(10, ml_recommendation, ml_ratings_test_df,  ml_test_user_id)}")
print(f"Precision@15: {ml_precision_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

Precision@5 : 0.0
Precision@10: 0.0
Precision@15: 0.0


In [6]:
print(f"Recall@5: {ml_recall_at_k(5, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Recall@10: {ml_recall_at_k(10, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Recall@15: {ml_recall_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Recall@100: {ml_recall_at_k(100, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

Recall@5: 0.0
Recall@10: 0.0
Recall@15: 0.0
Recall@100: 0.13333333333333333


In [7]:
print(f"F1@5: {ml_f1_at_k(5,   ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"F1@10: {ml_f1_at_k(10, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"F1@15: {ml_f1_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"F1@100: {ml_f1_at_k(100, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

F1@5: 0.0
F1@10: 0.0
F1@15: 0.0
F1@100: 0.03478260869542533
