In [1]:
import os
ROOT = os.path.join('..', '..')
import sys
sys.path.append(ROOT)
#
import numpy as np
import polars as pl
import altair as alt
from tqdm import tqdm

from scripts.data import (
    ml_ratings_df, ml_movies_df, ml_users_df, ml_df, ml_genres,
    bc_ratings_df, bc_books_df, bc_users_df, bc_df,
)
from src.metrics import (
    ml_precision_at_k, ml_recall_at_k, ml_f1_at_k
)
from src.models.simple import ml_popularity_based_recommendation

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

# Train - Test Split

In [2]:
idx = pl.Series(range(ml_ratings_df.select(pl.len()).collect().item()))
idx = idx.shuffle(seed=42)
train_size = int(len(idx) * 0.8)
train_idx = idx[:train_size]
test_idx = idx[train_size:]

ml_ratings_train_df = ml_ratings_df.collect()[train_idx, :].lazy()
ml_ratings_test_df = ml_ratings_df.collect()[test_idx, :].lazy()
ml_ratings_train_df.select(pl.len()).collect().item(), ml_ratings_test_df.select(pl.len()).collect().item()

(800167, 200042)

# User-User Colaborative Filtering

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
user_item_df = ml_ratings_train_df.sort(by=['UserID']).collect().pivot(index='UserID', columns='MovieID', values='Rating').fill_null(0)
user_item_matrix = user_item_df.select(pl.exclude('UserID')).to_numpy()
user_similarity_matrix = cosine_similarity(user_item_matrix)

def get_top_k_similar_users(user_id, k=5):
    similar_user_indices = np.argsort(user_similarity_matrix[user_id])[::-1][:k]
    similarities = user_similarity_matrix[user_id][similar_user_indices]
    return pl.DataFrame({'UserID': similar_user_indices, 'Similarity': similarities}).lazy().with_columns(pl.col('UserID').cast(pl.Int32))
    
top5users = get_top_k_similar_users(5)
top5users.collect()

UserID,Similarity
i32,f64
5,1.0
1091,0.283548
5429,0.270789
91,0.269771
835,0.261522


In [19]:
def ml_user_user_colaborative_filtering(user_id, n_users, n_recommendations):
    similar_users = get_top_k_similar_users(user_id, n_users+1).collect()
    similar_users_ids = similar_users['UserID']
    similar_users_ratings = user_item_df.filter(pl.col('UserID').is_in(similar_users_ids)) \
        .join(similar_users, on='UserID') \
        .sort(by='Similarity', descending=True)
    similar_users_ratings_pd = similar_users_ratings.to_pandas()
    cols_with_zero = similar_users_ratings_pd[similar_users_ratings_pd["UserID"] == user_id] == 0
    cols_with_zero[["UserID", "Similarity"]] = True
    similar_users_ratings_pd = similar_users_ratings_pd.loc[:, cols_with_zero.iloc[0]]
    similar_users_ratings_pd = similar_users_ratings_pd[similar_users_ratings_pd["UserID"] != user_id]
    sum_of_similarities = similar_users_ratings_pd["Similarity"].sum()
    similar_users_ratings_pd["Similarity"] = similar_users_ratings_pd["Similarity"] / sum_of_similarities
    return pl.DataFrame(similar_users_ratings_pd).select([
        pl.exclude(["UserID", "Similarity"]) / 5 * pl.col('Similarity')
    ]).sum().melt().select(
        pl.col('variable').alias("MovieID").cast(pl.Int32),
        pl.col('value').alias("Rating").cast(pl.Float64)
    ).sort(by='Rating', descending=True).head(n_recommendations).lazy()


In [18]:
best_user_id = 0
best_user_score = 0
for user_id in ml_users_df.select('UserID').collect().to_numpy().flatten(): 
    ml_recommendation = ml_user_user_colaborative_filtering(user_id, 5, 100)
    score = ml_f1_at_k(5, ml_recommendation, ml_ratings_test_df, user_id)
    if score > best_user_score:
        best_user_id = user_id
        best_user_score = score
        print(f"New best user: {best_user_id} with score: {best_user_score}")

New best user: 0 with score: 0.1333333333328889
New best user: 2 with score: 0.15384615384568048
New best user: 11 with score: 0.24999999999953126
New best user: 12 with score: 0.27586206896523185
New best user: 38 with score: 0.3999999999995556
New best user: 69 with score: 0.42105263157855943
New best user: 105 with score: 0.5333333333328889
New best user: 491 with score: 0.615384615384142
New best user: 824 with score: 0.6666666666661727
New best user: 2203 with score: 0.7272727272722312


In [20]:
ml_test_user_id = 2203
ml_recommendation = ml_user_user_colaborative_filtering(ml_test_user_id, 5, 100)
ml_recommendation = ml_recommendation.join(ml_movies_df, on='MovieID').sort(by='Rating', descending=True)
ml_recommendation.collect()

MovieID,Rating,Title,Genres,Year,IsChildren's,IsFantasy,IsAnimation,IsSci-Fi,IsAction,IsCrime,IsWar,IsMystery,IsHorror,IsAdventure,IsFilm-Noir,IsWestern,IsDrama,IsMusical,IsRomance,IsComedy,IsThriller,IsDocumentary
i32,f64,str,list[str],i32,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
2286,0.646202,"""Them!""","[""Sci-Fi"", ""Thriller"", ""War""]",1954,false,false,false,true,false,false,true,false,false,false,false,false,false,false,false,false,true,false
1923,0.643134,"""Plan 9 from Outer Space""","[""Horror"", ""Sci-Fi""]",1958,false,false,false,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false
967,0.573312,"""Night of the Living Dead""","[""Horror"", ""Sci-Fi""]",1968,false,false,false,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false
1018,0.542865,"""20""",,,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false
2660,0.500794,"""It Came from Outer Space""","[""Sci-Fi""]",1953,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1355,0.14096,"""Star Trek: First Contact""","[""Action"", ""Adventure"", ""Sci-Fi""]",1996,false,false,false,true,true,false,false,false,false,true,false,false,false,false,false,false,false,false
2011,0.14096,"""Back to the Future Part III""","[""Comedy"", ""Sci-Fi"", ""Western""]",1990,false,false,false,true,false,false,false,false,false,false,false,true,false,false,false,true,false,false
2205,0.14096,"""Suspicion""","[""Mystery"", ""Thriller""]",1941,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,true,false
3073,0.14096,"""Jeremiah Johnson""","[""Western""]",1972,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false


In [21]:
print(f"Precision@5 : {ml_precision_at_k(5, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Precision@10: {ml_precision_at_k(10, ml_recommendation, ml_ratings_test_df,  ml_test_user_id)}")
print(f"Precision@15: {ml_precision_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

Precision@5 : 0.8
Precision@10: 0.4
Precision@15: 0.26666666666666666


In [22]:
print(f"Recall@5: {ml_recall_at_k(5, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Recall@10: {ml_recall_at_k(10, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Recall@15: {ml_recall_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Recall@100: {ml_recall_at_k(100, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

Recall@5: 0.6666666666666666
Recall@10: 0.6666666666666666
Recall@15: 0.6666666666666666
Recall@100: 0.8333333333333334


In [23]:
print(f"F1@5: {ml_f1_at_k(5,   ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"F1@10: {ml_f1_at_k(10, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"F1@15: {ml_f1_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"F1@100: {ml_f1_at_k(100, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

F1@5: 0.7272727272722312
F1@10: 0.4999999999995312
F1@15: 0.38095238095197276
F1@100: 0.09433962264140264
