In [1]:
import os
ROOT = os.path.join('..', '..')
import sys
sys.path.append(ROOT)
#
import numpy as np
import polars as pl
import altair as alt
from tqdm import tqdm

from scripts.data import (
    ml_ratings_df, ml_movies_df, ml_users_df, ml_df, ml_genres,
    bc_ratings_df, bc_books_df, bc_users_df, bc_df,
)

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

# Train - Test Split

In [2]:
ml_ratings_df = ml_ratings_df.sort("Timestamp")
ml_ratings_df_len = ml_df.select(pl.len()).collect().item()
train_size = int(ml_ratings_df_len * 0.8)
ml_ratings_train_df = ml_df.head(train_size)
ml_ratings_test_df = ml_df.tail(ml_ratings_df_len - train_size)

In [3]:
ml_ratings_test_df.collect()

UserID,MovieID,Rating,Timestamp,Title,Genres,Year,IsCrime,IsMystery,IsComedy,IsWestern,IsRomance,IsWar,IsAnimation,IsSci-Fi,IsFantasy,IsChildren's,IsDocumentary,IsThriller,IsHorror,IsMusical,IsAction,IsDrama,IsAdventure,IsFilm-Noir,Gender,Age,Occupation,Zip-code
i32,i32,i32,i32,str,list[str],i32,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,str,i32,str,str
4795,3262,3,969726048,"""Twin Peaks: Fire Walk with Me""","[""Drama"", ""Mystery""]",1992,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,"""M""",45,"""self-employed""","""10962"""
4795,2624,4,969659424,"""After Life""","[""Drama""]",1998,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,"""M""",45,"""self-employed""","""10962"""
4795,2628,3,969381578,"""Star Wars: Episode I - The Pha…","[""Action"", ""Adventure"", … ""Sci-Fi""]",1999,false,false,false,false,false,false,false,true,true,false,false,false,false,false,true,false,true,false,"""M""",45,"""self-employed""","""10962"""
4795,3609,4,962985072,"""Regret to Inform""","[""Documentary""]",1998,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,"""M""",45,"""self-employed""","""10962"""
4795,3623,3,980095267,"""Mission: Impossible 2""","[""Action"", ""Thriller""]",2000,false,false,false,false,false,false,false,false,false,false,false,true,false,false,true,false,false,false,"""M""",45,"""self-employed""","""10962"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
6040,1091,1,956716541,"""Weekend at Bernie's""","[""Comedy""]",1989,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,"""M""",25,"""doctor/health care""","""11106"""
6040,1094,5,956704887,"""Crying Game""",,,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,"""M""",25,"""doctor/health care""","""11106"""
6040,562,5,956704746,"""Welcome to the Dollhouse""","[""Comedy"", ""Drama""]",1995,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,"""M""",25,"""doctor/health care""","""11106"""
6040,1096,4,956715648,"""Sophie's Choice""","[""Drama""]",1982,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,"""M""",25,"""doctor/health care""","""11106"""


# Baseline Popularity-Based Recommender

In [4]:
def ml_simple_recommendation(n_recommendations=5):
    """
    Recommends most popular movies. Thats it
    """
    return ml_ratings_train_df.join(ml_movies_df, on="MovieID") \
        .group_by("MovieID") \
        .agg(
            pl.sum("Rating").alias("score")
        ).join(ml_movies_df, on="MovieID") \
        .sort("score", descending=True) \
        .head(n_recommendations)

ml_simple_recommendation().collect()

MovieID,score,Title,Genres,Year,IsCrime,IsMystery,IsComedy,IsWestern,IsRomance,IsWar,IsAnimation,IsSci-Fi,IsFantasy,IsChildren's,IsDocumentary,IsThriller,IsHorror,IsMusical,IsAction,IsDrama,IsAdventure,IsFilm-Noir
i32,i32,str,list[str],i32,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
2858,12003,"""American Beauty""","[""Comedy"", ""Drama""]",1999,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
260,10464,"""Star Wars: Episode IV - A New …","[""Action"", ""Adventure"", … ""Sci-Fi""]",1977,False,False,False,False,False,False,False,True,True,False,False,False,False,False,True,False,True,False
1196,10026,"""Star Wars: Episode V - The Emp…","[""Action"", ""Adventure"", … ""War""]",1980,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,True,True,False
2028,9302,"""Saving Private Ryan""","[""Action"", ""Drama"", ""War""]",1998,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,True,False,False
1210,9109,"""Star Wars: Episode VI - Return…","[""Action"", ""Adventure"", … ""War""]",1983,False,False,False,False,True,True,False,True,False,False,False,False,False,False,True,False,True,False


# Evaluation metrics

## Precision@K

In [5]:
ml_test_user_id = 6040
ml_recommendation = ml_simple_recommendation(1000)

In [6]:
ml_ratings_test_df.filter(pl.col("UserID") == ml_test_user_id).select(pl.len()).collect().item()

341

In [7]:
ml_movies_df.select(pl.len()).collect().item()  

3883

In [8]:
def ml_precision_at_k(k, rec_df, ml_ratings_test_df, user_id):
    ml_user_ratings_test_df = ml_ratings_test_df.filter(pl.col("UserID") == user_id)
    topk = rec_df.head(k).with_columns([
        pl.col("MovieID").is_in(ml_user_ratings_test_df.select("MovieID").collect()).alias("is_relevant")
    ])
    top_k_len = topk.select(pl.len()).collect().item()
    ml_precision_at_k = topk.select(pl.col("is_relevant").sum()).collect().item() / top_k_len
    return ml_precision_at_k

print(f"Precision@5 : {ml_precision_at_k(5, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Precision@10: {ml_precision_at_k(10, ml_recommendation, ml_ratings_test_df,  ml_test_user_id)}")
print(f"Precision@15: {ml_precision_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

Precision@5 : 1.0
Precision@10: 0.9
Precision@15: 0.8666666666666667


## Recall@K

In [9]:
def ml_recall_at_k(k, rec_df, ml_ratings_test_df, user_id):
    ml_user_ratings_test_df = ml_ratings_test_df.filter(pl.col("UserID") == user_id)
    topk = rec_df.head(k).with_columns([
        pl.col("MovieID").is_in(ml_user_ratings_test_df.select("MovieID").collect()).alias("is_relevant")
    ])
    ml_user_ratings_test_df_len = ml_user_ratings_test_df.select(pl.len()).collect().item()
    if ml_user_ratings_test_df_len == 0: # Not sure about this
        return 0
    recall_at_k = topk.select(pl.col("is_relevant").sum()).collect().item() / ml_user_ratings_test_df_len
    return recall_at_k

print(f"Recall@5: {ml_recall_at_k(5, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Recall@10: {ml_recall_at_k(10, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Recall@15: {ml_recall_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Recall@100: {ml_recall_at_k(100, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

Recall@5: 0.01466275659824047
Recall@10: 0.026392961876832845
Recall@15: 0.03812316715542522
Recall@100: 0.18181818181818182


In [10]:
k_ = np.arange(1, 1001, 50)
res_ = []
for k in k_:
    res_.append(ml_recall_at_k(k, ml_recommendation, ml_ratings_test_df, ml_test_user_id))

In [11]:
alt.Chart(pl.DataFrame({
    "k": k_,
    "recall": res_
})).mark_line().encode(
    x="k:Q",
    y="recall:Q"
)

## F1@K

In [12]:
def f1_at_k(k, rec_df, ml_ratings_test_df, user_id):
    precision = ml_precision_at_k(k, rec_df, ml_ratings_test_df, user_id)
    recall = ml_recall_at_k(k, rec_df, ml_ratings_test_df, user_id)
    return 2 * (precision * recall) / (precision + recall + 1e-12)

print(f"F1@5: {f1_at_k(5,   ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"F1@10: {f1_at_k(10, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"F1@15: {f1_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"F1@100: {f1_at_k(100, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

F1@5: 0.028901734104017758
F1@10: 0.05128205128199593
F1@15: 0.07303370786508782
F1@100: 0.2811791383216448


# Content Based Model

In [13]:
ml_test_user_id = 5245

In [14]:
def create_user_profile(user_id):
    user_ratings = ml_ratings_train_df.filter(pl.col("UserID") == user_id)
    user_movies = user_ratings.join(ml_movies_df, on="MovieID")
    user_profile = user_movies.select([
        pl.sum("Is" + genre[0]).alias(genre[0])
        for genre in ml_genres.rows() if genre[0] is not None
    ])
    return user_profile

create_user_profile(ml_test_user_id).collect()

Crime,Mystery,Comedy,Western,Romance,War,Animation,Sci-Fi,Fantasy,Children's,Documentary,Thriller,Horror,Musical,Action,Drama,Adventure,Film-Noir
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
def ml_content_based_recommendation(user_id, n_recommendations=5):
    """
    Calculates scores as the cosine similarity between the user profile and the movie profiles. 
    NOTE: I haven't found a way to use polars' native functions to calculate cosine similarity, so this stuff is highty inefficient
    """
    user_profile = create_user_profile(user_id)
    movie_profiles = ml_movies_df.select([
        pl.col("Is" + genre[0]).cast(pl.UInt32) 
        for genre in ml_genres.rows() if genre[0] is not None
    ])
    u = user_profile.collect().to_numpy()[0]
    ms = movie_profiles.collect().to_numpy()
    s = ms.dot(u) / (np.linalg.norm(u) * np.linalg.norm(ms, axis=1)+1e-12)
    scores = ml_movies_df.with_columns(pl.Series(name="score", values=s)).sort("score", descending=True)
    return scores.head(n_recommendations)

ml_recommendation = ml_content_based_recommendation(ml_test_user_id, 20)

In [16]:
print(f"Precision@5 : {ml_precision_at_k(5, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Precision@10: {ml_precision_at_k(10, ml_recommendation, ml_ratings_test_df,  ml_test_user_id)}")
print(f"Precision@15: {ml_precision_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

Precision@5 : 0.4
Precision@10: 0.2
Precision@15: 0.13333333333333333


In [17]:
print(f"Recall@5: {ml_recall_at_k(5, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Recall@10: {ml_recall_at_k(10, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Recall@15: {ml_recall_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

Recall@5: 0.06666666666666667
Recall@10: 0.06666666666666667
Recall@15: 0.06666666666666667


In [18]:
print(f"F1@5: {f1_at_k(5,   ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"F1@10: {f1_at_k(10, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"F1@15: {f1_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

F1@5: 0.1142857142854694
F1@10: 0.09999999999962501
F1@15: 0.08888888888844444


# Item-Item Colaborative Filtering

In [19]:
def ml_item_item_colaborative_filtering_recommendation(user_id, n_recommendations=5):
    """
    
    """
    user_movies = ml_ratings_train_df.filter(pl.col("UserID") == user_id).join(ml_movies_df, on="MovieID")
    user_movies_representation = user_movies.select([
        pl.col("Is" + genre[0]).alias(genre[0])
        for genre in ml_genres.rows() if genre[0] is not None
    ]).collect().to_numpy()
    all_movies_representation = ml_movies_df.select([
        pl.col("Is" + genre[0]).alias(genre[0])
        for genre in ml_genres.rows() if genre[0] is not None
    ]).collect().to_numpy()
    s = all_movies_representation.dot(user_movies_representation.T)
    print(user_movies_representation.shape)
    print(all_movies_representation.shape)
    print(s.shape)
    # scores = ml_movies_df.with_columns(pl.Series(name="score", values=s.flatten()))
    # return scores.head(n_recommendations)

ml_item_item_colaborative_filtering_recommendation(2, 20)

(129, 18)
(3883, 18)
(3883, 129)


In [20]:
print(f"Precision@5 : {ml_precision_at_k(5, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Precision@10: {ml_precision_at_k(10, ml_recommendation, ml_ratings_test_df,  ml_test_user_id)}")
print(f"Precision@15: {ml_precision_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

Precision@5 : 0.4
Precision@10: 0.2
Precision@15: 0.13333333333333333


# User-User Colaborative Filtering

In [21]:
def ml_user_user_colaborative_filtering_recommendation(user_id, n_recommendations=5):
    # Get user
    user = ml_users_df.filter(pl.col("UserID") == user_id)
    # Get similar users
    print(user.collect())

ml_user_user_colaborative_filtering_recommendation(2, 20)

shape: (1, 5)
┌────────┬────────┬─────┬───────────────┬──────────┐
│ UserID ┆ Gender ┆ Age ┆ Occupation    ┆ Zip-code │
│ ---    ┆ ---    ┆ --- ┆ ---           ┆ ---      │
│ i32    ┆ str    ┆ i32 ┆ str           ┆ str      │
╞════════╪════════╪═════╪═══════════════╪══════════╡
│ 2      ┆ M      ┆ 56  ┆ self-employed ┆ 70072    │
└────────┴────────┴─────┴───────────────┴──────────┘


In [22]:
ml_users_df.collect()

UserID,Gender,Age,Occupation,Zip-code
i32,str,i32,str,str
1,"""F""",1,"""K-12 student""","""48067"""
2,"""M""",56,"""self-employed""","""70072"""
3,"""M""",25,"""scientist""","""55117"""
4,"""M""",45,"""executive/managerial""","""02460"""
5,"""M""",25,"""writer""","""55455"""
…,…,…,…,…
6036,"""F""",25,"""scientist""","""32603"""
6037,"""F""",45,"""academic/educator""","""76006"""
6038,"""F""",56,"""academic/educator""","""14706"""
6039,"""F""",45,"""other""","""01060"""


In [23]:
ml_users_df.collect()

UserID,Gender,Age,Occupation,Zip-code
i32,str,i32,str,str
1,"""F""",1,"""K-12 student""","""48067"""
2,"""M""",56,"""self-employed""","""70072"""
3,"""M""",25,"""scientist""","""55117"""
4,"""M""",45,"""executive/managerial""","""02460"""
5,"""M""",25,"""writer""","""55455"""
…,…,…,…,…
6036,"""F""",25,"""scientist""","""32603"""
6037,"""F""",45,"""academic/educator""","""76006"""
6038,"""F""",56,"""academic/educator""","""14706"""
6039,"""F""",45,"""other""","""01060"""


In [24]:
# d(item, item)
# user: item1, item2, item3

In [25]:
ml_movies_df.collect()

MovieID,Title,Genres,Year,IsCrime,IsMystery,IsComedy,IsWestern,IsRomance,IsWar,IsAnimation,IsSci-Fi,IsFantasy,IsChildren's,IsDocumentary,IsThriller,IsHorror,IsMusical,IsAction,IsDrama,IsAdventure,IsFilm-Noir
i32,str,list[str],i32,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
1,"""Toy Story""","[""Animation"", ""Children's"", ""Comedy""]",1995,false,false,true,false,false,false,true,false,false,true,false,false,false,false,false,false,false,false
2,"""Jumanji""","[""Adventure"", ""Children's"", ""Fantasy""]",1995,false,false,false,false,false,false,false,false,true,true,false,false,false,false,false,false,true,false
3,"""Grumpier Old Men""","[""Comedy"", ""Romance""]",1995,false,false,true,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false
4,"""Waiting to Exhale""","[""Comedy"", ""Drama""]",1995,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false
5,"""Father of the Bride Part II""","[""Comedy""]",1995,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
3948,"""Meet the Parents""","[""Comedy""]",2000,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false
3949,"""Requiem for a Dream""","[""Drama""]",2000,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false
3950,"""Tigerland""","[""Drama""]",2000,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false
3951,"""Two Family House""","[""Drama""]",2000,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false
