In [1]:
import os
ROOT = os.path.join('..', '..')
import sys
sys.path.append(ROOT)
#
from src.constants import (
    ML_MOVIES_PATH, ML_RATINGS_PATH, ML_USERS_PATH,
    BC_BOOKS_PATH, BC_RATINGS_PATH, BC_USERS_PATH
)
import numpy as np
import polars as pl
import altair as alt
from tqdm import tqdm
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

# Data

## MovieLens

### Users

In [2]:
# 1) Read lines and split them by "::"
# 2) Select the columns and cast them to the correct types
ml_ocupations = [
    "other",
	"academic/educator",
	"artist",
	"clerical/admin",
	"college/grad student",
	"customer service",
	"doctor/health care",
	"executive/managerial",
	"farmer",
	"homemaker",
	"K-12 student",
	"lawyer",
	"programmer",
	"retired",
	"sales/marketing",
	"scientist",
	"self-employed",
	"technician/engineer",
	"tradesman/craftsman",
	"unemployed",
	"writer",
]
ml_occupation_map = {i: occupation for i, occupation in enumerate(ml_ocupations)}
ml_users_df = pl.scan_csv(os.path.join(ROOT, ML_USERS_PATH), has_header=False, truncate_ragged_lines=True, encoding="utf8-lossy").select([
    pl.col("column_1").str.split("::")
]).select([
    pl.col("column_1").list.get(0).alias("UserID").cast(pl.Int32),
    pl.col("column_1").list.get(1).alias("Gender"),
    pl.col("column_1").list.get(2).alias("Age").cast(pl.Int32),
    pl.col("column_1").list.get(3).alias("Occupation").cast(pl.Int32).map_dict(ml_occupation_map),
    pl.col("column_1").list.get(4).alias("Zip-code"),
])

ml_users_df.head().collect()

  pl.col("column_1").list.get(3).alias("Occupation").cast(pl.Int32).map_dict(ml_occupation_map),


UserID,Gender,Age,Occupation,Zip-code
i32,str,i32,str,str
1,"""F""",1,"""K-12 student""","""48067"""
2,"""M""",56,"""self-employed""","""70072"""
3,"""M""",25,"""scientist""","""55117"""
4,"""M""",45,"""executive/managerial""","""02460"""
5,"""M""",25,"""writer""","""55455"""


### Movies


In [3]:
# 1) Read lines and split them by "::"
# 2) Select the columns and cast them to the correct types
# 3) Extract the year from the title column
# 4) If year is not null, remove it from the title
#   If the title is "Toy Story (1995)", the title should be "Toy Story" and the year should be 1995. 
#   If the title does not have a year, the year should be null.
ml_movies_df = pl.scan_csv(os.path.join(ROOT, ML_MOVIES_PATH), has_header=False, truncate_ragged_lines=True, encoding="utf8-lossy").select([
    pl.col("column_1").str.split("::")
]).select([
    pl.col("column_1").list.get(0).alias("MovieID").cast(pl.Int32),
    pl.col("column_1").list.get(1).alias("Title"),
    pl.col("column_1").list.get(2).str.split("|").alias("Genres")
]).with_columns([
    pl.col("Title").str.extract(r"\((\d{4})\)$").alias("Year").cast(pl.Int32),
]).with_columns([
    # If Year is not null, remove it from the title
    pl.when(pl.col("Year").is_null()).then(pl.col("Title")).otherwise(
        pl.col("Title").str.slice(0, pl.col("Title").str.find(" \((\d{4})\)$"))
    ).alias("Title")
])
# 4) Get possible genres
ml_genres = ml_movies_df.select(pl.col("Genres").explode().unique()).collect()
# 5) Make dummy variable for each genre
ml_movies_df = ml_movies_df.with_columns([
    pl.col("Genres").list.contains(genre[0]).alias(f"Is{genre[0]}")
    for genre in ml_genres.rows() if genre[0] is not None
])

In [4]:
ml_movies_df.collect()

MovieID,Title,Genres,Year,IsAdventure,IsAction,IsMystery,IsDrama,IsWar,IsDocumentary,IsAnimation,IsSci-Fi,IsChildren's,IsWestern,IsFantasy,IsHorror,IsMusical,IsFilm-Noir,IsThriller,IsCrime,IsComedy,IsRomance
i32,str,list[str],i32,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
1,"""Toy Story""","[""Animation"", ""Children's"", ""Comedy""]",1995,false,false,false,false,false,false,true,false,true,false,false,false,false,false,false,false,true,false
2,"""Jumanji""","[""Adventure"", ""Children's"", ""Fantasy""]",1995,true,false,false,false,false,false,false,false,true,false,true,false,false,false,false,false,false,false
3,"""Grumpier Old Men""","[""Comedy"", ""Romance""]",1995,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true
4,"""Waiting to Exhale""","[""Comedy"", ""Drama""]",1995,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,false
5,"""Father of the Bride Part II""","[""Comedy""]",1995,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
3948,"""Meet the Parents""","[""Comedy""]",2000,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false
3949,"""Requiem for a Dream""","[""Drama""]",2000,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false
3950,"""Tigerland""","[""Drama""]",2000,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false
3951,"""Two Family House""","[""Drama""]",2000,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false


In [5]:
ml_movies_df.null_count().collect()

MovieID,Title,Genres,Year,IsAdventure,IsAction,IsMystery,IsDrama,IsWar,IsDocumentary,IsAnimation,IsSci-Fi,IsChildren's,IsWestern,IsFantasy,IsHorror,IsMusical,IsFilm-Noir,IsThriller,IsCrime,IsComedy,IsRomance
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,993,993,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Ratings

In [6]:
# 1) Read lines and split them by "::"
# 2) Select columns and cast them to the correct type
ml_ratings_df = pl.scan_csv(os.path.join(ROOT, ML_RATINGS_PATH), has_header=False).select([
    pl.col("column_1").str.split("::")
]).select([
    pl.col("column_1").list.get(0).alias("UserID").cast(pl.Int32),
    pl.col("column_1").list.get(1).alias("MovieID").cast(pl.Int32),
    pl.col("column_1").list.get(2).alias("Rating").cast(pl.Int32),
    pl.col("column_1").list.get(3).alias("Timestamp").cast(pl.Int32)
])

ml_ratings_df.collect()

UserID,MovieID,Rating,Timestamp
i32,i32,i32,i32
1,1193,5,978300760
1,661,3,978302109
1,914,3,978301968
1,3408,4,978300275
1,2355,5,978824291
…,…,…,…
6040,1091,1,956716541
6040,1094,5,956704887
6040,562,5,956704746
6040,1096,4,956715648


In [7]:
ml_ratings_df.null_count().collect()

UserID,MovieID,Rating,Timestamp
u32,u32,u32,u32
0,0,0,0


### Join

In [8]:
# Join dataframes
ml_df = ml_ratings_df.join(ml_movies_df, on="MovieID").join(ml_users_df, on="UserID")
ml_df.collect()

UserID,MovieID,Rating,Timestamp,Title,Genres,Year,IsAdventure,IsAction,IsMystery,IsDrama,IsWar,IsDocumentary,IsAnimation,IsSci-Fi,IsChildren's,IsWestern,IsFantasy,IsHorror,IsMusical,IsFilm-Noir,IsThriller,IsCrime,IsComedy,IsRomance,Gender,Age,Occupation,Zip-code
i32,i32,i32,i32,str,list[str],i32,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,str,i32,str,str
1,1193,5,978300760,"""One Flew Over the Cuckoo's Nes…","[""Drama""]",1975,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,"""F""",1,"""K-12 student""","""48067"""
1,661,3,978302109,"""James and the Giant Peach""","[""Animation"", ""Children's"", ""Musical""]",1996,false,false,false,false,false,false,true,false,true,false,false,false,true,false,false,false,false,false,"""F""",1,"""K-12 student""","""48067"""
1,914,3,978301968,"""My Fair Lady""","[""Musical"", ""Romance""]",1964,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,"""F""",1,"""K-12 student""","""48067"""
1,3408,4,978300275,"""Erin Brockovich""","[""Drama""]",2000,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,"""F""",1,"""K-12 student""","""48067"""
1,2355,5,978824291,"""Bug's Life""",,,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,"""F""",1,"""K-12 student""","""48067"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
6040,1091,1,956716541,"""Weekend at Bernie's""","[""Comedy""]",1989,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,"""M""",25,"""doctor/health care""","""11106"""
6040,1094,5,956704887,"""Crying Game""",,,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,"""M""",25,"""doctor/health care""","""11106"""
6040,562,5,956704746,"""Welcome to the Dollhouse""","[""Comedy"", ""Drama""]",1995,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,false,"""M""",25,"""doctor/health care""","""11106"""
6040,1096,4,956715648,"""Sophie's Choice""","[""Drama""]",1982,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,"""M""",25,"""doctor/health care""","""11106"""


In [9]:
ml_df.null_count().collect()

UserID,MovieID,Rating,Timestamp,Title,Genres,Year,IsAdventure,IsAction,IsMystery,IsDrama,IsWar,IsDocumentary,IsAnimation,IsSci-Fi,IsChildren's,IsWestern,IsFantasy,IsHorror,IsMusical,IsFilm-Noir,IsThriller,IsCrime,IsComedy,IsRomance,Gender,Age,Occupation,Zip-code
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,243234,243234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Book-Crossing

In [10]:
bc_books_df = pl.scan_csv(os.path.join(ROOT, BC_BOOKS_PATH), separator=";")
bc_books_df.collect()

ISBN,Title,Author,Year,Publisher
str,str,str,i64,str
"""0195153448""","""Classical Mythology""","""Mark P. O. Morford""",2002,"""Oxford University Press"""
"""0002005018""","""Clara Callan""","""Richard Bruce Wright""",2001,"""HarperFlamingo Canada"""
"""0060973129""","""Decision in Normandy""","""Carlo D'Este""",1991,"""HarperPerennial"""
"""0374157065""","""Flu: The Story of the Great In…","""Gina Bari Kolata""",1999,"""Farrar Straus Giroux"""
"""0393045218""","""The Mummies of Urumchi""","""E. J. W. Barber""",1999,"""W. W. Norton & Company"""
…,…,…,…,…
"""0440400988""","""There's a Bat in Bunk Five""","""Paula Danziger""",1988,"""Random House Childrens Pub (Mm…"
"""0525447644""","""From One to One Hundred""","""Teri Sloat""",1991,"""Dutton Books"""
"""006008667X""","""Lily Dale : The True Story of …","""Christine Wicker""",2004,"""HarperSanFrancisco"""
"""0192126040""","""Republic (World's Classics)""","""Plato""",1996,"""Oxford University Press"""


In [11]:
bc_books_df.null_count().collect()

ISBN,Title,Author,Year,Publisher
u32,u32,u32,u32,u32
0,0,1,0,2


In [12]:
bc_users_df = pl.scan_csv(os.path.join(ROOT, BC_USERS_PATH), separator=";").filter(
    pl.col("Age").is_between(5, 100)
)
bc_users_df.collect()

User-ID,Age
i64,i64
2,18
4,17
6,61
10,26
11,14
…,…
278849,23
278851,33
278852,32
278853,17


In [13]:
bc_users_df.null_count().collect()

User-ID,Age
u32,u32
0,0


In [14]:
bc_ratings_df = pl.scan_csv(os.path.join(ROOT, BC_RATINGS_PATH), separator=";").filter(
    pl.col("User-ID").is_in(bc_users_df.select("User-ID").collect()),
)
bc_ratings_df.collect()

User-ID,ISBN,Rating
i64,str,i64
276727,"""0446520802""",0
276729,"""052165615X""",3
276729,"""0521795028""",6
276733,"""2080674722""",0
276737,"""0600570967""",6
…,…,…
276697,"""8445072897""",0
276706,"""0679447156""",0
276709,"""0515107662""",10
276721,"""0590442449""",10


In [15]:
bc_ratings_df.null_count().collect()

User-ID,ISBN,Rating
u32,u32,u32
0,0,0


# Train - Test Split

In [16]:
ml_ratings_df = ml_ratings_df.sort("Timestamp")
ml_ratings_df_len = ml_df.select(pl.len()).collect().item()
train_size = int(ml_ratings_df_len * 0.8)
ml_ratings_train_df = ml_df.head(train_size)
ml_ratings_test_df = ml_df.tail(ml_ratings_df_len - train_size)

In [17]:
ml_ratings_test_df.collect()

UserID,MovieID,Rating,Timestamp,Title,Genres,Year,IsAdventure,IsAction,IsMystery,IsDrama,IsWar,IsDocumentary,IsAnimation,IsSci-Fi,IsChildren's,IsWestern,IsFantasy,IsHorror,IsMusical,IsFilm-Noir,IsThriller,IsCrime,IsComedy,IsRomance,Gender,Age,Occupation,Zip-code
i32,i32,i32,i32,str,list[str],i32,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,str,i32,str,str
4795,3262,3,969726048,"""Twin Peaks: Fire Walk with Me""","[""Drama"", ""Mystery""]",1992,false,false,true,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,"""M""",45,"""self-employed""","""10962"""
4795,2624,4,969659424,"""After Life""","[""Drama""]",1998,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,"""M""",45,"""self-employed""","""10962"""
4795,2628,3,969381578,"""Star Wars: Episode I - The Pha…","[""Action"", ""Adventure"", … ""Sci-Fi""]",1999,true,true,false,false,false,false,false,true,false,false,true,false,false,false,false,false,false,false,"""M""",45,"""self-employed""","""10962"""
4795,3609,4,962985072,"""Regret to Inform""","[""Documentary""]",1998,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,"""M""",45,"""self-employed""","""10962"""
4795,3623,3,980095267,"""Mission: Impossible 2""","[""Action"", ""Thriller""]",2000,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,"""M""",45,"""self-employed""","""10962"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
6040,1091,1,956716541,"""Weekend at Bernie's""","[""Comedy""]",1989,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,"""M""",25,"""doctor/health care""","""11106"""
6040,1094,5,956704887,"""Crying Game""",,,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,"""M""",25,"""doctor/health care""","""11106"""
6040,562,5,956704746,"""Welcome to the Dollhouse""","[""Comedy"", ""Drama""]",1995,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,false,"""M""",25,"""doctor/health care""","""11106"""
6040,1096,4,956715648,"""Sophie's Choice""","[""Drama""]",1982,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,"""M""",25,"""doctor/health care""","""11106"""


# Baseline Popularity-Based Recommender

In [18]:
def ml_simple_recommendation(n_recommendations=5):
    """
    Recommends most popular movies. Thats it
    """
    return ml_ratings_train_df.join(ml_movies_df, on="MovieID") \
        .group_by("MovieID") \
        .agg(
            pl.sum("Rating").alias("score")
        ).join(ml_movies_df, on="MovieID") \
        .sort("score", descending=True) \
        .head(n_recommendations)

ml_simple_recommendation().collect()

MovieID,score,Title,Genres,Year,IsAdventure,IsAction,IsMystery,IsDrama,IsWar,IsDocumentary,IsAnimation,IsSci-Fi,IsChildren's,IsWestern,IsFantasy,IsHorror,IsMusical,IsFilm-Noir,IsThriller,IsCrime,IsComedy,IsRomance
i32,i32,str,list[str],i32,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
2858,12003,"""American Beauty""","[""Comedy"", ""Drama""]",1999,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False
260,10464,"""Star Wars: Episode IV - A New …","[""Action"", ""Adventure"", … ""Sci-Fi""]",1977,True,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False
1196,10026,"""Star Wars: Episode V - The Emp…","[""Action"", ""Adventure"", … ""War""]",1980,True,True,False,True,True,False,False,True,False,False,False,False,False,False,False,False,False,False
2028,9302,"""Saving Private Ryan""","[""Action"", ""Drama"", ""War""]",1998,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False
1210,9109,"""Star Wars: Episode VI - Return…","[""Action"", ""Adventure"", … ""War""]",1983,True,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,True


# Evaluation metrics

## Precision@K

In [19]:
ml_test_user_id = 6040
ml_recommendation = ml_simple_recommendation(20)

In [20]:
def ml_precision_at_k(k, rec_df, ml_ratings_test_df, user_id):
    ml_user_ratings_test_df = ml_ratings_test_df.filter(pl.col("UserID") == user_id)
    topk = rec_df.head(k).with_columns([
        pl.col("MovieID").is_in(ml_user_ratings_test_df.select("MovieID").collect()).alias("is_relevant")
    ])
    top_k_len = topk.select(pl.len()).collect().item()
    ml_precision_at_k = topk.select(pl.col("is_relevant").sum()).collect().item() / top_k_len
    return ml_precision_at_k

print(f"Precision@5 : {ml_precision_at_k(5, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Precision@10: {ml_precision_at_k(10, ml_recommendation, ml_ratings_test_df,  ml_test_user_id)}")
print(f"Precision@15: {ml_precision_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

Precision@5 : 1.0
Precision@10: 0.9
Precision@15: 0.8666666666666667


## Recall@K

In [21]:
def ml_recall_at_k(k, rec_df, ml_ratings_test_df, user_id):
    ml_user_ratings_test_df = ml_ratings_test_df.filter(pl.col("UserID") == user_id)
    topk = rec_df.head(k).with_columns([
        pl.col("MovieID").is_in(ml_user_ratings_test_df.select("MovieID").collect()).alias("is_relevant")
    ])
    ml_user_ratings_test_df_len = ml_user_ratings_test_df.select(pl.len()).collect().item()
    if ml_user_ratings_test_df_len == 0: # Not sure about this
        return 0
    recall_at_k = topk.select(pl.col("is_relevant").sum()).collect().item() / ml_user_ratings_test_df_len
    return recall_at_k

print(f"Recall@5: {ml_recall_at_k(5, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Recall@10: {ml_recall_at_k(10, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Recall@15: {ml_recall_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

Recall@5: 0.01466275659824047
Recall@10: 0.026392961876832845
Recall@15: 0.03812316715542522


## F1@K

In [22]:
def f1_at_k(k, rec_df, ml_ratings_test_df, user_id):
    precision = ml_precision_at_k(k, rec_df, ml_ratings_test_df, user_id)
    recall = ml_recall_at_k(k, rec_df, ml_ratings_test_df, user_id)
    return 2 * (precision * recall) / (precision + recall + 1e-12)

print(f"F1@5: {f1_at_k(5,   ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"F1@10: {f1_at_k(10, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"F1@15: {f1_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

F1@5: 0.028901734104017758
F1@10: 0.05128205128199593
F1@15: 0.07303370786508782


# Content Based Model

In [23]:
ml_test_user_id = 5245

In [24]:
def create_user_profile(user_id):
    user_ratings = ml_ratings_train_df.filter(pl.col("UserID") == user_id)
    user_movies = user_ratings.join(ml_movies_df, on="MovieID")
    user_profile = user_movies.select([
        pl.sum("Is" + genre[0]).alias(genre[0])
        for genre in ml_genres.rows() if genre[0] is not None
    ])
    return user_profile

create_user_profile(ml_test_user_id).collect()

Adventure,Action,Mystery,Drama,War,Documentary,Animation,Sci-Fi,Children's,Western,Fantasy,Horror,Musical,Film-Noir,Thriller,Crime,Comedy,Romance
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
def ml_content_based_recommendation(user_id, n_recommendations=5):
    """
    Calculates scores as the cosine similarity between the user profile and the movie profiles. 
    NOTE: I haven't found a way to use polars' native functions to calculate cosine similarity, so this stuff is highty inefficient
    """
    user_profile = create_user_profile(user_id)
    movie_profiles = ml_movies_df.select([
        pl.col("Is" + genre[0]).cast(pl.UInt32) 
        for genre in ml_genres.rows() if genre[0] is not None
    ])
    u = user_profile.collect().to_numpy()[0]
    ms = movie_profiles.collect().to_numpy()
    s = ms.dot(u) / (np.linalg.norm(u) * np.linalg.norm(ms, axis=1)+1e-12)
    scores = ml_movies_df.with_columns(pl.Series(name="score", values=s)).sort("score", descending=True)
    return scores.head(n_recommendations)

ml_recommendation = ml_content_based_recommendation(ml_test_user_id, 20)

In [26]:
print(f"Precision@5 : {ml_precision_at_k(5, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Precision@10: {ml_precision_at_k(10, ml_recommendation, ml_ratings_test_df,  ml_test_user_id)}")
print(f"Precision@15: {ml_precision_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

Precision@5 : 0.4
Precision@10: 0.2
Precision@15: 0.13333333333333333


In [27]:
print(f"Recall@5: {ml_recall_at_k(5, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Recall@10: {ml_recall_at_k(10, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Recall@15: {ml_recall_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

Recall@5: 0.06666666666666667
Recall@10: 0.06666666666666667
Recall@15: 0.06666666666666667


In [28]:
print(f"F1@5: {f1_at_k(5,   ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"F1@10: {f1_at_k(10, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"F1@15: {f1_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

F1@5: 0.1142857142854694
F1@10: 0.09999999999962501
F1@15: 0.08888888888844444


# Item-Item Colaborative Filtering

In [29]:
def ml_item_item_colaborative_filtering_recommendation(user_id, n_recommendations=5):
    """
    
    """
    user_movies = ml_ratings_train_df.filter(pl.col("UserID") == user_id).join(ml_movies_df, on="MovieID")
    user_movies_representation = user_movies.select([
        pl.col("Is" + genre[0]).alias(genre[0])
        for genre in ml_genres.rows() if genre[0] is not None
    ]).collect().to_numpy()
    all_movies_representation = ml_movies_df.select([
        pl.col("Is" + genre[0]).alias(genre[0])
        for genre in ml_genres.rows() if genre[0] is not None
    ]).collect().to_numpy()
    s = all_movies_representation.dot(user_movies_representation.T)
    print(user_movies_representation.shape)
    print(all_movies_representation.shape)
    print(s.shape)
    # scores = ml_movies_df.with_columns(pl.Series(name="score", values=s.flatten()))
    # return scores.head(n_recommendations)

ml_item_item_colaborative_filtering_recommendation(2, 20)

(129, 18)
(3883, 18)
(3883, 129)


In [30]:
print(f"Precision@5 : {ml_precision_at_k(5, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")
print(f"Precision@10: {ml_precision_at_k(10, ml_recommendation, ml_ratings_test_df,  ml_test_user_id)}")
print(f"Precision@15: {ml_precision_at_k(15, ml_recommendation, ml_ratings_test_df, ml_test_user_id)}")

Precision@5 : 0.4
Precision@10: 0.2
Precision@15: 0.13333333333333333


# User-User Colaborative Filtering

In [31]:
def ml_user_user_colaborative_filtering_recommendation(user_id, n_recommendations=5):
    # Get user
    user = ml_users_df.filter(pl.col("UserID") == user_id)
    # Get similar users
    print(user.collect())

ml_user_user_colaborative_filtering_recommendation(2, 20)

shape: (1, 5)
┌────────┬────────┬─────┬───────────────┬──────────┐
│ UserID ┆ Gender ┆ Age ┆ Occupation    ┆ Zip-code │
│ ---    ┆ ---    ┆ --- ┆ ---           ┆ ---      │
│ i32    ┆ str    ┆ i32 ┆ str           ┆ str      │
╞════════╪════════╪═════╪═══════════════╪══════════╡
│ 2      ┆ M      ┆ 56  ┆ self-employed ┆ 70072    │
└────────┴────────┴─────┴───────────────┴──────────┘
