In [1]:
import polars as pl
from scripts.data import ml_df, ml_movies_df
from src.metrics import ml_precision_at_k, ml_recall_at_k, ml_f1_at_k

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


  from pandas.core import (


# Train - Test Split

In [2]:
idx = pl.Series(range(ml_df.select(pl.len()).collect().item()))
idx = idx.shuffle(seed=42)
train_size = int(len(idx) * 0.8)
train_idx = idx[:train_size]
test_idx = idx[train_size:]

train_df = ml_df.collect()[train_idx, :].to_pandas()
test_df = ml_df.collect()[test_idx, :].lazy()
ml_movies_df = ml_movies_df.collect().to_pandas()

# Item-Item Collaborative Filtering

We recommend user an item that is simular to previous one. We describe item using scores of other users.

In [3]:
def item_item_recommender(pandas_df, movie_id, n=10):
    # Pivot the DataFrame to create a user-movie matrix with entries as ratings
    user_movie_matrix = pandas_df.pivot_table(index='UserID', columns='MovieID', values='Rating').fillna(0)

    # Calculate the mean only for non-zero ratings (excluding zeros)
    mask = user_movie_matrix != 0
    means = user_movie_matrix.where(mask).mean(axis=1).fillna(0)  # Fill NaN means with 0 if a user has no ratings

    # Normalize the matrix by subtracting the mean ratings only for non-zero entries
    normalized_matrix = user_movie_matrix.where(mask).sub(means, axis=0).fillna(0)

    # Transpose the matrix to get a movie-user matrix for similarity calculation
    movie_user_matrix = normalized_matrix.T

    # Compute cosine similarity between movies
    similarity_matrix = cosine_similarity(movie_user_matrix)

    # Create a DataFrame for the similarity matrix with movie IDs as both row and column indices
    similarity_df = pd.DataFrame(similarity_matrix, index=movie_user_matrix.index, columns=movie_user_matrix.index)

    # Get the movies sorted by highest similarity to the given movie_id, excluding itself
    sorted_similar_movies = similarity_df[movie_id].sort_values(ascending=False)
    most_similar_movies = sorted_similar_movies.iloc[1:n+1].index.tolist()

    return most_similar_movies

In [73]:
train_df[train_df.UserID == 52]

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Year,IsDrama,IsThriller,IsAction,...,IsAnimation,IsWestern,IsRomance,IsCrime,IsSci-Fi,IsChildren's,Gender,Age,Occupation,Zip-code
26,52,1279,5,977982675,Raise the Red Lantern,[Drama],1991.0,True,False,False,...,False,False,False,False,False,False,M,25,other,96931
535,52,1046,4,977974838,Long Kiss Goodnight,,,False,False,False,...,False,False,False,False,False,False,M,25,other,96931
812,52,3256,3,977975267,Bodyguard,,,False,False,False,...,False,False,False,False,False,False,M,25,other,96931
3237,52,2003,2,977980579,Gremlins 2: The New Batch,"[Comedy, Horror]",1990.0,False,False,False,...,False,False,False,False,False,False,M,25,other,96931
3329,52,1128,5,977973989,Escape from New York,"[Action, Adventure, Sci-Fi, Thriller]",1981.0,False,True,True,...,False,False,False,False,True,False,M,25,other,96931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
796376,52,1679,5,977984657,Sliding Doors,"[Drama, Romance]",1998.0,True,False,False,...,False,False,True,False,False,False,M,25,other,96931
797728,52,3265,5,977984851,Man Bites Dog (C'est arriv� pr�s de chez vous)...,"[Action, Comedy, Crime, Drama]",1992.0,True,False,True,...,False,False,False,True,False,False,M,25,other,96931
798638,52,2327,3,977980687,Vampires,[Horror],1998.0,False,False,False,...,False,False,False,False,False,False,M,25,other,96931
799604,52,2996,5,977948890,Being John Malkovich,[Comedy],1999.0,False,False,False,...,False,False,False,False,False,False,M,25,other,96931


In [74]:
(test_df.collect().to_pandas().UserID == 52).sum()

129

In [75]:
(train_df.UserID == 52).sum()

555

In [80]:
some_movie_id = 1128
some_user_id = 52
k = 15

recommended_movies = item_item_recommender(train_df, some_movie_id, k)
ml_recommendation = pl.from_pandas(ml_movies_df.loc[ml_movies_df.MovieID.isin(recommended_movies)]).lazy()

print(f"Precision@5 : {ml_precision_at_k(5, ml_recommendation, test_df, some_user_id)}")
print(f"Precision@10: {ml_precision_at_k(10, ml_recommendation, test_df,  some_user_id)}")
print(f"Precision@15: {ml_precision_at_k(15, ml_recommendation, test_df, some_user_id)}")
print("-" * 100)
print(f"Recall@5: {ml_recall_at_k(5, ml_recommendation, test_df, some_user_id)}")
print(f"Recall@10: {ml_recall_at_k(10, ml_recommendation, test_df, some_user_id)}")
print(f"Recall@15: {ml_recall_at_k(15, ml_recommendation, test_df, some_user_id)}")
print("-" * 100)
print(f"F1@5: {ml_f1_at_k(5,   ml_recommendation, test_df, some_user_id)}")
print(f"F1@10: {ml_f1_at_k(10, ml_recommendation, test_df, some_user_id)}")
print(f"F1@15: {ml_f1_at_k(15, ml_recommendation, test_df, some_user_id)}")

Precision@5 : 0.4
Precision@10: 0.2
Precision@15: 0.13333333333333333
----------------------------------------------------------------------------------------------------
Recall@5: 0.015503875968992248
Recall@10: 0.015503875968992248
Recall@15: 0.015503875968992248
----------------------------------------------------------------------------------------------------
F1@5: 0.02985074626858487
F1@10: 0.02877697841713265
F1@15: 0.027777777777591144


### Content based 

In [78]:
def content_based_recommender(pandas_df, movie_id, n=10):
    # Selecting the features to be used for similarity calculation
    feature_columns = ['Year', 'IsRomance', 'IsWar', 'IsComedy', 'IsFantasy', 'IsCrime', 'IsAction',
                       'IsFilm-Noir', 'IsChildren\'s', 'IsThriller', 'IsMystery', 'IsDrama',
                       'IsSci-Fi', 'IsAnimation', 'IsMusical', 'IsWestern', 'IsDocumentary',
                       'IsAdventure', 'IsHorror']

    # Creating a matrix of features for each movie
    features_df = pandas_df.set_index('MovieID')[feature_columns]

    # Fill NaN values in 'Year' with the average year, and other NaNs with 0
    average_year = features_df['Year'].mean(skipna=True)
    features_df['Year'].fillna(average_year, inplace=True)
    features_df.fillna(0, inplace=True)

    # Normalize the 'Year' to reduce the scale disparity among features
    features_df['Year'] = (features_df['Year'] - features_df['Year'].min()) / (features_df['Year'].max() - features_df['Year'].min())

    # Compute the cosine similarity matrix using the feature matrix
    similarity_matrix = cosine_similarity(features_df)
    similarity_df = pd.DataFrame(similarity_matrix, index=features_df.index, columns=features_df.index)

    # Get the movies sorted by highest similarity to the given movie_id, excluding itself
    sorted_similar_movies = similarity_df[movie_id].sort_values(ascending=False)
    most_similar_movies = sorted_similar_movies.iloc[1:n+1].index.tolist()

    return most_similar_movies

In [79]:
some_movie_id = 520
some_user_id = 52

recommended_movies = content_based_recommender(ml_movies_df, some_movie_id, 15)
ml_recommendation = pl.from_pandas(ml_movies_df.loc[ml_movies_df.MovieID.isin(recommended_movies)]).lazy()

print(f"Precision@5 : {ml_precision_at_k(4, ml_recommendation, test_df, some_user_id)}")
print(f"Precision@10: {ml_precision_at_k(10, ml_recommendation, test_df,  some_user_id)}")
print(f"Precision@15: {ml_precision_at_k(15, ml_recommendation, test_df, some_user_id)}")
print("-" * 100)
print(f"Recall@5: {ml_recall_at_k(5, ml_recommendation, test_df, some_user_id)}")
print(f"Recall@10: {ml_recall_at_k(10, ml_recommendation, test_df, some_user_id)}")
print(f"Recall@15: {ml_recall_at_k(15, ml_recommendation, test_df, some_user_id)}")
print("-" * 100)
print(f"F1@5: {ml_f1_at_k(5,   ml_recommendation, test_df, some_user_id)}")
print(f"F1@10: {ml_f1_at_k(10, ml_recommendation, test_df, some_user_id)}")
print(f"F1@15: {ml_f1_at_k(15, ml_recommendation, test_df, some_user_id)}")

Precision@5 : 0.25
Precision@10: 0.2
Precision@15: 0.13333333333333333
----------------------------------------------------------------------------------------------------
Recall@5: 0.007751937984496124
Recall@10: 0.015503875968992248
Recall@15: 0.015503875968992248
----------------------------------------------------------------------------------------------------
F1@5: 0.014925373134256514
F1@10: 0.02877697841713265
F1@15: 0.027777777777591144


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  features_df['Year'].fillna(average_year, inplace=True)
