In [1]:
import polars as pl
from scripts.data import ml_df
from src.metrics import ml_precision_at_k

# Train - Test Split

In [2]:
sorted_df = ml_df.sort("Timestamp")

df_size = ml_df.collect().height
train_size = int(df_size * 0.8)

train_df = sorted_df.head(train_size).to_pandas()
test_df = sorted_df.tail(df_size - train_size).to_pandas()

# Item-Item Collaborative Filtering

We recommend user an item that is simular to previous one. We describe item using scores of other users.

In [15]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def item_item_recommender(pandas_df, movie_id, n=10):
    # Pivot the DataFrame to create a user-movie matrix with entries as ratings
    user_movie_matrix = pandas_df.pivot_table(index='UserID', columns='MovieID', values='Rating').fillna(0)

    # Calculate the mean only for non-zero ratings (excluding zeros)
    mask = user_movie_matrix != 0
    means = user_movie_matrix.where(mask).mean(axis=1).fillna(0)  # Fill NaN means with 0 if a user has no ratings

    # Normalize the matrix by subtracting the mean ratings only for non-zero entries
    normalized_matrix = user_movie_matrix.where(mask).sub(means, axis=0).fillna(0)

    # Transpose the matrix to get a movie-user matrix for similarity calculation
    movie_user_matrix = normalized_matrix.T

    # Compute cosine similarity between movies
    similarity_matrix = cosine_similarity(movie_user_matrix)

    # Create a DataFrame for the similarity matrix with movie IDs as both row and column indices
    similarity_df = pd.DataFrame(similarity_matrix, index=movie_user_matrix.index, columns=movie_user_matrix.index)

    # Get the movies sorted by highest similarity to the given movie_id, excluding itself
    sorted_similar_movies = similarity_df[movie_id].sort_values(ascending=False)
    most_similar_movies = sorted_similar_movies.iloc[1:n+1].index.tolist()

    return most_similar_movies


some_movie_id = 3
recommended_movies = item_item_recommender(train_df, some_movie_id, 5)

In [24]:
def display_movie_details(pandas_df, recommended_movie_ids):
    # Filter the DataFrame for the recommended movie IDs
    recommended_movies_df = pandas_df[pandas_df['MovieID'].isin(recommended_movie_ids)]

    # Drop duplicates to ensure each movie is listed only once
    recommended_movies_df = recommended_movies_df.drop_duplicates(subset='MovieID')

    # Select the columns you want to display
    columns_to_display = ['Title', 'Genres', 'Year', 'IsRomance', 'IsWar',
                          'IsComedy', 'IsFantasy', 'IsCrime', 'IsAction',
                          'IsFilm-Noir', 'IsChildren\'s', 'IsThriller',
                          'IsMystery', 'IsDrama', 'IsSci-Fi', 'IsAnimation',
                          'IsMusical', 'IsWestern', 'IsDocumentary', 'IsAdventure', 'IsHorror']

    # Return the DataFrame with only the specified columns
    return recommended_movies_df[columns_to_display]

In [25]:
display_movie_details(train_df, recommended_movies)

Unnamed: 0,Title,Genres,Year,IsRomance,IsWar,IsComedy,IsFantasy,IsCrime,IsAction,IsFilm-Noir,...,IsThriller,IsMystery,IsDrama,IsSci-Fi,IsAnimation,IsMusical,IsWestern,IsDocumentary,IsAdventure,IsHorror
785,Bodyguard,,,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
869,Grumpy Old Men,[Comedy],1993.0,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1894,Mirror Has Two Faces,,,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1941,Friday the 13th Part 3: 3D,[Horror],1982.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2289,Days of Thunder,"[Action, Romance]",1990.0,True,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
