In [1]:
import os
ROOT = os.path.join('..', '..')
import sys
sys.path.append(ROOT)
# 
from tqdm import tqdm
import numpy as np
import pandas as pd
import polars as pl
import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
class PageRankRecommender():
    def __init__(self, ml_movies_df, ml_users_df):
        self.ml_movies_df = ml_movies_df
        self.ml_users_df = ml_users_df
        self.A_norm = None
        if os.path.exists("A_norm.npy"):
            self.A_norm = np.load("A_norm.npy")
    
    def fit(self, ml_ratings_train_df):
        if self.A_norm is None:
            print("Initializing adjacency matrix...")
            self._init_adjacency_matrix(ml_ratings_train_df)
            np.save("A_norm.npy", self.A_norm)
        self.pagerank_scores = self._pagerank(self.A_norm)
        self.ml_ratings_train_df = ml_ratings_train_df
    
    def predict(self, user_id, n_recommendations):
        movies_ids = self.ml_movies_df['MovieID'].values
        user_rated_movies_idx = self.ml_ratings_train_df[self.ml_ratings_train_df["UserID"] == user_id]["MovieID"].values
        scores = pd.DataFrame(self.pagerank_scores, index=movies_ids, columns=["Score"])\
            .sort_values("Score", ascending=False)\
            .drop(user_rated_movies_idx, errors='ignore').head(n_recommendations)
        return scores
    
    def evaluate(ml_ratings_test_df, k):
        pass
    
    def _init_adjacency_matrix(self, ml_ratings_train_df):
        movies_ids = self.ml_movies_df['MovieID'].values
        A_df = pd.DataFrame(0, index=movies_ids, columns=movies_ids)
        user_groups = ml_ratings_train_df.sort_values("Rating", ascending=False).groupby("UserID")
        for _, user_group in tqdm(user_groups):
            ratings = user_group["Rating"].values
            movie_ids = user_group["MovieID"].values
            diff_matrix = -np.subtract.outer(ratings, ratings)
            diff_matrix = np.maximum(diff_matrix, 0)
            A_df.loc[movie_ids, movie_ids] += diff_matrix
        A = A_df.values
        row_sums = A.sum(axis=1, keepdims=True)
        row_sums[row_sums == 0] = 1  # Handle rows with all zeros
        A_norm = A / row_sums
        self.A_norm = A_norm
        
    def _pagerank(self, A, d=0.85, eps=1e-8, max_iter=100):
        N = A.shape[0]
        p = np.ones(N) / N
        for _ in tqdm(range(max_iter)):
            p_new = (1 - d) / N + d * A.T.dot(p)
            if np.linalg.norm(p_new - p) < eps:
                break
            p = p_new
        return p

In [3]:
from scripts.data import (
    ml_ratings_df, ml_movies_df, ml_users_df, ml_df, ml_genres,
    bc_ratings_df, bc_books_df, bc_users_df, bc_df,
)
from src.metrics import (
    ml_precision_at_k, ml_recall_at_k, ml_f1_at_k,
    ml_precision_at_k_pl, ml_recall_at_k_pl, ml_f1_at_k_pl
)

In [4]:
idx = pl.Series(range(ml_ratings_df.select(pl.len()).collect().item()))
idx = idx.shuffle(seed=42)
train_size = int(len(idx) * 0.8)
train_idx = idx[:train_size]
test_idx = idx[train_size:]

ml_ratings_train_df = ml_ratings_df.collect()[train_idx, :]
ml_ratings_test_df = ml_ratings_df.collect()[test_idx, :]
ml_ratings_train_df.select(pl.len()).item(), ml_ratings_test_df.select(pl.len()).item()

(800167, 200042)

In [5]:
ml_movies_df = ml_movies_df.collect().to_pandas()
ml_ratings_train_df = ml_ratings_train_df.to_pandas()
ml_ratings_test_df = ml_ratings_test_df.to_pandas()
ml_users_df = ml_users_df.collect().to_pandas()

In [6]:
recommender = PageRankRecommender(ml_movies_df, ml_users_df)
recommender.fit(ml_ratings_train_df)

 22%|██▏       | 22/100 [00:00<00:01, 77.60it/s]


In [7]:
recommendations = recommender.predict(1, 100)
recommendations = recommendations.reset_index()
recommendations.columns = ["MovieID", "Score"]
recommendations

Unnamed: 0,MovieID,Score
0,259,0.003388
1,857,0.003024
2,607,0.002841
3,526,0.002793
4,295,0.002678
...,...,...
95,1673,0.001145
96,2691,0.001141
97,1672,0.001136
98,2699,0.001135


In [14]:
best_metric = -1
best_user_id = 0
for user_id in ml_ratings_test_df["UserID"].unique():
    metric = ml_precision_at_k(5, recommendations, ml_ratings_test_df, user_id, True)
    if metric > best_metric:
        best_metric = metric
        best_user_id = user_id
        print(f"User {user_id}: {metric}")

User 2104: 0.4
User 5746: 0.6
User 1723: 0.8


In [9]:
recommender.A_norm

array([[0.00000000e+00, 2.91283629e-04, 2.44678248e-04, ...,
        3.49540354e-05, 4.66053806e-05, 2.79632284e-04],
       [2.69146852e-03, 0.00000000e+00, 3.26479910e-04, ...,
        9.55550955e-05, 4.77775477e-05, 4.22035005e-04],
       [2.26741084e-03, 5.84807321e-04, 0.00000000e+00, ...,
        9.23379981e-05, 4.10391103e-05, 3.48832437e-04],
       ...,
       [1.32749237e-03, 0.00000000e+00, 1.32749237e-04, ...,
        0.00000000e+00, 7.96495420e-04, 1.19474313e-03],
       [9.31098696e-04, 0.00000000e+00, 0.00000000e+00, ...,
        2.32774674e-04, 0.00000000e+00, 1.39664804e-03],
       [1.78828959e-03, 1.73060283e-04, 1.73060283e-04, ...,
        2.01903663e-04, 2.88433804e-04, 0.00000000e+00]])