In [1]:
%load_ext autoreload
%autoreload 2

import IPython
from pathlib import Path
import os
locals = IPython.extract_module_locals() # type: ignore
notebook_name = "/".join(locals[1]["__vsc_ipynb_file__"].split("/"))
os.chdir(Path(notebook_name).parent.parent.parent)

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from pycleora import SparseMatrix
from tqdm import tqdm

from recsys.evaluation.metrics import map_k, precision_k, recall_k
from recsys.evaluation.evaluation import recommendation_relevance

In [3]:
base_path = Path(".data/movielens/base")

movies = pd.read_csv(base_path / "movies.csv")
links = pd.read_csv(base_path / "links.csv")
tags = pd.read_csv(base_path / "tags.csv")

intermediate_path = Path(".data/movielens/intermediate/1")

ratings = pd.read_parquet(intermediate_path / "ratings.parquet")
ratings_train = pd.read_parquet(intermediate_path / "ratings_train.parquet")
ratings_validation = pd.read_parquet(intermediate_path / "ratings_validation.parquet")
user_id_map = pd.read_parquet(intermediate_path / "user_id_map.parquet")
movie_id_map = pd.read_parquet(intermediate_path / "movie_id_map.parquet")

In [4]:
n_users = user_id_map["userId"].nunique()
n_items = movie_id_map['movieId'].nunique()

print(n_users, n_items, ratings.shape[0])

162414 47396 15630129


In [5]:
customer_products = ratings_train.astype({"item_id": str}).groupby('session_id')['item_id'].apply(list).values
cleora_input = map(lambda x: ' '.join(x), customer_products)
graph = SparseMatrix.from_iterator(cleora_input, columns='complex::reflexive::product')
embeddings = graph.initialize_deterministically(1024)
n_walks = 2

for i in range(n_walks):
    embeddings = graph.left_markov_propagate(embeddings)
    embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True)

embeddings_sorted = embeddings[np.argsort(np.array(graph.entity_ids).astype(int))]

In [6]:
def build_session_embedding(item_ids):
    return embeddings_sorted[item_ids].mean(axis=0)

user_embedding_series = ratings_train.groupby('session_id')['item_id'].apply(build_session_embedding)
user_embeddings = np.vstack(user_embedding_series.values)

In [7]:
user_embeddings_tensor = torch.from_numpy(user_embeddings)
embeddings_tensor = torch.from_numpy(embeddings_sorted)

In [8]:
N_candid = 300

recommendations = []
for batch in tqdm(torch.split(user_embeddings_tensor, 1000)):
    similarity = torch.matmul(batch, embeddings_tensor.T)
    recommendations.append(similarity.topk(N_candid, dim=1).indices)
recommendations = torch.vstack(recommendations)

100%|██████████| 162/162 [00:17<00:00,  9.23it/s]


In [9]:
most_popular_items = ratings_train[["session_id", 'item_id']].drop_duplicates()["item_id"].value_counts()
recommendations_cleora = pd.DataFrame({"session_id": user_embedding_series.index, "candidates": recommendations.numpy().tolist()})
missing_indices = np.setdiff1d(np.arange(n_users), user_embedding_series.index)
most_popular_list = most_popular_items.index[:N_candid].tolist()
missing_recommendations_df = pd.DataFrame({
    'session_id': missing_indices,
    'candidates': [most_popular_list] * len(missing_indices)
})
recommendations_full = pd.concat([recommendations_cleora, missing_recommendations_df], axis=0).sort_values("session_id")

In [10]:
ground_truth = torch.from_numpy(ratings_validation[["session_id", "item_id"]].values).T

In [12]:
def recall_k_rel(rel, rel_sum, rel_mask) -> torch.Tensor:
    return torch.mean(torch.sum(rel[rel_mask], dim=1) / rel_sum[rel_mask])


def precision_k_rel(rel, rel_sum, rel_mask) -> torch.Tensor:
    return torch.mean(torch.mean(rel[rel_mask], dim=1))

In [15]:
recommendations_tensor = torch.from_numpy(np.array(recommendations_full['candidates'].tolist()))
users_idx = torch.from_numpy(recommendations_full['session_id'].values)


for k in [5, 10, 50, 100, 200, 300]:
    recommendations_k = recommendations_tensor[:, :k]
    rel_output = recommendation_relevance(recommendations_k, ground_truth, users_idx=users_idx, n_users=n_users, n_items=n_items)

    prec = precision_k_rel(*rel_output)
    rec = recall_k_rel(*rel_output)

    print(f"MAP@{k}: {0:.6f} | Precision@{k}: {prec:.6f} | Recall@{k}: {rec:.6f}")

MAP@5: 0.000000 | Precision@5: 0.089534 | Recall@5: 0.009315
MAP@10: 0.000000 | Precision@10: 0.080695 | Recall@10: 0.016743
MAP@50: 0.000000 | Precision@50: 0.054934 | Recall@50: 0.051867
MAP@100: 0.000000 | Precision@100: 0.042858 | Recall@100: 0.079674
MAP@200: 0.000000 | Precision@200: 0.032318 | Recall@200: 0.116187
MAP@300: 0.000000 | Precision@300: 0.026903 | Recall@300: 0.142276


In [16]:
movies2 = movies.merge(movie_id_map, on="movieId").drop(["movieId", "title"], axis=1)
movies_genres_exploded = movies2["genres"].str.split("|").explode()
genres_map = {genre:i for i, genre in enumerate(movies_genres_exploded.unique())}
genres_one_hot = pd.get_dummies(movies_genres_exploded).astype(np.int8).groupby(level=0).sum()
item_features = genres_one_hot.reset_index().rename(columns={'index': 'item_id'})

In [17]:
item_features

Unnamed: 0,item_id,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,3,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,4,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47391,47391,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
47392,47392,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
47393,47393,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47394,47394,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [18]:
user_features = pd.DataFrame({"session_id": range(n_users)})

mean_ratings = ratings_train.groupby("session_id")["rating"].mean().reset_index(name="mean_rating")
average_mean_rating = mean_ratings["mean_rating"].mean()
user_features = user_features.merge(mean_ratings, on="session_id", how="left").fillna(average_mean_rating)

def get_categories_agg(x):
    return genres_one_hot.iloc[x["item_id"]].sum(axis=0)

user_categories_agg = ratings_train.groupby("session_id").apply(get_categories_agg).reset_index()
average_user_categories_agg = user_categories_agg.drop("session_id", axis=1).mean(axis=0)

categories_cols = user_categories_agg.drop("session_id", axis=1).columns

user_features = user_features.merge(user_categories_agg, on="session_id", how="left")
user_features = user_features.fillna(average_user_categories_agg)

  user_categories_agg = ratings_train.groupby("session_id").apply(get_categories_agg).reset_index()


In [19]:
user_features

Unnamed: 0,session_id,mean_rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,4.416667,0.000000,3.000000,5.000000,5.000000,6.00000,8.00000,3.000000,2.000000,...,1.000000,0.000000,0.000000,5.00000,0.000000,4.000000,3.000000,6.00000,1.000000,0.000000
1,1,4.373333,0.000000,12.000000,8.000000,2.000000,8.00000,32.00000,7.000000,2.000000,...,1.000000,4.000000,1.000000,2.00000,4.000000,18.000000,6.000000,20.00000,1.000000,0.000000
2,2,4.379310,0.000000,16.000000,15.000000,5.000000,14.00000,46.00000,13.000000,4.000000,...,1.000000,4.000000,1.000000,2.00000,5.000000,27.000000,9.000000,22.00000,5.000000,0.000000
3,3,4.250000,0.000000,7.000000,3.000000,0.000000,2.00000,15.00000,4.000000,1.000000,...,0.000000,1.000000,0.000000,1.00000,3.000000,11.000000,3.000000,9.00000,1.000000,0.000000
4,4,4.500000,0.000000,6.000000,3.000000,1.000000,2.00000,16.00000,5.000000,3.000000,...,0.000000,2.000000,0.000000,1.00000,1.000000,9.000000,2.000000,10.00000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162409,162409,4.284805,0.282191,12.037779,9.929843,2.824066,5.44789,32.20178,8.900736,4.558981,...,1.316569,8.073843,0.337255,3.88347,5.085948,17.452297,7.443396,15.15876,4.249665,1.510183
162410,162410,4.284805,0.282191,12.037779,9.929843,2.824066,5.44789,32.20178,8.900736,4.558981,...,1.316569,8.073843,0.337255,3.88347,5.085948,17.452297,7.443396,15.15876,4.249665,1.510183
162411,162411,4.284805,0.282191,12.037779,9.929843,2.824066,5.44789,32.20178,8.900736,4.558981,...,1.316569,8.073843,0.337255,3.88347,5.085948,17.452297,7.443396,15.15876,4.249665,1.510183
162412,162412,4.284805,0.282191,12.037779,9.929843,2.824066,5.44789,32.20178,8.900736,4.558981,...,1.316569,8.073843,0.337255,3.88347,5.085948,17.452297,7.443396,15.15876,4.249665,1.510183


In [20]:
rel, rel_sum, rel_mask = recommendation_relevance(recommendations_tensor, ground_truth, users_idx=users_idx, n_users=n_users, n_items=n_items)

In [21]:
true_indices = rel_mask.nonzero(as_tuple=True)[0]
true_indices_2d = true_indices.repeat(N_candid, 1).T

In [22]:
triples = torch.stack([true_indices_2d, recommendations_tensor[rel_mask], rel[rel_mask]], dim=-1)
triples_list = [tuple(triple) for triple in triples.reshape(-1, 3).tolist()]

In [23]:
df = pd.DataFrame(triples_list, columns=["session_id", "item_id", "label"]).astype(int)

In [24]:
df_features = df.merge(user_features).merge(item_features, on="item_id")

In [25]:
from sklearn.model_selection import train_test_split

# Split the users into training and testing sets
train_users, test_users = train_test_split(df_features['session_id'].unique(), test_size=0.25, random_state=42)

print(f"Number of training users: {len(train_users)}")
print(f"Number of testing users: {len(test_users)}")

Number of training users: 1705
Number of testing users: 569


In [26]:
train_df = df_features[df_features["session_id"].isin(train_users)]
test_df = df_features[df_features["session_id"].isin(test_users)]

In [27]:
train_df

Unnamed: 0,session_id,item_id,label,mean_rating,(no genres listed)_x,Action_x,Adventure_x,Animation_x,Children_x,Comedy_x,...,Film-Noir_y,Horror_y,IMAX_y,Musical_y,Mystery_y,Romance_y,Sci-Fi_y,Thriller_y,War_y,Western_y
0,30269,7788,0,3.944060,25.000000,251.000000,158.000000,55.000000,84.00000,641.00000,...,0,0,0,0,0,1,0,0,0,0
1,30269,1814,0,3.944060,25.000000,251.000000,158.000000,55.000000,84.00000,641.00000,...,0,0,0,1,0,1,0,0,0,0
2,30269,7934,0,3.944060,25.000000,251.000000,158.000000,55.000000,84.00000,641.00000,...,0,0,0,0,0,0,0,0,1,0
3,30269,9233,0,3.944060,25.000000,251.000000,158.000000,55.000000,84.00000,641.00000,...,0,0,0,0,0,0,0,0,0,0
4,30269,5929,0,3.944060,25.000000,251.000000,158.000000,55.000000,84.00000,641.00000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
682195,162413,23172,1,4.284805,0.282191,12.037779,9.929843,2.824066,5.44789,32.20178,...,0,1,0,0,0,0,0,0,0,0
682196,162413,1120,0,4.284805,0.282191,12.037779,9.929843,2.824066,5.44789,32.20178,...,0,0,0,0,0,0,0,0,0,0
682197,162413,13558,0,4.284805,0.282191,12.037779,9.929843,2.824066,5.44789,32.20178,...,0,1,0,0,0,0,1,1,0,0
682198,162413,19263,0,4.284805,0.282191,12.037779,9.929843,2.824066,5.44789,32.20178,...,0,1,0,0,0,0,0,1,0,0


In [28]:
test_df

Unnamed: 0,session_id,item_id,label,mean_rating,(no genres listed)_x,Action_x,Adventure_x,Animation_x,Children_x,Comedy_x,...,Film-Noir_y,Horror_y,IMAX_y,Musical_y,Mystery_y,Romance_y,Sci-Fi_y,Thriller_y,War_y,Western_y
2400,33497,1101,0,4.162011,1.000000,31.000000,33.000000,11.000000,21.00000,137.00000,...,0,0,0,0,1,0,0,0,0,0
2401,33497,6577,0,4.162011,1.000000,31.000000,33.000000,11.000000,21.00000,137.00000,...,0,0,0,1,0,1,0,0,0,0
2402,33497,7035,0,4.162011,1.000000,31.000000,33.000000,11.000000,21.00000,137.00000,...,0,0,0,0,0,0,0,0,0,0
2403,33497,23095,0,4.162011,1.000000,31.000000,33.000000,11.000000,21.00000,137.00000,...,0,0,0,0,0,0,0,1,0,0
2404,33497,8122,0,4.162011,1.000000,31.000000,33.000000,11.000000,21.00000,137.00000,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
679795,162405,23172,0,4.284805,0.282191,12.037779,9.929843,2.824066,5.44789,32.20178,...,0,1,0,0,0,0,0,0,0,0
679796,162405,1120,0,4.284805,0.282191,12.037779,9.929843,2.824066,5.44789,32.20178,...,0,0,0,0,0,0,0,0,0,0
679797,162405,13558,0,4.284805,0.282191,12.037779,9.929843,2.824066,5.44789,32.20178,...,0,1,0,0,0,0,1,1,0,0
679798,162405,19263,0,4.284805,0.282191,12.037779,9.929843,2.824066,5.44789,32.20178,...,0,1,0,0,0,0,0,1,0,0


In [29]:
import lightgbm as lgb
from sklearn.metrics import log_loss, accuracy_score

feature_cols = [col for col in train_df.columns if col not in ['label']]

X_train = train_df[feature_cols]
y_train = train_df["label"]

X_test = test_df[feature_cols]
y_test = test_df["label"]

# Train using a binary objective – during inference we'll use predict_proba
model = lgb.LGBMClassifier(objective='binary', random_state=42)
model.fit(X_train, y_train)

# Use predict_proba to obtain real-valued scores
y_pred_scores = model.predict_proba(X_test)[:, 1]
loss = log_loss(y_test, y_pred_scores)
# Convert scores to binary labels at threshold 0.5 for accuracy calculation
accuracy = accuracy_score(y_test, (y_pred_scores > 0.5).astype(int))
print(f"Test Log Loss: {loss:.6f}")
print(f"Test Accuracy: {accuracy:.6f}")

def reranker(model, X_test, df):
    """
    Predict scores for each candidate using the trained model.
    Groups items by session_id and sorts item_ids in descending order.
    Returns a dataframe with session_id and the ranked candidates as a list.
    """
    # Get the real valued scores
    scores = model.predict_proba(X_test)[:, 1]
    df = df.copy()
    df['score'] = scores
    # Group by session_id and sort items in descending order based on their score
    reranked = df.groupby('session_id').apply(lambda x: x.sort_values(by='score', ascending=False)['item_id'].tolist()).reset_index(name='candidates')
    return reranked

# Example usage:
reranked_df = reranker(model, X_test, test_df)
print(reranked_df.head())

[LightGBM] [Info] Number of positive: 14082, number of negative: 497418
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015026 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3416
[LightGBM] [Info] Number of data points in the train set: 511500, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.027531 -> initscore=-3.564533
[LightGBM] [Info] Start training from score -3.564533
Test Log Loss: 0.168724
Test Accuracy: 0.970410
   session_id                                         candidates
0       33497  [20371, 8690, 1047, 1051, 970, 851, 187, 566, ...
1       36315  [22355, 22359, 23496, 7864, 7931, 7972, 7781, ...
2       37164  [15814, 15797, 1039, 1046, 1044, 4202, 3034, 9...
3       37495  [1039, 1044, 4202, 3034, 958, 970, 974, 1972, ...
4       38144  [108, 970, 851, 15797, 1039, 1048, 1051, 1047,...


  reranked = df.groupby('session_id').apply(lambda x: x.sort_values(by='score', ascending=False)['item_id'].tolist()).reset_index(name='candidates')


In [30]:
reranked_df

Unnamed: 0,session_id,candidates
0,33497,"[20371, 8690, 1047, 1051, 970, 851, 187, 566, ..."
1,36315,"[22355, 22359, 23496, 7864, 7931, 7972, 7781, ..."
2,37164,"[15814, 15797, 1039, 1046, 1044, 4202, 3034, 9..."
3,37495,"[1039, 1044, 4202, 3034, 958, 970, 974, 1972, ..."
4,38144,"[108, 970, 851, 15797, 1039, 1048, 1051, 1047,..."
...,...,...
564,162384,"[13219, 9834, 185, 5613, 2363, 6804, 19425, 27..."
565,162385,"[13219, 9834, 185, 5613, 2363, 6804, 19425, 27..."
566,162391,"[13219, 9834, 185, 2363, 5613, 19425, 6804, 10..."
567,162403,"[13219, 9834, 5613, 185, 6804, 19425, 166, 981..."


In [31]:
recommendations_full_reranked = recommendations_full
recommendations_full_reranked = recommendations_full_reranked[~recommendations_full_reranked["session_id"].isin(test_users)]
recommendations_full_reranked = pd.concat([recommendations_full_reranked,reranked_df]).sort_values(by="session_id")

In [32]:
ratings_validation2 = ratings_validation[ratings_validation["session_id"].isin(test_users)]
ground_truth = torch.from_numpy(ratings_validation2[["session_id", "item_id"]].values).T

In [34]:
recommendations_tensor = torch.from_numpy(np.array(recommendations_full['candidates'].tolist()))
users_idx = torch.from_numpy(recommendations_full['session_id'].values)

for k in [5, 10, 50, 100, 200, 300]:
    recommendations_k = recommendations_tensor[:, :k]
    rel_output = recommendation_relevance(recommendations_k, ground_truth, users_idx=users_idx, n_users=n_users, n_items=n_items)

    prec = precision_k_rel(*rel_output)
    rec = recall_k_rel(*rel_output)

    print(f"MAP@{k}: {0:.6f} | Precision@{k}: {prec:.6f} | Recall@{k}: {rec:.6f}")

MAP@5: 0.000000 | Precision@5: 0.083304 | Recall@5: 0.008211
MAP@10: 0.000000 | Precision@10: 0.076098 | Recall@10: 0.015451
MAP@50: 0.000000 | Precision@50: 0.051810 | Recall@50: 0.050152
MAP@100: 0.000000 | Precision@100: 0.040650 | Recall@100: 0.082717
MAP@200: 0.000000 | Precision@200: 0.030351 | Recall@200: 0.110737
MAP@300: 0.000000 | Precision@300: 0.025021 | Recall@300: 0.131867


In [35]:
recommendations_tensor = torch.from_numpy(np.array(recommendations_full_reranked['candidates'].tolist()))
users_idx = torch.from_numpy(recommendations_full_reranked['session_id'].values)

for k in [5, 10, 50, 100, 200, 300]:
    recommendations_k = recommendations_tensor[:, :k]
    rel_output = recommendation_relevance(recommendations_k, ground_truth, users_idx=users_idx, n_users=n_users, n_items=n_items)

    prec = precision_k_rel(*rel_output)
    rec = recall_k_rel(*rel_output)

    print(f"MAP@{k}: {0:.6f} | Precision@{k}: {prec:.6f} | Recall@{k}: {rec:.6f}")

MAP@5: 0.000000 | Precision@5: 0.086116 | Recall@5: 0.009465
MAP@10: 0.000000 | Precision@10: 0.083128 | Recall@10: 0.017697
MAP@50: 0.000000 | Precision@50: 0.054060 | Recall@50: 0.055579
MAP@100: 0.000000 | Precision@100: 0.042935 | Recall@100: 0.084544
MAP@200: 0.000000 | Precision@200: 0.032452 | Recall@200: 0.116838
MAP@300: 0.000000 | Precision@300: 0.025021 | Recall@300: 0.131867
