In [1]:
%load_ext autoreload
%autoreload 2

import IPython
from pathlib import Path
import os
locals = IPython.extract_module_locals() # type: ignore
notebook_name = "/".join(locals[1]["__vsc_ipynb_file__"].split("/"))
os.chdir(Path(notebook_name).parent.parent.parent)

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split

from recsys.evaluation.metrics import map_k, precision_k, recall_k
from recsys.evaluation.evaluation import recommendation_relevance
from recsys.data.utils import filter_set
from recsys.utils import create_log_dir, load_model, save_model, set_seed

SEED = 0
set_seed(SEED)

2025-02-17 20:25:46,544 - recsys.utils - INFO - Setting seed to 0


In [3]:
base_path = Path(".data/movielens/base")

movies = pd.read_csv(base_path / "movies.csv")
links = pd.read_csv(base_path / "links.csv")
tags = pd.read_csv(base_path / "tags.csv")

intermediate_path = Path(".data/movielens/intermediate/1")

ratings = pd.read_parquet(intermediate_path / "ratings.parquet")
ratings_train = pd.read_parquet(intermediate_path / "ratings_train.parquet")
ratings_validation = pd.read_parquet(intermediate_path / "ratings_validation.parquet")
user_id_map = pd.read_parquet(intermediate_path / "user_id_map.parquet")
movie_id_map = pd.read_parquet(intermediate_path / "movie_id_map.parquet")

In [4]:
n_users = user_id_map["userId"].nunique()
n_items = movie_id_map['movieId'].nunique()

print(n_users, n_items, ratings.shape[0])

162414 47396 15630129


In [5]:
from dataclasses import dataclass, asdict
from functools import cached_property
from typing import Any
from sklearn.metrics import roc_auc_score
from types import SimpleNamespace

import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, IterableDataset, DataLoader

def collate_fn(batch):
    u_id = torch.cat([x["u_id"] for x in batch])
    i_id = torch.cat([x["i_id"] for x in batch])
    target = torch.cat([x["target"] for x in batch])
    return {"u_id": u_id, "i_id": i_id, "target": target}


def eval_collate_fn(batch):
    u_id = torch.cat([x["u_id"] for x in batch])
    return {"u_id": u_id}


def approx_neg_sampl(n_items: int, neg_sampl: int) -> torch.Tensor:
    return torch.randint(low=0, high=n_items, size=(neg_sampl,), dtype=torch.int32)


def batch_dict_to_device(batch: dict[str, Any], device: torch.device) -> dict[str, Any]:
    return {k: v.to(device) for k, v in batch.items()}


class DotProd(nn.Module):
    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
        return torch.sum(x * y, dim=1)


class MFDataset(Dataset):
    def __init__(self, relations: pd.DataFrame, users: pd.DataFrame, items: pd.DataFrame, namings: dict[str, str], neg_sampl: int = 5):
        self._df = torch.from_numpy(relations.values).to(torch.int32)
        self._users = torch.from_numpy(users.unique()).to(torch.float32)
        self._items = torch.from_numpy(items.unique()).to(torch.float32)
        self._neg_sampl = neg_sampl

    @property
    def _n_users(self) -> int:
        return len(self._users)

    @property
    def _n_items(self) -> int:
        return len(self._items)

    @cached_property
    def users_set(self) -> torch.Tensor:
        return torch.arange(self._n_users, dtype=torch.int32)

    @cached_property
    def items_set(self) -> torch.Tensor:
        return torch.arange(self._n_items, dtype=torch.int32)

    @cached_property
    def ground_truth(self) -> torch.Tensor:
        return self._df.T

    def __len__(self):
        return len(self._df)

    def __getitem__(self, idx: int) -> dict[str, Any]:
        row = self._df[idx]
        user = row[0].unsqueeze(0)
        items = row[1].unsqueeze(0)

        u_id = user.repeat(self._neg_sampl + 1)
        i_id = torch.cat([items, approx_neg_sampl(self._n_items, self._neg_sampl)])
        target = torch.tensor([1.0] + [0.0] * self._neg_sampl, dtype=torch.float)

        return {"u_id": u_id, "i_id": i_id, "target": target}


class MFEvalDataset(IterableDataset):
    def __init__(self, base_dataset: MFDataset, user_batch_size: int):
        super().__init__()
        self._base_dataset = base_dataset
        self._user_batch_size = user_batch_size

    @property
    def users_set(self) -> torch.Tensor:
        return self._base_dataset.users_set

    @property
    def items_set(self) -> torch.Tensor:
        return self._base_dataset.items_set

    @property
    def ground_truth(self) -> torch.Tensor:
        return self._base_dataset.ground_truth

    def __len__(self):
        return len(self.users_set) // self._user_batch_size + 1

    def __iter__(self):
        for batch in self.users_set.split(self._user_batch_size):
            yield {"u_id": batch}


@dataclass
class MFModelConfig:
    n_users: int
    n_items: int
    emb_size: int
    dropout: float = 0.0


@dataclass
class TrainConfig:
    valid_size: float
    batch_size: int
    train_print_every: int
    eval_batch_size: int
    eval_user_batch_size: int
    neg_sampl: int
    lr: float
    epochs: int


class MF(nn.Module):
    def __init__(self, config: MFModelConfig):
        super().__init__()
        self.user_factors = nn.Embedding(config.n_users, config.emb_size)
        self.item_factors = nn.Embedding(config.n_items, config.emb_size)

        self.dropout = nn.Dropout(config.dropout)

        self.dot = DotProd()

    def forward(self, x):
        user_factors = self.dropout(self.user_factors(x["u_id"]))
        item_factors = self.dropout(self.item_factors(x["i_id"]))
        return self.dot(user_factors, item_factors)

    @torch.no_grad()
    def recommend(self, x: dict[str, torch.Tensor]):
        user_emb = self.user_factors.weight[x["u_id"]]
        item_emb = self.item_factors.weight
        return torch.sigmoid(user_emb @ item_emb.T)

In [6]:
X_train = ratings_train

train_users, test_users = train_test_split(ratings_validation['session_id'].unique(), test_size=0.3, random_state=0)

X_valid_train = ratings_validation[ratings_validation["session_id"].isin(train_users)]
X_valid_valid = ratings_validation[ratings_validation["session_id"].isin(test_users)]

print(f"Number of validation-training users: {len(train_users)}")
print(f"Number of validation-validation users: {len(test_users)}")

Number of validation-training users: 2456
Number of validation-validation users: 1053


In [7]:
class MFTrainer:
    def __init__(self, model_config, train_config, dataset, device: "torch.device"):
        self.model_config = model_config
        self.train_config = train_config
        self.dataset = dataset
        self.device = device
        self.datasets = self._init_datasets()
        self.loaders = self._init_loaders()
        self.model = self._init_model()
        self.optimizer = self._init_optimizer()
        self.criterion = self._init_criterion()
        self.scheduler = self._init_scheduler()

    @property
    def _model_config(self) -> type:
        return MFModelConfig

    def _init_model(self) -> nn.Module:
        model = MF(self.model_config).to(
            self.device
        )
        return model

    def _init_optimizer(self) -> torch.optim.Optimizer:
        return torch.optim.Adam(self.model.parameters(), lr=self.train_config.lr)

    def _init_criterion(self) -> nn.Module:
        return nn.BCEWithLogitsLoss()

    def _init_scheduler(self) -> Any:
        return torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=2, gamma=0.5)

    def _init_datasets(self) -> dict[str, Dataset]:
        X_train, X_valid = self.dataset.data["relations"]

        train_dataset = MFDataset(
            relations=X_train,
            users=self.dataset.data["users"],
            items=self.dataset.data["items"],
            namings=self.dataset.namings,
            neg_sampl=self.train_config.neg_sampl,
        )
        val_dataset = MFDataset(
            relations=X_valid,
            users=self.dataset.data["users"],
            items=self.dataset.data["items"],
            namings=self.dataset.namings,
            neg_sampl=self.train_config.neg_sampl,
        )
        eval_dataset = MFEvalDataset(
            base_dataset=val_dataset,
            user_batch_size=self.train_config.eval_user_batch_size,
        )

        return {"train": train_dataset, "val": val_dataset, "eval": eval_dataset}

    def _init_loaders(self) -> dict[str, DataLoader]:
        train_loader = DataLoader(self.datasets["train"], batch_size=self.train_config.batch_size, shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(self.datasets["val"], batch_size=self.train_config.batch_size, shuffle=False, collate_fn=collate_fn)
        eval_loader = DataLoader(
            self.datasets["eval"], batch_size=self.train_config.eval_batch_size, shuffle=False, collate_fn=eval_collate_fn, drop_last=False
        )

        return {"train": train_loader, "val": val_loader, "eval": eval_loader}

    @torch.no_grad
    def recommend_udf(self, batch: dict[str, torch.Tensor], model: MF, n_items: int) -> torch.Tensor:
        model.eval()
        return model.recommend(batch)

    def train(self, print_every: None | int = None) -> tuple[float, float]:
        self.model.train()
        train_loss = 0.0
        preds, ground_truths = [], []

        for batch_idx, batch in enumerate(self.loaders["train"]):
            data = batch_dict_to_device(batch, self.device)

            self.optimizer.zero_grad()
            output = self.model(data)
            loss = self.criterion(output, data["target"])
            loss.backward()
            self.optimizer.step()

            loss_item = loss.detach().cpu().item()

            if print_every is not None and batch_idx % print_every == 0:
                percentage = 100.0 * batch_idx / len(self.loaders["train"])
                print(f"Train (Batch): [{batch_idx}/{len(self.loaders['train'])} ({percentage:.0f}%)] | Loss: {loss_item:.4f}")

            preds.append(output)
            ground_truths.append(data["target"])
            train_loss += loss_item

        train_loss /= len(self.loaders["train"])

        pred = torch.cat(preds, dim=0).detach().sigmoid().cpu().numpy()
        ground_truth = torch.cat(ground_truths, dim=0).detach().cpu().numpy()
        train_roc_auc = float(roc_auc_score(ground_truth, pred))

        print(f"\nTrain: Loss: {train_loss:.4f} | ROC AUC: {train_roc_auc:.4f}")

        return train_loss, train_roc_auc

    def test(self) -> tuple[float, float]:
        self.model.eval()
        test_loss = 0.0
        preds, ground_truths = [], []

        with torch.no_grad():
            for batch_idx, batch in enumerate(self.loaders["val"]):
                data = batch_dict_to_device(batch, self.device)

                output = self.model(data)
                loss = self.criterion(output, data["target"])

                preds.append(output)
                ground_truths.append(data["target"])
                test_loss += loss.detach().cpu().item()

        pred = torch.cat(preds, dim=0).sigmoid().cpu().numpy()
        ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
        test_roc_auc = float(roc_auc_score(ground_truth, pred))
        test_loss /= len(self.loaders["val"])

        print(f"Test: Loss: {test_loss:.4f} | ROC AUC: {test_roc_auc:.4f}")

        return test_loss, test_roc_auc

    def fit(self):
        history = {"train_loss": [], "train_roc_auc": [], "test_loss": [], "test_roc_auc": []}
        for epoch in tqdm(range(1, self.train_config.epochs + 1)):
            train_loss, train_roc_auc = self.train(print_every=self.train_config.train_print_every)
            test_loss, test_roc_auc = self.test()

            history["train_loss"].append(train_loss)
            history["train_roc_auc"].append(train_roc_auc)
            history["test_loss"].append(test_loss)
            history["test_roc_auc"].append(test_roc_auc)

        return history

ds = SimpleNamespace(
    n_users=n_users,
    n_items=n_items,
    data={
        "relations": (ratings_train[["session_id", "item_id"]], ratings_validation[["session_id", "item_id"]]),
        "users": ratings["session_id"],
        "items": ratings["item_id"],
    },
    namings={"user": "session_id", "item": "item_id"}
)

trainer = MFTrainer(
    MFModelConfig(n_users=n_users, n_items=n_items, emb_size=8), 
    TrainConfig(valid_size=0.2, batch_size=4096, train_print_every=1000, neg_sampl=1, lr=1e-3, epochs=7, eval_user_batch_size=1000, eval_batch_size=1), 
    dataset=ds, 
    device=torch.device("cpu")
)

In [10]:
trainer.fit()

  0%|          | 0/3 [00:00<?, ?it/s]


Train: Loss: 0.4139 | ROC AUC: 0.9071


 33%|███▎      | 1/3 [04:19<08:38, 259.06s/it]

Test: Loss: 0.9164 | ROC AUC: 0.5931

Train: Loss: 0.2604 | ROC AUC: 0.9607


 67%|██████▋   | 2/3 [08:37<04:18, 258.79s/it]

Test: Loss: 0.9484 | ROC AUC: 0.6092


 67%|██████▋   | 2/3 [09:52<04:56, 296.32s/it]


KeyboardInterrupt: 

In [11]:
log_dir = create_log_dir(trainer.model)
save_model(trainer.model, log_dir / "weights.pth")

2025-02-16 13:40:50,489 - recsys.utils - INFO - Saving model <class '__main__.MF'> to .runs/MF/2025-02-16_13-40-50/weights.pth


In [10]:
trainer.model = load_model(MF, Path(".runs/MF/2025-02-16_13-40-50/weights.pth"), {"config": trainer.model_config})

2025-02-17 20:27:49,880 - recsys.utils - INFO - Loading model <class '__main__.MF'> from .runs/MF/2025-02-16_13-40-50/weights.pth


In [11]:
def recall_k_rel(rel, rel_sum, rel_mask) -> torch.Tensor:
    return torch.mean(torch.sum(rel[rel_mask], dim=1) / rel_sum[rel_mask])


def precision_k_rel(rel, rel_sum, rel_mask) -> torch.Tensor:
    return torch.mean(torch.mean(rel[rel_mask], dim=1))


In [12]:
ratings_validation

Unnamed: 0,rating,timestamp,year_month,session_id,item_id
15513469,4.0,2019-09-28 07:22:42,2019-09,30269,15726
15519958,4.0,2019-09-30 03:37:54,2019-09,30269,22740
15522291,3.5,2019-10-01 05:01:17,2019-10,30269,21011
15534185,3.5,2019-10-07 04:50:03,2019-10,30269,46792
15540795,3.5,2019-10-10 04:28:11,2019-10,30269,41545
...,...,...,...,...,...
15630123,4.5,2019-11-21 09:10:06,2019-11,162413,9818
15630124,4.5,2019-11-21 09:10:45,2019-11,162413,19425
15630125,4.0,2019-11-21 09:11:19,2019-11,162413,31049
15630126,4.0,2019-11-21 09:12:13,2019-11,162413,45653


In [13]:
X_valid_train

Unnamed: 0,rating,timestamp,year_month,session_id,item_id
15513469,4.0,2019-09-28 07:22:42,2019-09,30269,15726
15519958,4.0,2019-09-30 03:37:54,2019-09,30269,22740
15522291,3.5,2019-10-01 05:01:17,2019-10,30269,21011
15534185,3.5,2019-10-07 04:50:03,2019-10,30269,46792
15540795,3.5,2019-10-10 04:28:11,2019-10,30269,41545
...,...,...,...,...,...
15630042,3.5,2019-11-21 05:29:25,2019-11,162412,9396
15630043,3.5,2019-11-21 05:30:37,2019-11,162412,20743
15630045,3.5,2019-11-21 05:33:35,2019-11,162412,38426
15630046,3.5,2019-11-21 05:34:22,2019-11,162412,4976


In [14]:
X_valid_valid

Unnamed: 0,rating,timestamp,year_month,session_id,item_id
15498286,4.0,2019-09-22 18:58:00,2019-09,30362,38346
15498517,5.0,2019-09-22 20:01:58,2019-09,30362,35041
15517387,4.5,2019-09-29 15:08:52,2019-09,30362,3896
15563066,4.5,2019-10-20 15:42:19,2019-10,30362,41030
15520324,3.5,2019-09-30 08:47:38,2019-09,30441,42737
...,...,...,...,...,...
15630123,4.5,2019-11-21 09:10:06,2019-11,162413,9818
15630124,4.5,2019-11-21 09:10:45,2019-11,162413,19425
15630125,4.0,2019-11-21 09:11:19,2019-11,162413,31049
15630126,4.0,2019-11-21 09:12:13,2019-11,162413,45653


In [15]:
from recsys.evaluation.prediction import recommend_k
from functools import partial

past_interactions = trainer.loaders["train"].dataset.ground_truth
N_candid = 300

recommendations = recommend_k(
    partial(trainer.recommend_udf, model=trainer.model, n_items=None),
    trainer.loaders["eval"],
    N_candid,
    device=torch.device("cpu"),
    past_interactions=past_interactions,
    n_users=n_users,
    n_items=n_items,
)

100%|██████████| 163/163 [00:07<00:00, 20.80it/s]


In [16]:
recommendations.shape

torch.Size([162414, 300])

In [17]:
_filtered_valid_full = filter_set(ratings_validation, ratings_train, user_col="session_id", item_col="item_id")
_filtered_valid_train = filter_set(X_valid_train, ratings_train, user_col="session_id", item_col="item_id")
_filtered_valid_valid = filter_set(X_valid_valid, ratings_train, user_col="session_id", item_col="item_id")

In [18]:
_filtered_valid_full

Unnamed: 0,rating,timestamp,year_month,session_id,item_id
15513469,4.0,2019-09-28 07:22:42,2019-09,30269,15726
15519958,4.0,2019-09-30 03:37:54,2019-09,30269,22740
15522291,3.5,2019-10-01 05:01:17,2019-10,30269,21011
15540795,3.5,2019-10-10 04:28:11,2019-10,30269,41545
15542745,4.0,2019-10-11 05:09:54,2019-10,30269,38630
...,...,...,...,...,...
15588119,4.0,2019-11-02 03:59:45,2019-11,161494,1314
15588120,4.5,2019-11-02 04:00:03,2019-11,161494,1062
15588122,4.5,2019-11-02 04:00:08,2019-11,161494,1122
15588124,4.0,2019-11-02 04:00:44,2019-11,161494,466


In [19]:
_filtered_valid_full.session_id.nunique()

2514

In [20]:
_filtered_valid_train

Unnamed: 0,rating,timestamp,year_month,session_id,item_id
15513469,4.0,2019-09-28 07:22:42,2019-09,30269,15726
15519958,4.0,2019-09-30 03:37:54,2019-09,30269,22740
15522291,3.5,2019-10-01 05:01:17,2019-10,30269,21011
15540795,3.5,2019-10-10 04:28:11,2019-10,30269,41545
15542745,4.0,2019-10-11 05:09:54,2019-10,30269,38630
...,...,...,...,...,...
15588119,4.0,2019-11-02 03:59:45,2019-11,161494,1314
15588120,4.5,2019-11-02 04:00:03,2019-11,161494,1062
15588122,4.5,2019-11-02 04:00:08,2019-11,161494,1122
15588124,4.0,2019-11-02 04:00:44,2019-11,161494,466


In [21]:
_filtered_valid_train.session_id.nunique()

1759

In [22]:
_filtered_valid_valid

Unnamed: 0,rating,timestamp,year_month,session_id,item_id
15498286,4.0,2019-09-22 18:58:00,2019-09,30362,38346
15498517,5.0,2019-09-22 20:01:58,2019-09,30362,35041
15517387,4.5,2019-09-29 15:08:52,2019-09,30362,3896
15563066,4.5,2019-10-20 15:42:19,2019-10,30362,41030
15520324,3.5,2019-09-30 08:47:38,2019-09,30441,42737
...,...,...,...,...,...
15556645,4.5,2019-10-16 20:55:30,2019-10,161478,2363
15499248,4.5,2019-09-23 00:22:06,2019-09,161489,6686
15499262,4.0,2019-09-23 00:27:17,2019-09,161489,167
15499263,4.0,2019-09-23 00:28:26,2019-09,161489,1102


In [23]:
_filtered_valid_valid.session_id.nunique()

755

In [24]:
ground_truth_valid_full = torch.from_numpy(_filtered_valid_full[['session_id', 'item_id']].values).T.to(torch.int32)
users_idx_valid_full = torch.from_numpy(_filtered_valid_full['session_id'].unique()).to(torch.int32)

for k in [5, 10, 50, 100, 200, 300]:
    recommendations_k = recommendations[users_idx_valid_full, :k]
    rel_output = recommendation_relevance(recommendations_k, ground_truth_valid_full, users_idx=users_idx_valid_full, n_users=n_users, n_items=n_items)

    map = map_k(recommendations_k, ground_truth_valid_full, k, users_idx_valid_full, n_users, n_items)
    prec = precision_k_rel(*rel_output)
    rec = recall_k_rel(*rel_output)

    print(f"MAP@{k}: {map:.6f} | Precision@{k}: {prec:.6f} | Recall@{k}: {rec:.6f}")

MAP@5: 0.034342 | Precision@5: 0.030072 | Recall@5: 0.012192
MAP@10: 0.030905 | Precision@10: 0.025935 | Recall@10: 0.020335
MAP@50: 0.023086 | Precision@50: 0.018234 | Recall@50: 0.067081
MAP@100: 0.019830 | Precision@100: 0.015203 | Recall@100: 0.104349
MAP@200: 0.016507 | Precision@200: 0.011716 | Recall@200: 0.153125
MAP@300: 0.014583 | Precision@300: 0.009886 | Recall@300: 0.187361


In [25]:
ground_truth_valid_valid = torch.from_numpy(_filtered_valid_valid[['session_id', 'item_id']].values).T.to(torch.int32)
users_idx_valid_valid = torch.from_numpy(_filtered_valid_valid['session_id'].unique()).to(torch.int32)

for k in [5, 10, 50, 100, 200, 300]:
    recommendations_k = recommendations[users_idx_valid_valid, :k]
    rel_output = recommendation_relevance(recommendations_k, ground_truth_valid_valid, users_idx=users_idx_valid_valid, n_users=n_users, n_items=n_items)

    map = map_k(recommendations_k, ground_truth_valid_valid, k, users_idx_valid_valid, n_users, n_items)
    prec = precision_k_rel(*rel_output)
    rec = recall_k_rel(*rel_output)

    print(f"MAP@{k}: {map:.6f} | Precision@{k}: {prec:.6f} | Recall@{k}: {rec:.6f}")

MAP@5: 0.034274 | Precision@5: 0.030199 | Recall@5: 0.012502
MAP@10: 0.030265 | Precision@10: 0.025033 | Recall@10: 0.020436
MAP@50: 0.022435 | Precision@50: 0.017907 | Recall@50: 0.065057
MAP@100: 0.019333 | Precision@100: 0.015099 | Recall@100: 0.102046
MAP@200: 0.016192 | Precision@200: 0.011510 | Recall@200: 0.152373
MAP@300: 0.014326 | Precision@300: 0.009700 | Recall@300: 0.189482


In [26]:
movies2 = movies.merge(movie_id_map, on="movieId").drop(["movieId", "title"], axis=1)
movies_genres_exploded = movies2["genres"].str.split("|").explode()
genres_map = {genre:i for i, genre in enumerate(movies_genres_exploded.unique())}
genres_one_hot = pd.get_dummies(movies_genres_exploded).astype(np.int8).groupby(level=0).sum()
item_features = genres_one_hot.reset_index().rename(columns={'index': 'item_id'})

In [27]:
item_features

Unnamed: 0,item_id,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,3,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,4,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47391,47391,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
47392,47392,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
47393,47393,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47394,47394,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
user_features = pd.DataFrame({"session_id": range(n_users)})

# rating
def minmax_scale(x, min_, max_):
    return (x - min_) / (max_ - min_)

mean_ratings = minmax_scale(ratings_train.groupby("session_id")["rating"].mean(), 0, 5).reset_index(name="mean_rating")
average_mean_rating = mean_ratings["mean_rating"].mean()
user_features = user_features.merge(mean_ratings, on="session_id", how="left").fillna(average_mean_rating)

# category
def get_categories_agg(x):
    agg = genres_one_hot.iloc[x["item_id"]].sum(axis=0)
    return agg / agg.values.sum()

user_categories_agg = ratings_train.groupby("session_id").apply(get_categories_agg).reset_index()
average_user_categories_agg = user_categories_agg.drop("session_id", axis=1).mean(axis=0)

categories_cols = user_categories_agg.drop("session_id", axis=1).columns

user_features = user_features.merge(user_categories_agg, on="session_id", how="left").fillna(average_user_categories_agg)

# n items
user_n_items = ratings_train.groupby("session_id").size().reset_index(name="n_items")
user_features = user_features.merge(user_n_items, on="session_id", how="left").fillna(0)


  user_categories_agg = ratings_train.groupby("session_id").apply(get_categories_agg).reset_index()


In [49]:
user_features

Unnamed: 0,session_id,mean_rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,n_items
0,0,0.883333,0.000000,0.048387,0.080645,0.080645,0.096774,0.129032,0.048387,0.032258,...,0.000000,0.000000,0.080645,0.000000,0.064516,0.048387,0.096774,0.016129,0.000000,24.0
1,1,0.874667,0.000000,0.073171,0.048780,0.012195,0.048780,0.195122,0.042683,0.012195,...,0.024390,0.006098,0.012195,0.024390,0.109756,0.036585,0.121951,0.006098,0.000000,75.0
2,2,0.875862,0.000000,0.063745,0.059761,0.019920,0.055777,0.183267,0.051793,0.015936,...,0.015936,0.003984,0.007968,0.019920,0.107570,0.035857,0.087649,0.019920,0.000000,116.0
3,3,0.850000,0.000000,0.083333,0.035714,0.000000,0.023810,0.178571,0.047619,0.011905,...,0.011905,0.000000,0.011905,0.035714,0.130952,0.035714,0.107143,0.011905,0.000000,40.0
4,4,0.900000,0.000000,0.071429,0.035714,0.011905,0.023810,0.190476,0.059524,0.035714,...,0.023810,0.000000,0.011905,0.011905,0.107143,0.023810,0.119048,0.000000,0.000000,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162409,162409,0.856976,0.000841,0.061305,0.053226,0.014152,0.030705,0.170023,0.041362,0.020966,...,0.038396,0.001535,0.020651,0.025649,0.092870,0.039975,0.077798,0.019673,0.006239,0.0
162410,162410,0.856976,0.000841,0.061305,0.053226,0.014152,0.030705,0.170023,0.041362,0.020966,...,0.038396,0.001535,0.020651,0.025649,0.092870,0.039975,0.077798,0.019673,0.006239,0.0
162411,162411,0.856976,0.000841,0.061305,0.053226,0.014152,0.030705,0.170023,0.041362,0.020966,...,0.038396,0.001535,0.020651,0.025649,0.092870,0.039975,0.077798,0.019673,0.006239,0.0
162412,162412,0.856976,0.000841,0.061305,0.053226,0.014152,0.030705,0.170023,0.041362,0.020966,...,0.038396,0.001535,0.020651,0.025649,0.092870,0.039975,0.077798,0.019673,0.006239,0.0


In [50]:
rel, rel_sum, rel_mask = recommendation_relevance(recommendations[users_idx_valid_full], ground_truth_valid_full, users_idx=users_idx_valid_full, n_users=n_users, n_items=n_items)

In [51]:
true_indices = users_idx_valid_full
true_indices_2d = true_indices.repeat(N_candid, 1).T

In [52]:
triples = torch.stack([true_indices_2d, recommendations[users_idx_valid_full][rel_mask], rel[rel_mask]], dim=-1)
triples_list = [tuple(triple) for triple in triples.reshape(-1, 3).tolist()]

In [53]:
df = pd.DataFrame(triples_list, columns=["session_id", "item_id", "label"]).astype(int)

In [75]:
df_features = df.merge(user_features).merge(item_features, on="item_id")

In [127]:
# cross features
a = df_features[[f"{i}_x" for i in categories_cols]].values
b = df_features[[f"{i}_y" for i in categories_cols]].values
df_features["item_categories_dot"] = (a * b).sum(axis=1)

In [129]:
train_df = df_features[df_features["session_id"].isin(train_users)]
test_df = df_features[df_features["session_id"].isin(test_users)]

In [131]:
feature_cols = [col for col in train_df.columns if col not in ['label', "session_id", "item_id"]]

X_train = train_df[feature_cols]
y_train = train_df["label"]

X_test = test_df[feature_cols]
y_test = test_df["label"]

In [132]:
import lightgbm as lgb
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score

model = lgb.LGBMClassifier(objective='binary', random_state=SEED)
model.fit(X_train, y_train)


for X, y, type in zip([X_train, X_test], [y_train, y_test], ["Train", "Test"]):
    y_pred_scores = model.predict_proba(X)[:, 1] # type: ignore
    loss = log_loss(y, y_pred_scores)
    y_pred = (y_pred_scores >= 0.5).astype(int)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    print(f"{type}:  Log Loss: {loss:.6f} | Accuracy: {accuracy:.6f} | Precision: {precision:.6f} | Recall: {recall:.6f}")

[LightGBM] [Info] Number of positive: 5259, number of negative: 522441
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017330 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5905
[LightGBM] [Info] Number of data points in the train set: 527700, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009966 -> initscore=-4.598571
[LightGBM] [Info] Start training from score -4.598571
Train:  Log Loss: 0.039185 | Accuracy: 0.990119 | Precision: 0.643312 | Recall: 0.019205
Test:  Log Loss: 0.055474 | Accuracy: 0.989430 | Precision: 0.037559 | Recall: 0.003641


In [84]:
def reranker(model, X_test, df):
    """
    Predict scores for each candidate using the trained model.
    Groups items by session_id and sorts item_ids in descending order.
    Returns a dataframe with session_id and the ranked candidates as a list.
    """
    scores = model.predict_proba(X_test)[:, 1]
    df = df.copy()
    df['score'] = scores
    reranked = df.groupby('session_id').apply(lambda x: x.sort_values(by='score', ascending=False)['item_id'].tolist()).reset_index(name='candidates')
    return reranked

reranked_recommendations_valid_train = torch.tensor(reranker(model, X_train, train_df)["candidates"].values.tolist())
reranked_recommendations_valid_valid = torch.tensor(reranker(model, X_test, test_df)["candidates"].values.tolist())

  reranked = df.groupby('session_id').apply(lambda x: x.sort_values(by='score', ascending=False)['item_id'].tolist()).reset_index(name='candidates')
  reranked = df.groupby('session_id').apply(lambda x: x.sort_values(by='score', ascending=False)['item_id'].tolist()).reset_index(name='candidates')


In [85]:
ground_truth_valid_train = torch.from_numpy(_filtered_valid_train[['session_id', 'item_id']].values).T.to(torch.int32)
users_id_valid_train = torch.from_numpy(_filtered_valid_train['session_id'].unique()).to(torch.int32)

for k in [5, 10, 50, 100, 200, 300]:
    recommendations_k = reranked_recommendations_valid_train[:, :k]
    rel_output = recommendation_relevance(recommendations_k, ground_truth_valid_train, users_idx=users_id_valid_train, n_users=n_users, n_items=n_items)

    map = map_k(recommendations_k, ground_truth_valid_train, k, users_id_valid_train, n_users, n_items)
    prec = precision_k_rel(*rel_output)
    rec = recall_k_rel(*rel_output)

    print(f"MAP@{k}: {map:.6f} | Precision@{k}: {prec:.6f} | Recall@{k}: {rec:.6f}")

MAP@5: 0.098924 | Precision@5: 0.075156 | Recall@5: 0.033835
MAP@10: 0.079625 | Precision@10: 0.053155 | Recall@10: 0.045401
MAP@50: 0.041754 | Precision@50: 0.022672 | Recall@50: 0.079867
MAP@100: 0.030260 | Precision@100: 0.016435 | Recall@100: 0.108698
MAP@200: 0.022374 | Precision@200: 0.012840 | Recall@200: 0.169353
MAP@300: 0.018717 | Precision@300: 0.009966 | Recall@300: 0.186450


In [86]:
ground_truth_valid_valid = torch.from_numpy(_filtered_valid_valid[['session_id', 'item_id']].values).T.to(torch.int32)
users_idx_valid_valid = torch.from_numpy(_filtered_valid_valid['session_id'].unique()).to(torch.int32)

for k in [5, 10, 50, 100, 200, 300]:
    recommendations_k = reranked_recommendations_valid_valid[:, :k]
    rel_output = recommendation_relevance(recommendations_k, ground_truth_valid_valid, users_idx=users_idx_valid_valid, n_users=n_users, n_items=n_items)

    map = map_k(recommendations_k, ground_truth_valid_valid, k, users_idx_valid_valid, n_users, n_items)
    prec = precision_k_rel(*rel_output)
    rec = recall_k_rel(*rel_output)

    print(f"MAP@{k}: {map:.6f} | Precision@{k}: {prec:.6f} | Recall@{k}: {rec:.6f}")

MAP@5: 0.021660 | Precision@5: 0.019338 | Recall@5: 0.006549
MAP@10: 0.019836 | Precision@10: 0.017086 | Recall@10: 0.010573
MAP@50: 0.015635 | Precision@50: 0.012954 | Recall@50: 0.041066
MAP@100: 0.013637 | Precision@100: 0.010914 | Recall@100: 0.065907
MAP@200: 0.012318 | Precision@200: 0.010775 | Recall@200: 0.143658
MAP@300: 0.011624 | Precision@300: 0.009700 | Recall@300: 0.189482
