In [11]:
%load_ext autoreload
%autoreload 2

import IPython
from pathlib import Path
import os
locals = IPython.extract_module_locals() # type: ignore
notebook_name = "/".join(locals[1]["__vsc_ipynb_file__"].split("/"))
os.chdir(Path(notebook_name).parent.parent)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
from typing import Callable, Any
from pathlib import Path

import numpy as np
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from retail_recommender_system.utils import load_model

seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x1327dea90>

In [13]:
relations = pl.read_parquet(".data/intermediate/relations.parquet")
users = pl.read_parquet(".data/intermediate/users.parquet")
items = pl.read_parquet(".data/intermediate/articles.parquet")

In [14]:
X_train, X_valid, _, _ = train_test_split(relations, np.ones(len(relations)), test_size=0.2, random_state=seed)

In [15]:
class MFDataset(Dataset):
    def __init__(self, df: pl.DataFrame, n_items: int, neg_sampl: int = 5):
        self._df = df
        self._n_items = n_items
        self._neg_sampl = neg_sampl

    def __len__(self):
        return len(self._df)
    
    def __getitem__(self, idx: int) -> dict[str, Any]:
        row = self._df[idx]
        user = torch.tensor(row.get_column("customer_id_map").to_numpy(), dtype=torch.int32)
        items = torch.tensor(row.get_column("article_id_map").to_numpy(), dtype=torch.int32)

        u_id = user.repeat(self._neg_sampl + 1)
        i_id = torch.cat([items, self._approx_neg_sampl()])
        target = torch.tensor([1.0] + [0.0] * self._neg_sampl, dtype=torch.float)

        return {
            "u_id": u_id,
            "i_id": i_id,
            "target": target
        }
    
    def _approx_neg_sampl(self):
        neg_i_id = torch.randint(low=0, high=self._n_items, size=(self._neg_sampl,), dtype=torch.int32)
        return neg_i_id
    
def _collate_fn(batch):
    u_id = torch.cat([x["u_id"] for x in batch])
    i_id = torch.cat([x["i_id"] for x in batch])
    target = torch.cat([x["target"] for x in batch])
    return {
        "u_id": u_id,
        "i_id": i_id,
        "target": target
    }

In [16]:
class MF(nn.Module):
    def __init__(self, n_users, n_items, emb_size):
        super().__init__()
        self.user_factors = nn.Embedding(n_users, emb_size)
        self.item_factors = nn.Embedding(n_items, emb_size)
        
    def forward(self, x):
        user_factors = self.user_factors(x["u_id"])
        item_factors = self.item_factors(x["i_id"])
        return (user_factors * item_factors).sum(1)

In [17]:
batch_size = 4096
neg_sampl = 3
n_users = relations.select("customer_id_map").max().get_column("customer_id_map").first() + 1
n_items = relations.select("article_id_map").max().get_column("article_id_map").first() + 1

train_dataset = MFDataset(X_train, n_items=n_items, neg_sampl=neg_sampl)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=_collate_fn)

val_dataset = MFDataset(X_valid, n_items=n_items, neg_sampl=neg_sampl)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=_collate_fn)

embedding_size = 16
dropout_rate = 0.3
lr = 1e-4
device = torch.device("cpu")

model = MF(n_users, n_items, emb_size=embedding_size).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [18]:
def _batch_dict_to_device(batch: dict[str, Any], device: torch.device) -> dict[str, Any]:
    return {k: v.to(device) for k, v in batch.items()}

def train(
    model: nn.Module, 
    loss_fn: Callable,
    optimizer: torch.optim.Optimizer,
    train_loader: DataLoader,
    device: torch.device, 
    epoch: int, 
    print_every: None | int = None
) -> float:
    model.train()
    train_loss = 0.

    for batch_idx, batch in enumerate(train_loader):
        data = _batch_dict_to_device(batch, device)
        
        optimizer.zero_grad()
        output = model(data)
        loss = loss_fn(output, data["target"])
        loss.backward()
        optimizer.step()

        loss_item = loss.detach().cpu().item()

        if print_every is not None and batch_idx % print_every == 0:
            print(
                "Train (Batch): [{}/{} ({:.0f}%)]\tTrain Loss: {:.4f}".format(
                    batch_idx, len(train_loader), 100.0 * batch_idx / len(train_loader), loss_item
                ) # type: ignore
            )
        train_loss += loss_item

    train_loss /= len(train_loader)

    return train_loss


def test(
    model: nn.Module, 
    loss_fn: Callable,
    device: torch.device,
    test_loader: DataLoader, 
    print_every: None | int = None
) -> float:
    model.eval()
    test_loss = 0.
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            data = _batch_dict_to_device(batch, device)

            output = model(data)
            loss =  loss_fn(output, data["target"])

            loss_item = loss.detach().cpu().item()
            test_loss += loss_item

    test_loss /= len(test_loader)

    if print_every is not None:
        print(
            "\nTest: Test loss: {:.4f}".format(test_loss) # type: ignore
        )
    
    return test_loss

In [19]:
history = {
    'train_loss': [],
    'val_loss': []
}

EPOCHS = 15
for epoch in tqdm(range(1, EPOCHS + 1)):
    train(model, criterion, optimizer, train_loader, device, epoch, print_every=100)
    test(model, criterion, device, val_loader, print_every=1)
    torch.save(model.state_dict(), f".models/mf_{epoch}.pth")

  0%|          | 0/15 [00:00<?, ?it/s]



  7%|▋         | 1/15 [45:43<10:40:11, 2743.71s/it]


Test: Test loss: 1.6580


 13%|█▎        | 2/15 [2:24:52<16:42:59, 4629.23s/it]


Test: Test loss: 1.5888


 20%|██        | 3/15 [2:40:58<9:51:14, 2956.18s/it] 


Test: Test loss: 1.5216


 20%|██        | 3/15 [3:21:21<13:25:27, 4027.26s/it]


KeyboardInterrupt: 

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
ax[0].plot(history['train_loss'], label='Train Loss')
ax[0].plot(history['val_loss'], label='Validation Loss')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Loss')
ax[0].set_title('Training and Validation Loss Over Time')

ax[1].plot(history['train_rmse'], label='Train RMSE')
ax[1].plot(history['val_rmse'], label='Validation RMSE')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('RMSE')
ax[1].set_title('Training and Validation RMSE Over Time')

plt.legend()
plt.show()

In [None]:
from functools import partial
from torch.utils.data import DataLoader, TensorDataset
from retail_recommender_system.evaluation.prediction import recommend_k
from retail_recommender_system.evaluation.metrics import precision_k, recall_k
from retail_recommender_system.evaluation.evaluation import EvalDataset

def collate_fn(batch):
    batch = torch.cat(batch)
    return {
        "u_id": batch[:, 0],
        "i_id": batch[:, 1]
    }

@torch.no_grad
def recommend_udf(model: nn.Module, batch: dict[str, torch.Tensor], K=5) -> torch.Tensor:
    model.eval()
    return model(batch).view(-1, n_items)

K = 5
n_users_eval = 1024*20
loader = DataLoader(
    EvalDataset(n_users_eval, n_items, user_batch_size=1024), 
    batch_size=1,
    collate_fn=collate_fn,
    shuffle=False
)
recommendations = recommend_k(partial(recommend_udf, model=model, K=K), loader, K, past_interactions=None)

 95%|█████████▌| 20/21 [00:56<00:02,  2.83s/it]


In [35]:
ground_truth = torch.from_numpy(
    (
        X_valid
        .filter(pl.col("customer_id_map") < n_users_eval)
        .select("customer_id_map", "article_id_map").to_numpy()
    )
).to(torch.int32).T

In [37]:
precision_k(recommendations, ground_truth, k=K, n_items=n_items)

tensor(9.1271e-05)

In [38]:
recall_k(recommendations, ground_truth, k=K, n_items=n_items)

tensor(4.2958e-05)