In [1]:
%load_ext autoreload
%autoreload 2

import IPython
from pathlib import Path
import os
locals = IPython.extract_module_locals() # type: ignore
notebook_name = "/".join(locals[1]["__vsc_ipynb_file__"].split("/"))
os.chdir(Path(notebook_name).parent.parent)

In [46]:
from pathlib import Path

import numpy as np
import polars as pl
import torch
from torch.utils.data import DataLoader, TensorDataset

from retail_recommender_system.evaluation.prediction import recommend_k 
from retail_recommender_system.evaluation.metrics import precision_k, recall_k

In [5]:
sample_submission = pl.read_csv(".data/base/sample_submission.csv")
customer_id_map = pl.read_parquet(".data/intermediate/customer_id_map.parquet")
article_id_map = pl.read_parquet(".data/intermediate/article_id_map.parquet")
relations = pl.read_parquet(".data/intermediate/relations.parquet")

In [6]:
# df_valid_ground_truth = relations \
#     .filter(
#         pl.col("t_dat").dt.year() == 2020, 
#         pl.col("t_dat").dt.month() == 9, 
#         pl.col("t_dat").dt.day() >= 15
#     ) \
#     .select("customer_id", "article_id") \
#     .group_by("customer_id") \
#     .agg(pl.col("article_id").cast(pl.String).alias("_prediction")) \
#     .with_columns(prediction=pl.col("_prediction").list.join(" ")) \
#     .select("customer_id", "prediction")
df_valid_ground_truth = relations \
    .filter(
        pl.col("t_dat").dt.year() == 2020, 
        pl.col("t_dat").dt.month() == 9, 
        pl.col("t_dat").dt.day() >= 15
    ) \
    .join(customer_id_map, on="customer_id", how='left') \
    .join(article_id_map, on="article_id", how='left') \
    .select("customer_id_map", "article_id_map")

In [None]:
K = 12
prediction_matrix = np.random.randint(low=0, high=len(article_id_map), size=(len(df_valid_ground_truth), K))

# matrix_df = customer_id_map.select("customer_id_map").with_columns(
#     recommendations=pl.Series(prediction_matrix.astype(np.uint32))
# )
matrix_df = df_valid_ground_truth.select(pl.col("customer_id").alias("customer_id_map")).with_columns(
    recommendations=pl.Series(prediction_matrix.astype(np.uint32))
)
matrix_df = (
    matrix_df
    .explode("recommendations")
    .join(article_id_map, left_on="recommendations", right_on="article_id_map")
    .group_by("customer_id_map")
    .agg(pl.col("article_id").cast(pl.String).alias("mapped_recommendations"))
    .with_columns(prediction=pl.col("mapped_recommendations").list.join(" "))
    .select(pl.col("customer_id_map").alias("customer_id"), "prediction")
)

In [None]:
matrix_df

In [7]:
df_valid_ground_truth

customer_id_map,article_id_map
u32,u32
2,78503
90,97666
90,97667
173,80056
173,80056
…,…
1371691,104961
1371691,100629
1371721,104053
1371747,88521


In [8]:
n_cust = customer_id_map.select("customer_id_map").n_unique()
n_art = article_id_map.select("article_id_map").n_unique()

In [33]:
def collate_fn(batch):
    return {"u_id": torch.tensor([b[0] for b in batch])}

def recommend_udf_incrementing(batch: dict[str, torch.Tensor]) -> torch.Tensor:
    return torch.arange(7, 0, -1).repeat(1, batch['u_id'].shape[0]).reshape(-1, 7)

def recommend_udf_random(batch: dict[str, torch.Tensor], K=5) -> torch.Tensor:
    return torch.randint(0, n_art, (batch['u_id'].shape[0], K)).to(torch.int32)

K = 5
loader = DataLoader(
    TensorDataset(torch.arange(0, n_cust)), 
    batch_size=4096*8, 
    collate_fn=collate_fn
)
past_interactions = torch.from_numpy(df_valid_ground_truth.to_numpy()).to(torch.int32).T
recommendations = recommend_k(recommend_udf_random, loader, K, past_interactions=None)

In [47]:
precision_k(recommendations, past_interactions, k=K, n_items=n_art)

tensor(7.9490e-06)

In [49]:
recall_k(recommendations, past_interactions, k=K, n_items=n_art)

tensor(8.8322e-06)

In [54]:
from torch.utils.data import IterableDataset

class EvalDataset(IterableDataset):
    def __init__(self, n_users, n_items, user_batch_size):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items

        self.user_batch_size = user_batch_size

    def get_batch_data(self, batch):
        u_min, u_max = batch, min(batch + self.user_batch_size, self.n_users)
        u_id = torch.repeat_interleave(torch.arange(u_min, u_max), self.n_items)
        i_id = torch.arange(self.n_items).repeat(u_max - u_min)

        return torch.column_stack((u_id, i_id))

    def __len__(self):
        return self.n_users // self.user_batch_size + 1

    def __iter__(self):
        for batch in range(0, self.n_users, self.user_batch_size):
            yield self.get_batch_data(batch)

In [60]:
it = iter(EvalDataset(5, 3, 1))

In [61]:
it.__next__()

tensor([[0, 0],
        [0, 1],
        [0, 2]])

In [62]:
it.__next__()

tensor([[1, 0],
        [1, 1],
        [1, 2]])

In [63]:
it.__next__()

tensor([[2, 0],
        [2, 1],
        [2, 2]])

In [64]:
it.__next__()

tensor([[3, 0],
        [3, 1],
        [3, 2]])

In [65]:
it.__next__()

tensor([[4, 0],
        [4, 1],
        [4, 2]])

In [66]:
it.__next__()

StopIteration: 

In [50]:
torch.repeat_interleave(torch.arange(0, 10), 5)

tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4,
        4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9,
        9, 9])

In [53]:
torch.arange(10).repeat(5)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3,
        4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7,
        8, 9])