In [1]:
%load_ext autoreload
%autoreload 2

import IPython
from pathlib import Path
import os
locals = IPython.extract_module_locals() # type: ignore
notebook_name = "/".join(locals[1]["__vsc_ipynb_file__"].split("/"))
os.chdir(Path(notebook_name).parent.parent.parent)

In [20]:
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
import torch

In [3]:
base_path = Path(".data/hm/base")
relations = pd.read_csv(base_path / "transactions_train.csv")
sample_submission = pd.read_csv(base_path / "sample_submission.csv")

intermediate_path = Path(".data/hm/intermediate/cov1")

relations_train = pd.read_parquet(intermediate_path / "relations_train.parquet")
relations_validation = pd.read_parquet(intermediate_path / "relations_validation.parquet")
customer_id_map = pd.read_parquet(intermediate_path / "customer_id_map.parquet")
article_id_map = pd.read_parquet(intermediate_path / "article_id_map.parquet")

In [4]:
n_users = sample_submission.customer_id.nunique()
n_items = relations.article_id.nunique()

print(n_users, n_items, relations.shape[0])

1371980 104547 31788324


In [5]:
sample_submission

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0706016001 0706016002 0372860001 0610776002 07...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0706016001 0706016002 0372860001 0610776002 07...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0706016001 0706016002 0372860001 0610776002 07...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0706016001 0706016002 0372860001 0610776002 07...


In [6]:
relations_validation

Unnamed: 0,t_dat,price,sales_channel_id,session_id,item_id
31691839,2020-09-20,0.013542,1,80,2145
31755458,2020-09-22,0.042356,2,86,85132
31723328,2020-09-21,0.033881,2,107,60282
31723329,2020-09-21,0.042356,2,107,102327
31723330,2020-09-21,0.050831,2,107,80800
...,...,...,...,...,...
31575037,2020-09-16,0.005068,2,1371879,93696
31575038,2020-09-16,0.016932,2,1371879,92067
31575039,2020-09-16,0.042356,1,1371937,79455
31575040,2020-09-16,0.016932,1,1371937,68989


In [15]:
N_candid = 50

random_candidates = np.random.randint(low=0, high=n_items, size=(n_users, N_candid))

random_recommendations_df = pd.DataFrame({
    'session_id': range(n_users),
    'candidates': random_candidates.tolist()
})

In [18]:
most_popular_items = relations_train[["session_id", 'item_id']].drop_duplicates()["item_id"].value_counts()

most_popular_recommendations_df = pd.DataFrame({
    'session_id': range(n_users),
    'candidates': [most_popular_items.index[:N_candid].tolist()] * n_users
})

In [19]:
from retail_recommender_system.evaluation.metrics import map_k, precision_k, recall_k

In [21]:
ground_truth = torch.from_numpy(relations_validation[["session_id", "item_id"]].values).T

In [23]:
recommendations_tensor = torch.from_numpy(np.array(random_recommendations_df['candidates'].tolist()))
users_idx = torch.from_numpy(random_recommendations_df['session_id'].values)

for k in [12, 30, 50]:
    map = map_k(recommendations_tensor, ground_truth, k=k, users_idx=users_idx, n_users=n_users, n_items=n_items)
    prec = precision_k(recommendations_tensor, ground_truth, k=k, users_idx=users_idx, n_users=n_users, n_items=n_items)
    rec = recall_k(recommendations_tensor, ground_truth, k=k, users_idx=users_idx, n_users=n_users, n_items=n_items)

    print(f"MAP@{k}: {map:.6f} | Precision@{k}: {prec:.6f} | Recall@{k}: {rec:.6f}")

MAP@12: 0.000038 | Precision@12: 0.000033 | Recall@12: 0.000122
MAP@30: 0.000035 | Precision@30: 0.000036 | Recall@30: 0.000327
MAP@50: 0.000035 | Precision@50: 0.000037 | Recall@50: 0.000579


In [24]:
recommendations_tensor = torch.from_numpy(np.array(most_popular_recommendations_df['candidates'].tolist()))
users_idx = torch.from_numpy(most_popular_recommendations_df['session_id'].values)

for k in [12, 30, 50]:
    map = map_k(recommendations_tensor, ground_truth, k=k, users_idx=users_idx, n_users=n_users, n_items=n_items)
    prec = precision_k(recommendations_tensor, ground_truth, k=k, users_idx=users_idx, n_users=n_users, n_items=n_items)
    rec = recall_k(recommendations_tensor, ground_truth, k=k, users_idx=users_idx, n_users=n_users, n_items=n_items)

    print(f"MAP@{k}: {map:.6f} | Precision@{k}: {prec:.6f} | Recall@{k}: {rec:.6f}")

MAP@12: 0.003145 | Precision@12: 0.002774 | Recall@12: 0.009219
MAP@30: 0.002683 | Precision@30: 0.002114 | Recall@30: 0.018270
MAP@50: 0.002352 | Precision@50: 0.001664 | Recall@50: 0.024320
