In [1]:
%load_ext autoreload
%autoreload 2

import IPython
from pathlib import Path
import os
locals = IPython.extract_module_locals() # type: ignore
notebook_name = "/".join(locals[1]["__vsc_ipynb_file__"].split("/"))
os.chdir(Path(notebook_name).parent.parent.parent)

In [2]:
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
base_path = Path(".data/hm/base")
relations = pd.read_csv(base_path / "transactions_train.csv")
sample_submission = pd.read_csv(base_path / "sample_submission.csv")

In [4]:
n_users = sample_submission.customer_id.nunique()
n_items = relations.article_id.nunique()

print(n_users, n_items, relations.shape[0])

1371980 104547 31788324


In [5]:
customer_id_map = pd.DataFrame(
    {
        "customer_id": sample_submission.customer_id.unique(), 
        "session_id": range(n_users)
    }
)
article_id_map = pd.DataFrame(
    {
        "article_id": relations.article_id.unique(), 
        "item_id": range(n_items)
    }
)

In [6]:
relations_orig = relations.copy()

In [7]:
relations = relations.merge(customer_id_map, on="customer_id").merge(article_id_map, on="article_id")

In [8]:
relations = relations.drop(columns=["customer_id", "article_id", "price", "sales_channel_id"])

In [9]:
relations['t_dat'] = pd.to_datetime(relations['t_dat'])

In [10]:
relations = relations.sort_values(by=["session_id", "t_dat"], ascending=[True, True])

In [11]:
relations['t_dat'].min(), relations['t_dat'].max()

(Timestamp('2018-09-20 00:00:00'), Timestamp('2020-09-22 00:00:00'))

In [12]:
validation_split_date = relations['t_dat'].max() - pd.Timedelta(days=7)
relations_train = relations[relations['t_dat'] <= validation_split_date]
relations_validation = relations[relations['t_dat'] > validation_split_date]

In [13]:
relations_validation

Unnamed: 0,t_dat,session_id,item_id
31691839,2020-09-20,80,2145
31755458,2020-09-22,86,85132
31723328,2020-09-21,107,60282
31723329,2020-09-21,107,102327
31723330,2020-09-21,107,80800
...,...,...,...
31575037,2020-09-16,1371879,93696
31575038,2020-09-16,1371879,92067
31575039,2020-09-16,1371937,79455
31575040,2020-09-16,1371937,68989


In [None]:
relations_train[relations_train.session_id.isin(np.random.randint(0, n_users, 10))].groupby("session_id").apply(display)

In [None]:
relations_train['session_id'].value_counts().head(200)

In [14]:
relations_train['n'] = relations_train.sort_values(by=["session_id", "t_dat"], ascending=[True, False]).groupby('session_id').cumcount()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relations_train['n'] = relations_train.sort_values(by=["session_id", "t_dat"], ascending=[True, False]).groupby('session_id').cumcount()


In [15]:
#df = relations_train[relations_train.session_id.isin([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])]
#df = relations_train[relations_train['t_dat'] > "2020-06-01"]
df = relations_train

In [16]:
df

Unnamed: 0,t_dat,session_id,item_id,n
4212358,2018-12-27,0,10895,18
4212359,2018-12-27,0,12746,19
4212360,2018-12-27,0,5938,20
9663224,2019-05-02,0,50328,17
10754876,2019-05-25,0,865,15
...,...,...,...,...
24375394,2020-04-09,1371978,84419,2
24375395,2020-04-09,1371978,82129,3
25077914,2020-04-25,1371978,84419,1
27806865,2020-06-22,1371978,93746,0


In [17]:
df_m = df.merge(df, on=['session_id', "t_dat"])[['t_dat', 'session_id', "item_id_x", "item_id_y"]]

In [18]:
df_m

Unnamed: 0,t_dat,session_id,item_id_x,item_id_y
0,2018-12-27,0,10895,10895
1,2018-12-27,0,10895,12746
2,2018-12-27,0,10895,5938
3,2018-12-27,0,12746,10895
4,2018-12-27,0,12746,12746
...,...,...,...,...
228758996,2020-04-09,1371978,82129,84419
228758997,2020-04-09,1371978,82129,82129
228758998,2020-04-25,1371978,84419,84419
228758999,2020-06-22,1371978,93746,93746


In [19]:
df_m = df_m.drop_duplicates()

In [20]:
df_m = df_m[df_m.item_id_x != df_m.item_id_y]

In [21]:
df_m['w'] = 1

In [22]:
df_m

Unnamed: 0,t_dat,session_id,item_id_x,item_id_y,w
1,2018-12-27,0,10895,12746,1
2,2018-12-27,0,10895,5938,1
3,2018-12-27,0,12746,10895,1
5,2018-12-27,0,12746,5938,1
6,2018-12-27,0,5938,10895,1
...,...,...,...,...,...
228758990,2020-04-05,1371978,85587,78163,1
228758991,2020-04-05,1371978,78163,82625,1
228758992,2020-04-05,1371978,78163,85587,1
228758995,2020-04-09,1371978,84419,82129,1


In [23]:
covisited = df_m.groupby(["item_id_x", "item_id_y"])['w'].sum().reset_index().sort_values(["item_id_x", "w"], ascending=[True, False])

In [24]:
covisited = covisited.rename(columns={"item_id_x": "item_id", "item_id_y": "candidate"})

In [25]:
covisited

Unnamed: 0,item_id,candidate,w
564,0,4907,80
607,0,5631,79
485,0,3998,35
614,0,5728,21
563,0,4906,19
...,...,...,...
84226907,103876,103074,1
84226908,103876,103229,1
84226909,103876,103649,1
84226910,103877,96209,1


In [26]:
covisited['rank'] = covisited.groupby("item_id")['candidate'].cumcount()

In [27]:
covisited

Unnamed: 0,item_id,candidate,w,rank
564,0,4907,80,0
607,0,5631,79,1
485,0,3998,35,2
614,0,5728,21,3
563,0,4906,19,4
...,...,...,...,...
84226907,103876,103074,1,8
84226908,103876,103229,1,9
84226909,103876,103649,1,10
84226910,103877,96209,1,0


In [28]:
covisited_candidates = covisited[covisited['rank'] < 10].drop(columns=["rank"])
covisited_candidates = covisited_candidates.astype({"item_id": np.int32, "candidate": np.int32})#.drop(columns=["w"])

In [29]:
covisited_candidates

Unnamed: 0,item_id,candidate,w
564,0,4907,80
607,0,5631,79
485,0,3998,35
614,0,5728,21
563,0,4906,19
...,...,...,...
84226906,103876,100709,1
84226907,103876,103074,1
84226908,103876,103229,1
84226910,103877,96209,1


In [30]:
covisited_candidates.groupby("item_id")['candidate'].apply(list).reset_index()

Unnamed: 0,item_id,candidate
0,0,"[4907, 5631, 3998, 5728, 4906, 4048, 26562, 38..."
1,1,"[703, 659, 10315, 5279, 918, 2459, 4465, 3528,..."
2,2,"[6, 1656, 3, 438, 531, 550, 837, 1080, 1177, 1..."
3,3,"[5, 4, 20, 732, 713, 1008, 139, 730, 22, 1081]"
4,4,"[5, 3, 20, 732, 1081, 1221, 789, 384, 67, 1008]"
...,...,...
103413,103873,[103872]
103414,103874,"[78377, 81022, 97964, 98805]"
103415,103875,[65077]
103416,103876,"[81564, 98570, 98616, 98784, 98942, 99920, 100..."


In [31]:
relations_train

Unnamed: 0,t_dat,session_id,item_id,n
4212358,2018-12-27,0,10895,18
4212359,2018-12-27,0,12746,19
4212360,2018-12-27,0,5938,20
9663224,2019-05-02,0,50328,17
10754876,2019-05-25,0,865,15
...,...,...,...,...
24375394,2020-04-09,1371978,84419,2
24375395,2020-04-09,1371978,82129,3
25077914,2020-04-25,1371978,84419,1
27806865,2020-06-22,1371978,93746,0


In [32]:
most_popular_items = relations_train[["session_id", 'item_id']].drop_duplicates()["item_id"].value_counts()

In [33]:
# relations_train2 = relations_train[relations_train.session_id.isin(relations_train.session_id.unique()[:1000000])]
# relations_train2 = relations_train2.astype({"session_id": np.int32, "item_id": np.int32}).drop(columns=["t_dat", "n"])

relations_train = relations_train.astype({"session_id": np.int32, "item_id": np.int32}).drop(columns=["t_dat", "n"])

In [None]:
m = relations_train[relations_train.session_id < 10].merge(covisited_candidates, on="item_id", how='left')

In [None]:
(
    m
    .dropna(subset=["candidate"])
    .drop(columns=["item_id"])
    .groupby(["session_id", "candidate"])["w"]
    .sum()
    .reset_index(name="count")   
)

In [34]:
N_candid = 50

recommendations = (
    relations_train
    .merge(covisited_candidates, on="item_id", how='left')
    .dropna(subset=["candidate"])
    .drop(columns=["item_id"])
    .groupby(["session_id", "candidate"])["w"]
    .sum()
    .reset_index(name="count")
    .sort_values(["session_id", "count"], ascending=[True, False])
    .groupby("session_id").head(N_candid)
    .groupby("session_id")["candidate"].apply(list)
)

def add_most_popular_items(reclist):
    return reclist + most_popular_items.index[:(N_candid - len(reclist))].tolist()

missing_recommendations_index = recommendations[recommendations.apply(len) < N_candid].index
recommendations[missing_recommendations_index] = recommendations[missing_recommendations_index].apply(add_most_popular_items)

recommendations_df = recommendations.reset_index()
recommendations_df.columns = ['session_id', 'candidates']
recommendations_df

Unnamed: 0,session_id,candidates
0,0,"[2323.0, 8656.0, 55.0, 62756.0, 61890.0, 1106...."
1,1,"[2880.0, 39611.0, 39489.0, 88001.0, 87972.0, 3..."
2,2,"[39611.0, 39489.0, 2880.0, 3136.0, 2882.0, 395..."
3,3,"[42711.0, 59981.0, 59984.0, 59261.0, 5320.0, 3..."
4,4,"[47790.0, 47899.0, 47791.0, 48691.0, 95100.0, ..."
...,...,...
1356665,1371975,"[52356.0, 52355.0, 49943.0, 44363.0, 44364.0, ..."
1356666,1371976,"[49927.0, 50066.0, 88001.0, 38400.0, 49739.0, ..."
1356667,1371977,"[38400.0, 50066.0, 40366.0, 35570.0, 37219.0, ..."
1356668,1371978,"[57995.0, 80542.0, 1408.0, 83960.0, 77216.0, 8..."


In [36]:
missing_indices = np.setdiff1d(np.arange(n_users), recommendations.index)
most_popular_list = most_popular_items.index[:N_candid].tolist()
missing_recommendations_df = pd.DataFrame({
    'session_id': missing_indices,
    'candidates': [most_popular_list] * len(missing_indices)
})

In [37]:
recommendations_full = pd.concat([recommendations_df, missing_recommendations_df]).sort_values(by="session_id")

In [39]:
from retail_recommender_system.evaluation.metrics import map_k, precision_k, recall_k

import torch

In [40]:
ground_truth = torch.from_numpy(relations_validation[["session_id", "item_id"]].values).T

In [41]:
recommendations_tensor = torch.from_numpy(np.array(recommendations_full['candidates'].tolist()))

In [42]:
users_idx = torch.from_numpy(recommendations_full['session_id'].values)

In [None]:
map = map_k(recommendations_tensor, ground_truth, k=12, users_idx=users_idx, n_users=n_users, n_items=n_items)
prec = precision_k(recommendations_tensor, ground_truth, k=12, users_idx=users_idx, n_users=n_users, n_items=n_items)
rec = recall_k(recommendations_tensor, ground_truth, k=12, users_idx=users_idx, n_users=n_users, n_items=n_items)

print(f"MAP@12: {map:.6f} | Precision@12: {prec:.6f} | Recall@12: {rec:.6f}")

MAP@12: 0.005052 | Precision@12: 0.003972 | Recall@12: 0.015037


In [44]:
map = map_k(recommendations_tensor, ground_truth, k=30, users_idx=users_idx, n_users=n_users, n_items=n_items)
prec = precision_k(recommendations_tensor, ground_truth, k=30, users_idx=users_idx, n_users=n_users, n_items=n_items)
rec = recall_k(recommendations_tensor, ground_truth, k=30, users_idx=users_idx, n_users=n_users, n_items=n_items)

print(f"MAP@30: {map:.6f} | Precision@30: {prec:.6f} | Recall@30: {rec:.6f}")

MAP@30: 0.003985 | Precision@30: 0.002836 | Recall@30: 0.026820


In [45]:
map = map_k(recommendations_tensor, ground_truth, k=50, users_idx=users_idx, n_users=n_users, n_items=n_items)
prec = precision_k(recommendations_tensor, ground_truth, k=50, users_idx=users_idx, n_users=n_users, n_items=n_items)
rec = recall_k(recommendations_tensor, ground_truth, k=50, users_idx=users_idx, n_users=n_users, n_items=n_items)

print(f"MAP@50: {map:.6f} | Precision@50: {prec:.6f} | Recall@50: {rec:.6f}")

MAP@50: 0.003417 | Precision@50: 0.002333 | Recall@50: 0.036442


In [None]:
recommendations_submission = recommendations_full.merge(customer_id_map, on="session_id").drop("session_id", axis=1)
item_to_article_map = dict(zip(article_id_map['item_id'], article_id_map['article_id']))
recommendations_submission["prediction"] = recommendations_submission['candidates'].apply(lambda x: " ".join([str(item_to_article_map[item]) for item in x]))
recommendations_submission = recommendations_submission.drop("candidates", axis=1)

In [None]:
recommendations_submission = recommendations_submission[['customer_id', 'prediction']]
recommendations_submission.to_csv(base_path / "recommendations_submission.csv", index=False)

In [None]:
import gzip
import shutil

with open(base_path / "recommendations_submission.csv", 'rb') as f_in:
    with gzip.open(base_path / "recommendations_submission.csv.gz", 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
!kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f {base_path / "recommendations_submission.csv.gz"} -m "Message"

In [None]:
relations_validation.groupby("session_id")["item_id"].apply(list)