In [1]:
%load_ext autoreload
%autoreload 2

import IPython
from pathlib import Path
import os
locals = IPython.extract_module_locals() # type: ignore
notebook_name = "/".join(locals[1]["__vsc_ipynb_file__"].split("/"))
os.chdir(Path(notebook_name).parent.parent.parent)

In [2]:
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
base_path = Path(".data/hm/base")
relations = pd.read_csv(base_path / "transactions_train.csv")
sample_submission = pd.read_csv(base_path / "sample_submission.csv")

In [10]:
n_users = sample_submission.customer_id.nunique()
n_items = relations.article_id.nunique()

print(n_users, n_items, relations.shape[0])

1371980 104547 31788324


In [11]:
customer_id_map = pd.DataFrame(
    {
        "customer_id": sample_submission.customer_id.unique(), 
        "session_id": range(n_users)
    }
)
article_id_map = pd.DataFrame(
    {
        "article_id": relations.article_id.unique(), 
        "item_id": range(n_items)
    }
)

In [12]:
relations

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2
...,...,...,...,...,...
31788319,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2
31788320,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2
31788321,2020-09-22,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,918325001,0.043203,1
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1


In [13]:
relations = relations.merge(customer_id_map, on="customer_id").merge(article_id_map, on="article_id")
relations = relations.drop(columns=["customer_id", "article_id"])
relations['t_dat'] = pd.to_datetime(relations['t_dat'])
relations = relations.sort_values(by=["session_id", "t_dat"], ascending=[True, True])

In [14]:
validation_split_date = relations['t_dat'].max() - pd.Timedelta(days=7)
relations_train = relations[relations['t_dat'] <= validation_split_date]
relations_validation = relations[relations['t_dat'] > validation_split_date]

In [15]:
relations_train

Unnamed: 0,t_dat,price,sales_channel_id,session_id,item_id
4212358,2018-12-27,0.044051,1,0,10895
4212359,2018-12-27,0.035576,1,0,12746
4212360,2018-12-27,0.030492,1,0,5938
9663224,2019-05-02,0.010153,2,0,50328
10754876,2019-05-25,0.050831,2,0,865
...,...,...,...,...,...
24375394,2020-04-09,0.043203,2,1371978,84419
24375395,2020-04-09,0.013542,2,1371978,82129
25077914,2020-04-25,0.050831,2,1371978,84419
27806865,2020-06-22,0.016932,1,1371978,93746


In [16]:
relations_validation

Unnamed: 0,t_dat,price,sales_channel_id,session_id,item_id
31691839,2020-09-20,0.013542,1,80,2145
31755458,2020-09-22,0.042356,2,86,85132
31723328,2020-09-21,0.033881,2,107,60282
31723329,2020-09-21,0.042356,2,107,102327
31723330,2020-09-21,0.050831,2,107,80800
...,...,...,...,...,...
31575037,2020-09-16,0.005068,2,1371879,93696
31575038,2020-09-16,0.016932,2,1371879,92067
31575039,2020-09-16,0.042356,1,1371937,79455
31575040,2020-09-16,0.016932,1,1371937,68989


In [19]:
intermediate_path = Path(".data/hm/intermediate/cov1")

relations_train.to_parquet(intermediate_path / "relations_train.parquet")
relations_validation.to_parquet(intermediate_path / "relations_validation.parquet")
customer_id_map.to_parquet(intermediate_path / "customer_id_map.parquet")
article_id_map.to_parquet(intermediate_path / "article_id_map.parquet")