In [1]:
%load_ext autoreload
%autoreload 2

import IPython
from pathlib import Path
import os
locals = IPython.extract_module_locals() # type: ignore
notebook_name = "/".join(locals[1]["__vsc_ipynb_file__"].split("/"))
os.chdir(Path(notebook_name).parent.parent)

In [2]:
from pathlib import Path

import numpy as np
import polars as pl
import pandas as pd

In [3]:
# Read csv
relations = pl.read_csv(".data/base/transactions_train.csv", try_parse_dates=True)
users = pl.read_csv(".data/base/customers.csv")
articles = pl.read_csv(".data/base/articles.csv")

In [4]:
# Create a mapping from the original id to a new id
users = users.with_columns(customer_id_map=pl.col("customer_id").cast(pl.Categorical).to_physical())
articles = articles.with_columns(article_id_map=pl.col("article_id").cast(pl.String).cast(pl.Categorical).to_physical())

Path(".data/intermediate").mkdir(exist_ok=True)
users_id_map = users.select("customer_id", "customer_id_map").unique()
articles_id_map = articles.select("article_id", "article_id_map").unique()
for c, id_map in zip(["customer_id", "article_id"], [users_id_map, articles_id_map]):
    id_map.write_parquet(f".data/intermediate/{c}_id_map.parquet")

assert users.select("customer_id_map").n_unique() == (users.get_column("customer_id_map").max() + 1) # type: ignore
assert articles.select("article_id_map").n_unique() == (articles.get_column("article_id_map").max() + 1) # type: ignore

In [5]:
# Add path column to articles
article_path_tuple_list = [(int(i.stem), str(i)) for i in Path(".data/base/images").rglob("*.jpg")]
articles_path_map = pl.DataFrame({
    "article_id": [i[0] for i in article_path_tuple_list], 
    "path": [i[1] for i in article_path_tuple_list]
})
articles = articles.join(articles_path_map, on="article_id", how="left")

In [6]:
# Add mapping columns to relations
relations = relations \
    .sort("t_dat") \
    .join(users_id_map, on="customer_id", how="left") \
    .join(articles_id_map, on="article_id", how="left")

In [7]:
# Write files to parquet
users.write_parquet(".data/intermediate/users.parquet")
articles.write_parquet(".data/intermediate/articles.parquet")
relations.write_parquet(".data/intermediate/relations.parquet")

In [None]:
articles

In [None]:
# Optional: Check image paths of the articles
def to_pandas_hyperlink(df: pl.DataFrame, path_col: str) -> pd.DataFrame:
    def fun(path): 
        f_url = os.path.basename(path)
        path_corrected = f"../{path}"
        return '<a href="{}">{}</a>'.format(path_corrected, f_url) 
    df_pd = df.to_pandas()
    return df_pd.style.format({path_col: fun}) # type: ignore

to_pandas_hyperlink(articles.sample(100), "path")