In [1]:
%load_ext autoreload
%autoreload 2

import IPython
from pathlib import Path
import os
locals = IPython.extract_module_locals() # type: ignore
notebook_name = "/".join(locals[1]["__vsc_ipynb_file__"].split("/"))
os.chdir(Path(notebook_name).parent.parent)

In [2]:
from pathlib import Path

import numpy as np
import polars as pl
import pandas as pd

In [3]:
# Read csv
relations = pl.read_csv(".data/base/transactions_train.csv", try_parse_dates=True)
users = pl.read_csv(".data/base/customers.csv")
articles = pl.read_csv(".data/base/articles.csv")

In [4]:
# Create a mapping from the original id to a new id
users = users.with_columns(customer_id_map=pl.col("customer_id").cast(pl.Categorical).to_physical())
articles = articles.with_columns(article_id_map=pl.col("article_id").cast(pl.String).cast(pl.Categorical).to_physical())

Path(".data/intermediate").mkdir(exist_ok=True)
users_id_map = users.select("customer_id", "customer_id_map").unique()
articles_id_map = articles.select("article_id", "article_id_map").unique()
for c, id_map in zip(["customer_id", "article_id"], [users_id_map, articles_id_map]):
    id_map.write_parquet(f".data/intermediate/{c}_id_map.parquet")

assert users.select("customer_id_map").n_unique() == (users.get_column("customer_id_map").max() + 1) # type: ignore
assert articles.select("article_id_map").n_unique() == (articles.get_column("article_id_map").max() + 1) # type: ignore

In [5]:
# Add path column to articles
article_path_tuple_list = [(int(i.stem), str(i)) for i in Path(".data/base/images").rglob("*.jpg")]
articles_path_map = pl.DataFrame({
    "article_id": [i[0] for i in article_path_tuple_list], 
    "path": [i[1] for i in article_path_tuple_list]
})
articles = articles.join(articles_path_map, on="article_id", how="left")

In [6]:
# Add mapping columns to relations
relations = relations \
    .sort("t_dat") \
    .join(users_id_map, on="customer_id", how="left") \
    .join(articles_id_map, on="article_id", how="left")

In [7]:
# Write files to parquet
users.write_parquet(".data/intermediate/users.parquet")
articles.write_parquet(".data/intermediate/articles.parquet")
relations.write_parquet(".data/intermediate/relations.parquet")

In [8]:
articles

article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,article_id_map,path
i64,i64,str,i64,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,str,u32,str
108775015,108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…",0,""".data/base/images/010/01087750…"
108775044,108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",10,"""White""",3,"""Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…",1,""".data/base/images/010/01087750…"
108775051,108775,"""Strap top (1)""",253,"""Vest top""","""Garment Upper body""",1010017,"""Stripe""",11,"""Off White""",1,"""Dusty Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…",2,""".data/base/images/010/01087750…"
110065001,110065,"""OP T-shirt (Idro)""",306,"""Bra""","""Underwear""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1339,"""Clean Lingerie""","""B""","""Lingeries/Tights""",1,"""Ladieswear""",61,"""Womens Lingerie""",1017,"""Under-, Nightwear""","""Microfibre T-shirt bra with un…",3,""".data/base/images/011/01100650…"
110065002,110065,"""OP T-shirt (Idro)""",306,"""Bra""","""Underwear""",1010016,"""Solid""",10,"""White""",3,"""Light""",9,"""White""",1339,"""Clean Lingerie""","""B""","""Lingeries/Tights""",1,"""Ladieswear""",61,"""Womens Lingerie""",1017,"""Under-, Nightwear""","""Microfibre T-shirt bra with un…",4,""".data/base/images/011/01100650…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
953450001,953450,"""5pk regular Placement1""",302,"""Socks""","""Socks & Tights""",1010014,"""Placement print""",9,"""Black""",4,"""Dark""",5,"""Black""",7188,"""Socks Bin""","""F""","""Menswear""",3,"""Menswear""",26,"""Men Underwear""",1021,"""Socks and Tights""","""Socks in a fine-knit cotton bl…",105537,""".data/base/images/095/09534500…"
953763001,953763,"""SPORT Malaga tank""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1919,"""Jersey""","""A""","""Ladieswear""",1,"""Ladieswear""",2,"""H&M+""",1005,"""Jersey Fancy""","""Loose-fitting sports vest top …",105538,""".data/base/images/095/09537630…"
956217002,956217,"""Cartwheel dress""",265,"""Dress""","""Garment Full body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1641,"""Jersey""","""A""","""Ladieswear""",1,"""Ladieswear""",18,"""Womens Trend""",1005,"""Jersey Fancy""","""Short, A-line dress in jersey …",105539,""".data/base/images/095/09562170…"
957375001,957375,"""CLAIRE HAIR CLAW""",72,"""Hair clip""","""Accessories""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",3946,"""Small Accessories""","""D""","""Divided""",2,"""Divided""",52,"""Divided Accessories""",1019,"""Accessories""","""Large plastic hair claw.""",105540,""".data/base/images/095/09573750…"


In [9]:
# Optional: Check image paths of the articles
def to_pandas_hyperlink(df: pl.DataFrame, path_col: str) -> pd.DataFrame:
    def fun(path): 
        f_url = os.path.basename(path)
        path_corrected = f"../{path}"
        return '<a href="{}">{}</a>'.format(path_corrected, f_url) 
    df_pd = df.to_pandas()
    return df_pd.style.format({path_col: fun}) # type: ignore

to_pandas_hyperlink(articles.sample(100), "path")

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,article_id_map,path
0,695573003,695573,FREDRIK SHORTS TP,274,Shorts,Garment Lower body,1010016,Solid,73,Dark Blue,4,Dark,2,Blue,7648,Kids Boy Jersey Fancy,H,Children Sizes 92-140,4,Baby/Children,46,Kids Boy,1005,Jersey Fancy,"Shorts in soft, printed cotton jersey with an elasticated drawstring waist.",49889,0695573003.jpg
1,870579002,870579,Ariana hood,252,Sweater,Garment Upper body,1010008,Front print,9,Black,4,Dark,5,Black,8716,Young Girl Jersey Fancy,I,Children Sizes 134-170,4,Baby/Children,77,Young Girl,1005,Jersey Fancy,"Long-sleeved hoodie in soft, printed sweatshirt fabric with a double-layered hood, kangaroo pocket and ribbing at the cuffs and hem. Soft brushed inside.",96538,0870579002.jpg
2,909091001,909091,Winona Swimsuit,298,Bikini top,Swimwear,1010005,Colour blocking,9,Black,4,Dark,5,Black,1641,Jersey,A,Ladieswear,1,Ladieswear,18,Womens Trend,1005,Jersey Fancy,Fully lined one-shoulder swimsuit with a cut-out section at the front and side.,103072,0909091001.jpg
3,701853010,701853,Satin kimono (W),305,Robe,Underwear,1010016,Solid,71,Light Blue,1,Dusty Light,2,Blue,3709,Nightwear,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1017,"Under-, Nightwear","Satin dressing gown with concealed ties at the waist and a detachable tie belt. Short, wide sleeves with contrasting colour trims.",52674,0701853010.jpg
4,567533001,567533,Spencer padded bra 2pk,306,Bra,Underwear,1010001,All over pattern,31,Light Orange,1,Dusty Light,3,Orange,3705,Mama Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Soft jersey nursing bras with opening, lined cups with space for nursing pads, elastication under the bust and a hook-and-eye fastening at the back.",15565,0567533001.jpg
5,641433001,641433,Merry x-mas,252,Sweater,Garment Upper body,1010010,Melange,7,Grey,1,Dusty Light,12,Grey,8758,Young Boy Knitwear,I,Children Sizes 134-170,4,Baby/Children,47,Young Boy,1003,Knitwear,"Jumper in a fine-knit cotton blend with a print motif on the front and ribbing around the neckline, cuffs and hem.",34347,0641433001.jpg
6,705330006,705330,Borat Sweatpants,272,Trousers,Garment Lower body,1010016,Solid,53,Dark Pink,4,Dark,18,Red,1643,Basic 1,D,Divided,2,Divided,51,Divided Basics,1002,Jersey Basic,"Trousers in sweatshirt fabric with an elasticated waist, side pockets and ribbed hems.",53648,0705330006.jpg
7,715346008,715346,SULLIVAN CREWNECK,252,Sweater,Garment Upper body,1010008,Front print,73,Dark Blue,4,Dark,2,Blue,7648,Kids Boy Jersey Fancy,H,Children Sizes 92-140,4,Baby/Children,46,Kids Boy,1005,Jersey Fancy,"Top in cotton piqué with long raglan sleeves and ribbing around the neckline, cuffs and hem.",56974,0715346008.jpg
8,443696031,443696,BO basic OH BB,308,Hoodie,Garment Upper body,1010016,Solid,19,Greenish Khaki,4,Dark,20,Khaki green,8768,Young Boy Jersey Basic,I,Children Sizes 134-170,4,Baby/Children,72,Boys Underwear & Basics,1002,Jersey Basic,"Long-sleeved top in sweatshirt fabric made from an organic cotton blend with a lined hood, kangaroo pocket and ribbing at the cuffs and hem. Soft brushed inside.",2975,0443696031.jpg
9,694298045,694298,ALGOT body LS 3p,256,Bodysuit,Garment Upper body,1010001,All over pattern,51,Light Pink,1,Dusty Light,4,Pink,6564,Newborn,G,Baby Sizes 50-98,4,Baby/Children,44,Baby Essentials & Complements,1005,Jersey Fancy,"Long-sleeved bodysuits in soft, organic cotton jersey with a wrapover front and press-studs at the side and crotch.",49286,0694298045.jpg
