In [1]:
# How many rows are there in tx?

import pandas as pd
tx = pd.read_csv("../data/external/transactions_train.csv", dtype='string')

print(len(tx))

259661


In [2]:
articles = pd.read_csv("../data/processed/articles_clean.csv", dtype="string", usecols=["sku"])
tx["sku"] = tx["sku"].str.strip()
articles["sku"] = articles["sku"].str.strip()
tx_skus = pd.Index(tx["sku"].dropna().unique())
art_skus = pd.Index(articles["sku"].dropna().unique())
missing = tx_skus.difference(art_skus)
print(f"Unique SKUs in tx: {len(tx_skus):,} | in articles: {len(art_skus):,}")
print(f"NOT FOUND in articles (exact): {len(missing):,}")

Unique SKUs in tx: 17,500 | in articles: 107,749
NOT FOUND in articles (exact): 0


In [3]:
display(tx.sort_values("orderId", key=lambda x: x.astype(int)).head())

Unnamed: 0,orderId,shopUserId,created,currencyId,orderLineId,sku,quantity,price,name
259660,158790,78135,2024-05-22 12:38:29,50,454306,291294,1,11.3,Vasaravarpaan oikaisija 4/pak
259659,158791,78136,2024-05-22 12:44:01,40,454308,542092,1,89.0,Tvättlappar enfärgade av frotté 5-pack
259657,158800,78145,2024-05-22 12:54:51,40,454358,250209-0039,1,259.0,Hälsotoffel
259658,158800,78145,2024-05-22 12:54:51,40,454359,261518,1,49.0,Bh-förlängare 3-pack
259655,158805,78150,2024-05-22 12:59:31,134,454377,542252,1,179.0,Köksset 4-delar återvunnet material


In [4]:
# How many null values are there in tx?

# Use the already loaded 'tx' DataFrame from previous cells
print(tx.isnull().sum())

orderId        0
shopUserId     0
created        0
currencyId     0
orderLineId    1
sku            1
quantity       1
price          1
name           1
dtype: int64


In [5]:
# Remove all rows with null values
tx = tx.dropna()

 compute order-level user history (past order count, recency in seconds/log, and average past basket value) and merge it back to each line, so every row has the same past-user context without leaking the current order. We then add leave-one-line-out basket features (sum/qty/lines/avg unit price excluding the current line) and calendar features (hour, weekday/weekend, month, quarter) from order_created. 

In [6]:
import numpy as np
import pandas as pd

# --- Clean basic line features ---
tx['created']  = pd.to_datetime(tx['created'], errors='coerce')
tx['price']    = pd.to_numeric(tx['price'], errors='coerce')
tx['quantity'] = pd.to_numeric(tx['quantity'], errors='coerce').clip(lower=1)
tx['line_value'] = tx['price'] * tx['quantity']
tx['unit_price'] = tx['price']

# --- Order-level user history ---
orders = (tx.sort_values(['shopUserId','created'])
            .groupby(['shopUserId','orderId'], as_index=False)
            .agg(order_created=('created','min'),
                 basket_value=('line_value','sum')))

orders['user_past_orders']  = orders.groupby('shopUserId').cumcount().astype('Int32')
orders['user_recency_sec']  = (orders.groupby('shopUserId')['order_created']
                                     .diff().dt.total_seconds().fillna(-1))

# Optional: smoother scale for some models
orders['user_recency_log'] = np.log1p(np.clip(orders['user_recency_sec'].where(orders['user_recency_sec']>=0, 0),
                                              0, 182*24*3600))

# Left-exclusive avg basket value
g   = orders.groupby('shopUserId', sort=False)
den = orders['user_past_orders'].replace(0, np.nan)
orders['user_avg_basket_value'] = ((g['basket_value'].cumsum() - orders['basket_value']) / den).fillna(0.0)

# Merge back 
tx = tx.merge(orders[['shopUserId','orderId','order_created','basket_value',
                      'user_past_orders','user_recency_sec','user_recency_log',
                      'user_avg_basket_value']],
              on=['shopUserId','orderId'], how='left')

# --- Basket context (leave-one-line-out) ---
grp = tx.groupby('orderId', dropna=False)
tx['basket_total_value_excl']    = grp['line_value'].transform('sum') - tx['line_value']
tx['basket_total_qty_excl']      = grp['quantity'].transform('sum')   - tx['quantity']
tx['basket_num_lines_excl']      = grp['orderLineId'].transform('count') - 1
tx['basket_avg_unit_price_excl'] = (
    (grp['unit_price'].transform('sum') - tx['unit_price']) /
    tx['basket_num_lines_excl'].clip(lower=1)
)

# --- Time features from order_created ---
tx['year']      = tx['order_created'].dt.year.astype('Int16')
tx['week']      = tx['order_created'].dt.isocalendar().week.astype('Int16')
tx['hour']      = tx['order_created'].dt.hour.astype('Int16')
tx['dayofweek'] = tx['order_created'].dt.dayofweek.astype('Int8')
tx['is_weekend']= tx['dayofweek'].isin([5,6]).astype('Int8')
tx['month']     = tx['order_created'].dt.month.astype('Int8')
tx['quarter']   = tx['order_created'].dt.quarter.astype('Int8')


Goal: Create one row per user–item interaction (the “positive” examples), with the user’s context at that time.

In [7]:
import pandas as pd
import numpy as np

# --- minimal guards so .dt works ---
tx['created'] = pd.to_datetime(tx['created'], errors='coerce')   # ensure created is datetime

# Ensure order_created/year/week exist
if 'order_created' not in tx.columns:
    oc = (tx.groupby(['shopUserId','orderId'], as_index=False)['created']
            .min()
            .rename(columns={'created':'order_created'}))
    tx = tx.merge(oc, on=['shopUserId','orderId'], how='left')

tx['order_created'] = pd.to_datetime(tx['order_created'], errors='coerce')  # <-- make datetime

if 'year' not in tx.columns or 'week' not in tx.columns:
    tx['year'] = tx['order_created'].dt.year
    tx['week'] = tx['order_created'].dt.isocalendar().week.astype(int)  # week number only

# POSITIVES: one row per (user, order, sku) with compact context
pos = (tx.groupby(['shopUserId','orderId','sku','year','week','order_created',
                   'hour','dayofweek','is_weekend','month','quarter',
                   'user_past_orders','user_recency_log','user_avg_basket_value'],
                  as_index=False)
         .agg(quantity=('quantity','sum'),
              price=('price','mean'),
              line_value=('line_value','sum'),
              unit_price=('unit_price','mean'),
              basket_total_value_excl=('basket_total_value_excl','mean'),
              basket_total_qty_excl=('basket_total_qty_excl','mean'),
              basket_num_lines_excl=('basket_num_lines_excl','mean'),
              basket_avg_unit_price_excl=('basket_avg_unit_price_excl','mean')))
pos['label'] = 1

# Weekly context per user (latest order in that week)
pos['user_week_key'] = (pos['shopUserId'].astype(str) + '|' +
                        pos['year'].astype(str) + '|' +
                        pos['week'].astype(str))

uw_context = (pos.sort_values(['shopUserId','year','week','order_created'])
                .groupby(['shopUserId','year','week','user_week_key'], as_index=False)
                .tail(1)[['shopUserId','year','week','user_week_key','order_created',
                          'hour','dayofweek','is_weekend','month','quarter',
                          'user_past_orders','user_recency_log','user_avg_basket_value']])


In [8]:
import pandas as pd, numpy as np

# Popular SKUs pool (simple candidate generator)
TOP_K = 50  # try 50/100 if you want
top_skus = (tx.groupby('sku')['orderId'].nunique()
              .sort_values(ascending=False)
              .head(TOP_K).index)

# Cross-join (user-week) × (top_skus), drop positives, then sample
user_weeks = pos[['shopUserId','year','week','user_week_key']].drop_duplicates()
neg_pool = (user_weeks.assign(key=1)
            .merge(pd.DataFrame({'sku': top_skus, 'key':1}), on='key', how='left')
            .drop('key', axis=1))
neg_pool = neg_pool.merge(pos[['user_week_key','sku']].drop_duplicates().assign(pos=1),
                          on=['user_week_key','sku'], how='left')
neg_pool = neg_pool[neg_pool['pos'].isna()].drop(columns='pos')

# k negatives per positive (per user-week)
k = 5
n_pos = pos.groupby('user_week_key').size().rename('n_pos')
neg_pool = neg_pool.merge(n_pos, on='user_week_key', how='left')
rng = np.random.default_rng(42)
neg_pool = (neg_pool.assign(rand=rng.random(len(neg_pool)))
                   .sort_values(['user_week_key','rand'])
                   .groupby('user_week_key', group_keys=False)
                   .apply(lambda g: g.head(int(g['n_pos'].iloc[0] * k)))
                   .drop(columns=['rand','n_pos']))

# Attach weekly context; fill item/basket with neutral values
negs = (neg_pool.merge(uw_context, on=['shopUserId','year','week','user_week_key'], how='left')
               .assign(orderId=np.nan, quantity=0, price=0.0, line_value=0.0, unit_price=0.0,
                       basket_total_value_excl=0.0, basket_total_qty_excl=0.0,
                       basket_num_lines_excl=0.0, basket_avg_unit_price_excl=0.0, label=0))

# Final UI training frame (keep keys for joins; drop before model.fit if desired)
cols = ['shopUserId','sku','orderId','order_created','hour','dayofweek','is_weekend','month','quarter',
        'user_past_orders','user_recency_log','user_avg_basket_value',
        'quantity','price','line_value','unit_price',
        'basket_total_value_excl','basket_total_qty_excl','basket_num_lines_excl','basket_avg_unit_price_excl',
        'year','week','user_week_key','label']
ui_train = (pd.concat([pos[cols], negs[cols]], ignore_index=True)
              .sample(frac=1.0, random_state=42)
              .reset_index(drop=True))

print(ui_train.shape, ui_train['label'].value_counts())

  neg_pool = (neg_pool.assign(rand=rng.random(len(neg_pool)))
  ui_train = (pd.concat([pos[cols], negs[cols]], ignore_index=True)


(1541914, 24) label
0    1282340
1     259574
Name: count, dtype: int64


In [9]:

from IPython.display import display, HTML
display(HTML('<div style="overflow-x:auto">' + ui_train.head(3).to_html() + '</div>'))


Unnamed: 0,shopUserId,sku,orderId,order_created,hour,dayofweek,is_weekend,month,quarter,user_past_orders,user_recency_log,user_avg_basket_value,quantity,price,line_value,unit_price,basket_total_value_excl,basket_total_qty_excl,basket_num_lines_excl,basket_avg_unit_price_excl,year,week,user_week_key,label
0,493841,264242,,2024-11-21 08:59:23,8,3,0,11,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024,47,493841|2024|47,0
1,329945,261933-4244,,2024-08-06 05:13:02,5,1,0,8,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024,32,329945|2024|32,0
2,583806,260224,,2025-02-03 13:11:58,13,0,0,2,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025,6,583806|2025|6,0


In [10]:
ui_train.drop(columns=['orderId', "order_created"], inplace=True)

In [11]:
import pandas as pd

# Load & normalize keys
articles = pd.read_csv("../data/processed/articles_clean.csv", dtype=str)
articles['sku'] = articles['sku'].str.strip()

# Normalize categoryId (take primary before any comma)
articles['categoryId_primary'] = articles['categoryId'].str.split(',').str[0].str.strip()

# Select columns and dedupe to 1 row/sku
use_cols = ['sku','brandId','categoryId_primary','colorId','audienceId','name','status']
articles = articles[use_cols].drop_duplicates('sku', keep='last')

# Merge into ui_train
ui_train['sku'] = ui_train['sku'].astype(str).str.strip()
ui_train = ui_train.merge(articles, on='sku', how='left', validate='m:1')

# LightGBM: mark categoricals
for c in ['brandId','categoryId_primary','colorId','audienceId']:
    ui_train[c] = ui_train[c].astype('category')

# Keep `status` only for filtering active items at inference
active_skus = set(ui_train.loc[ui_train['status'].str.lower()=='active','sku'])


In [12]:

from IPython.display import display, HTML
display(HTML('<div style="overflow-x:auto">' + ui_train.head(3).to_html() + '</div>'))


Unnamed: 0,shopUserId,sku,hour,dayofweek,is_weekend,month,quarter,user_past_orders,user_recency_log,user_avg_basket_value,quantity,price,line_value,unit_price,basket_total_value_excl,basket_total_qty_excl,basket_num_lines_excl,basket_avg_unit_price_excl,year,week,user_week_key,label,brandId,categoryId_primary,colorId,audienceId,name,status
0,493841,264242,8,3,0,11,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024,47,493841|2024|47,0,82,27,105,6,Bh-förlängare 2-p,active
1,329945,261933-4244,5,1,0,8,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024,32,329945|2024|32,0,109,19,18,6,Stretchtrosa 3-pack,inactive
2,583806,260224,13,0,0,2,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025,6,583806|2025|6,0,109,95,97,6,Medicinska sockor 2-pack,active


In [13]:

# Load + normalize keys
cust = pd.read_csv("../data/processed/customers_clean.csv", dtype="string")
cust = cust.rename(columns={"Gender":"gender","Age":"age"})
cust["shopUserId"] = cust["shopUserId"].str.strip()
ui_train["shopUserId"] = ui_train["shopUserId"].astype(str).str.strip()

# Keep only one row per customer (avoid row explosion)
cust = cust[["shopUserId","invoiceCountryId","gender","age"]].drop_duplicates("shopUserId", keep="last")

# Merge (m:1 expected)
ui_train = ui_train.merge(cust, on="shopUserId", how="left", validate="m:1")

# LightGBM-friendly dtypes / light cleaning
ui_train["invoiceCountryId"] = ui_train["invoiceCountryId"].astype("category")
ui_train["gender"] = (ui_train["gender"].str.strip().str.lower()
                      .map({"female":"female","male":"male"}).fillna("unknown").astype("category"))
ui_train["age"] = (pd.to_numeric(ui_train["age"], errors="coerce")
                   .clip(lower=0, upper=120).astype("Int16"))

# age buckets for trees/DNN
ui_train["age_bucket"] = pd.cut(ui_train["age"], bins=[0,18,25,35,45,55,65,75,120],
                                labels=["<18","18-24","25-34","35-44","45-54","55-64","65-74","75+"]
                               ).astype("category")


In [15]:

from IPython.display import display, HTML
display(HTML('<div style="overflow-x:auto">' + ui_train.head(30).to_html() + '</div>'))


Unnamed: 0,shopUserId,sku,hour,dayofweek,is_weekend,month,quarter,user_past_orders,user_recency_log,user_avg_basket_value,quantity,price,line_value,unit_price,basket_total_value_excl,basket_total_qty_excl,basket_num_lines_excl,basket_avg_unit_price_excl,year,week,user_week_key,label,brandId,categoryId_primary,colorId,audienceId,name,status,invoiceCountryId,gender,age,age_bucket
0,493841,264242,8,3,0,11,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024,47,493841|2024|47,0,82,27,105,6,Bh-förlängare 2-p,active,160.0,unknown,,
1,329945,261933-4244,5,1,0,8,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024,32,329945|2024|32,0,109,19,18,6,Stretchtrosa 3-pack,inactive,72.0,female,70.0,65-74
2,583806,260224,13,0,0,2,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025,6,583806|2025|6,0,109,95,97,6,Medicinska sockor 2-pack,active,160.0,unknown,,
3,96782,261638-4044,11,1,0,5,2,2,14.833446,72.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025,22,96782|2025|22,0,251,639,35,6,"Ankelsocka VID""""",active,72.0,female,69.0,65-74
4,673257,240186-0042,12,0,0,3,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025,11,673257|2025|11,0,126,2110,35,6,Fritidsbyxa,active,,unknown,,
5,396024,261635-4044,10,1,0,9,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024,39,396024|2024|39,0,251,639,101,6,"Ankelsocka VID""""",active,160.0,unknown,,
6,678433,290290,11,4,0,3,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025,11,678433|2025|11,0,139,1418,__UNK_COLORID__,99,TurnKey skruvkorksöppnare,active,58.0,unknown,,
7,350037,260646-4648,6,2,0,8,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024,34,350037|2024|34,0,126,19,18,6,Trosa 3-pack,active,,unknown,,
8,702371,200400,10,3,0,4,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025,14,702371|2025|14,0,126,200,35,99,Svart väska m axelrem,active,72.0,female,70.0,65-74
9,487960,210338-4042,10,0,0,11,4,0,0.0,0.0,1.0,229.0,229.0,229.0,1097.0,3.0,3.0,365.666667,2024,47,487960|2024|47,1,126,48,671,6,T-shirt 2-pack,active,,unknown,,
