In [1]:
import os, gc
import numpy as np
import pandas as pd

pd.set_option('mode.copy_on_write', True)

os.environ['HM_TARGET_WEEK_END'] = '2020-08-19'

TARGET_WEEK_END = os.environ.get("HM_TARGET_WEEK_END")
assert TARGET_WEEK_END is not None, "Set HM_TARGET_WEEK_END='YYYY-MM-DD'"
TARGET_WEEK_END = pd.to_datetime(TARGET_WEEK_END)

WEEK_TAG = TARGET_WEEK_END.strftime("%Y%m%d")

TX_PATH = '../data/input_data/transactions_train.csv'
CAND_PATH = f'../data/outputs/candidates_week={WEEK_TAG}.parquet'


In [2]:
# Cell 2 – load candidates for this week
candidates = pd.read_parquet(CAND_PATH)

candidates['customer_id'] = candidates['customer_id'].astype('int64')
candidates['article_id']  = candidates['article_id'].astype('int32')
candidates['value']       = candidates['value'].astype('float32')

if 'window_type' in candidates.columns:
    candidates['window_type'] = candidates['window_type'].astype('category')

print("Candidates for week", WEEK_TAG, "shape:", candidates.shape)

# 27 sec

Candidates for week 20200819 shape: (262938890, 5)


In [None]:
# Cell 3 – build history up to cut_ts for this week
tx = pd.read_csv(
    TX_PATH,
    usecols=['t_dat','customer_id','article_id'],
    dtype={'t_dat':'string','customer_id':'string','article_id':'int32'},
)
tx['customer_id'] = tx['customer_id'].str[-16:].apply(lambda h: np.int64(np.uint64(int(h,16))))
tx['t_dat'] = pd.to_datetime(tx['t_dat'])

last_ts = TARGET_WEEK_END
cut_ts  = last_ts - pd.Timedelta(days=7)

# Full history up to cut_ts
tx_full = tx[tx['t_dat'] <= cut_ts][['customer_id','article_id','t_dat']].copy()

history_start = cut_ts - pd.Timedelta(days=56)
tx_hist = tx[(tx['t_dat'] > history_start) & (tx['t_dat'] <= cut_ts)][['customer_id','article_id','t_dat']].copy()

del tx
gc.collect()

week_start  = cut_ts - pd.Timedelta(days=7)
month_start = cut_ts - pd.Timedelta(days=28)

# Customer recency windows
cust_1w = (tx_hist[tx_hist['t_dat'] > week_start].groupby('customer_id')['article_id']
           .size().reset_index(name='cust_purchases_1w'))

cust_4w = (tx_hist[tx_hist['t_dat'] > month_start].groupby('customer_id')['article_id']
           .size().reset_index(name='cust_purchases_4w'))

del tx_hist
gc.collect()

# 2 min 23 sec

0

In [4]:
# Cell 4 – customer-level cumulative features (all history up to cut_ts)
cust_agg = (
    tx_full.groupby('customer_id')
      .agg(
          customer_total_purchases=('article_id','size'),
          customer_unique_articles=('article_id','nunique'),
      )
      .reset_index()
)

In [5]:
# Cell 5 – article-level cumulative features (all history up to cut_ts)
art_agg = (
    tx_full.groupby('article_id')
      .agg(
          article_total_purchases=('customer_id','size'),
          article_unique_customers=('customer_id','nunique'),
      )
      .reset_index()
)

# Load article metadata
art_meta = pd.read_csv(
    '../data/input_data/articles.csv',
    usecols=[
        'article_id',
        'product_code',
        'department_no',
        'product_type_no',
        'product_group_name',
        'index_code',
        'section_no',
        'graphical_appearance_no',
        'colour_group_code',
        'perceived_colour_value_id',
        'perceived_colour_master_id',
        'index_group_no',
        'garment_group_no',
    ],
    dtype={'article_id':'int32','product_code':'int32','department_no':'int32'}
)

# Encode categories
for col in ['product_group_name','index_code']:
    art_meta[col] = art_meta[col].astype('category').cat.codes.astype('int16')

# Downcast ints
for col in [
    'product_type_no','section_no','graphical_appearance_no','colour_group_code',
    'perceived_colour_value_id','perceived_colour_master_id','index_group_no',
    'garment_group_no','department_no'
]:
    art_meta[col] = pd.to_numeric(art_meta[col], downcast='integer')


In [6]:
# Cell 6 – last purchase per (customer, article) using all history
last_purchase = (
    tx_full.sort_values('t_dat')
           .drop_duplicates(['customer_id','article_id'], keep='last')
           [['customer_id','article_id','t_dat']]
           .rename(columns={'t_dat':'last_purchase_date'})
)

cust_last_purchase = (
    tx_full.sort_values('t_dat')
           .drop_duplicates(['customer_id'], keep='last')
           [['customer_id','article_id','t_dat']]
           .rename(columns={
               'article_id':'last_bought_article_id',
               't_dat':'customer_last_purchase_date'
           })
)

tx_price = pd.read_csv(
    TX_PATH,
    usecols=['t_dat','customer_id','article_id','price'],
    dtype={'t_dat':'string','customer_id':'string','article_id':'int32','price':'float32'}
)
tx_price['customer_id'] = tx_price['customer_id'].str[-16:].apply(lambda h: np.int64(np.uint64(int(h,16))))
tx_price['t_dat'] = pd.to_datetime(tx_price['t_dat'])

tx_price_hist = tx_price[tx_price['t_dat'] <= cut_ts].copy()

art_price_agg = (tx_price_hist.groupby('article_id')['price']
                 .mean().reset_index(name='article_mean_price'))

cust_price_agg = (tx_price_hist.groupby('customer_id')['price']
                  .mean().reset_index(name='customer_mean_price'))

del tx_price, tx_price_hist
gc.collect()

cust_meta = pd.read_csv(
    '../data/input_data/customers.csv',
    usecols=['customer_id','age','postal_code','club_member_status','fashion_news_frequency'],
    dtype={'customer_id':'string'}
)
cust_meta['customer_id'] = cust_meta['customer_id'].str[-16:].apply(lambda h: np.int64(np.uint64(int(h,16))))
cust_meta['age'] = cust_meta['age'].fillna(-1).astype('float32')

for col in ['postal_code','club_member_status','fashion_news_frequency']:
    cust_meta[col] = cust_meta[col].astype('category')

tx_age = tx_full.merge(cust_meta[['customer_id','age']], on='customer_id', how='left')
art_age_agg = tx_age.groupby('article_id')['age'].mean().reset_index(name='article_mean_age')

del tx_age
gc.collect()

# 2 min 39 sec

20

In [7]:
features = candidates.merge(cust_agg, on='customer_id', how='left')
features = features.merge(cust_1w, on='customer_id', how='left')
features = features.merge(cust_4w, on='customer_id', how='left')
features = features.merge(art_agg, on='article_id', how='left')
features = features.merge(last_purchase, on=['customer_id','article_id'], how='left')
features = features.merge(cust_meta, on='customer_id', how='left')
features = features.merge(art_meta, on='article_id', how='left')
features = features.merge(cust_last_purchase, on='customer_id', how='left')
features = features.merge(art_price_agg, on='article_id', how='left')
features = features.merge(cust_price_agg, on='customer_id', how='left')
features = features.merge(art_age_agg, on='article_id', how='left')

if 'last_bought_article_id' in features.columns:
    art_index_map = art_meta.set_index('article_id')['index_group_no'].to_dict()
    art_product_map = art_meta.set_index('article_id')['product_code'].to_dict()

    features['last_bought_index_group'] = features['last_bought_article_id'].map(art_index_map)
    features['last_bought_product_code'] = features['last_bought_article_id'].map(art_product_map)

    features['index_group_match'] = (features['index_group_no'] == features['last_bought_index_group']).astype('int8')
    features['product_code_match'] = (features['product_code'] == features['last_bought_product_code']).astype('int8')

    features = features.drop(columns=['last_bought_article_id','last_bought_index_group','last_bought_product_code'])
else:
    features['index_group_match'] = 0
    features['product_code_match'] = 0

# Cleanup large objects
del cust_agg, cust_1w, cust_4w, art_agg, last_purchase
del cust_meta, art_meta, cust_last_purchase, art_price_agg, cust_price_agg, art_age_agg
gc.collect()

# 9 min 53 sec

0

In [None]:
print("candidates – rows:", len(candidates))
print("candidates – unique customers:", candidates['customer_id'].nunique())

print("features – rows:", len(features))
print("features – unique customers:", features['customer_id'].nunique())

candidates – rows: 258924439
candidates – unique customers: 1371980
features – rows: 258924439
features – unique customers: 1371980


In [8]:
# Cell 8 – recency and encoding
del candidates
gc.collect()

features['days_since_last_purchase'] = (
    (cut_ts - features['last_purchase_date']).dt.days.astype('float32')
)

features['customer_days_since_last_purchase'] = (
    (cut_ts - features['customer_last_purchase_date']).dt.days.astype('float32')
)

features['price_sensitivity'] = (features['article_mean_price'] - features['customer_mean_price']).abs()
features['age_sensitivity'] = (features['age'] - features['article_mean_age']).abs()

features = features.drop(columns=['last_purchase_date','customer_last_purchase_date'])

to_fill = [
    'customer_total_purchases','customer_unique_articles',
    'article_total_purchases','article_unique_customers',
    'cust_purchases_1w','cust_purchases_4w',
    'days_since_last_purchase','customer_days_since_last_purchase',
    'article_mean_price','customer_mean_price','price_sensitivity',
    'article_mean_age','age_sensitivity'
]
for col in to_fill:
    features[col] = features[col].fillna(0).astype('float32')

# category encodings
for col in ['club_member_status','fashion_news_frequency']:
    if col in features.columns:
        features[col] = features[col].astype('category').cat.codes.astype('int16')

if 'postal_code' in features.columns:
    features['postal_code'] = features['postal_code'].astype('category').cat.codes.astype('int32')

if 'window_type' in features.columns:
    features['window_type_code'] = features['window_type'].astype('category').cat.codes.astype('int16')
    features = features.drop(columns=['window_type'])

gc.collect()


# ~ 40gb of ram usage, 18s

36

In [9]:
# Cell 9
import json

feature_cols_order = [c for c in features.columns if c not in ['customer_id','article_id']]
meta_payload = {
    "feature_cols": ['customer_id','article_id'] + feature_cols_order,
    "timestamp": pd.Timestamp.utcnow().isoformat(),
}

with open("../data/outputs/feature_cols.json","w") as f:
    json.dump(meta_payload, f)

In [10]:
# Cell 10 – encode window_type and save
features['week_end'] = np.int32(int(WEEK_TAG))

out_path = f'../data/outputs/features_week={WEEK_TAG}.parquet'
features.to_parquet(out_path, index=False)
print("Saved:", out_path)

del features
gc.collect()

# ~ 31gb of ram usage, 3 min 1s

Saved: ../data/outputs/features_week=20200819.parquet


0