In [1]:
# Cell 1 – config & imports
import gc, json, numpy as np, pandas as pd, os, hashlib

FEATURE_PATH = "../data/outputs/features.parquet"
TX_PATH = "../data/input_data/transactions_train.csv"
FEATURE_COLS_JSON = "../data/outputs/feature_cols.json"

LABEL_LOOKBACK_DAYS = 7
NEG_SAMPLE_CLF = 1_000_000
RANDOM_SEED = 42
RANK_TOP_K_EVAL = 12

np.random.seed(RANDOM_SEED)

def hex16_to_int(s):
    return np.int64(np.uint64(int(s[-16:],16)))

# Deterministic split function (modulus)
def is_valid_customer(cid: int) -> bool:
    return (cid % 5) == 0

print("Config loaded.")

Config loaded.


In [2]:
# Cell 2 – load base features (already engineered) – memory safe
# Load feature column order first (it already includes customer_id, article_id)
if os.path.exists(FEATURE_COLS_JSON):
    with open(FEATURE_COLS_JSON) as f:
        feature_cols = json.load(f)["feature_cols"]
else:
    # Fallback: read whole file once to derive columns
    tmp_all = pd.read_parquet(FEATURE_PATH)
    feature_cols = ['customer_id','article_id'] + [c for c in tmp_all.columns if c not in ['customer_id','article_id']]
    del tmp_all
    gc.collect()

# Subset to only columns needed for modeling (exclude anything you no longer use)
# Ensure id columns always present
core_cols = ['customer_id','article_id']
# Keep a minimal set of numeric / categorical predictors (adjust as needed)
predictor_subset = [
    'value','window_type_code',
    'customer_total_purchases','customer_unique_articles',
    'article_total_purchases','article_unique_customers',
    'cust_purchases_1w','cust_purchases_4w',
    'days_since_last_purchase','customer_days_since_last_purchase',
    'age','club_member_status','fashion_news_frequency','postal_code',
    'product_type_no','product_group_name','index_code','section_no',
    'graphical_appearance_no','colour_group_code','perceived_colour_value_id',
    'perceived_colour_master_id','index_group_no','garment_group_no',
    'article_mean_price','customer_mean_price', 
    'price_sensitivity', 'index_group_match',
    'age_sensitivity', 'article_mean_age','department_no', 'product_code_match'
]


needed_cols = [c for c in feature_cols if c in (core_cols + predictor_subset)]

# Read only required columns
features = pd.read_parquet(FEATURE_PATH, columns=needed_cols)

print("Features shape:", features.shape)
print("Loaded columns:", len(needed_cols))

# Types
features['customer_id'] = features['customer_id'].astype('int64')
features['article_id']  = features['article_id'].astype('int32')

print("Number of feature_cols (including ids):", len(feature_cols))

# ~ 30gb of ram usage, 11s

Features shape: (262416062, 34)
Loaded columns: 34
Number of feature_cols (including ids): 35


In [3]:
# Cell 3 – build labels (last LABEL_LOOKBACK_DAYS) – optimized
tx = pd.read_csv(
    TX_PATH,
    usecols=['t_dat','customer_id','article_id'],
    dtype={'t_dat':'string','customer_id':'string','article_id':'int32'},
    parse_dates=['t_dat']  # parse directly
)
tx['customer_id'] = tx['customer_id'].str[-16:].apply(hex16_to_int)

last_ts = tx['t_dat'].max()
cut_ts  = last_ts - pd.Timedelta(days=LABEL_LOOKBACK_DAYS)

label_tx = tx[(tx['t_dat'] > cut_ts) & (tx['t_dat'] <= last_ts)][['customer_id','article_id']].drop_duplicates()
labels = label_tx.assign(label=1)
del label_tx, tx
gc.collect()
print("Positive label rows:", len(labels))

# ~ 24gb of ram usage, 3 min 47s

Positive label rows: 213728


In [4]:
# Cell 4 – merge labels (unchanged logic, ensure categories preserved)
data = features.merge(labels, on=['customer_id','article_id'], how='left')
data['label'] = data['label'].fillna(0).astype('int8')

# Compute coverage/recall before freeing labels
total_pos_labels = int(len(labels))
pos_covered = int(data['label'].sum())
recall = (pos_covered / total_pos_labels) if total_pos_labels else 0.0

del features, labels
gc.collect()
print(
    "Merged dataset rows:", len(data),
    "Pos covered:", pos_covered,
    "Total pos labels:", total_pos_labels,
    f"Recall: {recall:.4f}"
)

# ~ 50gb of ram usage, 2 min 46s

Merged dataset rows: 262416062 Pos covered: 47462 Total pos labels: 213728 Recall: 0.2221


In [None]:
# Cell 5 – deterministic split (vectorized, no apply)
cust_ids = data['customer_id'].unique()
print("Unique customers:", len(cust_ids))

valid_mask = (data['customer_id'] % 5) == 0
train_mask = ~valid_mask

# No .copy() unless you mutate
train_full = data.loc[train_mask]
valid_full = data.loc[valid_mask]

print("Train rows (ranking):", len(train_full), "Valid rows (ranking):", len(valid_full))

# ~ 31gb of ram usage, 21s

Unique customers: 1371980
Train rows (ranking): 210021247 Valid rows (ranking): 52394815


In [None]:
# Cell 6 – ranking groups with NEGATIVE DOWNSAMPLING (Winning Solution Approach)

print(f"Train rows before downsampling: {len(train_full)}")

# 1. Split Positive and Negative
pos_train = train_full[train_full['label'] == 1]
neg_train = train_full[train_full['label'] == 0]

# 2. Downsample Negatives
# The winning solution suggests ~1M negatives. NEG_SAMPLE_CLF is set to 1M in config.
if len(neg_train) > NEG_SAMPLE_CLF:
    print(f"Downsampling negatives from {len(neg_train)} to {NEG_SAMPLE_CLF}...")
    neg_train = neg_train.sample(n=NEG_SAMPLE_CLF, random_state=RANDOM_SEED)

# 3. Recombine
train_full = pd.concat([pos_train, neg_train], ignore_index=True)
print(f"Train rows after downsampling: {len(train_full)}")

# 4. Sort by customer_id (REQUIRED for LightGBM Ranker groups)
# LightGBM requires all rows for a specific query (customer) to be contiguous.
train_full = train_full.sort_values('customer_id')
valid_full = valid_full.sort_values('customer_id')

# 5. Calculate Group Sizes
# value_counts(sort=False).sort_index() aligns with the sorted dataframe
train_group_sizes = train_full['customer_id'].value_counts(sort=False).sort_index().to_numpy()
valid_group_sizes = valid_full['customer_id'].value_counts(sort=False).sort_index().to_numpy()

# 6. Save
np.save("../data/outputs/groups_train.npy", train_group_sizes)
np.save("../data/outputs/groups_valid.npy", valid_group_sizes)

train_full.to_parquet("../data/outputs/train_rank.parquet", index=False)
valid_full.to_parquet("../data/outputs/valid_rank.parquet", index=False)
print("Saved ranking data + group arrays.")

# Clean up temp vars
del pos_train, neg_train
gc.collect()

# ~ 45gb of ram usage, 6 min 12s

Train rows before downsampling: 210021247
Downsampling negatives from 209983277 to 1000000...
Train rows after downsampling: 1037970
Saved ranking data + group arrays.


0

In [None]:
# Cell 7 – meta (add memory info)
model_features = [c for c in feature_cols if c not in ['customer_id','article_id','label'] and c in data.columns]

with open("../data/outputs/dataset_meta.json","w") as f:
    json.dump({
        "last_ts": str(last_ts),
        "cut_ts": str(cut_ts),
        "label_lookback_days": LABEL_LOOKBACK_DAYS,
        "neg_sample_clf": NEG_SAMPLE_CLF,
        "feature_cols_full": feature_cols,
        "model_features": model_features,
        "rank_train_rows": len(train_full),
        "rank_valid_rows": len(valid_full),
        "rank_at": RANK_TOP_K_EVAL
    }, f)

print("Meta saved.")

Meta saved.


In [None]:
# Cell 8 – cleanup
del data, train_full, valid_full
gc.collect()
print("Cleanup done.")

Cleanup done.
