In [1]:
# Cell 1 – imports
import gc
import json
import numpy as np
import pandas as pd
from catboost import CatBoostRanker, Pool

In [2]:
# Cell 2 – load prepared ranking data
META_PATH = '../data/outputs/dataset_meta.json'
TRAIN_RANK_PATH = '../data/outputs/train_rank.parquet'
VALID_RANK_PATH = '../data/outputs/valid_rank.parquet'

with open(META_PATH) as f:
    meta = json.load(f)

feature_cols = meta['model_features']  # same intersected list

train_df = pd.read_parquet(TRAIN_RANK_PATH)
valid_df = pd.read_parquet(VALID_RANK_PATH)

# Enforce intersection (safety) and drop non-feature cols
available = set(train_df.columns)
feature_cols = [c for c in feature_cols
                if c in available and c not in ['customer_id', 'label', 'group_idx']]

# OPTIONAL but recommended: drop unused columns early to save RAM
keep_cols = feature_cols + ['customer_id', 'label']
train_df = train_df[keep_cols]
valid_df = valid_df[keep_cols]

print("Train rows:", len(train_df), "Valid rows:", len(valid_df))
print("Features used:", len(feature_cols))

# Identify categorical features (keep similar to LGBM coded columns)
potential_cat = {
    'window_type_code','club_member_status','fashion_news_frequency','postal_code',
    'product_type_no','product_group_name','index_code','section_no',
    'graphical_appearance_no','colour_group_code','perceived_colour_value_id',
    'perceived_colour_master_id','index_group_no','garment_group_no'
}
cat_features = [i for i, c in enumerate(feature_cols) if c in potential_cat]

print("Categorical feature count:", len(cat_features))

# ~ 24gb of ram usage, 21s

Train rows: 128475160 Valid rows: 32048094
Features used: 26
Categorical feature count: 14


In [3]:
# Cell 3 – user activity stats (train + valid union)
tmp = pd.concat(
    [
        train_df[['customer_id','cust_purchases_4w','customer_total_purchases']],
        valid_df[['customer_id','cust_purchases_4w','customer_total_purchases']],
    ],
    ignore_index=True
).drop_duplicates('customer_id')

print("Users with any history:", (tmp['customer_total_purchases'] > 0).sum())
print("Users with 0 history but some recent:",
      ((tmp['customer_total_purchases'] == 0) & (tmp['cust_purchases_4w'] > 0)).sum())
print("Users with history but no recent activity:",
      ((tmp['customer_total_purchases'] > 0) & (tmp['cust_purchases_4w'] == 0)).sum())
del tmp
gc.collect()

# ~ 16gb of ram usage, 4s

Users with any history: 1356709
Users with 0 history but some recent: 0
Users with history but no recent activity: 1121432


0

In [None]:
# Cell 4 – memory-lean capped ranking frame for GPU (≤1023 per group), vectorized
MAX_GROUP = 1023  # GPU hard limit
rng = np.random.default_rng(42)

def build_keep_mask(df, max_group):
    # Work on a tiny meta frame only
    meta = df[['customer_id', 'label']].copy()
    meta['_row'] = np.arange(len(meta), dtype=np.int64)
    meta['label'] = meta['label'].astype('int8')

    # Keep all positives and compute per-group negative budget
    pos_mask = meta['label'] == 1
    pos_cnt = meta.loc[pos_mask].groupby('customer_id')['label'].size()
    neg_budget_per_group = (max_group - pos_cnt).clip(lower=0)
    # groups with no positives get full budget
    neg_budget_per_group = neg_budget_per_group.astype('int32')

    # For negatives, assign random order within each group and take up to budget
    neg_meta = meta.loc[~pos_mask, ['customer_id', '_row']].copy()
    neg_meta['_r'] = rng.integers(0, np.iinfo(np.uint32).max, size=len(neg_meta), dtype=np.uint32)
    neg_meta = neg_meta.sort_values(['customer_id', '_r'], kind='mergesort')
    neg_meta['_rk'] = neg_meta.groupby('customer_id').cumcount().astype('int32')
    neg_meta['_budget'] = neg_meta['customer_id'].map(neg_budget_per_group).fillna(max_group).astype('int32')

    keep_neg_rows = neg_meta.loc[neg_meta['_rk'] < neg_meta['_budget'], '_row'].to_numpy()
    keep_pos_rows = meta.loc[pos_mask, '_row'].to_numpy()
    keep_rows = np.concatenate([keep_pos_rows, keep_neg_rows])

    mask = np.zeros(len(df), dtype=bool)
    mask[keep_rows] = True
    return mask

# Build masks without duplicating feature columns
train_keep = build_keep_mask(train_df, MAX_GROUP)
valid_keep = build_keep_mask(valid_df, MAX_GROUP)

# Filter once; then sort and densify groups
train_df = train_df.loc[train_keep].reset_index(drop=True)
valid_df = valid_df.loc[valid_keep].reset_index(drop=True)

train_df = train_df.sort_values('customer_id', kind='mergesort')
valid_df = valid_df.sort_values('customer_id', kind='mergesort')

# Train: dense ids from its own customers
train_cust_ids = train_df['customer_id'].unique()
cust_to_group_train = {cid: i for i, cid in enumerate(train_cust_ids)}
train_group_id = train_df['customer_id'].map(cust_to_group_train).astype('int32')

# Valid: independent dense ids from its own customers
valid_cust_ids = valid_df['customer_id'].unique()
cust_to_group_valid = {cid: i for i, cid in enumerate(valid_cust_ids)}
valid_group_id = valid_df['customer_id'].map(cust_to_group_valid).astype('int32')

# Sanity: max group size <= 1023 (by customer_id)
max_train_group = train_df.groupby('customer_id').size().max()
max_valid_group = valid_df.groupby('customer_id').size().max()
print("Max train group size (by customer_id):", max_train_group)
print("Max valid group size (by customer_id):", max_valid_group)
assert max_train_group <= 1023, "GPU requires ≤1023 per group"
assert max_valid_group <= 1023, "GPU requires ≤1023 per group"

# Downcast numerics BEFORE Pool creation to avoid big temporary buffers
for c in feature_cols:
    if pd.api.types.is_float_dtype(train_df[c]):
        train_df[c] = train_df[c].astype('float32')
        valid_df[c] = valid_df[c].astype('float32')

# Keep only genuinely low-card categorical features to avoid heavy CTR tables
low_card_keep = {'window_type_code','club_member_status','fashion_news_frequency'}
cat_features_restricted = [i for i, c in enumerate(feature_cols) if c in low_card_keep]

# === NEW: save pre-built ranking data for reuse ===
np.save("../data/outputs/cb_train_group_id.npy", train_group_id.to_numpy())
np.save("../data/outputs/cb_valid_group_id.npy", valid_group_id.to_numpy())

# Save features + labels as parquet (already filtered & downcasted)
train_df.to_parquet("../data/outputs/cb_train_frame.parquet", index=False)
valid_df.to_parquet("../data/outputs/cb_valid_frame.parquet", index=False)

# Save catboost-specific meta (feature_cols and restricted cat features)
cb_meta = {
    "feature_cols": feature_cols,
    "cat_features_restricted": cat_features_restricted,
}
with open("../data/outputs/cb_ranker_meta.json", "w") as f:
    json.dump(cb_meta, f)

print("Saved CatBoost ranker inputs.")

# Free DataFrames; Pools hold their own memory
del train_df, valid_df, train_keep, valid_keep, train_group_id, valid_group_id
del cust_to_group_train, cust_to_group_valid
gc.collect()

# ~ 34gb of ram usage, 6 min 6s

Max train group size (by customer_id): 616
Max valid group size (by customer_id): 593
Saved CatBoost ranker inputs.


32

In [None]:
### NEED TO RUN CELL 5 AND 6 TO TRAIN THE MODEL WHEN NO CHANGES TO CANDIDATES OR FEATURES

# Cell 5
import json
import numpy as np
import pandas as pd
from catboost import Pool

with open("../data/outputs/cb_ranker_meta.json") as f:
    cb_meta = json.load(f)

feature_cols = cb_meta["feature_cols"]
cat_features_restricted = cb_meta["cat_features_restricted"]

train_df = pd.read_parquet("../data/outputs/cb_train_frame.parquet")
valid_df = pd.read_parquet("../data/outputs/cb_valid_frame.parquet")

train_group_id = np.load("../data/outputs/cb_train_group_id.npy")
valid_group_id = np.load("../data/outputs/cb_valid_group_id.npy")

pool_train = Pool(
    data=train_df[feature_cols],
    label=train_df["label"].astype("int8").to_numpy(),
    group_id=train_group_id,
    cat_features=cat_features_restricted,
)
pool_valid = Pool(
    data=valid_df[feature_cols],
    label=valid_df["label"].astype("int8").to_numpy(),
    group_id=valid_group_id,
    cat_features=cat_features_restricted,
)

print("Loaded prebuilt pools.")
print("Pool train rows:", pool_train.shape[0])
print("Pool valid rows:", pool_valid.shape[0])

del train_df, valid_df
gc.collect()

# ~ xgb of ram usage, 2 min 15s

Loaded prebuilt pools.
Pool train rows: 128475160
Pool valid rows: 32048094


1062

In [11]:
# Cell 6 – GPU CatBoost ranker

model = CatBoostRanker(
    loss_function='YetiRank',
    eval_metric='NDCG:top=12',
    learning_rate=0.05,
    depth=6,
    iterations=500,
    border_count=64,
    random_seed=42,
    early_stopping_rounds=50,
    task_type='GPU',
    devices='0',
    gpu_ram_part=0.7,
    one_hot_max_size=10,
    max_ctr_complexity=1,
    verbose=10,
    metric_period=10,
)
model.fit(pool_train, eval_set=pool_valid)
# ~ 30gb of ram usage, 5 min 22s

Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=12;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.9861038	best: 0.9861038 (0)	total: 2.1s	remaining: 17m 28s
10:	test: 0.9868903	best: 0.9868903 (10)	total: 15.7s	remaining: 11m 37s
20:	test: 0.9869727	best: 0.9869727 (20)	total: 29.4s	remaining: 11m 11s
30:	test: 0.9870390	best: 0.9870390 (30)	total: 43.1s	remaining: 10m 52s
40:	test: 0.9870637	best: 0.9870649 (35)	total: 56.8s	remaining: 10m 35s
50:	test: 0.9870768	best: 0.9870801 (48)	total: 1m 10s	remaining: 10m 19s
60:	test: 0.9871203	best: 0.9871203 (60)	total: 1m 23s	remaining: 10m 3s
70:	test: 0.9871385	best: 0.9871400 (66)	total: 1m 37s	remaining: 9m 49s
80:	test: 0.9871366	best: 0.9871413 (76)	total: 1m 51s	remaining: 9m 34s
90:	test: 0.9871598	best: 0.9871641 (88)	total: 2m 4s	remaining: 9m 20s
100:	test: 0.9871641	best: 0.9871655 (94)	total: 2m 18s	remaining: 9m 6s
110:	test: 0.9871655	best: 0.9871702 (104)	total: 2m 31s	remaining: 8m 52s
120:	test: 0.9871631	best: 0.9871702 (104)	total: 2m 45s	remaining: 8m 38s
130:	test: 0.9871683	best: 0.9871710 (129)	total: 

<catboost.core.CatBoostRanker at 0x7b7737500910>

In [12]:
del pool_train, pool_valid
gc.collect()

1154

In [13]:
# Cell 7 – scoring (single Pool, no group_id; avoid batch Pool overhead)
FEATURE_PATH = '../data/outputs/features.parquet'
data_full = pd.read_parquet(FEATURE_PATH, columns=feature_cols + ['customer_id','article_id'])

# Downcast scoring frame to float32 (safety)
for c in feature_cols:
    if pd.api.types.is_float_dtype(data_full[c]):
        data_full[c] = data_full[c].astype('float32')

infer_pool = Pool(
    data=data_full[feature_cols],
    cat_features=cat_features_restricted
)

scores = model.predict(infer_pool).astype(np.float32)

all_data_rank_cb = data_full[['customer_id','article_id']].copy()
all_data_rank_cb['score'] = scores

del infer_pool, data_full, scores
gc.collect()

print("CatBoost scored rows:", len(all_data_rank_cb))
print("CatBoost unique customers:", all_data_rank_cb['customer_id'].nunique())

# ~ 39gb of ram usage, 2 min 22s

CatBoost scored rows: 160523254
CatBoost unique customers: 1371980


In [14]:
# Cell 8 – build top-12 predictions per customer
all_data_rank_cb['article_id_str'] = '0' + all_data_rank_cb['article_id'].astype(str)

BATCH_CUSTOMERS = 500_000
unique_cust = all_data_rank_cb['customer_id'].unique()
pred_parts = []

for start in range(0, len(unique_cust), BATCH_CUSTOMERS):
    end = min(start + BATCH_CUSTOMERS, len(unique_cust))
    cust_chunk = unique_cust[start:end]
    chunk = all_data_rank_cb[all_data_rank_cb['customer_id'].isin(cust_chunk)].copy()
    chunk = chunk.sort_values(['customer_id','score'], ascending=[True, False])
    chunk = chunk.groupby('customer_id', group_keys=False).head(12)
    part = (
        chunk.groupby('customer_id')['article_id_str']
             .apply(lambda x: ' '.join(x))
             .reset_index()
             .rename(columns={'customer_id':'customer_id_int','article_id_str':'prediction'})
    )
    pred_parts.append(part)
    del chunk, part
    gc.collect()

pred_df_rank_cb_int = pd.concat(pred_parts, ignore_index=True)
del pred_parts
gc.collect()

print("pred_df_rank_cb_int rows:", len(pred_df_rank_cb_int))
print("pred_df_rank_cb_int customers:", pred_df_rank_cb_int['customer_id_int'].nunique())

# ~ 26gb of ram usage, 3 min 22s

pred_df_rank_cb_int rows: 1371980
pred_df_rank_cb_int customers: 1371980


In [15]:
# Cell 9
sub = pd.read_csv(
    '../data/input_data/sample_submission.csv',
    usecols=['customer_id'],
    dtype={'customer_id':'string'}
)
sub['customer_id_int'] = sub['customer_id'].str[-16:].apply(lambda h: np.int64(np.uint64(int(h,16))))

print("sample_submission customers:", sub['customer_id_int'].nunique())
print("pred_df_rank_cb_int customers:", pred_df_rank_cb_int['customer_id_int'].nunique())

sub = sub.merge(pred_df_rank_cb_int, how='left', on='customer_id_int')
print("rows in sub after merge:", len(sub))
print("rows with model preds (raw):", sub['prediction'].notna().sum())
print("rows needing fallback (raw):", sub['prediction'].isna().sum())

gp = pd.read_json('../data/outputs/general_pred_str.json', typ='series')
general_pred_str = gp['general_pred_str']
sub['prediction'] = sub['prediction'].fillna(general_pred_str)
fallback_items = general_pred_str.split()

def pad_to_12(pred):
    items = pred.split()
    if len(items) >= 12:
        return ' '.join(items[:12])
    seen = set(items)
    for art in fallback_items:
        if art not in seen:
            items.append(art)
            seen.add(art)
        if len(items) == 12:
            break
    return ' '.join(items)

sub['prediction'] = sub['prediction'].apply(pad_to_12)

sub[['customer_id','prediction']].to_csv(
    '../data/submission/catboost_ranker_submission.csv',
    index=False
)
sub.head()

# ~ 20gb of ram usage, 17s

sample_submission customers: 1371980
pred_df_rank_cb_int customers: 1371980
rows in sub after merge: 1371980
rows with model preds (raw): 1371980
rows needing fallback (raw): 0


Unnamed: 0,customer_id,customer_id_int,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,6883939031699146327,0568601043 0568601006 0751471001 0751471043 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,-7200416642310594310,0918292001 0866731001 0751471001 0706016001 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,-6846340800584936,0794321007 0916468003 0918292001 0852643003 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,-94071612138601410,0751471001 0751471043 0915529003 0863595006 05...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,-283965518499174310,0730683050 0818320001 0896152002 0927530004 07...


In [16]:
# Cell 10 – feature importance (CatBoost)
from catboost import EFstrType

# Use PredictionValuesChange, which does NOT require passing data
importances = model.get_feature_importance(type=EFstrType.PredictionValuesChange)

for name, imp in sorted(zip(feature_cols, importances), key=lambda x: -x[1]):
    print(f"{name:25s} {imp:.4f}")

# Clean up big objects (all_data_rank_cb etc. are already gone by here)
gc.collect()

value                     21.5910
article_mean_price        15.0095
days_since_last_purchase  10.6474
window_type_code          10.5916
garment_group_no          8.0362
section_no                7.7722
article_total_purchases   7.5244
product_group_name        5.6467
product_type_no           4.0439
cust_purchases_1w         3.2199
colour_group_code         1.8614
article_unique_customers  1.3066
customer_mean_price       0.5308
perceived_colour_master_id 0.4590
graphical_appearance_no   0.3980
perceived_colour_value_id 0.3520
index_code                0.2951
cust_purchases_4w         0.2696
customer_days_since_last_purchase 0.2052
age                       0.1148
customer_unique_articles  0.0979
index_group_no            0.0240
customer_total_purchases  0.0028
club_member_status        0.0000
fashion_news_frequency    0.0000
postal_code               0.0000


23