In [1]:
# Cell 1 – imports
import gc
import numpy as np
import pandas as pd
import json
import os

import xgboost as xgb

In [2]:
# Cell 2 – load prepared ranking data (same structure as LGBM)
META_PATH = '../data/outputs/dataset_meta.json'
TRAIN_RANK_PATH = '../data/outputs/train_rank.parquet'
VALID_RANK_PATH = '../data/outputs/valid_rank.parquet'
GROUP_TRAIN_PATH = '../data/outputs/groups_train.npy'
GROUP_VALID_PATH = '../data/outputs/groups_valid.npy'

with open(META_PATH) as f:
    meta = json.load(f)

# Use intersected feature list written by prepare_model_dataset
feature_cols = meta['model_features']

train_df = pd.read_parquet(TRAIN_RANK_PATH)
valid_df = pd.read_parquet(VALID_RANK_PATH)

# Safety: enforce intersection (avoids KeyError if meta stale)
available = set(train_df.columns)
feature_cols = [c for c in feature_cols if c in available]

train_group = np.load(GROUP_TRAIN_PATH)
valid_group = np.load(GROUP_VALID_PATH)

print("Train rows:", len(train_df), "Valid rows:", len(valid_df))
print("Features used:", len(feature_cols))
print("Groups train:", len(train_group), "valid:", len(valid_group))

# Downcast to float32 where appropriate for XGBoost
for c in feature_cols:
    if pd.api.types.is_float_dtype(train_df[c]):
        train_df[c] = train_df[c].astype('float32')
        valid_df[c] = valid_df[c].astype('float32')

train_df['label'] = train_df['label'].astype('float32')
valid_df['label'] = valid_df['label'].astype('float32')

gc.collect()

# ~ 20gb of ram usage, 17s

Train rows: 160523254 Valid rows: 160523254
Features used: 26
Groups train: 1371980 valid: 1371980


22

In [3]:
# Cell 3 – user activity stats (same as other rankers)
tmp = pd.concat(
    [
        train_df[['customer_id', 'cust_purchases_4w', 'customer_total_purchases']],
        valid_df[['customer_id', 'cust_purchases_4w', 'customer_total_purchases']],
    ],
    ignore_index=True,
).drop_duplicates('customer_id')

print("Users with any history:", (tmp['customer_total_purchases'] > 0).sum())
print(
    "Users with 0 history but some recent:",
    ((tmp['customer_total_purchases'] == 0) & (tmp['cust_purchases_4w'] > 0)).sum(),
)
print(
    "Users with history but no recent activity:",
    ((tmp['customer_total_purchases'] > 0) & (tmp['cust_purchases_4w'] == 0)).sum(),
)
del tmp
gc.collect()

# ~ 20gb of ram usage, 5s

Users with any history: 1356709
Users with 0 history but some recent: 0
Users with history but no recent activity: 1121432


0

In [None]:
# Cell 4 – train XGBoost ranker (rank:ndcg)
# Prepare DMatrix objects with group arrays
dtrain = xgb.DMatrix(
    data=train_df[feature_cols],
    label=train_df['label'],
)
dvalid = xgb.DMatrix(
    data=valid_df[feature_cols],
    label=valid_df['label'],
)

# XGBoost expects group sizes, not group ids
dtrain.set_group(train_group)
dvalid.set_group(valid_group)

# Basic ranking hyperparameters – comparable to LGBM / CatBoost
params = {
    'objective': 'rank:ndcg',
    'eval_metric': 'ndcg@12',
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'tree_method': 'hist',   # change to 'gpu_hist' if GPU available
    'random_state': 42,
}

num_boost_round = 20
early_stopping_rounds = 5

evals = [(dtrain, 'train'), (dvalid, 'valid')]
evals_result = {}

bst = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=num_boost_round,
    evals=evals,
    early_stopping_rounds=early_stopping_rounds,
    evals_result=evals_result,
    verbose_eval=1,
)

print("Best iteration:", bst.best_iteration)
print("Best valid ndcg@12:", evals_result['valid']['ndcg@12'][bst.best_iteration])

del dtrain, dvalid, train_df, valid_df, train_group, valid_group
gc.collect()

# ~ 20gb of ram usage, 17s

In [6]:
# Cell 5 – scoring on full feature set (same as LGBM flow)
FEATURE_PATH = '../data/outputs/features.parquet'

# Load only necessary columns: features + ids
data = pd.read_parquet(FEATURE_PATH, columns=feature_cols + ['customer_id', 'article_id'])

# Ensure float32 for XGBoost
for c in feature_cols:
    if pd.api.types.is_float_dtype(data[c]):
        data[c] = data[c].astype('float32')

BATCH_SIZE = 5_000_000
n_rows = len(data)
scores = np.empty(n_rows, dtype=np.float32)

for start in range(0, n_rows, BATCH_SIZE):
    end = min(start + BATCH_SIZE, n_rows)
    X_batch = data.iloc[start:end][feature_cols]  # keep as DataFrame with column names
    dmatrix_batch = xgb.DMatrix(X_batch)
    scores[start:end] = bst.predict(
        dmatrix_batch,
        iteration_range=(0, bst.best_iteration + 1),
    ).astype(np.float32)
    del X_batch, dmatrix_batch
    gc.collect()

all_data_rank_xgb = data[['customer_id', 'article_id']].copy()
all_data_rank_xgb['score'] = scores

del scores, data
gc.collect()

print("all_data_rank_xgb rows:", len(all_data_rank_xgb))
print("all_data_rank_xgb customers:", all_data_rank_xgb['customer_id'].nunique())

all_data_rank_xgb rows: 160523254
all_data_rank_xgb customers: 1371980


In [7]:
# Cell 6 – build predictions per integer customer_id (top-12)
all_data_rank_xgb['article_id_str'] = '0' + all_data_rank_xgb['article_id'].astype(str)

BATCH_CUSTOMERS = 500_000
unique_cust = all_data_rank_xgb['customer_id'].unique()
pred_parts = []

for start in range(0, len(unique_cust), BATCH_CUSTOMERS):
    end = min(start + BATCH_CUSTOMERS, len(unique_cust))
    cust_chunk = unique_cust[start:end]
    chunk = all_data_rank_xgb[all_data_rank_xgb['customer_id'].isin(cust_chunk)].copy()
    chunk = chunk.sort_values(['customer_id', 'score'], ascending=[True, False])
    chunk = chunk.groupby('customer_id', group_keys=False).head(12)
    part = (
        chunk.groupby('customer_id')['article_id_str']
             .apply(lambda x: ' '.join(x))
             .reset_index()
             .rename(columns={'customer_id': 'customer_id_int',
                              'article_id_str': 'prediction'})
    )
    pred_parts.append(part)
    del chunk, part
    gc.collect()

pred_df_rank_xgb_int = pd.concat(pred_parts, ignore_index=True)
del pred_parts
gc.collect()

print("pred_df_rank_xgb_int rows:", len(pred_df_rank_xgb_int))
print("pred_df_rank_xgb_int customers:", pred_df_rank_xgb_int['customer_id_int'].nunique())

pred_df_rank_xgb_int['n_items'] = pred_df_rank_xgb_int['prediction'].str.split().str.len()
print("Mean items per predicted customer:", pred_df_rank_xgb_int['n_items'].mean())
print("Min items per predicted customer:", pred_df_rank_xgb_int['n_items'].min())
print("Max items per predicted customer:", pred_df_rank_xgb_int['n_items'].max())
print("Total customer–item pairs:", pred_df_rank_xgb_int['n_items'].sum())

del all_data_rank_xgb
gc.collect()

pred_df_rank_xgb_int rows: 1371980
pred_df_rank_xgb_int customers: 1371980
Mean items per predicted customer: 12.0
Min items per predicted customer: 12
Max items per predicted customer: 12
Total customer–item pairs: 16463760


15

In [8]:
# Cell 7 – attach hex ids and merge into sample_submission
sub = pd.read_csv(
    '../data/input_data/sample_submission.csv',
    usecols=['customer_id'],
    dtype={'customer_id': 'string'}
)
sub['customer_id_int'] = sub['customer_id'].str[-16:].apply(
    lambda h: np.int64(np.uint64(int(h, 16)))
)

print("sample_submission customers:", sub['customer_id_int'].nunique())
print("pred_df_rank_xgb_int customers:", pred_df_rank_xgb_int['customer_id_int'].nunique())

sub = sub.merge(
    pred_df_rank_xgb_int,
    how='left',
    on='customer_id_int'
)

print("rows in sub after merge:", len(sub))
print("rows with model preds (raw):", sub['prediction'].notna().sum())
print("rows needing fallback (raw):", sub['prediction'].isna().sum())

gp = pd.read_json('../data/outputs/general_pred_str.json', typ='series')
general_pred_str = gp['general_pred_str']

sub['prediction'] = sub['prediction'].fillna(general_pred_str)
fallback_items = general_pred_str.split()

def pad_to_12(pred):
    items = pred.split()
    if len(items) >= 12:
        return ' '.join(items[:12])
    seen = set(items)
    for art in fallback_items:
        if art not in seen:
            items.append(art)
            seen.add(art)
        if len(items) == 12:
            break
    return ' '.join(items)

sub['prediction'] = sub['prediction'].apply(pad_to_12)

sub[['customer_id', 'prediction']].to_csv(
    '../data/submission/xgboost_ranker_submission.csv',
    index=False
)

sub.head()

sample_submission customers: 1371980
pred_df_rank_xgb_int customers: 1371980
rows in sub after merge: 1371980
rows with model preds (raw): 1371980
rows needing fallback (raw): 0


Unnamed: 0,customer_id,customer_id_int,prediction,n_items
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,6883939031699146327,0568601043 0568601006 0751471043 0915529003 07...,12
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,-7200416642310594310,0909370001 0866731001 0918292001 0827968001 07...,12
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,-6846340800584936,0794321007 0915526001 0916468003 0852643003 09...,12
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,-94071612138601410,0751471043 0915529003 0863595006 0762846006 09...,12
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,-283965518499174310,0896152002 0730683050 0818320001 0791587015 09...,12


In [9]:
# Cell 8 – feature importance (XGBoost gain)
importances = bst.get_score(importance_type='gain')  # dict: feature index -> importance

# Map back to feature names (XGBoost uses f0, f1, ...)
feat_name_map = {f"f{i}": name for i, name in enumerate(feature_cols)}
items = []
for k, v in importances.items():
    name = feat_name_map.get(k, k)
    items.append((name, v))

for name, imp in sorted(items, key=lambda x: -x[1]):
    print(f"{name:25s} {imp:.4f}")

gc.collect()

window_type_code          862.9497
value                     581.2436
cust_purchases_1w         180.4240
days_since_last_purchase  117.9940
customer_days_since_last_purchase 83.1339
article_total_purchases   70.8977
product_group_name        62.7714
garment_group_no          55.7787
index_code                51.6154
article_mean_price        49.7249
section_no                48.4736
customer_mean_price       41.9713
graphical_appearance_no   37.9613
product_type_no           30.8679
article_unique_customers  29.5927
customer_unique_articles  26.7493
age                       23.1414
cust_purchases_4w         22.5028
customer_total_purchases  22.1936
perceived_colour_master_id 21.6410
colour_group_code         20.8729
fashion_news_frequency    18.1958
postal_code               11.1414
perceived_colour_value_id 4.9041


32