In [1]:
# Cell 1 – imports & config
import gc
import json
import numpy as np
import pandas as pd
import xgboost as xgb

META_PATH = '../data/outputs/dataset_meta.json'
TRAIN_RANK_PATH = '../data/outputs/train_rank.parquet'
VALID_RANK_PATH = '../data/outputs/valid_rank.parquet'
GROUP_TRAIN_PATH = '../data/outputs/groups_train.npy'
GROUP_VALID_PATH = '../data/outputs/groups_valid.npy'


In [None]:
# Cell 2 – load train/valid ranking data
with open(META_PATH) as f:
    meta = json.load(f)

feature_cols = meta['model_features']

train_df = pd.read_parquet(TRAIN_RANK_PATH)
valid_df = pd.read_parquet(VALID_RANK_PATH)

# REMOVE FEATURES
remove_features = [
    # 'value',
    # 'age_bucket',
    # 'customer_total_purchases',
    # 'customer_unique_articles',
    # 'cust_purchases_1w',
    # 'cust_purchases_4w',
    # 'article_total_purchases',
    # 'article_unique_customers',
    #'club_member_status',
    #'fashion_news_frequency',
    #'age',
    #'postal_code',
    #'product_code',
    #'product_type_no',
    #'product_group_name',
    #'graphical_appearance_no',
    #'colour_group_code',
    #'perceived_colour_value_id',
    #'perceived_colour_master_id',
    #'department_no',
    #'index_code',
    #'index_group_no',
    #'section_no',
    #'garment_group_no',
    # 'article_mean_price',
    # 'customer_mean_price',
    # 'article_mean_age',
    # 'index_group_match',
    # 'product_code_match',
    # 'days_since_last_purchase',
    # 'customer_days_since_last_purchase',
    # 'price_sensitivity',
    # 'age_sensitivity',
    # 'window_type_code',
]

drop_cols = [c for c in remove_features if c in train_df.columns]
if drop_cols:
    print("Dropping from TRAIN/VALID:", drop_cols)
    train_df = train_df.drop(columns=drop_cols)
    valid_df = valid_df.drop(columns=drop_cols, errors='ignore')

# Update feature_cols
feature_cols = [f for f in feature_cols if f not in remove_features]

train_group = np.load(GROUP_TRAIN_PATH)
valid_group = np.load(GROUP_VALID_PATH)

# Float32 conversion
for c in feature_cols:
    if pd.api.types.is_float_dtype(train_df[c]):
        train_df[c] = train_df[c].astype('float32')
        valid_df[c] = valid_df[c].astype('float32')

train_df['label'] = train_df['label'].astype('float32')
valid_df['label'] = valid_df['label'].astype('float32')

print("Train rows:", len(train_df))
print("Valid rows:", len(valid_df))
print("Features:", len(feature_cols))
print("Groups train:", len(train_group), "valid:", len(valid_group))

gc.collect()

#40s


Train rows: 8180082
Valid rows: 253714158
Features: 34
Groups train: 294983 valid: 1371980


22

In [None]:
# %%
# Cell 3 – filter validation groups with ≥1 positive

print("\nFiltering validation set to groups with at least 1 positive...")

# Use original valid_group from disk
orig_valid_group = np.load(GROUP_VALID_PATH)
group_bounds = np.insert(np.cumsum(orig_valid_group), 0, 0)[:-1]
group_has_positive = np.add.reduceat(valid_df['label'].values, group_bounds) > 0

valid_group = orig_valid_group[group_has_positive]
row_mask = np.repeat(group_has_positive, orig_valid_group)

# Apply mask to validation dataframe
valid_df = valid_df[row_mask].reset_index(drop=True)

print("Filtered valid groups:", len(valid_group))
print("Filtered valid rows:", len(valid_df))

gc.collect()


Filtering validation set to groups with at least 1 positive...
Filtered valid groups: 27802
Filtered valid rows: 8372080


11

In [None]:
# %%
# Cell 4 – training model and saving

MODEL_PATH = "../data/outputs/xgb_ranker.model"

dtrain = xgb.DMatrix(train_df[feature_cols], label=train_df['label'])
dvalid = xgb.DMatrix(valid_df[feature_cols], label=valid_df['label'])

dtrain.set_group(train_group)
dvalid.set_group(valid_group)

params = {
    'objective': 'rank:ndcg',
    'eval_metric': 'ndcg@12',
    'max_depth': 10,
    'eta': 0.09809700484952676,
    'subsample': 0.9716829987966161,
    'colsample_bytree': 0.6101366365030814,
    'min_child_weight': 111.48302154942498,
    'lambda': 37.3293640594945,
    'alpha': 6.805043204868502,
    'tree_method': 'hist',
    'random_state': 42,
}


evals_result = {}
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=3000,
    evals=[(dtrain, 'train'), (dvalid, 'valid')],
    early_stopping_rounds=200,
    evals_result=evals_result,
    verbose_eval=25,
)

print("Best iteration:", bst.best_iteration)
print("Best valid ndcg@12:", evals_result['valid']['ndcg@12'][bst.best_iteration])

# Always save the trained model
bst.save_model(MODEL_PATH)
print("Model saved to:", MODEL_PATH)

# Save training log to TXT
log_path = "../data/outputs/xgb_training_log.txt"

train_log = evals_result.get('train', {}).get('ndcg@12', [])
valid_log = evals_result.get('valid', {}).get('ndcg@12', [])

max_len = min(len(train_log), len(valid_log))

with open(log_path, "w") as f:
    f.write("iter,train_ndcg12,valid_ndcg12\n")
    for i in range(max_len):
        f.write(f"{i},{train_log[i]},{valid_log[i]}\n")

print("Saved training log to:", log_path)

# Free memory
del dtrain, dvalid, bst
gc.collect()

# 10 min

[0]	train-ndcg@12:0.78130	valid-ndcg@12:0.12988
[25]	train-ndcg@12:0.80572	valid-ndcg@12:0.16362
[50]	train-ndcg@12:0.80988	valid-ndcg@12:0.16568
[75]	train-ndcg@12:0.81250	valid-ndcg@12:0.16701
[100]	train-ndcg@12:0.81523	valid-ndcg@12:0.16829
[125]	train-ndcg@12:0.81761	valid-ndcg@12:0.16896
[150]	train-ndcg@12:0.81946	valid-ndcg@12:0.16941
[175]	train-ndcg@12:0.82119	valid-ndcg@12:0.16997
[200]	train-ndcg@12:0.82284	valid-ndcg@12:0.17027
[225]	train-ndcg@12:0.82442	valid-ndcg@12:0.17102
[250]	train-ndcg@12:0.82610	valid-ndcg@12:0.17120
[275]	train-ndcg@12:0.82750	valid-ndcg@12:0.17120
[300]	train-ndcg@12:0.82882	valid-ndcg@12:0.17137
[325]	train-ndcg@12:0.83003	valid-ndcg@12:0.17151
[350]	train-ndcg@12:0.83139	valid-ndcg@12:0.17131
[375]	train-ndcg@12:0.83248	valid-ndcg@12:0.17138
[400]	train-ndcg@12:0.83340	valid-ndcg@12:0.17154
[425]	train-ndcg@12:0.83448	valid-ndcg@12:0.17156
[450]	train-ndcg@12:0.83554	valid-ndcg@12:0.17142
[475]	train-ndcg@12:0.83656	valid-ndcg@12:0.17148
[500]

  bst.save_model(MODEL_PATH)


28

In [None]:
# %%
# Cell 5 - load model and run inference

MODEL_PATH = "../data/outputs/xgb_ranker.model"

print("Loading stored model:", MODEL_PATH)
bst = xgb.Booster()
bst.load_model(MODEL_PATH)
print("Model loaded, best iteration:", bst.best_iteration)

FEATURES_PATH = "../data/outputs/features_week=20200922.parquet"

data = pd.read_parquet(FEATURES_PATH, columns=['customer_id','article_id'] + feature_cols)

data['customer_id'] = data['customer_id'].astype('int64')
data['article_id'] = data['article_id'].astype('int32')

for c in feature_cols:
    if pd.api.types.is_float_dtype(data[c]):
        data[c] = data[c].astype('float32')

BATCH = 2_000_000
n_rows = len(data)
scores = np.empty(n_rows, dtype=np.float32)

print(f"Running inference on {n_rows:,} rows...")

for start in range(0, n_rows, BATCH):
    end = min(start + BATCH, n_rows)
    dmatrix_batch = xgb.DMatrix(data.iloc[start:end][feature_cols])
    scores[start:end] = bst.predict(dmatrix_batch, iteration_range=(0, bst.best_iteration + 1))
    del dmatrix_batch
    gc.collect()
    print(f"Predicted rows {start:,} to {end:,} / {n_rows:,}")

data['score'] = scores
del scores
gc.collect()

print("Final scored rows:", len(data))

# 50 min

Loading stored model: ../data/outputs/optuna_xgb/best_xgb_ranker.model


  bst.load_model(MODEL_PATH)


Model loaded, best iteration: 389
Running inference on 250,982,495 rows...
Predicted rows 0 to 2,000,000 / 250,982,495
Predicted rows 2,000,000 to 4,000,000 / 250,982,495
Predicted rows 4,000,000 to 6,000,000 / 250,982,495
Predicted rows 6,000,000 to 8,000,000 / 250,982,495
Predicted rows 8,000,000 to 10,000,000 / 250,982,495
Predicted rows 10,000,000 to 12,000,000 / 250,982,495
Predicted rows 12,000,000 to 14,000,000 / 250,982,495
Predicted rows 14,000,000 to 16,000,000 / 250,982,495
Predicted rows 16,000,000 to 18,000,000 / 250,982,495
Predicted rows 18,000,000 to 20,000,000 / 250,982,495
Predicted rows 20,000,000 to 22,000,000 / 250,982,495
Predicted rows 22,000,000 to 24,000,000 / 250,982,495
Predicted rows 24,000,000 to 26,000,000 / 250,982,495
Predicted rows 26,000,000 to 28,000,000 / 250,982,495
Predicted rows 28,000,000 to 30,000,000 / 250,982,495
Predicted rows 30,000,000 to 32,000,000 / 250,982,495
Predicted rows 32,000,000 to 34,000,000 / 250,982,495
Predicted rows 34,000,00

In [5]:
# Cell 6 – top12 per customer
data['customer_id'] = data['customer_id'].astype('int64')

data = data.sort_values(['customer_id','score'], ascending=[True,False])
top12 = data.groupby('customer_id', group_keys=False).head(12)

top12['article_id_str'] = top12['article_id'].astype(str).str.zfill(10)

pred_df = (
    top12.groupby('customer_id')['article_id_str']
         .apply(lambda x: ' '.join(x))
         .reset_index()
         .rename(columns={'customer_id':'customer_id_int',
                          'article_id_str':'prediction'})
)

pred_df['customer_id_int'] = pred_df['customer_id_int'].astype('int64')

del top12, data
gc.collect()

print("Predictions:", len(pred_df))

# 8 min 20s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top12['article_id_str'] = top12['article_id'].astype(str).str.zfill(10)


Predictions: 1371980


In [None]:
# Cell 7 – merge with sample + fallback

# Must match preprocessing pipeline
def hex16_to_int(s):
    return np.int64(np.uint64(int(s[-16:], 16)))

sample = pd.read_csv('../data/input_data/sample_submission.csv')
sample['customer_id_int'] = sample['customer_id'].apply(hex16_to_int)
sample = sample.drop(columns=['prediction'], errors='ignore')
pred_df['customer_id_int'] = pred_df['customer_id_int'].astype('int64')
sub = sample.merge(pred_df, how='left', on='customer_id_int')

print("Predictions matched:", sub['prediction'].notna().sum())
print("Rows after merge:", len(sub))

del sample
gc.collect()

gp = json.load(open('../data/outputs/general_pred_str.json'))
fallback_str = gp['general_pred_str']
fallback_items = fallback_str.split()

sub['prediction'] = sub['prediction'].fillna(fallback_str)


Predictions matched: 1371980
Rows after merge: 1371980


In [7]:
# Cell 8 – pad/trim to 12
def pad_to_12(pred):
    items = pred.split()
    if len(items) >= 12:
        return ' '.join(items[:12])
    used = set(items)
    for art in fallback_items:
        if art not in used:
            items.append(art)
            used.add(art)
        if len(items) == 12:
            break
    return ' '.join(items)

sub['prediction'] = sub['prediction'].apply(pad_to_12)

out_path = '../data/submission/xgboost_ranker_submission_optuna.csv'
sub[['customer_id','prediction']].to_csv(out_path, index=False)
print("Saved:", out_path)
sub.head()


Saved: ../data/submission/xgboost_ranker_submission_optuna.csv


Unnamed: 0,customer_id,customer_id_int,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,6883939031699146327,0568601043 0568601044 0568601006 0568601007 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,-7200416642310594310,0673677002 0918522001 0918292001 0924243001 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,-6846340800584936,0794321007 0794321011 0794321008 0851400001 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,-94071612138601410,0794321011 0805000001 0730683050 0804992017 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,-283965518499174310,0896152002 0730683062 0730683050 0818320001 07...


In [8]:
# Cell 9 – feature importance
imp = bst.get_score(importance_type='gain')

name_map = {f"f{i}": name for i,name in enumerate(feature_cols)}
items = [(name_map.get(k,k), v) for k,v in imp.items()]

for n,v in sorted(items, key=lambda x: -x[1]):
    print(n, v)

gc.collect()


product_code_match 169.62710571289062
value 56.29108428955078
window_type_code 52.8043098449707
days_since_last_purchase 19.17896842956543
garment_group_no 12.759961128234863
product_group_name 11.037084579467773
cust_purchases_1w 9.809845924377441
section_no 9.74382209777832
index_group_match 9.56946849822998
article_total_purchases 8.334626197814941
age_bucket 7.718550205230713
product_type_no 7.698000431060791
article_mean_price 7.491127967834473
department_no 7.487173557281494
colour_group_code 6.769662380218506
index_code 6.520567893981934
product_code 6.480300426483154
article_unique_customers 6.46755313873291
index_group_no 6.360974311828613
graphical_appearance_no 6.075438499450684
customer_days_since_last_purchase 4.966021537780762
age 4.516480445861816
customer_mean_price 4.458639621734619
article_mean_age 4.436407089233398
perceived_colour_value_id 4.343845844268799
customer_unique_articles 4.277185916900635
cust_purchases_4w 3.9445064067840576
perceived_colour_master_id 3.7

19