In [None]:
# Cell 1 – imports
import gc
import json
import numpy as np
import pandas as pd
from catboost import CatBoostRanker, Pool

META_PATH = '../data/outputs/dataset_meta.json'
TRAIN_RANK_PATH = '../data/outputs/train_rank.parquet'
VALID_RANK_PATH = '../data/outputs/valid_rank.parquet'
GROUP_VALID_PATH = '../data/outputs/groups_valid.npy'
FEATURES_PATH = '../data/outputs/features_week=20200922.parquet'
SAMPLE_PATH = '../data/input_data/sample_submission.csv'
GENERAL_PATH = '../data/outputs/general_pred_str.json'
MODEL_PATH = '../data/outputs/catboost_ranker.model'
SUB_PATH = '../data/submission/catboost_ranker_submission.csv'

In [None]:
# Cell 2 – load train/valid ranking data
META_PATH = '../data/outputs/dataset_meta.json'
TRAIN_RANK_PATH = '../data/outputs/train_rank.parquet'
VALID_RANK_PATH = '../data/outputs/valid_rank.parquet'
GROUP_TRAIN_PATH = '../data/outputs/groups_train.npy'
GROUP_VALID_PATH = '../data/outputs/groups_valid.npy'

with open(META_PATH) as f:
    meta = json.load(f)

feature_cols = meta['model_features']

# load
train_df = pd.read_parquet(TRAIN_RANK_PATH)
valid_df = pd.read_parquet(VALID_RANK_PATH)

# REMOVE FEATURES
remove_features = [
    # 'value',
    # 'age_bucket',
    # 'customer_total_purchases',
    # 'customer_unique_articles',
    # 'cust_purchases_1w',
    # 'cust_purchases_4w',
    # 'article_total_purchases',
    # 'article_unique_customers',
    #'club_member_status',
    #'fashion_news_frequency',
    #'age',
    #'postal_code',
    #'product_code',
    #'product_type_no',
    #'product_group_name',
    #'graphical_appearance_no',
    #'colour_group_code',
    #'perceived_colour_value_id',
    #'perceived_colour_master_id',
    #'department_no',
    #'index_code',
    #'index_group_no',
    #'section_no',
    #'garment_group_no',
    # 'article_mean_price',
    # 'customer_mean_price',
    # 'article_mean_age',
    # 'index_group_match',
    # 'product_code_match',
    # 'days_since_last_purchase',
    # 'customer_days_since_last_purchase',
    # 'price_sensitivity',
    # 'age_sensitivity',
    # 'window_type_code',
]

# drop removed features
drop_cols = [c for c in remove_features if c in train_df.columns]
if drop_cols:
    print("Dropping:", drop_cols)
    train_df = train_df.drop(columns=drop_cols)
    valid_df = valid_df.drop(columns=drop_cols, errors='ignore')

# update feature_cols
feature_cols = [c for c in feature_cols if c not in remove_features]

train_group = np.load(GROUP_TRAIN_PATH)
valid_group = np.load(GROUP_VALID_PATH)

# float32 for GPU efficiency
for c in feature_cols:
    if pd.api.types.is_float_dtype(train_df[c]):
        train_df[c] = train_df[c].astype('float32')
        valid_df[c] = valid_df[c].astype('float32')

train_df['label'] = train_df['label'].astype('float32')
valid_df['label'] = valid_df['label'].astype('float32')

print("Train rows:", len(train_df))
print("Valid rows:", len(valid_df))
print("Features:", len(feature_cols))
print("Groups train:", len(train_group), "valid:", len(valid_group))
gc.collect()

Train rows: 8180082
Valid rows: 253714158
Features: 34
Groups train: 294983 valid: 1371980


0

In [None]:
# Cell 3 – filter validation groups with ≥1 positive

print("\nFiltering validation set to groups with at least 1 positive...")

orig_valid_group = np.load(GROUP_VALID_PATH)
group_bounds = np.insert(np.cumsum(orig_valid_group), 0, 0)[:-1]
group_has_positive = np.add.reduceat(valid_df['label'].values, group_bounds) > 0

valid_group = orig_valid_group[group_has_positive]
row_mask = np.repeat(group_has_positive, orig_valid_group)
valid_df = valid_df[row_mask].reset_index(drop=True)

print("Filtered valid groups:", len(valid_group))
print("Filtered valid rows:", len(valid_df))
gc.collect()


Filtering validation set to groups with at least 1 positive...
Filtered valid groups: 27802
Filtered valid rows: 8372080


11

In [None]:
# Cell 4 – Build CatBoost Pools

from catboost import Pool

# Convert group-sizes to per-row group_id (CatBoost requirement)
def expand_group_ids(group_sizes):
    ids = np.repeat(np.arange(len(group_sizes), dtype=np.int32), group_sizes)
    return ids

train_group_id = expand_group_ids(train_group)
valid_group_id = expand_group_ids(valid_group)

print("Expanded train_group_id:", len(train_group_id))
print("Expanded valid_group_id:", len(valid_group_id))
print("Train_df rows:", len(train_df))
print("Valid_df rows:", len(valid_df))

assert len(train_group_id) == len(train_df)
assert len(valid_group_id) == len(valid_df)

# Treat all features as numeric
cat_features = []

pool_train = Pool(
    data=train_df[feature_cols],
    label=train_df["label"],
    group_id=train_group_id,
    cat_features=cat_features
)

pool_valid = Pool(
    data=valid_df[feature_cols],
    label=valid_df["label"],
    group_id=valid_group_id,
    cat_features=cat_features
)

print("Pools built:")
print("Train rows:", pool_train.shape[0])
print("Valid rows:", pool_valid.shape[0])

gc.collect()


Expanded train_group_id: 8180082
Expanded valid_group_id: 8372080
Train_df rows: 8180082
Valid_df rows: 8372080
Pools built:
Train rows: 8180082
Valid rows: 8372080


0

In [None]:
# Cell 5 – GPU CatBoostRanker with eval logging

MODEL_PATH = "../data/outputs/catboost_ranker.model"
LOG_PATH = "../data/outputs/catboost_training_log.txt"

model = CatBoostRanker(
    loss_function='YetiRank',
    eval_metric='NDCG:top=12',
    iterations=10000,
    learning_rate=0.13294997299885136,
    depth=10,
    l2_leaf_reg=1.0589957706654705,
    min_data_in_leaf=21,
    random_strength=0.6391996053429478,
    bagging_temperature=0.1992110736721816,
    border_count=172,
    task_type='GPU',
    devices='0',
    random_seed=42,
    early_stopping_rounds=400,
    verbose=50,
)

model.fit(pool_train, eval_set=pool_valid)
best_iter = model.get_best_iteration()
print("Best iteration:", best_iter)

model.shrink(ntree_end=best_iter + 1)
model.save_model(MODEL_PATH)


evals = model.get_evals_result()

# Find the key containing NDCG in validation set
def find_ndcg_key(eval_dict):
    for key in eval_dict:
        if "NDCG" in key.upper():
            return key
    # If not found, return first available key or raise error
    available = list(eval_dict.keys())
    if available:
        print(f"Warning: NDCG not found. Using first available metric: {available[0]}")
        return available[0]
    raise KeyError(f"No metrics found in eval_dict. Available keys: {available}")

valid_metric_key = find_ndcg_key(evals['validation'])
print(f"Using validation metric: {valid_metric_key}")

# Since learn is empty, we'll only log validation metrics
valid_log = evals['validation'][valid_metric_key]

with open(LOG_PATH, "w") as f:
    f.write("iter,valid_ndcg12\n")
    for i, val in enumerate(valid_log):
        f.write(f"{i},{val}\n")

print("Training log saved successfully:", LOG_PATH)

gc.collect()

Default metric period is 5 because PFound, NDCG is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=12;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.1132206	best: 0.1132206 (0)	total: 452ms	remaining: 1h 15m 20s
50:	test: 0.1508516	best: 0.1511016 (49)	total: 9.07s	remaining: 29m 30s
100:	test: 0.1564645	best: 0.1565230 (99)	total: 17.9s	remaining: 29m 12s
150:	test: 0.1597213	best: 0.1597213 (150)	total: 26.8s	remaining: 29m 8s
200:	test: 0.1614503	best: 0.1614503 (200)	total: 35.7s	remaining: 29m
250:	test: 0.1629685	best: 0.1629685 (250)	total: 44.6s	remaining: 28m 50s
300:	test: 0.1639570	best: 0.1639570 (300)	total: 53.5s	remaining: 28m 45s
350:	test: 0.1647285	best: 0.1647633 (349)	total: 1m 2s	remaining: 28m 32s
400:	test: 0.1652624	best: 0.1654592 (395)	total: 1m 11s	remaining: 28m 21s
450:	test: 0.1656133	best: 0.1656133 (450)	total: 1m 19s	remaining: 28m 7s
500:	test: 0.1659550	best: 0.1660813 (493)	total: 1m 28s	remaining: 27m 54s
550:	test: 0.1662942	best: 0.1663778 (547)	total: 1m 37s	remaining: 27m 44s
600:	test: 0.1665069	best: 0.1666384 (593)	total: 1m 45s	remaining: 27m 35s
650:	test: 0.1667524	best: 0.1

0

In [None]:
# Cell 6 – cleanup after training

del pool_train, pool_valid
gc.collect()


0

In [None]:
# Cell 7 – inference on submission week
import gc
import json
import numpy as np
import pandas as pd
from catboost import CatBoostRanker, Pool

MODEL_PATH = "../data/outputs/catboost_ranker.model"
META_PATH = '../data/outputs/dataset_meta.json'
FEATURES_PATH = "../data/outputs/features_week=20200922.parquet"

print("Loading stored model:", MODEL_PATH)
model = CatBoostRanker()
model.load_model(MODEL_PATH)
print("Model loaded successfully")

# Load feature_cols from metadata
with open(META_PATH) as f:
    meta = json.load(f)

feature_cols = meta['model_features']

# Apply same feature removal as training
remove_features = [
    # 'value',
    # 'age_bucket',
    # 'customer_total_purchases',
    # 'customer_unique_articles',
    # 'cust_purchases_1w',
    # 'cust_purchases_4w',
    # 'article_total_purchases',
    # 'article_unique_customers',
    #'club_member_status',
    #'fashion_news_frequency',
    #'age',
    #'postal_code',
    #'product_code',
    #'product_type_no',
    #'product_group_name',
    #'graphical_appearance_no',
    #'colour_group_code',
    #'perceived_colour_value_id',
    #'perceived_colour_master_id',
    #'department_no',
    #'index_code',
    #'index_group_no',
    #'section_no',
    #'garment_group_no',
    # 'article_mean_price',
    # 'customer_mean_price',
    # 'article_mean_age',
    # 'index_group_match',
    # 'product_code_match',
    # 'days_since_last_purchase',
    # 'customer_days_since_last_purchase',
    # 'price_sensitivity',
    # 'age_sensitivity',
    # 'window_type_code',
]

feature_cols = [c for c in feature_cols if c not in remove_features]
cat_features = []

print(f"Using {len(feature_cols)} features for inference")
print("Loading submission week features...")
data = pd.read_parquet(
    FEATURES_PATH,
    columns=["customer_id", "article_id"] + feature_cols
)

data["customer_id"] = data["customer_id"].astype("int64")
data["article_id"] = data["article_id"].astype("int32")

for c in feature_cols:
    if pd.api.types.is_float_dtype(data[c]):
        data[c] = data[c].astype("float32")

BATCH = 2_000_000
n_rows = len(data)
scores = np.empty(n_rows, dtype=np.float32)

print(f"Running inference on {n_rows:,} rows...")

for start in range(0, n_rows, BATCH):
    end = min(start + BATCH, n_rows)
    infer_pool = Pool(
        data=data.iloc[start:end][feature_cols],
        cat_features=cat_features
    )
    scores[start:end] = model.predict(infer_pool).astype(np.float32)
    del infer_pool
    gc.collect()
    print(f"Predicted rows {start:,} to {end:,} / {n_rows:,}")

data["score"] = scores
del scores
gc.collect()

print("Final scored rows:", len(data))

Loading stored model: ../data/outputs/catboost_ranker.model
Model loaded successfully
Using 34 features for inference
Loading submission week features...
Running inference on 250,982,495 rows...
Predicted rows 0 to 2,000,000 / 250,982,495
Predicted rows 2,000,000 to 4,000,000 / 250,982,495
Predicted rows 4,000,000 to 6,000,000 / 250,982,495
Predicted rows 6,000,000 to 8,000,000 / 250,982,495
Predicted rows 8,000,000 to 10,000,000 / 250,982,495
Predicted rows 10,000,000 to 12,000,000 / 250,982,495
Predicted rows 12,000,000 to 14,000,000 / 250,982,495
Predicted rows 14,000,000 to 16,000,000 / 250,982,495
Predicted rows 16,000,000 to 18,000,000 / 250,982,495
Predicted rows 18,000,000 to 20,000,000 / 250,982,495
Predicted rows 20,000,000 to 22,000,000 / 250,982,495
Predicted rows 22,000,000 to 24,000,000 / 250,982,495
Predicted rows 24,000,000 to 26,000,000 / 250,982,495
Predicted rows 26,000,000 to 28,000,000 / 250,982,495
Predicted rows 28,000,000 to 30,000,000 / 250,982,495
Predicted ro

In [None]:
# Cell 8 – top-12 per customer

data = data.sort_values(["customer_id", "score"], ascending=[True, False])
top12 = data.groupby("customer_id", group_keys=False).head(12)

top12["article_id_str"] = top12["article_id"].astype(str).str.zfill(10)

pred_df = (
    top12.groupby("customer_id")["article_id_str"]
         .apply(lambda x: " ".join(x))
         .reset_index()
         .rename(columns={"customer_id": "customer_id_int",
                          "article_id_str": "prediction"})
)

pred_df["customer_id_int"] = pred_df["customer_id_int"].astype("int64")

print("Predictions built:", len(pred_df))

del top12, data
gc.collect()

# 10 min

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top12["article_id_str"] = top12["article_id"].astype(str).str.zfill(10)


Predictions built: 1371980


0

In [None]:
# Cell 9 – merge with sample + fallback
SAMPLE_PATH = '../data/input_data/sample_submission.csv'
GENERAL_PATH = '../data/outputs/general_pred_str.json'


def hex16_to_int(s):
    return np.int64(np.uint64(int(s[-16:], 16)))

sample = pd.read_csv(SAMPLE_PATH)
sample["customer_id_int"] = sample["customer_id"].apply(hex16_to_int)
sample = sample.drop(columns=["prediction"], errors="ignore")

pred_df["customer_id_int"] = pred_df["customer_id_int"].astype("int64")

print("Sample customers:", sample["customer_id_int"].nunique())
print("Predicted customers:", pred_df["customer_id_int"].nunique())

sub = sample.merge(pred_df, how="left", on="customer_id_int")
print("Predictions matched:", sub["prediction"].notna().sum())

del sample, pred_df
gc.collect()

gp = json.load(open(GENERAL_PATH))
fallback_str = gp["general_pred_str"]
fallback_items = fallback_str.split()

sub["prediction"] = sub["prediction"].fillna(fallback_str)

Sample customers: 1371980
Predicted customers: 1371980
Predictions matched: 1371980


In [None]:
# Cell 10 – pad to 12 and save
SUB_PATH = '../data/submission/catboost_ranker_submission.csv'


def pad_to_12(pred):
    items = pred.split()
    if len(items) >= 12:
        return " ".join(items[:12])
    seen = set(items)
    for art in fallback_items:
        if art not in seen:
            items.append(art)
            seen.add(art)
        if len(items) == 12:
            break
    return " ".join(items)

sub["prediction"] = sub["prediction"].apply(pad_to_12)

sub[["customer_id", "prediction"]].to_csv(SUB_PATH, index=False)
print("Submission saved to:", SUB_PATH)

sub.head()

Submission saved to: ../data/submission/catboost_ranker_submission.csv


Unnamed: 0,customer_id,customer_id_int,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,6883939031699146327,0568601006 0568601043 0568601044 0568601007 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,-7200416642310594310,0673677002 0448509014 0918522001 0918525001 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,-6846340800584936,0794321007 0794321011 0794321008 0851400020 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,-94071612138601410,0794321011 0804992017 0805000001 0730683050 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,-283965518499174310,0896152002 0730683050 0730683062 0791587001 08...
