In [1]:
# %%
# Cell 1 – imports
import gc
import json
import numpy as np
import pandas as pd
from catboost import CatBoostRanker, Pool

META_PATH = '../data/outputs/dataset_meta.json'
TRAIN_RANK_PATH = '../data/outputs/train_rank.parquet'
VALID_RANK_PATH = '../data/outputs/valid_rank.parquet'
GROUP_VALID_PATH = '../data/outputs/groups_valid.npy'
FEATURES_PATH = '../data/outputs/features_week=20200922.parquet'
SAMPLE_PATH = '../data/input_data/sample_submission.csv'
GENERAL_PATH = '../data/outputs/general_pred_str.json'
MODEL_PATH = '../data/outputs/catboost_ranker.model'
SUB_PATH = '../data/submission/catboost_ranker_submission.csv'


In [2]:
# %%
# Cell 2 – load train/valid ranking data
META_PATH = '../data/outputs/dataset_meta.json'
TRAIN_RANK_PATH = '../data/outputs/train_rank.parquet'
VALID_RANK_PATH = '../data/outputs/valid_rank.parquet'
GROUP_TRAIN_PATH = '../data/outputs/groups_train.npy'
GROUP_VALID_PATH = '../data/outputs/groups_valid.npy'

with open(META_PATH) as f:
    meta = json.load(f)

# same feature list used in LGBM/XGB
feature_cols = meta['model_features']

# load
train_df = pd.read_parquet(TRAIN_RANK_PATH)
valid_df = pd.read_parquet(VALID_RANK_PATH)

# REMOVE FEATURES – same list as XGB/LGBM
remove_features = [
    # same list you used in the others
]

# drop removed features
drop_cols = [c for c in remove_features if c in train_df.columns]
if drop_cols:
    print("Dropping:", drop_cols)
    train_df = train_df.drop(columns=drop_cols)
    valid_df = valid_df.drop(columns=drop_cols, errors='ignore')

# update feature_cols
feature_cols = [c for c in feature_cols if c not in remove_features]

train_group = np.load(GROUP_TRAIN_PATH)
valid_group = np.load(GROUP_VALID_PATH)

# float32 for GPU CatBoost
for c in feature_cols:
    if pd.api.types.is_float_dtype(train_df[c]):
        train_df[c] = train_df[c].astype('float32')
        valid_df[c] = valid_df[c].astype('float32')

train_df['label'] = train_df['label'].astype('float32')
valid_df['label'] = valid_df['label'].astype('float32')

print("Train rows:", len(train_df))
print("Valid rows:", len(valid_df))
print("Features:", len(feature_cols))
print("Groups train:", len(train_group), "valid:", len(valid_group))
gc.collect()


Train rows: 8180082
Valid rows: 253714158
Features: 34
Groups train: 294983 valid: 1371980


0

In [3]:
# %%
# Cell 3 – filter validation groups with ≥1 positive

print("\nFiltering validation set to groups with at least 1 positive...")

orig_valid_group = np.load(GROUP_VALID_PATH)
group_bounds = np.insert(np.cumsum(orig_valid_group), 0, 0)[:-1]
group_has_positive = np.add.reduceat(valid_df['label'].values, group_bounds) > 0

valid_group = orig_valid_group[group_has_positive]
row_mask = np.repeat(group_has_positive, orig_valid_group)
valid_df = valid_df[row_mask].reset_index(drop=True)

print("Filtered valid groups:", len(valid_group))
print("Filtered valid rows:", len(valid_df))
gc.collect()



Filtering validation set to groups with at least 1 positive...
Filtered valid groups: 27802
Filtered valid rows: 8372080


11

In [4]:
# %%
# Cell 4 – Build CatBoost Pools (aligned with LGBM/XGB)

from catboost import Pool

# --- Convert group-sizes to per-row group_id (CatBoost requirement) ---
def expand_group_ids(group_sizes):
    ids = np.repeat(np.arange(len(group_sizes), dtype=np.int32), group_sizes)
    return ids

train_group_id = expand_group_ids(train_group)
valid_group_id = expand_group_ids(valid_group)

print("Expanded train_group_id:", len(train_group_id))
print("Expanded valid_group_id:", len(valid_group_id))
print("Train_df rows:", len(train_df))
print("Valid_df rows:", len(valid_df))

assert len(train_group_id) == len(train_df)
assert len(valid_group_id) == len(valid_df)

# Treat all features as numeric
cat_features = []

pool_train = Pool(
    data=train_df[feature_cols],
    label=train_df["label"],
    group_id=train_group_id,
    cat_features=cat_features
)

pool_valid = Pool(
    data=valid_df[feature_cols],
    label=valid_df["label"],
    group_id=valid_group_id,
    cat_features=cat_features
)

print("Pools built:")
print("Train rows:", pool_train.shape[0])
print("Valid rows:", pool_valid.shape[0])

gc.collect()


Expanded train_group_id: 8180082
Expanded valid_group_id: 8372080
Train_df rows: 8180082
Valid_df rows: 8372080
Pools built:
Train rows: 8180082
Valid rows: 8372080


0

In [None]:
# %%
# Cell 5 – GPU CatBoostRanker with eval logging

MODEL_PATH = "../data/outputs/catboost_ranker.model"
LOG_PATH = "../data/outputs/catboost_training_log.txt"

model = CatBoostRanker(
    loss_function='YetiRank',
    eval_metric='NDCG:top=12',
    learning_rate=0.08,
    depth=10,
    l2_leaf_reg=15,
    min_data_in_leaf=256,
    border_count=128, 
    iterations=100,
    early_stopping_rounds=100,
    task_type='GPU',
    devices='0',
    random_seed=42,
    verbose=10,
    metric_period=10,
)

model.fit(pool_train, eval_set=pool_valid)


# model = CatBoostRanker(
#     loss_function='YetiRank',
#     eval_metric='NDCG:top=12',
#     learning_rate=0.05,
#     depth=6,
#     iterations=500,
#     border_count=64,
#     random_seed=42,
#     early_stopping_rounds=50,
#     task_type='GPU',
#     devices='0',
#     gpu_ram_part=0.7,
#     one_hot_max_size=10,
#     max_ctr_complexity=1,
#     verbose=10,
#     metric_period=10,
# )
# model.fit(pool_train, eval_set=pool_valid)

print("Best iteration:", model.get_best_iteration())

# Save model
model.save_model(MODEL_PATH)
print("Model saved:", MODEL_PATH)

# ---- LOGGING ----
evals = model.get_evals_result()

# Find the key containing NDCG in validation set
def find_ndcg_key(eval_dict):
    for key in eval_dict:
        if "NDCG" in key.upper():
            return key
    # If not found, return first available key or raise error
    available = list(eval_dict.keys())
    if available:
        print(f"Warning: NDCG not found. Using first available metric: {available[0]}")
        return available[0]
    raise KeyError(f"No metrics found in eval_dict. Available keys: {available}")

valid_metric_key = find_ndcg_key(evals['validation'])
print(f"Using validation metric: {valid_metric_key}")

# Since learn is empty, we'll only log validation metrics
valid_log = evals['validation'][valid_metric_key]

with open(LOG_PATH, "w") as f:
    f.write("iter,valid_ndcg12\n")
    for i, val in enumerate(valid_log):
        f.write(f"{i},{val}\n")

print("Training log saved successfully:", LOG_PATH)

gc.collect()


Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=12;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.1065279	best: 0.1065279 (0)	total: 329ms	remaining: 32.5s
10:	test: 0.1156121	best: 0.1156121 (10)	total: 1.85s	remaining: 15s
20:	test: 0.1331676	best: 0.1331676 (20)	total: 3.4s	remaining: 12.8s
30:	test: 0.1356985	best: 0.1356985 (30)	total: 4.93s	remaining: 11s
40:	test: 0.1405683	best: 0.1405718 (39)	total: 6.46s	remaining: 9.3s
50:	test: 0.1426873	best: 0.1426873 (50)	total: 8.07s	remaining: 7.76s
60:	test: 0.1439404	best: 0.1440354 (59)	total: 9.64s	remaining: 6.17s
70:	test: 0.1446463	best: 0.1447158 (66)	total: 11.2s	remaining: 4.58s
80:	test: 0.1466995	best: 0.1466995 (80)	total: 12.8s	remaining: 3s
90:	test: 0.1472478	best: 0.1472478 (90)	total: 14.4s	remaining: 1.42s
99:	test: 0.1472252	best: 0.1474337 (97)	total: 15.8s	remaining: 0us
bestTest = 0.1474337369
bestIteration = 97
Shrink model to first 98 iterations.
Best iteration: 97
Model saved: ../data/outputs/catboost_ranker.model
Available eval keys:
Learn keys: []
Validation keys: ['PFound', 'NDCG:top=12;type=

0

In [None]:
# %%
# Cell 6 – cleanup after training
del pool_train, pool_valid
gc.collect()


736

In [None]:
# %%
# Cell 7 – inference on submission week

print("\nLoading model for inference...")
model = CatBoostRanker()
model.load_model(MODEL_PATH)

print("Loading submission week features...")
data = pd.read_parquet(
    FEATURES_PATH,
    columns=["customer_id", "article_id"] + feature_cols
)

data["customer_id"] = data["customer_id"].astype("int64")
data["article_id"] = data["article_id"].astype("int32")

for c in feature_cols:
    if pd.api.types.is_float_dtype(data[c]):
        data[c] = data[c].astype("float32")

infer_pool = Pool(
    data=data[feature_cols],
    cat_features=cat_features
)

print("Running inference...")
scores = model.predict(infer_pool).astype(np.float32)
data["score"] = scores

del infer_pool, scores
gc.collect()
print("Scored data rows:", len(data))



Loading model for inference...
Loading submission week features...
Running inference...
Scored data rows: 250982495


In [None]:
# %%
# Cell 8 – top-12 per customer

data = data.sort_values(["customer_id", "score"], ascending=[True, False])
top12 = data.groupby("customer_id", group_keys=False).head(12)

top12["article_id_str"] = top12["article_id"].astype(str).str.zfill(10)

pred_df = (
    top12.groupby("customer_id")["article_id_str"]
         .apply(lambda x: " ".join(x))
         .reset_index()
         .rename(columns={"customer_id": "customer_id_int",
                          "article_id_str": "prediction"})
)

pred_df["customer_id_int"] = pred_df["customer_id_int"].astype("int64")

print("Predictions built:", len(pred_df))

del top12, data
gc.collect()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top12["article_id_str"] = top12["article_id"].astype(str).str.zfill(10)


Predictions built: 1371980


0

In [None]:
# %%
# Cell 9 – merge with sample + fallback

def hex16_to_int(s):
    return np.int64(np.uint64(int(s[-16:], 16)))

sample = pd.read_csv(SAMPLE_PATH)
sample["customer_id_int"] = sample["customer_id"].apply(hex16_to_int)
sample = sample.drop(columns=["prediction"], errors="ignore")

pred_df["customer_id_int"] = pred_df["customer_id_int"].astype("int64")

print("Sample customers:", sample["customer_id_int"].nunique())
print("Predicted customers:", pred_df["customer_id_int"].nunique())

sub = sample.merge(pred_df, how="left", on="customer_id_int")
print("Predictions matched:", sub["prediction"].notna().sum())

del sample, pred_df
gc.collect()

gp = json.load(open(GENERAL_PATH))
fallback_str = gp["general_pred_str"]
fallback_items = fallback_str.split()

sub["prediction"] = sub["prediction"].fillna(fallback_str)


Sample customers: 1371980
Predicted customers: 1371980
Predictions matched: 1371980


In [None]:
# %%
# Cell 10 – pad to 12 and save

def pad_to_12(pred):
    items = pred.split()
    if len(items) >= 12:
        return " ".join(items[:12])
    seen = set(items)
    for art in fallback_items:
        if art not in seen:
            items.append(art)
            seen.add(art)
        if len(items) == 12:
            break
    return " ".join(items)

sub["prediction"] = sub["prediction"].apply(pad_to_12)

sub[["customer_id", "prediction"]].to_csv(SUB_PATH, index=False)
print("Submission saved to:", SUB_PATH)

sub.head()


Submission saved to: ../data/submission/catboost_ranker_submission.csv


Unnamed: 0,customer_id,customer_id_int,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,6883939031699146327,0568601006 0568601043 0568601044 0568601007 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,-7200416642310594310,0673677002 0918522001 0924243001 0918292001 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,-6846340800584936,0794321007 0794321011 0794321008 0918522001 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,-94071612138601410,0804992017 0794321011 0805000001 0754238024 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,-283965518499174310,0896152002 0791587015 0730683050 0730683062 08...
