In [None]:
# %%
# Cell 1
import numpy as np
import pandas as pd
import gc
import json
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRanker, Pool

FEATURES_PATH = "../data/outputs/features_week=20200922.parquet"
CAT_MODEL_PATH = "../data/outputs/catboost_ranker.model"
LGB_MODEL_PATH = "../data/outputs/lgbm_ranker.model"
XGB_MODEL_PATH = "../data/outputs/xgb_ranker.model"
SAMPLE_PATH = "../data/input_data/sample_submission.csv"
GENERAL_PATH = "../data/outputs/general_pred_str.json"
OUT_PATH = "../data/submission/ensemble_submission.csv"

  from optuna import progress_bar as pbar_module
  import rmm
  from rmm import mr


In [None]:
# Cell 2

import pyarrow.parquet as pq
from tqdm import tqdm

print("Preparing parquet reader...")

with open("../data/outputs/dataset_meta.json") as f:
    meta = json.load(f)

feature_cols = meta["model_features"]

pf = pq.ParquetFile(FEATURES_PATH)
num_rows = pf.metadata.num_rows

print("Total rows:", num_rows)
print("Num feature columns:", len(feature_cols))

print("Loading CatBoost...")
cat_model = CatBoostRanker()
cat_model.load_model(CAT_MODEL_PATH)

print("Loading LightGBM...")
lgb_model = lgb.Booster(model_file=LGB_MODEL_PATH)

print("Loading XGBoost...")
xgb_model = xgb.Booster()
xgb_model.load_model(XGB_MODEL_PATH)


Preparing parquet reader...
Total rows: 250982495
Num feature columns: 34
Loading CatBoost...
Loading LightGBM...
Loading XGBoost...


  xgb_model.load_model(XGB_MODEL_PATH)


In [None]:
# Cell 3

print("Running batch predictions...")

BATCH = 20_000_000 ## Batch dependent on memory capacity
scores_cat = np.memmap("tmp_cat.bin", dtype="float32", mode="w+", shape=(num_rows,))
scores_lgb = np.memmap("tmp_lgb.bin", dtype="float32", mode="w+", shape=(num_rows,))
scores_xgb = np.memmap("tmp_xgb.bin", dtype="float32", mode="w+", shape=(num_rows,))

row_ptr = 0
buffer = []
buffer_rows = 0

for rg in tqdm(range(pf.num_row_groups), desc="Row-groups"):

    # read row group
    rg_df = pf.read_row_group(rg).to_pandas()

    # enforce features
    for col in feature_cols:
        if col not in rg_df.columns:
            rg_df[col] = 0.0

    buffer.append(rg_df)
    buffer_rows += len(rg_df)

    # process batch if buffer is large enough
    if buffer_rows >= BATCH or rg == pf.num_row_groups - 1:

        df = pd.concat(buffer, ignore_index=True)
        X = df[feature_cols].astype("float32")
        n = len(df)

        # CatBoost
        scores_cat[row_ptr:row_ptr+n] = cat_model.predict(Pool(X)).astype("float32")

        # LightGBM
        scores_lgb[row_ptr:row_ptr+n] = lgb_model.predict(
            X, num_iteration=lgb_model.best_iteration
        ).astype("float32")

        # XGBoost
        dmat = xgb.DMatrix(X)
        scores_xgb[row_ptr:row_ptr+n] = xgb_model.predict(
            dmat,
            iteration_range=(0, xgb_model.best_iteration + 1)
        ).astype("float32")

        row_ptr += n
        buffer = []
        buffer_rows = 0

        del df, X, dmat
        gc.collect()

        # 152 min

Running batch predictions...


Row-groups: 100%|██████████| 240/240 [1:42:29<00:00, 25.62s/it]  


In [None]:
# weights (if added to 1.0), or validation scores
s_xgb = 0.20
s_lgb = 0.30
s_cat = 0.50

# Compute normalized weights scaled to 1.0
total = s_xgb + s_lgb + s_cat

w_xgb = s_xgb / total
w_lgb = s_lgb / total
w_cat = s_cat / total

print("Weights:", w_cat, w_lgb, w_xgb)

# Apply weighted ensemble
ensemble_scores = (
    w_cat * scores_cat +
    w_lgb * scores_lgb +
    w_xgb * scores_xgb
)

Weights: 0.5 0.3 0.2


In [None]:
## optional when rerunning with new weights
# del pred_df

In [None]:
# Cell 4

import json
import numpy as np

# Rebuild pred_df if missing
if "pred_df" not in locals():
    import pyarrow.parquet as pq

    pf = pq.ParquetFile(FEATURES_PATH)
    num_rows = pf.metadata.num_rows
    base_df = pf.read(columns=["customer_id", "article_id"]).to_pandas()

    scores_cat = np.memmap("tmp_cat.bin", dtype="float32", mode="r", shape=(num_rows,))
    scores_lgb = np.memmap("tmp_lgb.bin", dtype="float32", mode="r", shape=(num_rows,))
    scores_xgb = np.memmap("tmp_xgb.bin", dtype="float32", mode="r", shape=(num_rows,))

    ensemble_scores = (w_cat * scores_cat) + (w_lgb * scores_lgb) + (w_xgb * scores_xgb)

    base_df["score"] = ensemble_scores
    df_sorted = base_df.sort_values(["customer_id", "score"], ascending=[True, False])
    top12 = df_sorted.groupby("customer_id", group_keys=False).head(12)
    top12["article_id_str"] = top12["article_id"].astype(str).str.zfill(10)

    pred_df = (
        top12.groupby("customer_id")["article_id_str"]
        .apply(lambda x: " ".join(x))
        .reset_index()
        .rename(columns={"customer_id": "customer_id_int", "article_id_str": "prediction"})
    )

def hex16_to_int64(s):
    return np.int64(np.uint64(int(s[-16:], 16)))

# -------------------------------
# Load fallback data
# -------------------------------
gp = json.load(open(GENERAL_PATH))
fallback_str = gp["general_pred_str"]
fallback_items = fallback_str.split()

if "article_id_str" in pred_df.columns and "prediction" not in pred_df.columns:
    pred_df = pred_df.rename(columns={"article_id_str": "prediction"})

if "customer_id" in pred_df.columns and "customer_id_int" not in pred_df.columns:
    pred_df = pred_df.rename(columns={"customer_id": "customer_id_int"})

pred_df["customer_id_int"] = pred_df["customer_id_int"].astype("int64")

print("pred_df ready, rows:", len(pred_df))

# -------------------------------
# Load sample and convert IDs correctly
# -------------------------------
sample = pd.read_csv(SAMPLE_PATH)

sample["customer_id_int"] = sample["customer_id"].apply(hex16_to_int64)
sample = sample.drop(columns=["prediction"], errors="ignore")

print("Sample rows:", len(sample))

# -------------------------------
# Merge
# -------------------------------
sub = sample.merge(pred_df, how="left", on="customer_id_int")

print("Matched predictions:", sub["prediction"].notna().sum())
print("Expected:", len(sample))

# -------------------------------
# Apply fallback + pad
# -------------------------------
sub["prediction"] = sub["prediction"].fillna(fallback_str)

def pad12(pred):
    items = pred.split()
    if len(items) >= 12:
        return " ".join(items[:12])
    used = set(items)
    for art in fallback_items:
        if art not in used:
            items.append(art)
            if len(items) == 12:
                break
    return " ".join(items)

sub["prediction"] = sub["prediction"].apply(pad12)

sub[["customer_id","prediction"]].to_csv(OUT_PATH, index=False)
print("Saved ensemble submission:", OUT_PATH)
sub.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top12["article_id_str"] = top12["article_id"].astype(str).str.zfill(10)


pred_df ready, rows: 1371980
Sample rows: 1371980
Matched predictions: 1371980
Expected: 1371980
Saved ensemble submission: ../data/submission/ensemble_submission.csv


Unnamed: 0,customer_id,customer_id_int,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,6883939031699146327,0568601043 0568601044 0568601006 0568601007 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,-7200416642310594310,0673677002 0918522001 0448509014 0706016001 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,-6846340800584936,0794321007 0794321011 0794321008 0851400020 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,-94071612138601410,0794321011 0730683050 0804992017 0805000001 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,-283965518499174310,0896152002 0730683050 0730683062 0896152001 07...
