In [None]:
# ===============================================================
# XGBoost Modeling
# Outputs (under ./models & ./results & ./submissions)
# ===============================================================

import os
import gc
import json
import pickle
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import RidgeCV

from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args

import xgboost as xgb

SEED = 42
np.random.seed(SEED)

DATA_DIR = Path("./data")
EMBEDDINGS_DIR = Path("./embeddings")
MODELS_DIR = Path("./models")
SUBMISSIONS_DIR = Path("./submissions")
RESULTS_DIR = Path("./results")

TRAIN_PATH = DATA_DIR / "train_preprocessed.csv"
TEST_PATH  = DATA_DIR / "test_preprocessed.csv"

# Text artifacts
EMBEDDINGS_MODEL = "all-mpnet-base-v2"
TEXT_PCA_TRAIN = EMBEDDINGS_DIR / EMBEDDINGS_MODEL / "pca" / f"text_pca_train.parquet"
TEXT_PCA_TEST  = EMBEDDINGS_DIR / EMBEDDINGS_MODEL / "pca" / f"text_pca_test.parquet"
TEXT_SEV_OOF   = EMBEDDINGS_DIR / EMBEDDINGS_MODEL / "severity" / f"text_severity_oof.parquet"
TEXT_SEV_TEST  = EMBEDDINGS_DIR / EMBEDDINGS_MODEL / "severity" / f"text_severity_test.parquet"

# Feature flags
INCLUDE_IICC = True
INCLUDE_TEXT = True

# Columns
TARGET_COL = "UltimateIncurredClaimCost"
ID_COL     = "ClaimNumber"
STRAT_COL  = "accident_year"

# Optimization / stacking
N_INITIAL_POINTS = 10
N_CALLS          = 30
N_BEST_MODELS    = 4

# Split
VALID_SIZE = 0.20

# Suffixes
TEST_SUFFIX = f"{'IICC' if INCLUDE_IICC else 'noIICC'}_{'TEXT' if INCLUDE_TEXT else 'noTEXT'}_{EMBEDDINGS_MODEL}"
RUN_SUFFIX  = f"xgb_gp_stack_{TEST_SUFFIX}"

DIR_MODEL = MODELS_DIR / RUN_SUFFIX

for p in [DIR_MODEL, SUBMISSIONS_DIR, RESULTS_DIR]:
    p.mkdir(parents=True, exist_ok=True)

def rmse_orig(y_true, y_pred):
    return root_mean_squared_error(y_true, y_pred)

def to_log(y):
    return np.log1p(np.maximum(y, 0.0))

def from_log(p):
    return np.maximum(np.expm1(p), 0.0)

def safe_replace_inf(df_like):
    df_like.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df_like


In [28]:
# ===============================================================
# 01) Data Loading & Text Merge
# ===============================================================
def load_data(train_path: Path, test_path: Path):
    df = pd.read_csv(train_path)
    dft = pd.read_csv(test_path)
    return safe_replace_inf(df), safe_replace_inf(dft)

def maybe_merge_text(df: pd.DataFrame, dft: pd.DataFrame, include_text: bool):
    if not include_text:
        return df, dft

    pca_tr = pd.read_parquet(TEXT_PCA_TRAIN)
    pca_te = pd.read_parquet(TEXT_PCA_TEST)
    sev_tr = pd.read_parquet(TEXT_SEV_OOF).rename(columns={"text_sev_oof": "text_sev"})
    sev_te = pd.read_parquet(TEXT_SEV_TEST).rename(columns={"text_sev_pred": "text_sev"})

    df_merged  = df.merge(pca_tr, on=ID_COL, how="left").merge(sev_tr, on=ID_COL, how="left")
    dft_merged = dft.merge(pca_te, on=ID_COL, how="left").merge(sev_te, on=ID_COL, how="left")
    return df_merged, dft_merged


In [29]:
# ===============================================================
# 02) Feature Selection
# ===============================================================
def select_features(df: pd.DataFrame, include_iicc: bool, include_text: bool):
    base_feats = [
        "WeeklyWages","hourly_wage","HoursWorkedPerWeek","DaysWorkedPerWeek",
        "Age","DependentChildren","DependentsOther",
        "accident_year","accident_month","accident_dow","accident_hour",
        "report_delay_days","is_weekend",
    ]
    cat_feats = [c for c in ["Gender","MaritalStatus","PartTimeFullTime"] if c in df.columns]
    flag_feats = [c for c in ["iicc_is_one_flag","iicc_small_flag","inconsistent_wages_flag","invalid_exposure_flag"] if c in df.columns]

    iicc_feats = []
    if include_iicc:
        for c in ["InitialIncurredClaimsCost","logIICC"]:
            if c in df.columns:
                iicc_feats.append(c)

    text_feats = []
    if include_text:
        text_feats += [c for c in df.columns if c.startswith("text_pca_")]
        for c in ["desc_char_len","desc_token_len","text_sev"]:
            if c in df.columns:
                text_feats.append(c)

    feat_cols = base_feats + flag_feats + iicc_feats + text_feats + cat_feats
    feat_cols = [c for c in feat_cols if c in df.columns]

    numeric_cols = [c for c in feat_cols if c not in cat_feats]
    categorical_cols = [c for c in cat_feats if c in df.columns]
    return feat_cols, numeric_cols, categorical_cols


In [30]:
# ===============================================================
# 03) Split & Preprocessor
# ===============================================================

def split_data(df: pd.DataFrame, strat_col: str, valid_size: float, seed: int):
    assert strat_col in df.columns, f"Missing strat column {strat_col}"
    train_idx, valid_idx = train_test_split(
        np.arange(len(df)),
        test_size=valid_size,
        random_state=seed,
        stratify=df[strat_col].astype(int)
    )
    train_df = df.iloc[train_idx].reset_index(drop=True)
    valid_df = df.iloc[valid_idx].reset_index(drop=True)
    return train_df, valid_df

def build_preprocessor(categorical_cols, numeric_cols):
    return ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
            ("num", "passthrough", numeric_cols),
        ],
        remainder="drop",
        sparse_threshold=0.0
    )

def build_design_matrices(preprocessor, feat_cols, train_df, valid_df, test_df):
    X_train = preprocessor.fit_transform(train_df[feat_cols])
    X_valid = preprocessor.transform(valid_df[feat_cols])
    X_test  = preprocessor.transform(test_df[feat_cols])
    return X_train, X_valid, X_test


In [31]:
# ===============================================================
# 04) Bayesian Optimization on Validation
# ===============================================================
def bayes_opt_xgb(
    X_train, y_train_log, X_valid, y_valid, n_calls: int, n_init: int, seed: int
):
    # Search space
    space  = [
        Integer(3, 12,     name="max_depth"),
        Real(0.01, 0.3,    name="learning_rate"),
        Integer(400, 3000, name="n_estimators"),
        Real(0.5, 1.0,     name="subsample"),
        Real(0.5, 1.0,     name="colsample_bytree"),
        Real(1e-8, 50.0,   name="min_child_weight"),
        Real(0.0, 10.0,    name="reg_alpha"),
        Real(0.0, 20.0,    name="reg_lambda"),
        Real(0.0, 10.0,    name="gamma"),
    ]

    trial_params = []
    trial_valid_preds = []
    trial_scores = []
    y_valid_log = to_log(y_valid)

    @use_named_args(space)
    def objective(**params):
        model = xgb.XGBRegressor(
            objective="reg:squarederror",
            eval_metric="rmse",
            random_state=seed,
            n_jobs=os.cpu_count(),
            tree_method="hist",
            **params
        )
        model.fit(X_train, y_train_log, eval_set=[(X_valid, y_valid_log)], verbose=False)
        yv_pred = from_log(model.predict(X_valid))
        score = rmse_orig(y_valid, yv_pred)

        trial_params.append(params)
        trial_valid_preds.append(yv_pred)
        trial_scores.append(score)
        return score

    res = gp_minimize(
        func=objective,
        dimensions=space,
        n_calls=n_calls,
        n_initial_points=n_init,
        random_state=seed,
        acq_func="EI",
        noise="gaussian"
    )
    return res, trial_params, trial_valid_preds, trial_scores


In [None]:
# ===============================================================
# 05) Stacker fit (RidgeCV) on Validation set
# ===============================================================
def fit_stacker_ridgecv(P_valid, y_valid):
    ridge = RidgeCV(alphas=np.logspace(-3, 3, 13), cv=5, scoring="neg_mean_squared_error")
    ridge.fit(P_valid, y_valid)
    valid_blend = ridge.predict(P_valid)
    return ridge, valid_blend


In [33]:
# ===============================================================
# 06) Full-data refit of top models + test prediction
# ===============================================================
def refit_and_predict_full(
    df_full, dft, feat_cols, cat_feats, params_list, seed
):
    # Full-data preprocessor
    preprocessor_full = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), [c for c in cat_feats if c in df_full.columns]),
            ("num", "passthrough", [c for c in feat_cols if c not in cat_feats]),
        ],
        remainder="drop",
        sparse_threshold=0.0
    )
    X_full  = preprocessor_full.fit_transform(df_full[feat_cols])
    X_testF = preprocessor_full.transform(dft[feat_cols])

    y_full   = df_full[TARGET_COL].values
    y_full_l = to_log(y_full)

    # Fit each top model
    test_pred_matrix = []
    for params in params_list:
        model = xgb.XGBRegressor(
            objective="reg:squarederror",
            eval_metric="rmse",
            random_state=seed,
            n_jobs=os.cpu_count(),
            tree_method="hist",
            **params
        )
        model.fit(X_full, y_full_l, verbose=False)
        yp = from_log(model.predict(X_testF))
        test_pred_matrix.append(yp)

    P_test = np.column_stack(test_pred_matrix).astype(np.float64)

    # Return everything needed to save for inference
    bundle = {
        "preprocessor_full": preprocessor_full,
        "feat_cols": feat_cols,
        "cat_feats": cat_feats,
        "top_params": params_list,
        "seed": seed,
    }
    return P_test, bundle


In [None]:
# ===============================================================
# 07) Orchestration
# ===============================================================

# 1) Load & merge text
df, dft = load_data(TRAIN_PATH, TEST_PATH)
df, dft = maybe_merge_text(df, dft, INCLUDE_TEXT)

# 2) Select features
feat_cols, numeric_cols, categorical_cols = select_features(df, INCLUDE_IICC, INCLUDE_TEXT)

print("Total features:", len(feat_cols))
print("Sample:", feat_cols[:15], "...")

# 3) Split
train_df, valid_df = split_data(df, STRAT_COL, VALID_SIZE, SEED)
y_train = train_df[TARGET_COL].values
y_valid = valid_df[TARGET_COL].values

# 4) Preprocessor & matrices
pre = build_preprocessor(categorical_cols, numeric_cols)
X_train, X_valid, X_test = build_design_matrices(pre, feat_cols, train_df, valid_df, dft)
y_train_log = to_log(y_train)
y_valid_log = to_log(y_valid)

print("Train/Valid sizes:", len(train_df), len(valid_df))
print("X shapes:", X_train.shape, X_valid.shape, X_test.shape)

# 5) gp.minimize on VALID
res, trial_params, trial_valid_preds, trial_scores = bayes_opt_xgb(
    X_train, y_train_log, X_valid, y_valid, n_calls=N_CALLS, n_init=N_INITIAL_POINTS, seed=SEED
)
print("Best validation RMSE:", res.fun)
print("Best params:", res.x)

# 6) Select best K trials and stack
order = np.argsort(np.array(trial_scores))
top_k = order[:N_BEST_MODELS]
print("Top trials (score):")
for i, idx in enumerate(top_k, 1):
    print(f"{i}: {trial_scores[idx]:.4f} | params: {trial_params[idx]}")

P_valid = np.column_stack([trial_valid_preds[idx] for idx in top_k]).astype(np.float64)
ridge, valid_blend = fit_stacker_ridgecv(P_valid, y_valid)
print("Ridge weights:", ridge.coef_, "| intercept:", ridge.intercept_)
print("Stacked VALID RMSE:", rmse_orig(y_valid, valid_blend))

# TRAIN preds matrix for stacker
P_train_list = []
for idx in top_k:
    params = trial_params[idx]
    m = xgb.XGBRegressor(
        objective="reg:squarederror",
        eval_metric="rmse",
        random_state=SEED,
        n_jobs=os.cpu_count(),
        tree_method="hist",
        **params
    )
    # Fit on TRAIN split (same as gp trials) and predict TRAIN
    m.fit(X_train, y_train_log, verbose=False)
    p_tr = from_log(m.predict(X_train))
    P_train_list.append(p_tr)

P_train = np.column_stack(P_train_list).astype(np.float64)
train_blend = ridge.predict(P_train)
train_blend = np.maximum(train_blend, 0.0)

# Build full training predictions (train + valid), with trained_on flag
pred_train_df = pd.DataFrame({
    "idx": train_df.index,
    "trained_on": 1,
    "y_true": y_train,
    "y_pred": train_blend
})
pred_valid_df = pd.DataFrame({
    "idx": valid_df.index,
    "trained_on": 0,
    "y_true": y_valid,
    "y_pred": np.maximum(valid_blend, 0.0)
})
pred_full_df = pd.concat([pred_train_df, pred_valid_df], axis=0).sort_values("idx").reset_index(drop=True)

# Save full training predictions
full_train_preds_path = RESULTS_DIR / f"train_preds_{RUN_SUFFIX}.csv"
pred_full_df.to_csv(full_train_preds_path, index=False)
print("Saved full training predictions:", full_train_preds_path)

# 7) Refit top models on FULL and predict TEST
top_params_list = [trial_params[idx] for idx in top_k]
P_test, model_bundle = refit_and_predict_full(df, dft, feat_cols, categorical_cols, top_params_list, SEED)

# Blend TEST with ridge stacker
test_blend = ridge.predict(P_test)
test_blend = np.maximum(test_blend, 0.0)

# 8) Save submission
sub = pd.DataFrame({ID_COL: dft[ID_COL], TARGET_COL: test_blend.astype(float)})
sub_path = SUBMISSIONS_DIR / f"submission_{RUN_SUFFIX}.csv"
sub.to_csv(sub_path, index=False)
print("Saved submission:", sub_path)

# 9) Save model bundle (preprocessor + top params + ridge)
alphas_grid = getattr(ridge, "alphas", None)   # array or None
alpha_best  = getattr(ridge, "alpha_", None)   # scalar or None
model_artifact = {
    "bundle": model_bundle,   # preprocessor_full, feat_cols, cat_feats, top_params, seed
    "stacker": {
        "type": "RidgeCV",
        "coef_": ridge.coef_.tolist(),
        "intercept_": float(ridge.intercept_),
        "alphas": (alphas_grid.tolist() if alphas_grid is not None else None),
        "alpha_chosen": (float(alpha_best) if alpha_best is not None else None),
    },
}
pkl_path = DIR_MODEL / "model.pkl"
with open(pkl_path, "wb") as f:
    pickle.dump(model_artifact, f)
print("Saved model:", pkl_path)

# 10) Save config/metadata JSON
meta = {
    "seed": SEED,
    "valid_size": VALID_SIZE,
    "include_iicc": INCLUDE_IICC,
    "include_text": INCLUDE_TEXT,
    "n_calls": N_CALLS,
    "n_initial_points": N_INITIAL_POINTS,
    "n_best_models": N_BEST_MODELS,
    "best_score": float(res.fun),
    "best_params": dict(zip(
        ["max_depth","learning_rate","n_estimators","subsample","colsample_bytree",
         "min_child_weight","reg_alpha","reg_lambda","gamma"],
        [int(y) if isinstance(y, (int, np.integer)) else float(y) for y in res.x]
    )),
    "ridge_coef": ridge.coef_.tolist(),
    "ridge_intercept": float(ridge.intercept_),
    "features_used_count": len(feat_cols),
    "run_suffix": RUN_SUFFIX
}
json_path = DIR_MODEL / "config.json"
with open(json_path, "w") as f:
    json.dump(meta, f, indent=2)
print("Saved config:", json_path)

# Final cleanup
gc.collect()

Total features: 153
Sample: ['WeeklyWages', 'hourly_wage', 'HoursWorkedPerWeek', 'DaysWorkedPerWeek', 'Age', 'DependentChildren', 'DependentsOther', 'accident_year', 'accident_month', 'accident_dow', 'accident_hour', 'report_delay_days', 'is_weekend', 'iicc_is_one_flag', 'iicc_small_flag'] ...
Train/Valid sizes: 43200 10800
X shapes: (43200, 158) (10800, 158) (36000, 158)
Best validation RMSE: 22732.767361550657
Best params: [np.int64(4), 0.1529638068018135, np.int64(503), 0.8436401141630159, 0.7629446361427012, 1e-08, 2.5725735617941172, 10.727608169367786, 0.0]
Top trials (score):
1: 22732.7674 | params: {'max_depth': np.int64(4), 'learning_rate': 0.1529638068018135, 'n_estimators': np.int64(503), 'subsample': 0.8436401141630159, 'colsample_bytree': 0.7629446361427012, 'min_child_weight': 1e-08, 'reg_alpha': 2.5725735617941172, 'reg_lambda': 10.727608169367786, 'gamma': 0.0}
2: 22740.2462 | params: {'max_depth': np.int64(3), 'learning_rate': 0.07695920943042321, 'n_estimators': np.in

310