# 04 — Custom models (WITHOUT LAMA)

Goal: beat the best LAMA baseline.

We implement **multiple pipelines**:
1) TF‑IDF (text) + OHE (cats) + numeric → **LogReg**
2) Target Encoding (cats) + numeric → **LightGBM**
3) CatBoost on tabular → **CatBoostClassifier**

Plus: **Optuna tuning** for LightGBM.

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss

from src.config import Paths, TARGET_COL, ID_COL, SEED
from src.models.cv import stratified_cv_predict_proba
from src.models.custom import (
    make_tfidf_linear_pipeline,
    fit_predict_proba_sklearn,
    make_lgbm_target_enc_pipeline,
    fit_predict_proba_lgbm_target_enc,
    make_catboost_model,
    fit_predict_proba_catboost,
)
from src.utils.seed import set_global_seed

set_global_seed(SEED)
paths = Paths()

df = pd.read_parquet(paths.data_processed/"model_table.parquet")
spec = json.loads((paths.data_processed/"feature_spec.json").read_text(encoding="utf-8"))

train_df = df[df[TARGET_COL].notna()].copy()
test_df  = df[df[TARGET_COL].isna()].copy()

labels = ["high","medium","low"]

## Pipeline 1: TF‑IDF + Logistic Regression (strong baseline)

In [None]:
X = train_df[spec["numeric_cols"] + spec["categorical_cols"] + spec["text_cols"]].copy()
y = train_df[TARGET_COL].copy()

tfidf_pipe = make_tfidf_linear_pipeline(
    numeric_cols=spec["numeric_cols"],
    categorical_cols=spec["categorical_cols"],
    text_cols=spec["text_cols"],
    max_features=70000,
)

def fit_pred_fn(X_tr, y_tr, X_va):
    return fit_predict_proba_sklearn(tfidf_pipe, X_tr, y_tr, X_va)

res = stratified_cv_predict_proba(fit_pred_fn, X, y, labels=labels, n_splits=5, seed=SEED)
print("Mean CV logloss:", res.mean_score)

## Pipeline 2: Target Encoding + LightGBM

In [None]:
X_tab = train_df[spec["numeric_cols"] + spec["categorical_cols"]].copy()
y_tab = train_df[TARGET_COL].copy()

enc, lgbm = make_lgbm_target_enc_pipeline(
    numeric_cols=spec["numeric_cols"],
    categorical_cols=spec["categorical_cols"],
    params={"n_estimators": 2500},
)

def fit_pred_fn2(X_tr, y_tr, X_va):
    return fit_predict_proba_lgbm_target_enc(enc, lgbm, X_tr, y_tr, X_va)

res2 = stratified_cv_predict_proba(fit_pred_fn2, X_tab, y_tab, labels=labels, n_splits=5, seed=SEED)
print("Mean CV logloss:", res2.mean_score)

## Pipeline 3: CatBoost (tabular)

In [None]:
X_cb = train_df[spec["numeric_cols"] + spec["categorical_cols"]].copy()
y_cb = train_df[TARGET_COL].copy()

cb_model = make_catboost_model(cat_cols=spec["categorical_cols"], params={"iterations": 2000})

def fit_pred_fn3(X_tr, y_tr, X_va):
    m = make_catboost_model(cat_cols=spec["categorical_cols"], params={"iterations": 2000})
    return fit_predict_proba_catboost(m, X_tr, y_tr, X_va, cat_cols=spec["categorical_cols"])

res3 = stratified_cv_predict_proba(fit_pred_fn3, X_cb, y_cb, labels=labels, n_splits=5, seed=SEED)
print("Mean CV logloss:", res3.mean_score)

## Compare pipelines

In [None]:
import pandas as pd

comp = pd.DataFrame([
    {"pipeline": "TF‑IDF + LogisticRegression", "logloss": res.mean_score},
    {"pipeline": "TargetEnc + LightGBM", "logloss": res2.mean_score},
    {"pipeline": "CatBoost (tabular)", "logloss": res3.mean_score},
]).sort_values("logloss")

display(comp)

ax = comp.set_index("pipeline")["logloss"].plot(kind="barh", figsize=(8,3))
ax.set_title("Custom pipelines (lower is better)")
ax.set_xlabel("CV logloss")
plt.show()

## Optuna tuning example (LightGBM)

In [None]:
import optuna
from sklearn.model_selection import StratifiedKFold

def objective(trial: optuna.Trial) -> float:
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.08, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 32, 256),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 200),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
        "n_estimators": 3000,
    }

    enc, model = make_lgbm_target_enc_pipeline(
        numeric_cols=spec["numeric_cols"],
        categorical_cols=spec["categorical_cols"],
        params=params,
    )

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
    scores = []
    for tr_idx, va_idx in skf.split(X_tab, y_tab):
        X_tr, X_va = X_tab.iloc[tr_idx], X_tab.iloc[va_idx]
        y_tr, y_va = y_tab.iloc[tr_idx], y_tab.iloc[va_idx]
        proba = fit_predict_proba_lgbm_target_enc(enc, model, X_tr, y_tr, X_va)
        scores.append(log_loss(y_va, proba, labels=labels))
    return float(np.mean(scores))

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=25)

print("Best params:", study.best_params)
print("Best value :", study.best_value)

## Train best custom pipeline on full train and create submission

In [None]:
# Choose best pipeline from comp table
best = comp.iloc[0]["pipeline"]
print("Best by CV:", best)

if best == "TF‑IDF + LogisticRegression":
    pipe = make_tfidf_linear_pipeline(spec["numeric_cols"], spec["categorical_cols"], spec["text_cols"], max_features=70000)
    X_full = train_df[spec["numeric_cols"] + spec["categorical_cols"] + spec["text_cols"]]
    y_full = train_df[TARGET_COL]
    pipe.fit(X_full, y_full)
    X_test = test_df[spec["numeric_cols"] + spec["categorical_cols"] + spec["text_cols"]]
    pred = pipe.predict_proba(X_test)

elif best == "TargetEnc + LightGBM":
    best_params = getattr(study, "best_params", {})
    enc, model = make_lgbm_target_enc_pipeline(spec["numeric_cols"], spec["categorical_cols"], params={**best_params, "n_estimators": 4000})
    X_full = train_df[spec["numeric_cols"] + spec["categorical_cols"]]
    y_full = train_df[TARGET_COL]
    enc.fit(X_full.fillna(0), y_full)
    X_full_enc = enc.transform(X_full.fillna(0))
    model.fit(X_full_enc, y_full)
    X_test = test_df[spec["numeric_cols"] + spec["categorical_cols"]]
    pred = model.predict_proba(enc.transform(X_test.fillna(0)))

else:
    model = make_catboost_model(cat_cols=spec["categorical_cols"], params={"iterations": 2500})
    X_full = train_df[spec["numeric_cols"] + spec["categorical_cols"]]
    y_full = train_df[TARGET_COL]
    cat_idx = [X_full.columns.get_loc(c) for c in spec["categorical_cols"]]
    model.fit(X_full, y_full, cat_features=cat_idx)
    X_test = test_df[spec["numeric_cols"] + spec["categorical_cols"]]
    pred = model.predict_proba(X_test)

sub = pd.DataFrame({ID_COL: test_df[ID_COL].values})
sub[labels] = pred

out_path = paths.submissions / "submission_custom_best.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)
sub.to_csv(out_path, index=False)
print("Saved:", out_path)
sub.head()