<a href="https://colab.research.google.com/github/nullvoid-ky/introduction-to-machine-learning-and-deep-learning/blob/main/Lightgbm%26Xgboost_find_best_threshold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# ===== Setup & Installs (Kaggle usually has most of these; safe to re-run) =====
!pip -q install kagglehub shap lightgbm xgboost

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import List

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.inspection import permutation_importance

import shap
import warnings
warnings.filterwarnings('ignore')


In [14]:
# ถ้าในสภาพแวดล้อมคุณยังไม่มี ให้รันก่อน (Kaggle มักมีอยู่แล้ว)
!pip install lightgbm xgboost -q


In [15]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("utkarshx27/american-companies-bankruptcy-prediction-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'american-companies-bankruptcy-prediction-dataset' dataset.
Path to dataset files: /kaggle/input/american-companies-bankruptcy-prediction-dataset


In [20]:
from kagglehub import KaggleDatasetAdapter, load_dataset
import pandas as pd

# Load the dataset
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Set the CSV file path **inside** the dataset (adjust if needed)
# Explore the dataset directory printed below to confirm the file name.
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
file_path = "/kaggle/input/american-companies-bankruptcy-prediction-dataset/american_bankruptcy.csv"

df = pd.read_csv(file_path)

print("Loaded shape:", df.shape)
print("Columns:\n", list(df.columns))
df.head()

Loaded shape: (78682, 21)
Columns:
 ['company_name', 'status_label', 'year', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18']


Unnamed: 0,company_name,status_label,year,X1,X2,X3,X4,X5,X6,X7,...,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
0,C_1,alive,1999,511.267,833.107,18.373,89.031,336.018,35.163,128.348,...,1024.333,740.998,180.447,70.658,191.226,163.816,201.026,1024.333,401.483,935.302
1,C_1,alive,2000,485.856,713.811,18.577,64.367,320.59,18.531,115.187,...,874.255,701.854,179.987,45.79,160.444,125.392,204.065,874.255,361.642,809.888
2,C_1,alive,2001,436.656,526.477,22.496,27.207,286.588,-58.939,77.528,...,638.721,710.199,217.699,4.711,112.244,150.464,139.603,638.721,399.964,611.514
3,C_1,alive,2002,396.412,496.747,27.172,30.745,259.954,-12.41,66.322,...,606.337,686.621,164.658,3.573,109.59,203.575,124.106,606.337,391.633,575.592
4,C_1,alive,2003,432.204,523.302,26.68,47.491,247.245,3.504,104.661,...,651.958,709.292,248.666,20.811,128.656,131.261,131.884,651.958,407.608,604.467


In [21]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, roc_auc_score, average_precision_score

def find_best_threshold(y_true, y_proba, metric="f1"):
    y_true = np.asarray(y_true).astype(int)
    thresholds = np.linspace(0.01, 0.99, 99)
    best_t, best_val = 0.5, -1.0
    for t in thresholds:
        y_hat = (y_proba >= t).astype(int)
        p, r, f1, _ = precision_recall_fscore_support(y_true, y_hat, average="binary", zero_division=0)
        val = f1 if metric == "f1" else r
        if val > best_val:
            best_val, best_t = val, t
    return float(best_t), float(best_val)

def evaluate_at_threshold(y_true, y_proba, threshold):
    y_pred = (y_proba >= threshold).astype(int)
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    acc = (tp+tn)/cm.sum()
    prec = tp/(tp+fp) if (tp+fp) else 0.0
    rec  = tp/(tp+fn) if (tp+fn) else 0.0
    f1   = 2*prec*rec/(prec+rec) if (prec+rec) else 0.0
    return {"threshold": float(threshold), "cm": cm.tolist(), "accuracy": acc, "precision": prec, "recall": rec, "f1": f1}


In [24]:
# ===== 0) เตรียม X, y (map target) =====
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

FEATURES = ["X8","X17","X3","X11","X10","X1","X6"]
TARGET   = "status_label"

# ตรวจคอลัมน์
missing = [c for c in FEATURES+[TARGET] if c not in df.columns]
if missing:
    raise ValueError(f"❌ Missing columns: {missing}")

# y: alive->0, failed->1
y = df[TARGET]
if y.dtype == object:
    y = y.astype(str).str.strip().str.lower().map({"alive":0, "failed":1}).astype(int)
else:
    y = pd.Series(y).astype(int)

X = df[FEATURES].copy()

# ===== 1) split: train/valid/test =====
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_trn, X_val, y_trn, y_val = train_test_split(
    X_tr, y_tr, test_size=0.2, random_state=42, stratify=y_tr
)

# ===== 2) scale_pos_weight จาก train แท้ ๆ =====
pos = int((y_trn == 1).sum())
neg = int((y_trn == 0).sum())
assert pos > 0, "No positive samples in training set."
scale_pos_weight = neg / pos

print("Shapes:",
      "\n  X_trn:", X_trn.shape, " X_val:", X_val.shape, " X_te:", X_te.shape,
      "\nClass ratio (train):", dict(pd.Series(y_trn).value_counts(normalize=True).round(3)))
print(f"scale_pos_weight = {scale_pos_weight:.2f}")


Shapes: 
  X_trn: (50356, 7)  X_val: (12589, 7)  X_te: (15737, 7) 
Class ratio (train): {0: np.float64(0.934), 1: np.float64(0.066)}
scale_pos_weight = 14.07


In [26]:
# ตัวอย่าง time-based split (ปรับช่วงปีตามจริง)
train_mask = df["year"] <= 2011
val_mask   = (df["year"] >= 2012) & (df["year"] <= 2014)
test_mask  = df["year"] >= 2015

X_trn, y_trn = X[train_mask], y[train_mask]
X_val, y_val = X[val_mask],   y[val_mask]
X_te,  y_te  = X[test_mask],  y[test_mask]

pos = int((y_trn == 1).sum()); neg = int((y_trn == 0).sum())
scale_pos_weight = neg / pos
print("time-split OK | spw=", round(scale_pos_weight,2))


time-split OK | spw= 11.59


In [27]:
import xgboost as xgb

def fit_xgb_compat(X_trn, y_trn, X_val, y_val, *,
                   scale_pos_weight,
                   learning_rate=0.03, max_depth=7, min_child_weight=1,
                   subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
                   n_estimators=10000, early_stopping_rounds=300, random_state=42):
    params = dict(
        objective="binary:logistic",
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_lambda=reg_lambda,
        random_state=random_state,
        tree_method="hist",
        scale_pos_weight=scale_pos_weight,
        eval_metric="aucpr"
    )
    # try callbacks
    try:
        model = xgb.XGBClassifier(**params)
        model.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], verbose=100,
                  callbacks=[xgb.callback.EarlyStopping(rounds=early_stopping_rounds, save_best=True)])
        return model
    except TypeError:
        pass
    # try early_stopping_rounds=
    try:
        model = xgb.XGBClassifier(**params)
        model.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], verbose=100,
                  early_stopping_rounds=early_stopping_rounds)
        return model
    except TypeError:
        pass
    # fallback xgb.train
    dtrn = xgb.DMatrix(X_trn, label=y_trn)
    dval = xgb.DMatrix(X_val, label=y_val)
    train_params = dict(
        objective="binary:logistic",
        eta=learning_rate,
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        lambda_=reg_lambda,
        tree_method="hist",
        scale_pos_weight=scale_pos_weight,
        eval_metric="aucpr",
        seed=random_state,
    )
    booster = xgb.train(train_params, dtrn, num_boost_round=n_estimators,
                        evals=[(dtrn,"train"),(dval,"valid")],
                        early_stopping_rounds=early_stopping_rounds, verbose_eval=100)
    class BoosterWrapper:
        def __init__(self, booster): self.booster = booster
        def predict_proba(self, X):
            dm = xgb.DMatrix(X)
            best_it = getattr(self.booster, "best_iteration", None)
            p = self.booster.predict(dm, iteration_range=(0, best_it+1)) if best_it is not None else self.booster.predict(dm)
            import numpy as np
            return np.vstack([1-p, p]).T
    return BoosterWrapper(booster)


In [28]:
import lightgbm as lgb

def fit_lgbm(X_trn, y_trn, X_val, y_val, *,
             scale_pos_weight,
             num_leaves=63, min_child_samples=100,
             learning_rate=0.03, subsample=0.8, colsample_bytree=0.8,
             reg_lambda=1.0, n_estimators=10000, early_stopping_rounds=300, random_state=42):
    lgbm = lgb.LGBMClassifier(
        objective="binary",
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        num_leaves=num_leaves,
        min_child_samples=min_child_samples,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_lambda=reg_lambda,
        random_state=random_state,
        scale_pos_weight=scale_pos_weight,
        first_metric_only=True,
        force_col_wise=True
    )
    lgbm.fit(
        X_trn, y_trn,
        eval_set=[(X_val, y_val)],
        eval_metric="average_precision",
        callbacks=[lgb.early_stopping(early_stopping_rounds), lgb.log_evaluation(100)]
    )
    return lgbm


In [29]:
from itertools import product

# กริดเล็กๆ พอให้วิ่งเร็ว
spw_mults_lgb = [0.5, 1.0, 2.0, 4.0]
leaves_grid   = [31, 63, 127]
mcs_grid      = [50, 200]          # min_child_samples
lr_grid       = [0.02, 0.05]
col_grid      = [0.7, 0.9]
sub_grid      = [0.7, 0.9]

spw_mults_xgb = [0.5, 1.0, 2.0, 4.0]
md_grid       = [4, 7]             # max_depth
mcw_grid      = [1, 5]             # min_child_weight
lr_x_grid     = [0.02, 0.05]
col_x_grid    = [0.7, 0.9]
sub_x_grid    = [0.7, 0.9]

results = []

# ---- LightGBM sweep ----
for spwm, nl, mcs, lr, col, sub in product(spw_mults_lgb, leaves_grid, mcs_grid, lr_grid, col_grid, sub_grid):
    try:
        model = fit_lgbm(
            X_trn, y_trn, X_val, y_val,
            scale_pos_weight=spw_base*spwm,
            num_leaves=nl, min_child_samples=mcs,
            learning_rate=lr, colsample_bytree=col, subsample=sub,
            early_stopping_rounds=200
        )
        proba_val = model.predict_proba(X_val)[:,1]
        t_val, f1_val = find_best_threshold(y_val, proba_val, metric="f1")
        auprc_val = average_precision_score(y_val, proba_val)
        results.append({
            "model":"LGBM","params":{"spw_mult":spwm,"num_leaves":nl,"min_child_samples":mcs,"lr":lr,"col":col,"sub":sub},
            "t_val":t_val, "f1_val":f1_val, "auprc_val":auprc_val, "estimator":model
        })
    except Exception as e:
        print("LGBM fail:", e)

# ---- XGBoost sweep ----
for spwm, md, mcw, lr, col, sub in product(spw_mults_xgb, md_grid, mcw_grid, lr_x_grid, col_x_grid, sub_x_grid):
    try:
        model = fit_xgb_compat(
            X_trn, y_trn, X_val, y_val,
            scale_pos_weight=spw_base*spwm,
            learning_rate=lr, max_depth=md, min_child_weight=mcw,
            colsample_bytree=col, subsample=sub,
            early_stopping_rounds=200
        )
        proba_val = model.predict_proba(X_val)[:,1]
        t_val, f1_val = find_best_threshold(y_val, proba_val, metric="f1")
        auprc_val = average_precision_score(y_val, proba_val)
        results.append({
            "model":"XGB","params":{"spw_mult":spwm,"max_depth":md,"min_child_weight":mcw,"lr":lr,"col":col,"sub":sub},
            "t_val":t_val, "f1_val":f1_val, "auprc_val":auprc_val, "estimator":model
        })
    except Exception as e:
        print("XGB fail:", e)

# ตารางสรุป (เรียงตาม F1 บน validation)
df_res = pd.DataFrame(results).sort_values(["f1_val","auprc_val"], ascending=False)
print(df_res[["model","params","t_val","f1_val","auprc_val"]].head(10))
best = df_res.iloc[0]
best_model = best["estimator"]
best_t = float(best["t_val"])
print("\nBest candidate:", best["model"], best["params"], "t*=", best_t, "F1_val=", round(best["f1_val"],4))


[LightGBM] [Info] Number of positive: 4442, number of negative: 51485
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 55927, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079425 -> initscore=-2.450186
[LightGBM] [Info] Start training from score -2.450186
Training until validation scores don't improve for 200 rounds
[100]	valid_0's average_precision: 0.256831	valid_0's binary_logloss: 0.367382
[200]	valid_0's average_precision: 0.255301	valid_0's binary_logloss: 0.394631
Early stopping, best iteration is:
[1]	valid_0's average_precision: 0.107647	valid_0's binary_logloss: 0.198711
[LightGBM] [Info] Number of positive: 4442, number of negative: 51485
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 55927, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079425 -> initscore=-2.450186
[LightGBM] [Info] Start training from score -2.450186
Training

In [30]:
proba_te = best_model.predict_proba(X_te)[:,1]
print("Test ROC AUC:", roc_auc_score(y_te, proba_te).round(4),
      "| AUPRC:", average_precision_score(y_te, proba_te).round(4))
print("Test @best_t:", evaluate_at_threshold(y_te, proba_te, best_t))


Test ROC AUC: 0.8297 | AUPRC: 0.2294
Test @best_t: {'threshold': 0.81, 'cm': [[10852, 1143], [131, 156]], 'accuracy': np.float64(0.8962709656407751), 'precision': np.float64(0.12009237875288684), 'recall': np.float64(0.5435540069686411), 'f1': np.float64(0.19672131147540983)}


In [31]:
# เลือก LGBM และ XGB ที่ดีที่สุดจาก df_res อย่างละหนึ่ง
best_lgb = next(r for r in results if r["model"]=="LGBM" and r["f1_val"]==df_res[df_res.model=="LGBM"]["f1_val"].max())
best_xgb = next(r for r in results if r["model"]=="XGB"  and r["f1_val"]==df_res[df_res.model=="XGB"]["f1_val"].max())

p_val_blend = 0.5*best_lgb["estimator"].predict_proba(X_val)[:,1] + 0.5*best_xgb["estimator"].predict_proba(X_val)[:,1]
t_blend, f1_blend = find_best_threshold(y_val, p_val_blend, metric="f1")
print("Blend val: F1=", round(f1_blend,4), "t*=", t_blend)

p_te_blend = 0.5*best_lgb["estimator"].predict_proba(X_te)[:,1] + 0.5*best_xgb["estimator"].predict_proba(X_te)[:,1]
print("Blend test:", evaluate_at_threshold(y_te, p_te_blend, t_blend))


Blend val: F1= 0.2999 t*= 0.46
Blend test: {'threshold': 0.46, 'cm': [[10717, 1278], [123, 164]], 'accuracy': np.float64(0.8859306301905228), 'precision': np.float64(0.11373092926490985), 'recall': np.float64(0.5714285714285714), 'f1': np.float64(0.189705031810295)}
