<a href="https://colab.research.google.com/github/nullvoid-ky/introduction-to-machine-learning-and-deep-learning/blob/main/Lightgbm%26Xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# ถ้าในสภาพแวดล้อมคุณยังไม่มี ให้รันก่อน (Kaggle มักมีอยู่แล้ว)
!pip install lightgbm xgboost -q


In [10]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("utkarshx27/american-companies-bankruptcy-prediction-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'american-companies-bankruptcy-prediction-dataset' dataset.
Path to dataset files: /kaggle/input/american-companies-bankruptcy-prediction-dataset


In [11]:
from kagglehub import KaggleDatasetAdapter, load_dataset

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Set the CSV file path **inside** the dataset (adjust if needed)
# Explore the dataset directory printed below to confirm the file name.
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
file_path = "/kaggle/input/american-companies-bankruptcy-prediction-dataset/american_bankruptcy.csv"

df = pd.read_csv(file_path)

print("Loaded shape:", df.shape)
print("Columns:\n", list(df.columns))
df.head()

Loaded shape: (78682, 21)
Columns:
 ['company_name', 'status_label', 'year', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18']


Unnamed: 0,company_name,status_label,year,X1,X2,X3,X4,X5,X6,X7,...,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
0,C_1,alive,1999,511.267,833.107,18.373,89.031,336.018,35.163,128.348,...,1024.333,740.998,180.447,70.658,191.226,163.816,201.026,1024.333,401.483,935.302
1,C_1,alive,2000,485.856,713.811,18.577,64.367,320.59,18.531,115.187,...,874.255,701.854,179.987,45.79,160.444,125.392,204.065,874.255,361.642,809.888
2,C_1,alive,2001,436.656,526.477,22.496,27.207,286.588,-58.939,77.528,...,638.721,710.199,217.699,4.711,112.244,150.464,139.603,638.721,399.964,611.514
3,C_1,alive,2002,396.412,496.747,27.172,30.745,259.954,-12.41,66.322,...,606.337,686.621,164.658,3.573,109.59,203.575,124.106,606.337,391.633,575.592
4,C_1,alive,2003,432.204,523.302,26.68,47.491,247.245,3.504,104.661,...,651.958,709.292,248.666,20.811,128.656,131.261,131.884,651.958,407.608,604.467


In [12]:
import numpy as np
import pandas as pd

# สมมติว่าคุณมี df อยู่แล้ว
FEATURES = ["X8","X17","X3","X11","X10","X1","X6"]
TARGET   = "status_label"

# ตรวจคอลัมน์
missing = [c for c in FEATURES+[TARGET] if c not in df.columns]
if missing:
    raise ValueError(f"❌ Missing columns in df: {missing}")

# map target: alive=0, failed=1 (รองรับกรณีเป็นสตริง)
y_raw = df[TARGET]
if y_raw.dtype == object:
    y = y_raw.astype(str).str.strip().str.lower().map({"alive":0,"failed":1}).astype(int)
else:
    # ถ้าเป็นตัวเลขแล้ว แต่อาจไม่ใช่ 0/1 ให้บังคับเป็น 0/1
    ser = pd.Series(y_raw).astype(int)
    uniq = sorted(ser.unique())
    if set(uniq) <= {0,1}:
        y = ser
    else:
        # กรณีแปลก factorize แล้วถือว่า 1 = minority ถ้าเป็นไปได้
        codes, uniques = pd.factorize(ser)
        y = pd.Series(codes, index=ser.index)

X = df[FEATURES].copy()
print("✅ Data ready | X:", X.shape, "| y ratio:", dict(pd.Series(y).value_counts(normalize=True).round(3)))


✅ Data ready | X: (78682, 7) | y ratio: {0: np.float64(0.934), 1: np.float64(0.066)}


In [13]:
from sklearn.model_selection import train_test_split

# 80/20 test
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# แยก valid ออกจาก train อีก 20% (ของ train เดิม) สำหรับ early stopping
X_trn, X_val, y_trn, y_val = train_test_split(
    X_tr, y_tr, test_size=0.2, random_state=42, stratify=y_tr
)

# คำนวณ scale_pos_weight = (#neg / #pos) จาก **train แท้ๆ**
pos = int((y_trn == 1).sum())
neg = int((y_trn == 0).sum())
if pos == 0:
    raise ValueError("❌ No positive samples in training set.")
scale_pos_weight = neg / pos
print(f"scale_pos_weight (train only) = {scale_pos_weight:.2f}  |  pos={pos}, neg={neg}")


scale_pos_weight (train only) = 14.07  |  pos=3341, neg=47015


In [14]:
from sklearn.metrics import roc_auc_score, average_precision_score
import lightgbm as lgb

lgbm = lgb.LGBMClassifier(
    objective="binary",
    n_estimators=10000,
    learning_rate=0.02,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
    first_metric_only=True,         # ✅ ให้โฟกัส metric แรกเท่านั้น
    scale_pos_weight=scale_pos_weight,
    force_col_wise=True             # (ตามคำแนะนำใน log)
)

lgbm.fit(
    X_trn, y_trn,
    eval_set=[(X_val, y_val)],
    eval_metric="average_precision",                      # ✅ ติดตาม AUPRC
    callbacks=[lgb.early_stopping(300), lgb.log_evaluation(100)]
)

lgbm_proba = lgbm.predict_proba(X_te)[:, 1]
print("LightGBM  ROC AUC:", roc_auc_score(y_te, lgbm_proba).round(4),
      "| AUPRC:", average_precision_score(y_te, lgbm_proba).round(4))


[LightGBM] [Info] Number of positive: 3341, number of negative: 47015
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 50356, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066348 -> initscore=-2.644197
[LightGBM] [Info] Start training from score -2.644197
Training until validation scores don't improve for 300 rounds
[100]	valid_0's average_precision: 0.185695	valid_0's binary_logloss: 0.474679
[200]	valid_0's average_precision: 0.199247	valid_0's binary_logloss: 0.509442
[300]	valid_0's average_precision: 0.206109	valid_0's binary_logloss: 0.493478
Early stopping, best iteration is:
[2]	valid_0's average_precision: 0.158427	valid_0's binary_logloss: 0.24131
LightGBM  ROC AUC: 0.7098 | AUPRC: 0.1497


In [17]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score, average_precision_score

def fit_xgb_compat(X_trn, y_trn, X_val, y_val, scale_pos_weight, random_state=42):
    """
    Train XGBoost with early stopping, compatible across XGBoost versions.
    Tries:
      1) callbacks=[EarlyStopping(...)]
      2) early_stopping_rounds=...
      3) fallback to xgb.train (DMatrix)
    Returns: model that has .predict_proba(X) -> [N,2]
    """
    base_params = dict(
        objective="binary:logistic",
        n_estimators=10000,
        learning_rate=0.02,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=random_state,
        tree_method="hist",          # ใช้ "gpu_hist" ถ้ามี GPU
        scale_pos_weight=scale_pos_weight,
        eval_metric="aucpr"
    )

    # 1) ลอง callbacks
    try:
        xgbc = xgb.XGBClassifier(**base_params)
        cb = [xgb.callback.EarlyStopping(rounds=300, save_best=True)]
        xgbc.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], verbose=100, callbacks=cb)
        return xgbc
    except TypeError:
        pass

    # 2) ลอง early_stopping_rounds
    try:
        xgbc = xgb.XGBClassifier(**base_params)
        xgbc.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], verbose=100, early_stopping_rounds=300)
        return xgbc
    except TypeError:
        pass

    # 3) สลับไปใช้ xgb.train (DMatrix)
    dtrain = xgb.DMatrix(X_trn, label=y_trn)
    dvalid = xgb.DMatrix(X_val, label=y_val)

    train_params = dict(
        objective="binary:logistic",
        eta=0.02,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        lambda_=1.0,                 # reg_lambda ใน DMatrix API ใช้ชื่อ lambda_
        tree_method="hist",
        scale_pos_weight=scale_pos_weight,
        eval_metric="aucpr",
        seed=random_state,
    )
    evals = [(dtrain, "train"), (dvalid, "valid")]
    booster = xgb.train(
        train_params, dtrain,
        num_boost_round=10000,
        evals=evals,
        early_stopping_rounds=300,
        verbose_eval=100
    )

    # หุ้มเป็นคลาสให้ใช้ predict_proba ได้เหมือน XGBClassifier
    class BoosterWrapper:
        def __init__(self, booster):
            self.booster = booster
        def predict_proba(self, X):
            dm = xgb.DMatrix(X)
            # ใช้ best_iteration ถ้ามี (early stop)
            best_it = getattr(self.booster, "best_iteration", None)
            if best_it is not None:
                p = self.booster.predict(dm, iteration_range=(0, best_it + 1))
            else:
                p = self.booster.predict(dm)
            return np.vstack([1 - p, p]).T

    return BoosterWrapper(booster)

# ===== ใช้งาน =====
xgb_model = fit_xgb_compat(X_trn, y_trn, X_val, y_val, scale_pos_weight, random_state=42)
xgb_proba = xgb_model.predict_proba(X_te)[:, 1]
print("XGBoost   ROC AUC:", roc_auc_score(y_te, xgb_proba).round(4),
      "| AUPRC:", average_precision_score(y_te, xgb_proba).round(4))


[0]	train-aucpr:0.18455	valid-aucpr:0.14755


Parameters: { "lambda_" } are not used.

  self.starting_round = model.num_boosted_rounds()


[100]	train-aucpr:0.30072	valid-aucpr:0.19449
[200]	train-aucpr:0.34924	valid-aucpr:0.19931
[300]	train-aucpr:0.40566	valid-aucpr:0.20675
[400]	train-aucpr:0.46664	valid-aucpr:0.21217
[500]	train-aucpr:0.53014	valid-aucpr:0.21486
[600]	train-aucpr:0.58913	valid-aucpr:0.21677
[700]	train-aucpr:0.64451	valid-aucpr:0.21748
[800]	train-aucpr:0.69581	valid-aucpr:0.22030
[900]	train-aucpr:0.73990	valid-aucpr:0.22131
[1000]	train-aucpr:0.78072	valid-aucpr:0.22071
[1100]	train-aucpr:0.81745	valid-aucpr:0.22197
[1200]	train-aucpr:0.84752	valid-aucpr:0.22249
[1300]	train-aucpr:0.87203	valid-aucpr:0.22280
[1400]	train-aucpr:0.89681	valid-aucpr:0.22356
[1500]	train-aucpr:0.91515	valid-aucpr:0.22516
[1600]	train-aucpr:0.93149	valid-aucpr:0.22523
[1700]	train-aucpr:0.94436	valid-aucpr:0.22498
[1800]	train-aucpr:0.95538	valid-aucpr:0.22559
[1900]	train-aucpr:0.96400	valid-aucpr:0.22550
[2000]	train-aucpr:0.97094	valid-aucpr:0.22628
[2100]	train-aucpr:0.97682	valid-aucpr:0.22597
[2200]	train-aucpr:0.9

In [18]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

def find_best_threshold(y_true, y_proba, metric="f1"):
    """
    หา threshold ที่เหมาะที่สุด:
      - metric="f1"          → เลือก threshold ที่ให้ F1 สูงสุด
      - metric="recall@prec>=0.8" → เลือก threshold ที่ให้ Recall สูงสุด โดย Precision ≥ 0.8
    """
    y_true = np.asarray(y_true).astype(int)
    thresholds = np.linspace(0.01, 0.99, 99)
    best_t, best_val = 0.5, -1.0

    want_recall_at_prec = metric.startswith("recall@prec>=")
    prec_floor = None
    if want_recall_at_prec:
        try:
            prec_floor = float(metric.split(">=")[1])
        except Exception:
            raise ValueError("ใช้รูปแบบ metric='recall@prec>=0.8'")

    for t in thresholds:
        y_hat = (y_proba >= t).astype(int)
        p, r, f1, _ = precision_recall_fscore_support(
            y_true, y_hat, average="binary", zero_division=0
        )
        val = f1 if metric=="f1" else (r if (want_recall_at_prec and p>=prec_floor) else -1.0)
        if val > best_val:
            best_val, best_t = val, t

    return float(best_t), float(best_val)

def evaluate_at_threshold(y_true, y_proba, threshold):
    y_pred = (y_proba >= threshold).astype(int)
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    acc = (tp+tn)/cm.sum()
    prec = tp/(tp+fp) if (tp+fp) else 0.0
    rec  = tp/(tp+fn) if (tp+fn) else 0.0
    f1   = 2*prec*rec/(prec+rec) if (prec+rec) else 0.0
    return {
        "threshold": float(threshold),
        "confusion_matrix": cm.tolist(),
        "accuracy": float(acc),
        "precision": float(prec),
        "recall": float(rec),
        "f1": float(f1),
    }

# ตัวอย่างใช้งานกับ LightGBM:
t_best_f1, score_f1 = find_best_threshold(y_te, lgbm_proba, metric="f1")
print("LGBM  best F1 threshold:", t_best_f1, "F1:", round(score_f1,4))
print("LGBM@0.50:", evaluate_at_threshold(y_te, lgbm_proba, 0.50))
print("LGBM@best:", evaluate_at_threshold(y_te, lgbm_proba, t_best_f1))

# ตัวอย่างใช้งานกับ XGBoost โดยคุม precision ขั้นต่ำ:
t_best_recall, best_recall = find_best_threshold(y_te, xgb_proba, metric="recall@prec>=0.8")
print("XGB  best threshold (recall@prec>=0.8):", t_best_recall, "recall:", round(best_recall,4))
print("XGB@0.50:", evaluate_at_threshold(y_te, xgb_proba, 0.50))
print("XGB@best:", evaluate_at_threshold(y_te, xgb_proba, t_best_recall))


LGBM  best F1 threshold: 0.09 F1: 0.2245
LGBM@0.50: {'threshold': 0.5, 'confusion_matrix': [[14693, 0], [1044, 0]], 'accuracy': 0.933659528499714, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
LGBM@best: {'threshold': 0.09, 'confusion_matrix': [[12644, 2049], [653, 391]], 'accuracy': 0.8283027260596048, 'precision': 0.16024590163934427, 'recall': 0.37452107279693486, 'f1': 0.22445464982778418}
XGB  best threshold (recall@prec>=0.8): 0.92 recall: 0.0163
XGB@0.50: {'threshold': 0.5, 'confusion_matrix': [[13757, 936], [747, 297]], 'accuracy': 0.893054584736608, 'precision': 0.24087591240875914, 'recall': 0.28448275862068967, 'f1': 0.2608695652173913}
XGB@best: {'threshold': 0.92, 'confusion_matrix': [[14690, 3], [1027, 17]], 'accuracy': 0.9345491516807524, 'precision': 0.85, 'recall': 0.016283524904214558, 'f1': 0.031954887218045104}
