<a href="https://colab.research.google.com/github/nullvoid-ky/introduction-to-machine-learning-and-deep-learning/blob/main/DATACLEAN%26AUG_Lightgbm%26Xgboost_find_best_threshold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ===== Setup & Installs (Kaggle usually has most of these; safe to re-run) =====
!pip -q install kagglehub shap lightgbm xgboost

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import List

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.inspection import permutation_importance

import shap
import warnings
warnings.filterwarnings('ignore')


In [2]:
# ถ้าในสภาพแวดล้อมคุณยังไม่มี ให้รันก่อน (Kaggle มักมีอยู่แล้ว)
!pip install lightgbm xgboost -q


In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("utkarshx27/american-companies-bankruptcy-prediction-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'american-companies-bankruptcy-prediction-dataset' dataset.
Path to dataset files: /kaggle/input/american-companies-bankruptcy-prediction-dataset


In [4]:
from kagglehub import KaggleDatasetAdapter, load_dataset
import pandas as pd

# Load the dataset
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Set the CSV file path **inside** the dataset (adjust if needed)
# Explore the dataset directory printed below to confirm the file name.
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
file_path = "/kaggle/input/american-companies-bankruptcy-prediction-dataset/american_bankruptcy.csv"

df = pd.read_csv(file_path)

print("Loaded shape:", df.shape)
print("Columns:\n", list(df.columns))
df.head()

Loaded shape: (78682, 21)
Columns:
 ['company_name', 'status_label', 'year', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18']


Unnamed: 0,company_name,status_label,year,X1,X2,X3,X4,X5,X6,X7,...,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
0,C_1,alive,1999,511.267,833.107,18.373,89.031,336.018,35.163,128.348,...,1024.333,740.998,180.447,70.658,191.226,163.816,201.026,1024.333,401.483,935.302
1,C_1,alive,2000,485.856,713.811,18.577,64.367,320.59,18.531,115.187,...,874.255,701.854,179.987,45.79,160.444,125.392,204.065,874.255,361.642,809.888
2,C_1,alive,2001,436.656,526.477,22.496,27.207,286.588,-58.939,77.528,...,638.721,710.199,217.699,4.711,112.244,150.464,139.603,638.721,399.964,611.514
3,C_1,alive,2002,396.412,496.747,27.172,30.745,259.954,-12.41,66.322,...,606.337,686.621,164.658,3.573,109.59,203.575,124.106,606.337,391.633,575.592
4,C_1,alive,2003,432.204,523.302,26.68,47.491,247.245,3.504,104.661,...,651.958,709.292,248.666,20.811,128.656,131.261,131.884,651.958,407.608,604.467


In [5]:

FEATURES = ["X8","X17","X3","X11","X10","X1","X6"]
TARGET   = "status_label"
COMPANY  = "company_name"   # ถ้าไม่มีคอลัมน์นี้ โค้ดจะ fallback อัตโนมัติ

# ---------- helpers ----------
def _check_cols(df, features, target, company_col=None):
    req = set(features+[target])
    miss = list(req - set(df.columns))
    if miss:
        raise ValueError(f"Missing columns: {miss}")
    if company_col is not None and company_col not in df.columns:
        company_col = None
    return company_col

def _clip_like(df_aug, df_ref, features):
    # คลิปค่าให้อยู่ในช่วง min/max ของทั้ง dataset เพื่อไม่ให้ synthetic หลุดโลก
    mins = df_ref[features].min()
    maxs = df_ref[features].max()
    df_aug[features] = df_aug[features].clip(lower=mins, upper=maxs, axis=1)
    return df_aug

def augment_failed(df, features=FEATURES, target=TARGET, company_col=COMPANY,
                   n_new=0, random_state=42):
    """สร้างแถว failed (target=1) เพิ่ม n_new แถว ด้วย 3 วิธี: uniform jitter, gaussian noise, mixup."""
    if n_new <= 0:
        return pd.DataFrame(columns=df.columns)

    rng = np.random.default_rng(random_state)
    df1 = df[df[target]==1].copy()
    if df1.empty:
        raise ValueError("No failed rows to augment.")

    # เตรียมสัดส่วน 3 วิธี แบ่งพอๆ กัน
    n1 = int(np.floor(n_new/3))
    n2 = int(np.floor(n_new/3))
    n3 = n_new - n1 - n2

    # 1) Uniform jitter ±0.5%
    take1 = df1.sample(n=max(n1,0), replace=True, random_state=random_state).copy()
    if not take1.empty:
        jitter = rng.uniform(-0.005, 0.005, size=(len(take1), len(features)))
        take1.loc[:, features] = take1[features].values * (1.0 + jitter)
        take1["aug_method"] = "uniform_0p5pct"

    # 2) Gaussian noise (σ = 0.5% ของ std ต่อคอลัมน์คลาส failed)
    take2 = df1.sample(n=max(n2,0), replace=True, random_state=random_state+1).copy()
    if not take2.empty:
        stds = df1[features].std().replace(0, 1e-12)
        noise = rng.normal(loc=0.0, scale=(0.005*stds.values), size=(len(take2), len(features)))
        take2.loc[:, features] = take2[features].values + noise
        take2["aug_method"] = "gaussian_0p5pct_std"

    # 3) Mixup (λ ~ Beta(0.4, 0.4))
    take3 = pd.DataFrame(columns=df.columns)
    if n3 > 0:
        a = df1.sample(n=n3, replace=True, random_state=random_state+2)
        b = df1.sample(n=n3, replace=True, random_state=random_state+3)
        lam = rng.beta(0.4, 0.4, size=n3).reshape(-1,1)
        mix_vals = lam * a[features].values + (1-lam) * b[features].values
        take3 = a.copy()
        take3.loc[:, features] = mix_vals
        take3["aug_method"] = "mixup_beta_0.4"

    df_aug = pd.concat([x for x in [take1, take2, take3] if not x.empty], ignore_index=True)
    df_aug[target] = 1

    # ทำให้ค่ามีเหตุผล (clip ตาม min/max ทั้ง df)
    df_aug = _clip_like(df_aug, df, features)

    # ถ้าอยากตั้งชื่อบริษัทให้รู้ว่าเป็น aug (เฉพาะมี company_name)
    if company_col is not None:
        df_aug[company_col] = df_aug[company_col].astype(str)  # คงบริษัทเดิมไว้ (สมจริงกว่า)
        # หรือจะเติม suffix:
        # df_aug[company_col] = df_aug[company_col].astype(str) + "_aug"

    return df_aug

def downsample_alive_diverse(df, keep_n, features=FEATURES, target=TARGET, company_col=COMPANY, random_state=42):
    """คัด 'alive'=0 ให้เหลือ keep_n โดยกระจาย 'บริษัท' ให้หลากหลายที่สุดเท่าที่ทำได้"""
    df0 = df[df[target]==0].copy()
    if company_col is None:
        # ไม่มี company_name → สุ่มตรงๆ
        return df0.sample(n=min(keep_n, len(df0)), replace=False, random_state=rs)

    rs = np.random.RandomState(random_state)

    # 1) เลือกอย่างน้อย 1 แถวต่่อบริษัท (ครอบคลุมบริษัทให้เยอะสุด)
    pick1 = df0.groupby(company_col, group_keys=False).apply(lambda g: g.sample(1, random_state=rs)).reset_index(drop=True)
    if len(pick1) >= keep_n:
        # บริษัทเยอะกว่าโควต้า → สุ่มเลือกบริษัทบางส่วน
        # เลือกบริษัทสุ่ม keep_n แห่ง แล้วหยิบ 1 แถวจากแต่ละบริษัท
        chosen_companies = rs.choice(pick1[company_col].unique(), size=keep_n, replace=False)
        kept = pick1[pick1[company_col].isin(chosen_companies)].copy()
        return kept.reset_index(drop=True)

    # 2) ถ้ายังไม่พอ ต้องเติมจากแถวนอกเหนือจากที่เลือกไปแล้ว
    rem_need = keep_n - len(pick1)
    remaining = df0.drop(index=pick1.index)
    if rem_need > len(remaining):
        rem_need = len(remaining)
    pick2 = remaining.sample(n=rem_need, replace=False, random_state=rs)
    kept = pd.concat([pick1, pick2], ignore_index=True)
    return kept.sample(frac=1.0, random_state=rs).reset_index(drop=True)

def balance_40_50(df, features=FEATURES, target=TARGET, company_col=COMPANY,
                  target_ratio=0.45, min_ratio=0.40, random_state=42):
    """
    ทำให้สัดส่วน failed อยู่ในช่วง ~40–50%.
    กลยุทธ์:
      1) คำนวณเพดานจำนวน alive ที่ 'ควรเก็บ' เพื่อให้ >= min_ratio โดยไม่ต้อง augment: N_new <= P*(1-min)/min
      2) downsample alive โดยกระจายบริษัท
      3) augment failed เพิ่มให้ถึง target_ratio ด้วย 3 วิธี noise
    """
    company_col = _check_cols(df, features, target, company_col)

    df = df.copy()
    df[target] = df[target].astype(int)

    P = int((df[target]==1).sum())
    N = int((df[target]==0).sum())

    if P == 0:
        raise ValueError("No failed rows in the dataset.")
    if N == 0:
        return df  # already all failed

    # 1) หาโควต้าจำนวน alive สูงสุดที่ยังทำให้ ratio >= min_ratio ถ้าไม่ augment
    #    เงื่อนไข: P / (P + N_keep) >= min_ratio  ->  N_keep <= P*(1-min)/min
    N_keep_cap = int(np.floor(P*(1 - min_ratio) / min_ratio))
    N_keep_cap = max(1, min(N_keep_cap, N))  # ป้องกันขอบ
    # เลือกจำนวน alive ที่เราจะ "เก็บ" จริง ๆ (อาจเท่ากับ cap)
    N_keep = N_keep_cap

    # 2) คัด alive ให้เหลือ N_keep โดยกระจายบริษัท
    kept_alive = downsample_alive_diverse(df, keep_n=N_keep, features=features, target=target,
                                          company_col=company_col, random_state=random_state)

    # 3) จำนวน failed ที่ "ต้องมี" เพื่อได้ target_ratio:
    #    r = (P + A) / (P + A + N_keep) = target_ratio  ->  A = r*N_keep/(1-r) - P
    A_need = int(np.ceil(target_ratio * N_keep / (1 - target_ratio) - P))
    A_need = max(0, A_need)

    df_failed = df[df[target]==1]
    df_aug = augment_failed(
        pd.concat([df_failed, kept_alive], ignore_index=True),  # ให้ clip ช่วงตามทั้งชุดใหม่
        features=features, target=target, company_col=company_col,
        n_new=A_need, random_state=random_state
    )

    # รวมชุดใหม่: failed เดิม + failed augment + alive ที่คัดไว้
    new_df = pd.concat([df_failed, df_aug, kept_alive], ignore_index=True)
    new_df = new_df.sample(frac=1.0, random_state=random_state).reset_index(drop=True)

    # รายงานผล
    P2 = int((new_df[target]==1).sum()); N2 = int((new_df[target]==0).sum())
    ratio = P2 / (P2 + N2)
    print(f"Before: failed={P}, alive={N}, ratio={P/(P+N):.3f}")
    print(f"Keep alive = {N_keep}  |  Aug failed = {A_need}")
    print(f"After:  failed={P2}, alive={N2}, ratio={ratio:.3f}  (~{ratio*100:.1f}%)")
    if company_col is not None:
        uniq_alive = new_df.loc[new_df[target]==0, company_col].nunique()
        print(f"Unique companies among KEPT alive: {uniq_alive}")
    return new_df

In [6]:

# 0) ตรวจว่าคอลัมน์ครบไหม
missing = [c for c in FEATURES+[TARGET] if c not in df.columns]
if missing:
    raise ValueError(f"❌ Missing columns: {missing}")

# 1) ฟังก์ชัน normalize label ให้เป็น 0/1 แบบทนทาน
def normalize_status(x):
    if pd.isna(x):
        return np.nan
    t = str(x).strip().lower()
    # ตัวเลขที่มาเป็นสตริง หรือ float 0.0/1.0
    if t in {"0","1"}:
        return int(t)
    try:
        # กรณีเป็น 0.0/1.0 จริง ๆ
        f = float(t)
        if f in (0.0, 1.0):
            return int(f)
    except:
        pass
    # แม็พคำยอดฮิต
    direct = {
        "alive": 0, "non-bankrupt": 0, "nonbankrupt": 0, "healthy": 0, "normal": 0,
        "failed": 1, "fail": 1, "bankrupt": 1, "bankruptcy": 1, "went_bankrupt": 1,
        "yes": 1, "y": 1, "true": 1,
        "no": 0, "n": 0, "false": 0
    }
    if t in direct:
        return direct[t]
    # สุดท้าย ถ้าระบุไม่ถูก ให้คืน NaN เพื่อตรวจสอบ
    return np.nan

y_norm = df[TARGET].apply(normalize_status)

# 2) เช็คค่าที่แปลงไม่ได้ (จะเป็น NaN)
bad_mask = y_norm.isna()
if bad_mask.any():
    print("⚠️ พบ label ที่ไม่รู้จัก (ตัวอย่าง top 20):")
    print(df.loc[bad_mask, TARGET].value_counts().head(20))
    # ทางเลือก: ตัดแถวที่ label ไม่ชัดเจนทิ้งไปก่อน
    df = df.loc[~bad_mask].copy()
    y_norm = y_norm.loc[~bad_mask]

# 3) เขียนกลับเป็นตัวเลข 0/1
df[TARGET] = y_norm.astype(int)


balanced = balance_40_50(
    df=df,
    features=["X8","X17","X3","X11","X10","X1","X6"],
    target="status_label",
    company_col="company_name",   # ถ้าไม่มีคอลัมน์นี้ ปล่อยไว้หรือส่ง None
    target_ratio=0.5,            # ปรับเป็น 0.50 หากอยาก 50/50
    min_ratio=0.40,
    random_state=42
)

Before: failed=5220, alive=73462, ratio=0.066
Keep alive = 7830  |  Aug failed = 2610
After:  failed=7830, alive=7830, ratio=0.500  (~50.0%)
Unique companies among KEPT alive: 7830


In [7]:
df = balanced.copy()
df.head()

Unnamed: 0,company_name,status_label,year,X1,X2,X3,X4,X5,X6,X7,...,X10,X11,X12,X13,X14,X15,X16,X17,X18,aug_method
0,C_3082,1,1999,6.54,0.566,0.016,-7.445,0.118,-6.923,0.077,...,12.812,0.267,-7.461,1.223,0.74,-6.647,1.789,1.007,9.234,
1,C_281,1,2000,262.463,522.847,9.769,26.184,163.206,-1.675,95.752,...,418.851,161.135,16.415,221.662,107.143,105.775,744.509,288.639,718.325,
2,C_1883,1,2005,349.538,1141.105,27.755,67.836,303.8,26.094,31.803,...,498.141,6.815,40.081,448.237,135.518,207.643,1589.342,158.546,1521.506,
3,C_3343,1,2001,36.96,51.597,4.069,6.175,4.547,-22.205,13.139,...,76.927,0.0,2.106,30.041,34.435,2.398,81.638,40.435,75.463,
4,C_3311,0,1999,29.066,72.827,14.347,14.543,3.672,-2.846,20.629,...,111.408,45.502,0.196,47.454,26.163,8.736,120.281,73.546,105.738,


In [8]:
# ==============================
# Load your DataFrame (df)
# ==============================
try:
    df  # noqa: F821
    print("✅ Found existing `df`.")
except NameError:
    import pandas as pd
    print("ℹ️ No existing `df` found. Creating a tiny placeholder. Replace with your CSV load.")
    df = pd.DataFrame({
        "X8":[0.1,0.2,0.3,0.4],
        "X17":[1,2,3,4],
        "X3":[5,6,7,8],
        "X11":[0,1,0,1],
        "X15":[10,11,12,13],
        "X1":[2,3,4,5],
        "X6":[9,8,7,6],
        "status_label":["alive","failed","alive","failed"],
    })
print("df shape:", df.shape)


✅ Found existing `df`.
df shape: (15660, 22)


In [9]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, roc_auc_score, average_precision_score

def find_best_threshold(y_true, y_proba, metric="f1"):
    y_true = np.asarray(y_true).astype(int)
    thresholds = np.linspace(0.01, 0.99, 99)
    best_t, best_val = 0.5, -1.0
    for t in thresholds:
        y_hat = (y_proba >= t).astype(int)
        p, r, f1, _ = precision_recall_fscore_support(y_true, y_hat, average="binary", zero_division=0)
        val = f1 if metric == "f1" else r
        if val > best_val:
            best_val, best_t = val, t
    return float(best_t), float(best_val)

def evaluate_at_threshold(y_true, y_proba, threshold):
    y_pred = (y_proba >= threshold).astype(int)
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    acc = (tp+tn)/cm.sum()
    prec = tp/(tp+fp) if (tp+fp) else 0.0
    rec  = tp/(tp+fn) if (tp+fn) else 0.0
    f1   = 2*prec*rec/(prec+rec) if (prec+rec) else 0.0
    return {"threshold": float(threshold), "cm": cm.tolist(), "accuracy": acc, "precision": prec, "recall": rec, "f1": f1}


In [10]:
# ===== 0) เตรียม X, y (map target) =====
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

FEATURES = ["X8","X17","X3","X11","X10","X1","X6"]
TARGET   = "status_label"

# ตรวจคอลัมน์
missing = [c for c in FEATURES+[TARGET] if c not in df.columns]
if missing:
    raise ValueError(f"❌ Missing columns: {missing}")

# y: alive->0, failed->1
y = df[TARGET]
if y.dtype == object:
    y = y.astype(str).str.strip().str.lower().map({"alive":0, "failed":1}).astype(int)
else:
    y = pd.Series(y).astype(int)

X = df[FEATURES].copy()

# ===== 1) split: train/valid/test =====
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_trn, X_val, y_trn, y_val = train_test_split(
    X_tr, y_tr, test_size=0.2, random_state=42, stratify=y_tr
)

# ===== 2) scale_pos_weight จาก train แท้ ๆ =====
pos = int((y_trn == 1).sum())
neg = int((y_trn == 0).sum())
assert pos > 0, "No positive samples in training set."
scale_pos_weight = neg / pos

print("Shapes:",
      "\n  X_trn:", X_trn.shape, " X_val:", X_val.shape, " X_te:", X_te.shape,
      "\nClass ratio (train):", dict(pd.Series(y_trn).value_counts(normalize=True).round(3)))
print(f"scale_pos_weight = {scale_pos_weight:.2f}")


Shapes: 
  X_trn: (10022, 7)  X_val: (2506, 7)  X_te: (3132, 7) 
Class ratio (train): {1: np.float64(0.5), 0: np.float64(0.5)}
scale_pos_weight = 1.00


In [11]:
# ตัวอย่าง time-based split (ปรับช่วงปีตามจริง)
train_mask = df["year"] <= 2011
val_mask   = (df["year"] >= 2012) & (df["year"] <= 2014)
test_mask  = df["year"] >= 2015

X_trn, y_trn = X[train_mask], y[train_mask]
X_val, y_val = X[val_mask],   y[val_mask]
X_te,  y_te  = X[test_mask],  y[test_mask]

pos = int((y_trn == 1).sum()); neg = int((y_trn == 0).sum())
scale_pos_weight = neg / pos
print("time-split OK | spw=", round(scale_pos_weight,2))


time-split OK | spw= 0.83


In [12]:
import xgboost as xgb

def fit_xgb_compat(X_trn, y_trn, X_val, y_val, *,
                   scale_pos_weight,
                   learning_rate=0.03, max_depth=7, min_child_weight=1,
                   subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
                   n_estimators=10000, early_stopping_rounds=300, random_state=42):
    params = dict(
        objective="binary:logistic",
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_lambda=reg_lambda,
        random_state=random_state,
        tree_method="hist",
        scale_pos_weight=scale_pos_weight,
        eval_metric="aucpr"
    )
    # try callbacks
    try:
        model = xgb.XGBClassifier(**params)
        model.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], verbose=100,
                  callbacks=[xgb.callback.EarlyStopping(rounds=early_stopping_rounds, save_best=True)])
        return model
    except TypeError:
        pass
    # try early_stopping_rounds=
    try:
        model = xgb.XGBClassifier(**params)
        model.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], verbose=100,
                  early_stopping_rounds=early_stopping_rounds)
        return model
    except TypeError:
        pass
    # fallback xgb.train
    dtrn = xgb.DMatrix(X_trn, label=y_trn)
    dval = xgb.DMatrix(X_val, label=y_val)
    train_params = dict(
        objective="binary:logistic",
        eta=learning_rate,
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        lambda_=reg_lambda,
        tree_method="hist",
        scale_pos_weight=scale_pos_weight,
        eval_metric="aucpr",
        seed=random_state,
    )
    booster = xgb.train(train_params, dtrn, num_boost_round=n_estimators,
                        evals=[(dtrn,"train"),(dval,"valid")],
                        early_stopping_rounds=early_stopping_rounds, verbose_eval=100)
    class BoosterWrapper:
        def __init__(self, booster): self.booster = booster
        def predict_proba(self, X):
            dm = xgb.DMatrix(X)
            best_it = getattr(self.booster, "best_iteration", None)
            p = self.booster.predict(dm, iteration_range=(0, best_it+1)) if best_it is not None else self.booster.predict(dm)
            import numpy as np
            return np.vstack([1-p, p]).T
    return BoosterWrapper(booster)


In [13]:
import lightgbm as lgb

def fit_lgbm(X_trn, y_trn, X_val, y_val, *,
             scale_pos_weight,
             num_leaves=63, min_child_samples=100,
             learning_rate=0.03, subsample=0.8, colsample_bytree=0.8,
             reg_lambda=1.0, n_estimators=10000, early_stopping_rounds=300, random_state=42):
    lgbm = lgb.LGBMClassifier(
        objective="binary",
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        num_leaves=num_leaves,
        min_child_samples=min_child_samples,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_lambda=reg_lambda,
        random_state=random_state,
        scale_pos_weight=scale_pos_weight,
        first_metric_only=True,
        force_col_wise=True
    )
    lgbm.fit(
        X_trn, y_trn,
        eval_set=[(X_val, y_val)],
        eval_metric="average_precision",
        callbacks=[lgb.early_stopping(early_stopping_rounds), lgb.log_evaluation(100)]
    )
    return lgbm


In [16]:
# ==== PRELUDE: make sure we have X_trn, y_trn, X_val, y_val + spw_base ====
import pandas as pd
from sklearn.model_selection import train_test_split

FEATURES = ["X8","X17","X3","X11","X10","X1","X6"]
TARGET   = "status_label"

# 1) ensure we have a train/val split
if 'y_trn' not in globals() or 'y_val' not in globals():
    base_df = train_df if 'train_df' in globals() else df  # prefer your pre-made train_df
    X_full = base_df[FEATURES].copy()
    y_full = pd.Series(base_df[TARGET]).astype(int)
    X_trn, X_val, y_trn, y_val = train_test_split(
        X_full, y_full, test_size=0.2, random_state=42, stratify=y_full
    )

# 2) compute spw_base from y_trn (neg/pos)
y_trn_s = pd.Series(y_trn).astype(int)
pos = int((y_trn_s == 1).sum())
neg = int((y_trn_s == 0).sum())
if pos == 0:
    raise ValueError("No positive samples in y_trn. Check your split / labeling.")
spw_base = neg / pos
print(f"spw_base = {spw_base:.2f}  (neg={neg}, pos={pos})")


spw_base = 0.83  (neg=5559, pos=6660)


In [17]:
from itertools import product

# กริดเล็กๆ พอให้วิ่งเร็ว
spw_mults_lgb = [0.5, 1.0, 2.0, 4.0]
leaves_grid   = [31, 63, 127]
mcs_grid      = [50, 200]          # min_child_samples
lr_grid       = [0.02, 0.05]
col_grid      = [0.7, 0.9]
sub_grid      = [0.7, 0.9]

spw_mults_xgb = [0.5, 1.0, 2.0, 4.0]
md_grid       = [4, 7]             # max_depth
mcw_grid      = [1, 5]             # min_child_weight
lr_x_grid     = [0.02, 0.05]
col_x_grid    = [0.7, 0.9]
sub_x_grid    = [0.7, 0.9]

results = []

# ---- LightGBM sweep ----
for spwm, nl, mcs, lr, col, sub in product(spw_mults_lgb, leaves_grid, mcs_grid, lr_grid, col_grid, sub_grid):
    try:
        model = fit_lgbm(
            X_trn, y_trn, X_val, y_val,
            scale_pos_weight=spw_base*spwm,
            num_leaves=nl, min_child_samples=mcs,
            learning_rate=lr, colsample_bytree=col, subsample=sub,
            early_stopping_rounds=200
        )
        proba_val = model.predict_proba(X_val)[:,1]
        t_val, f1_val = find_best_threshold(y_val, proba_val, metric="f1")
        auprc_val = average_precision_score(y_val, proba_val)
        results.append({
            "model":"LGBM","params":{"spw_mult":spwm,"num_leaves":nl,"min_child_samples":mcs,"lr":lr,"col":col,"sub":sub},
            "t_val":t_val, "f1_val":f1_val, "auprc_val":auprc_val, "estimator":model
        })
    except Exception as e:
        print("LGBM fail:", e)

# ---- XGBoost sweep ----
for spwm, md, mcw, lr, col, sub in product(spw_mults_xgb, md_grid, mcw_grid, lr_x_grid, col_x_grid, sub_x_grid):
    try:
        model = fit_xgb_compat(
            X_trn, y_trn, X_val, y_val,
            scale_pos_weight=spw_base*spwm,
            learning_rate=lr, max_depth=md, min_child_weight=mcw,
            colsample_bytree=col, subsample=sub,
            early_stopping_rounds=200
        )
        proba_val = model.predict_proba(X_val)[:,1]
        t_val, f1_val = find_best_threshold(y_val, proba_val, metric="f1")
        auprc_val = average_precision_score(y_val, proba_val)
        results.append({
            "model":"XGB","params":{"spw_mult":spwm,"max_depth":md,"min_child_weight":mcw,"lr":lr,"col":col,"sub":sub},
            "t_val":t_val, "f1_val":f1_val, "auprc_val":auprc_val, "estimator":model
        })
    except Exception as e:
        print("XGB fail:", e)

# ตารางสรุป (เรียงตาม F1 บน validation)
df_res = pd.DataFrame(results).sort_values(["f1_val","auprc_val"], ascending=False)
print(df_res[["model","params","t_val","f1_val","auprc_val"]].head(10))
best = df_res.iloc[0]
best_model = best["estimator"]
best_t = float(best["t_val"])
print("\nBest candidate:", best["model"], best["params"], "t*=", best_t, "F1_val=", round(best["f1_val"],4))


[1;30;43mเอาต์พุตของการสตรีมมีการตัดเหลือเพียง 5000 บรรทัดสุดท้าย[0m
[200]	valid_0's average_precision: 0.737659	valid_0's binary_logloss: 0.658145
Early stopping, best iteration is:
[67]	valid_0's average_precision: 0.745156	valid_0's binary_logloss: 0.682647
[LightGBM] [Info] Number of positive: 6660, number of negative: 5559
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 12219, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.545053 -> initscore=0.180701
[LightGBM] [Info] Start training from score 0.180701
Training until validation scores don't improve for 200 rounds
[100]	valid_0's average_precision: 0.736452	valid_0's binary_logloss: 0.673232
[200]	valid_0's average_precision: 0.737346	valid_0's binary_logloss: 0.658287
Early stopping, best iteration is:
[34]	valid_0's average_precision: 0.741173	valid_0's binary_logloss: 0.68577
[LightGBM] [Info] Number of positive: 6660, number of negative: 5559
[LightGBM

In [None]:
proba_te = best_model.predict_proba(X_te)[:,1]
print("Test ROC AUC:", roc_auc_score(y_te, proba_te).round(4),
      "| AUPRC:", average_precision_score(y_te, proba_te).round(4))
print("Test @best_t:", evaluate_at_threshold(y_te, proba_te, best_t))


Test ROC AUC: 0.8297 | AUPRC: 0.2294
Test @best_t: {'threshold': 0.81, 'cm': [[10852, 1143], [131, 156]], 'accuracy': np.float64(0.8962709656407751), 'precision': np.float64(0.12009237875288684), 'recall': np.float64(0.5435540069686411), 'f1': np.float64(0.19672131147540983)}


In [None]:
# เลือก LGBM และ XGB ที่ดีที่สุดจาก df_res อย่างละหนึ่ง
best_lgb = next(r for r in results if r["model"]=="LGBM" and r["f1_val"]==df_res[df_res.model=="LGBM"]["f1_val"].max())
best_xgb = next(r for r in results if r["model"]=="XGB"  and r["f1_val"]==df_res[df_res.model=="XGB"]["f1_val"].max())

p_val_blend = 0.5*best_lgb["estimator"].predict_proba(X_val)[:,1] + 0.5*best_xgb["estimator"].predict_proba(X_val)[:,1]
t_blend, f1_blend = find_best_threshold(y_val, p_val_blend, metric="f1")
print("Blend val: F1=", round(f1_blend,4), "t*=", t_blend)

p_te_blend = 0.5*best_lgb["estimator"].predict_proba(X_te)[:,1] + 0.5*best_xgb["estimator"].predict_proba(X_te)[:,1]
print("Blend test:", evaluate_at_threshold(y_te, p_te_blend, t_blend))


Blend val: F1= 0.2999 t*= 0.46
Blend test: {'threshold': 0.46, 'cm': [[10717, 1278], [123, 164]], 'accuracy': np.float64(0.8859306301905228), 'precision': np.float64(0.11373092926490985), 'recall': np.float64(0.5714285714285714), 'f1': np.float64(0.189705031810295)}
