<a href="https://colab.research.google.com/github/nullvoid-ky/introduction-to-machine-learning-and-deep-learning/blob/main/13_SMOTE_RF_with_tqdm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bankruptcy Prediction – Run models separately (per-cell)

**แนวคิด:** รันแต่ละโมเดลแยก cell เพื่อช่วยดีบัก/ควบคุมหน่วยความจำได้ง่ายขึ้น และใช้ **train/test split ร่วมกัน**

ลำดับรันแนะนำ:
1) Load `df` (หรือข้ามถ้าคุณมี `df` อยู่แล้ว)
2) Feature selection (X,y) + map target (alive→0, failed→1)
3) Import & class definitions
4) สร้าง Controller
5) **Split & Share** (เตรียม `shared_split`)
6) รันแต่ละโมเดล (LogReg / DT / RF / NB / SVM / Perceptron / MLP / PCA+RF / PCA+SVM / KMeans / Agglo)
7) Benchmark table, ROC curves, Loss curve


In [2]:
# ===== Setup & Installs (Kaggle usually has most of these; safe to re-run) =====
!pip -q install kagglehub shap lightgbm xgboost

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import List
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.datasets import make_classification
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')


In [5]:
# import kagglehub

# Download latest version
path = kagglehub.dataset_download("utkarshx27/american-companies-bankruptcy-prediction-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'american-companies-bankruptcy-prediction-dataset' dataset.
Path to dataset files: /kaggle/input/american-companies-bankruptcy-prediction-dataset


In [6]:
from kagglehub import KaggleDatasetAdapter, load_dataset

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Set the CSV file path **inside** the dataset (adjust if needed)
# Explore the dataset directory printed below to confirm the file name.
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
file_path = "/kaggle/input/american-companies-bankruptcy-prediction-dataset/american_bankruptcy.csv"

df = pd.read_csv(file_path)

print("Loaded shape:", df.shape)
print("Columns:\n", list(df.columns))
df.head()

Loaded shape: (78682, 21)
Columns:
 ['company_name', 'status_label', 'year', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18']


Unnamed: 0,company_name,status_label,year,X1,X2,X3,X4,X5,X6,X7,...,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
0,C_1,alive,1999,511.267,833.107,18.373,89.031,336.018,35.163,128.348,...,1024.333,740.998,180.447,70.658,191.226,163.816,201.026,1024.333,401.483,935.302
1,C_1,alive,2000,485.856,713.811,18.577,64.367,320.59,18.531,115.187,...,874.255,701.854,179.987,45.79,160.444,125.392,204.065,874.255,361.642,809.888
2,C_1,alive,2001,436.656,526.477,22.496,27.207,286.588,-58.939,77.528,...,638.721,710.199,217.699,4.711,112.244,150.464,139.603,638.721,399.964,611.514
3,C_1,alive,2002,396.412,496.747,27.172,30.745,259.954,-12.41,66.322,...,606.337,686.621,164.658,3.573,109.59,203.575,124.106,606.337,391.633,575.592
4,C_1,alive,2003,432.204,523.302,26.68,47.491,247.245,3.504,104.661,...,651.958,709.292,248.666,20.811,128.656,131.261,131.884,651.958,407.608,604.467


In [7]:

# FEATURES_OLD = ["X8","X17","X3","X11","X10","X1","X6"]
FEATURES = ["X1","X2","X3","X4","X5","X6","X7","X8","X9","X11","X12","X13","X14","X15","X16","X17","X18","year"]
TARGET   = "status_label"
COMPANY  = "company_name"   # ถ้าไม่มีคอลัมน์นี้ โค้ดจะ fallback อัตโนมัติ


In [8]:

# 0) ตรวจว่าคอลัมน์ครบไหม
missing = [c for c in FEATURES+[TARGET] if c not in df.columns]
if missing:
    raise ValueError(f"❌ Missing columns: {missing}")

# 1) ฟังก์ชัน normalize label ให้เป็น 0/1 แบบทนทาน
def normalize_status(x):
    if pd.isna(x):
        return np.nan
    t = str(x).strip().lower()
    # ตัวเลขที่มาเป็นสตริง หรือ float 0.0/1.0
    if t in {"0","1"}:
        return int(t)
    try:
        # กรณีเป็น 0.0/1.0 จริง ๆ
        f = float(t)
        if f in (0.0, 1.0):
            return int(f)
    except:
        pass
    # แม็พคำยอดฮิต
    direct = {
        "alive": 0, "non-bankrupt": 0, "nonbankrupt": 0, "healthy": 0, "normal": 0,
        "failed": 1, "fail": 1, "bankrupt": 1, "bankruptcy": 1, "went_bankrupt": 1,
        "yes": 1, "y": 1, "true": 1,
        "no": 0, "n": 0, "false": 0
    }
    if t in direct:
        return direct[t]
    # สุดท้าย ถ้าระบุไม่ถูก ให้คืน NaN เพื่อตรวจสอบ
    return np.nan

y_norm = df[TARGET].apply(normalize_status)

# 2) เช็คค่าที่แปลงไม่ได้ (จะเป็น NaN)
bad_mask = y_norm.isna()
if bad_mask.any():
    print("⚠️ พบ label ที่ไม่รู้จัก (ตัวอย่าง top 20):")
    print(df.loc[bad_mask, TARGET].value_counts().head(20))
    # ทางเลือก: ตัดแถวที่ label ไม่ชัดเจนทิ้งไปก่อน
    df = df.loc[~bad_mask].copy()
    y_norm = y_norm.loc[~bad_mask]

# 3) เขียนกลับเป็นตัวเลข 0/1
df[TARGET] = y_norm.astype(int)

In [9]:
# ==============================
# Load your DataFrame (df)
# ==============================
try:
    df  # noqa: F821
    print("✅ Found existing `df`.")
except NameError:
    import pandas as pd
    print("ℹ️ No existing `df` found. Creating a tiny placeholder. Replace with your CSV load.")
    df = pd.DataFrame({
        "X8":[0.1,0.2,0.3,0.4],
        "X17":[1,2,3,4],
        "X3":[5,6,7,8],
        "X11":[0,1,0,1],
        "X15":[10,11,12,13],
        "X1":[2,3,4,5],
        "X6":[9,8,7,6],
        "status_label":["alive","failed","alive","failed"],
    })
print("df shape:", df.shape)


✅ Found existing `df`.
df shape: (78682, 21)


In [10]:
# ==============================
# Feature selection (X, y) + map target
# ==============================
import numpy as np
import pandas as pd

FEATURES = FEATURES
TARGET   = "status_label"

missing = [c for c in FEATURES + [TARGET] if c not in df.columns]
if missing:
    raise ValueError(f"❌ Missing columns in df: {missing}")

# Make sure the target column is integer type
df[TARGET] = df[TARGET].astype(int)

X = df[FEATURES].copy()
y = df[TARGET].copy()

print("✅ X,y ready.")
print("X shape:", X.shape, "| y counts:", dict(pd.Series(y).value_counts()))

✅ X,y ready.
X shape: (78682, 18) | y counts: {0: np.int64(73462), 1: np.int64(5220)}


In [11]:
# ==============================
# Imports
# ==============================
!pip -q install imbalanced-learn
import warnings
warnings.filterwarnings("ignore")


import matplotlib.pyplot as plt
from abc import ABC, abstractmethod
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA

from sklearn.metrics import (
    accuracy_score, confusion_matrix, roc_auc_score, f1_score,
    precision_score, recall_score, roc_curve, auc
)

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.metrics import accuracy_score, classification_report

plt.style.use("ggplot")
RANDOM_STATE = 42


In [16]:
# !pip install imbalanced-learn tqdm
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

# =============================
# PREPARE DATA
# =============================
# Assume df already loaded
X = df[FEATURES].copy()
y = df['status_label'].copy()

# =============================
# DEFINE PIPELINE
# =============================
pipe = ImbPipeline(steps=[
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42, k_neighbors=4, sampling_strategy='auto')),
    ('clf', RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_leaf=2,
        n_jobs=-1,
        random_state=42
    ))
])

# =============================
# STRATIFIED CV WITH PROGRESS BAR
# =============================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_scores = []
for fold, (train_idx, test_idx) in enumerate(tqdm(skf.split(X, y), total=5, desc="Training SMOTE-RF folds")):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='macro')
    f1_scores.append(f1)

    print(f"\n===== Fold {fold+1} =====")
    print(classification_report(y_test, y_pred, digits=4))
    print(f"F1 (macro): {f1:.4f}\n")

# =============================
# FINAL SUMMARY
# =============================
print("================ FINAL CV SUMMARY ================")
print(f"F1 Scores (macro): {np.round(f1_scores, 4)}")
print(f"Mean F1: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")


Training SMOTE-RF folds:  20%|██        | 1/5 [00:50<03:23, 50.92s/it]


===== Fold 1 =====
              precision    recall  f1-score   support

           0     0.9540    0.9305    0.9421     14693
           1     0.2738    0.3688    0.3143      1044

    accuracy                         0.8932     15737
   macro avg     0.6139    0.6496    0.6282     15737
weighted avg     0.9089    0.8932    0.9005     15737

F1 (macro): 0.6282



Training SMOTE-RF folds:  40%|████      | 2/5 [01:40<02:30, 50.24s/it]


===== Fold 2 =====
              precision    recall  f1-score   support

           0     0.9531    0.9321    0.9425     14693
           1     0.2707    0.3544    0.3069      1044

    accuracy                         0.8938     15737
   macro avg     0.6119    0.6433    0.6247     15737
weighted avg     0.9078    0.8938    0.9003     15737

F1 (macro): 0.6247



Training SMOTE-RF folds:  60%|██████    | 3/5 [02:28<01:38, 49.35s/it]


===== Fold 3 =====
              precision    recall  f1-score   support

           0     0.9538    0.9334    0.9435     14692
           1     0.2798    0.3640    0.3164      1044

    accuracy                         0.8957     15736
   macro avg     0.6168    0.6487    0.6300     15736
weighted avg     0.9091    0.8957    0.9019     15736

F1 (macro): 0.6300



Training SMOTE-RF folds:  80%|████████  | 4/5 [03:18<00:49, 49.45s/it]


===== Fold 4 =====
              precision    recall  f1-score   support

           0     0.9521    0.9347    0.9433     14692
           1     0.2691    0.3381    0.2997      1044

    accuracy                         0.8951     15736
   macro avg     0.6106    0.6364    0.6215     15736
weighted avg     0.9068    0.8951    0.9006     15736

F1 (macro): 0.6215



Training SMOTE-RF folds: 100%|██████████| 5/5 [04:06<00:00, 49.21s/it]


===== Fold 5 =====
              precision    recall  f1-score   support

           0     0.9532    0.9368    0.9449     14692
           1     0.2840    0.3525    0.3145      1044

    accuracy                         0.8981     15736
   macro avg     0.6186    0.6447    0.6297     15736
weighted avg     0.9088    0.8981    0.9031     15736

F1 (macro): 0.6297

F1 Scores (macro): [0.6282 0.6247 0.63   0.6215 0.6297]
Mean F1: 0.6268 ± 0.0033





In [None]:
# !pip install imbalanced-learn tqdm
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

# =============================
# PREPARE DATA
# =============================
# Assume df already loaded
X = df[FEATURES].copy()
y = df['status_label'].copy()

# =============================
# DEFINE PIPELINE
# =============================
pipe = ImbPipeline(steps=[
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42, k_neighbors=5, sampling_strategy='auto')),
    ('clf', RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        min_samples_leaf=3,
        n_jobs=-1,
        random_state=42
    ))
])

# =============================
# STRATIFIED CV WITH PROGRESS BAR
# =============================
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

f1_scores = []
for fold, (train_idx, test_idx) in enumerate(tqdm(skf.split(X, y), total=5, desc="Training SMOTE-RF folds")):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='macro')
    f1_scores.append(f1)

    print(f"\n===== Fold {fold+1} =====")
    print(classification_report(y_test, y_pred, digits=4))
    print(f"F1 (macro): {f1:.4f}\n")

# =============================
# FINAL SUMMARY
# =============================
print("================ FINAL CV SUMMARY ================")
print(f"F1 Scores (macro): {np.round(f1_scores, 4)}")
print(f"Mean F1: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")


Training SMOTE-RF folds:  20%|██        | 1/5 [01:52<07:29, 112.29s/it]


===== Fold 1 =====
              precision    recall  f1-score   support

           0     0.9536    0.9151    0.9339      7347
           1     0.2381    0.3736    0.2908       522

    accuracy                         0.8791      7869
   macro avg     0.5959    0.6443    0.6124      7869
weighted avg     0.9062    0.8791    0.8913      7869

F1 (macro): 0.6124



Training SMOTE-RF folds:  40%|████      | 2/5 [03:45<05:39, 113.08s/it]


===== Fold 2 =====
              precision    recall  f1-score   support

           0     0.9580    0.9196    0.9384      7347
           1     0.2766    0.4330    0.3376       522

    accuracy                         0.8873      7869
   macro avg     0.6173    0.6763    0.6380      7869
weighted avg     0.9128    0.8873    0.8985      7869

F1 (macro): 0.6380



Training SMOTE-RF folds:  60%|██████    | 3/5 [05:36<03:43, 111.99s/it]


===== Fold 3 =====
              precision    recall  f1-score   support

           0     0.9560    0.9179    0.9366      7346
           1     0.2601    0.4061    0.3171       522

    accuracy                         0.8840      7868
   macro avg     0.6081    0.6620    0.6269      7868
weighted avg     0.9099    0.8840    0.8955      7868

F1 (macro): 0.6269



Training SMOTE-RF folds:  80%|████████  | 4/5 [07:26<01:51, 111.31s/it]


===== Fold 4 =====
              precision    recall  f1-score   support

           0     0.9545    0.9251    0.9396      7346
           1     0.2647    0.3793    0.3118       522

    accuracy                         0.8889      7868
   macro avg     0.6096    0.6522    0.6257      7868
weighted avg     0.9087    0.8889    0.8979      7868

F1 (macro): 0.6257



Training SMOTE-RF folds: 100%|██████████| 5/5 [09:16<00:00, 110.56s/it]


===== Fold 5 =====
              precision    recall  f1-score   support

           0     0.9571    0.9255    0.9410      7346
           1     0.2840    0.4157    0.3375       522

    accuracy                         0.8917      7868
   macro avg     0.6205    0.6706    0.6393      7868
weighted avg     0.9124    0.8917    0.9010      7868

F1 (macro): 0.6393



Training SMOTE-RF folds: 6it [11:06, 110.61s/it]                       


===== Fold 6 =====
              precision    recall  f1-score   support

           0     0.9586    0.9227    0.9403      7346
           1     0.2873    0.4387    0.3472       522

    accuracy                         0.8906      7868
   macro avg     0.6229    0.6807    0.6438      7868
weighted avg     0.9140    0.8906    0.9009      7868

F1 (macro): 0.6438

