<a href="https://colab.research.google.com/github/nullvoid-ky/introduction-to-machine-learning-and-deep-learning/blob/main/OOP_ML_Model_Controller_for_Bankruptcy_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# OOP ML Model Controller for Bankruptcy Prediction

เวอร์ชันนี้ **ออกแบบให้รันต่อจากตัวแปร `df` ที่คุณโหลดจากไฟล์ CSV เอง**

➡️ ขั้นตอนการใช้งานสั้นๆ:
1. รันเซลล์ **Load your DataFrame (`df`)** แล้วแก้ไข path ให้ถูก (หรือวางโค้ดโหลดของคุณเอง)
2. รันเซลล์ **Feature selection (X, y)** เพื่อเลือกคอลัมน์ `X8, X17, X3, X11, X15, X1, X6` เป็น X และ `status_label` เป็น y
3. รันเซลล์ **Model Classes** (OOP) และ **Controller & Run**
4. ดูผลใน **Benchmark Table**, **ROC Curves**, และ **MLP Loss Curve**

In [1]:
# ===== Setup & Installs (Kaggle usually has most of these; safe to re-run) =====
!pip -q install kagglehub shap lightgbm xgboost

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import List

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.inspection import permutation_importance

import shap
import warnings
warnings.filterwarnings('ignore')


In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("utkarshx27/american-companies-bankruptcy-prediction-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/utkarshx27/american-companies-bankruptcy-prediction-dataset?dataset_version_number=3...


100%|██████████| 4.47M/4.47M [00:00<00:00, 45.4MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/utkarshx27/american-companies-bankruptcy-prediction-dataset/versions/3


In [3]:
from kagglehub import KaggleDatasetAdapter, load_dataset

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Set the CSV file path **inside** the dataset (adjust if needed)
# Explore the dataset directory printed below to confirm the file name.
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
file_path = "/root/.cache/kagglehub/datasets/utkarshx27/american-companies-bankruptcy-prediction-dataset/versions/3/american_bankruptcy.csv"

df = pd.read_csv(file_path)

print("Loaded shape:", df.shape)
print("Columns:\n", list(df.columns))
df.head()

Loaded shape: (78682, 21)
Columns:
 ['company_name', 'status_label', 'year', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18']


Unnamed: 0,company_name,status_label,year,X1,X2,X3,X4,X5,X6,X7,...,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
0,C_1,alive,1999,511.267,833.107,18.373,89.031,336.018,35.163,128.348,...,1024.333,740.998,180.447,70.658,191.226,163.816,201.026,1024.333,401.483,935.302
1,C_1,alive,2000,485.856,713.811,18.577,64.367,320.59,18.531,115.187,...,874.255,701.854,179.987,45.79,160.444,125.392,204.065,874.255,361.642,809.888
2,C_1,alive,2001,436.656,526.477,22.496,27.207,286.588,-58.939,77.528,...,638.721,710.199,217.699,4.711,112.244,150.464,139.603,638.721,399.964,611.514
3,C_1,alive,2002,396.412,496.747,27.172,30.745,259.954,-12.41,66.322,...,606.337,686.621,164.658,3.573,109.59,203.575,124.106,606.337,391.633,575.592
4,C_1,alive,2003,432.204,523.302,26.68,47.491,247.245,3.504,104.661,...,651.958,709.292,248.666,20.811,128.656,131.261,131.884,651.958,407.608,604.467


In [4]:
# ==============================
# Load your DataFrame (df)
# ==============================
# คุณสามารถแทนที่บรรทัดด้านล่างด้วยโค้ดโหลด DataFrame ของคุณเอง
# ตัวอย่าง:
# import pandas as pd
# df = pd.read_csv("/path/to/your.csv")
#
# หมายเหตุ: ถ้าคุณสร้าง `df` มาก่อนหน้านี้แล้ว ก็ข้ามเซลล์นี้ได้

try:
    df  # type: ignore # ตรวจว่ามี df อยู่แล้วหรือยัง
    print("✅ Found existing `df` in memory.")
except NameError:
    import pandas as pd
    print("ℹ️ No existing `df` found. Creating a tiny placeholder DataFrame just for sanity checks.")
    df = pd.DataFrame({
        "X8":[0.1,0.2,0.3,0.4],
        "X17":[1,2,3,4],
        "X3":[5,6,7,8],
        "X11":[0,1,0,1],
        "X15":[10,11,12,13],
        "X1":[2,3,4,5],
        "X6":[9,8,7,6],
        "status_label":[0,1,0,1],
    })
    print("➡️ Replace this with your real CSV load.")
print("df shape:", df.shape)


✅ Found existing `df` in memory.
df shape: (78682, 21)


In [5]:
# ==============================
# Feature selection (X, y)
# ==============================
import numpy as np
import pandas as pd

FEATURES = ["X8","X17","X3","X11","X15","X1","X6"]
TARGET   = "status_label"

# ตรวจสอบคอลัมน์ที่จำเป็น
missing = [c for c in FEATURES + [TARGET] if c not in df.columns]
if missing:
    raise ValueError(f"❌ Missing columns in df: {missing}. Please ensure your CSV contains them.")

X = df[FEATURES].copy()
y = df[TARGET].copy()

# แปลง target ให้เป็นเลขกรณีเป็นสตริง เช่น 'yes'/'no', 'bankrupt'/'normal'
if y.dtype == object:
    y_clean = y.astype(str).str.strip().str.lower()
    unique_vals = sorted(y_clean.unique())
    # Try simple mapping
    mapping_candidates = [
        ({"yes":1,"no":0}, set(["yes","no"])),
        ({"true":1,"false":0}, set(["true","false"])),
        ({"bankrupt":1,"normal":0}, set(["bankrupt","normal"])),
        ({"1":1,"0":0}, set(["1","0"])),
    ]
    used_map = None
    for m, allowed in mapping_candidates:
        if set(unique_vals) <= allowed:
            y = y_clean.map(m).astype(int)
            used_map = m
            print(f"🔁 Auto-mapped target using: {m}")
            break
    if used_map is None:
        # fallback: factorize
        y, _ = pd.factorize(y_clean)
        print("🔁 Target factorized (0..K-1).")
else:
    # Ensure integer 0/1 if possible
    try:
        y = y.astype(int)
    except Exception:
        pass

print("✅ Feature/Target ready.")
print("X shape:", X.shape, "| y shape:", y.shape, "| y value counts:", dict(pd.Series(y).value_counts()))


🔁 Target factorized (0..K-1).
✅ Feature/Target ready.
X shape: (78682, 7) | y shape: (78682,) | y value counts: {0: np.int64(73462), 1: np.int64(5220)}


In [6]:
# ==============================
# Imports for modeling
# ==============================
import matplotlib.pyplot as plt
from abc import ABC, abstractmethod
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Classification
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Unsupervised & Dim. Reduction
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA

# Metrics
from sklearn.metrics import (
    accuracy_score, confusion_matrix, roc_auc_score, f1_score,
    precision_score, recall_score, roc_curve, auc
)

plt.style.use("ggplot")
RANDOM_STATE = 42


In [7]:
  # ==============================
  # Base & Subclass Models (OOP)
  # ==============================
  import numpy as np
  import pandas as pd

  class BaseModel(ABC):
      def __init__(self, X: pd.DataFrame, y: pd.Series, model_params: dict = None, random_state: int = RANDOM_STATE):
          self.X = X
          self.y = y
          self.random_state = random_state
          self.model = None
          self.model_params = model_params if model_params is not None else {}
          self.X_train = None
          self.X_test  = None
          self.y_train = None
          self.y_test  = None
          self.scaler = StandardScaler()
          self.history = None
          self.is_fitted = False

      def split_data(self, test_size: float = 0.2, scale: bool = True):
          X_train_raw, X_test_raw, self.y_train, self.y_test = train_test_split(
              self.X, self.y, test_size=test_size, random_state=self.random_state, stratify=self.y
          )
          if scale:
              self.X_train = pd.DataFrame(self.scaler.fit_transform(X_train_raw), columns=X_train_raw.columns)
              self.X_test  = pd.DataFrame(self.scaler.transform(X_test_raw), columns=X_test_raw.columns)
          else:
              self.X_train = X_train_raw
              self.X_test  = X_test_raw

      @abstractmethod
      def build_model(self):
          pass

      def fit(self):
          if self.X_train is None:
              raise ValueError("Data not split. Call .split_data() first.")
          self.build_model()
          self.model.fit(self.X_train, self.y_train)
          self.is_fitted = True
          if hasattr(self.model, "loss_curve_"):
              self.history = {"loss_curve": self.model.loss_curve_}
          print(f"✅ {self.__class__.__name__} trained.")

      def evaluate(self):
          # ถ้าเป็น Unsupervised ให้ข้าม metrics classification
          if isinstance(self.model, (KMeans, AgglomerativeClustering)):
              return {"Note": "Unsupervised Model: Use specific clustering metrics."}

          if self.y_test is None or not self.is_fitted:
              return {"Error": "Model not fitted or Test data not available."}

          y_pred = self.model.predict(self.X_test)

          results = {}
          if len(np.unique(self.y_test)) == 2:
              try:
                  tn, fp, fn, tp = confusion_matrix(self.y_test, y_pred).ravel()
                  sensitivity = tp / (tp + fn) if (tp + fn) else 0.0
                  specificity = tn / (tn + fp) if (tn + fp) else 0.0
                  results = {
                      "Loss (MLP only)": getattr(self.model, "loss_", "N/A"),
                      "Confusion Matrix": confusion_matrix(self.y_test, y_pred).tolist(),
                      "Accuracy": accuracy_score(self.y_test, y_pred),
                      "Precision": precision_score(self.y_test, y_pred, zero_division=0),
                      "Sensitivity (Recall)": sensitivity,
                      "Specificity (TNR)": specificity,
                      "F1-score": f1_score(self.y_test, y_pred, zero_division=0),
                      "R-Square (N/A)": "N/A",
                  }
                  if hasattr(self.model, "predict_proba"):
                      y_proba = self.model.predict_proba(self.X_test)[:, 1]
                      results["ROC AUC"] = roc_auc_score(self.y_test, y_proba)
                  else:
                      results["ROC AUC"] = "N/A"
              except Exception as e:
                  results = {"Error": f"Evaluation failed: {e}"}
          else:
              # Multi-class (เผื่อกรณีอนาคต)
              results = {"Accuracy": accuracy_score(self.y_test, y_pred)}
          return results

      def plot_roc(self, ax=None, label_prefix=""):
          if not hasattr(self.model, "predict_proba") or self.y_test is None:
              return
          y_proba = self.model.predict_proba(self.X_test)[:, 1]
          fpr, tpr, _ = roc_curve(self.y_test, y_proba)
          roc_auc = auc(fpr, tpr)
          if ax is None:
              fig, ax = plt.subplots(1, 1)
          ax.plot(fpr, tpr, label=f"{label_prefix} (AUC = {roc_auc:.4f})")
          ax.plot([0, 1], [0, 1], "r--")

      def plot_performance_curve(self, ax=None, label_prefix=""):
          if hasattr(self.model, "loss_curve_") and self.model.loss_curve_ is not None:
              if ax is None:
                  fig, ax = plt.subplots(1, 1)
              ax.plot(self.model.loss_curve_, label=f"{label_prefix} Loss")
              ax.set_title("Model Loss Curve")
              ax.set_xlabel("Epochs")
              ax.set_ylabel("Loss")
              ax.legend()


  class LogisticRegressionModel(BaseModel):
	  def build_model(self):
	      self.model = LogisticRegression(random_state=self.random_state, **self.model_params)

  class DecisionTreeModel(BaseModel):
	  def build_model(self):
	      self.model = DecisionTreeClassifier(random_state=self.random_state, **self.model_params)

  class RandomForestModel(BaseModel):
	  def build_model(self):
	      self.model = RandomForestClassifier(random_state=self.random_state, **self.model_params)

  class NaiveBayesModel(BaseModel):
	  def build_model(self):
	      self.model = GaussianNB(**self.model_params)

  class SupportVectorMachineModel(BaseModel):
	  def build_model(self):
	      self.model = SVC(random_state=self.random_state, probability=True, **self.model_params)

  class PerceptronModel(BaseModel):
	  def build_model(self):
	      self.model = Perceptron(random_state=self.random_state, **self.model_params)

  class MLPModel(BaseModel):
	  def build_model(self):
	      self.model = MLPClassifier(random_state=self.random_state, **self.model_params)

  class KMeanClustering(BaseModel):
	  def build_model(self):
	      n_clusters = self.model_params.pop("n_clusters", 2)
	      # set explicit n_init for broader sklearn compatibility
	      self.model = KMeans(n_clusters=n_clusters, random_state=self.random_state, n_init=10, **self.model_params)
	  def fit(self):
	      if self.X_train is None:
	          raise ValueError("Data not split.")
	      self.build_model()
	      self.model.fit(self.X_train)
	      self.is_fitted = True
	      print(f"✅ {self.__class__.__name__} trained.")

  class AgglomerativeClusteringModel(BaseModel):
	  def build_model(self):
	      n_clusters = self.model_params.pop("n_clusters", 2)
	      self.model = AgglomerativeClustering(n_clusters=n_clusters, **self.model_params)
	  def fit(self):
	      if self.X_train is None:
	          raise ValueError("Data not split.")
	      self.build_model()
	      self.model.fit(self.X_train)
	      self.is_fitted = True
	      print(f"✅ {self.__class__.__name__} trained.")

  class ReducedClassifierModel(BaseModel):
	  def __init__(self, X, y, classifier_class, n_components, model_params=None, random_state=RANDOM_STATE):
	      super().__init__(X, y, model_params, random_state)
	      self.classifier_class = classifier_class
	      self.n_components = n_components
	  def build_model(self):
	      pca = PCA(n_components=self.n_components, random_state=self.random_state)
	      if self.classifier_class == SVC:
	          classifier = SVC(random_state=self.random_state, probability=True, **self.model_params)
	      else:
	          classifier = self.classifier_class(random_state=self.random_state, **self.model_params)
	      self.model = Pipeline(steps=[("pca", pca), ("classifier", classifier)])


In [8]:
# ==============================
# Controller
# ==============================
class ModelController:
    def __init__(self, X: pd.DataFrame, y: pd.Series, test_size: float = 0.2, random_state: int = RANDOM_STATE):
        self.X = X
        self.y = y
        self.test_size = test_size
        self.random_state = random_state
        self.models = self._initialize_models()
        self.results = {}

    def _initialize_models(self):
        svm_params = {"C": 10, "gamma": "auto"}
        mlp_params = {"hidden_layer_sizes": (50,), "max_iter": 500}
        return {
            "Logistic Regression": LogisticRegressionModel(self.X, self.y),
            "Decision Tree": DecisionTreeModel(self.X, self.y),
            "Random Forest": RandomForestModel(self.X, self.y, model_params={"n_estimators": 150}),
            "Naïve Bayesian": NaiveBayesModel(self.X, self.y),
            "Support Vector Machine": SupportVectorMachineModel(self.X, self.y, model_params=svm_params),
            "Perceptron (SLP)": PerceptronModel(self.X, self.y),
            "Multi-layer Perceptron (MLP)": MLPModel(self.X, self.y, model_params=mlp_params),
            "PCA(5) + RF": ReducedClassifierModel(self.X, self.y, RandomForestClassifier, n_components=5),
            "PCA(5) + SVM": ReducedClassifierModel(self.X, self.y, SVC, n_components=5, model_params=svm_params),
            "K-Means Clustering (k=2)": KMeanClustering(self.X, self.y, model_params={"n_clusters": 2}),
            "Agglomerative Clustering (k=2)": AgglomerativeClusteringModel(self.X, self.y, model_params={"n_clusters": 2}),
        }

    def run_all(self):
        print("="*60)
        print("🚀 Starting Model Training and Evaluation for all models...")
        print("="*60)

        # Split with the first model
        first = list(self.models.values())[0]
        first.split_data(test_size=self.test_size, scale=True)
        print(f"Data split: Train={first.X_train.shape}, Test={first.X_test.shape}")

        for name, m in self.models.items():
            print(f"\n--- Running {name} ---")
            try:
                m.X_train, m.X_test = first.X_train, first.X_test
                m.y_train, m.y_test = first.y_train, first.y_test
                m.fit()
                self.results[name] = m.evaluate()
            except Exception as e:
                print(f"❌ Error running {name}: {e}")
                self.results[name] = {"Error": str(e)}

        print("\n" + "="*60)
        print("✨ All models finished running.")
        return self.results

    def get_results(self, model_name: str = None):
        if model_name:
            return self.results.get(model_name, "Model not found or not run.")
        return self.results

    def show_benchmarking(self):
        classification = {k: v for k, v in self.results.items() if isinstance(v, dict) and ('Accuracy' in v or 'ROC AUC' in v)}
        if not classification:
            print("\n⚠️ No classification results to show benchmarking.")
            return
        df_results = pd.DataFrame.from_dict(classification, orient="index")
        metrics_order = ["Accuracy", "ROC AUC", "F1-score", "Precision", "Sensitivity (Recall)", "Specificity (TNR)"]
        cols = [m for m in metrics_order if m in df_results.columns]

        print("\n" + "="*70)
        print("📊 QUANTITATIVE BENCHMARKING RESULTS (Classification Models)")
        print("="*70)
        # use to_string to avoid truncation
        print(df_results[cols].sort_values(by="Accuracy", ascending=False).to_string(float_format="{:.4f}".format))
        print("\n* ROC AUC may be 'N/A' for models without predict_proba (e.g., Perceptron).")

    def plot_all_roc(self):
        plt.figure(figsize=(12, 8))
        ax = plt.gca()
        plotted = False
        for name, m in self.models.items():
            res = self.results.get(name, {})
            if isinstance(m, (KMeanClustering, AgglomerativeClustering)):
                continue
            if res.get("ROC AUC") in ("N/A", None):
                continue
            try:
                m.plot_roc(ax=ax, label_prefix=name)
                plotted = True
            except Exception:
                pass
        if plotted:
            ax.set_title("Receiver Operating Characteristic (ROC) Curve Comparison", fontsize=16)
            ax.set_xlabel("False Positive Rate (1 - Specificity)")
            ax.set_ylabel("True Positive Rate (Sensitivity)")
            ax.legend(loc="lower right")
            plt.show()
        else:
            print("\n⚠️ No ROC Curves available for comparison.")

    def plot_loss_curves(self):
        plt.figure(figsize=(8, 6))
        ax = plt.gca()
        found = False
        for name, m in self.models.items():
            if isinstance(m, MLPModel) and hasattr(m.model, "loss_curve_") and m.model.loss_curve_ is not None:
                m.plot_performance_curve(ax=ax, label_prefix=name)
                found = True
        if found:
            ax.set_title("Performance Curve: MLP Loss vs. Epochs", fontsize=16)
            plt.show()
        else:
            print("\n⚠️ No performance (loss) curves found for plotting.")


In [None]:
# ==============================
# Run Controller
# ==============================
controller = ModelController(X=X, y=y, test_size=0.2, random_state=RANDOM_STATE)
all_results = controller.run_all()

print("\n✅ Finished. You can inspect `all_results` dict or use helpers below.")


🚀 Starting Model Training and Evaluation for all models...
Data split: Train=(62945, 7), Test=(15737, 7)

--- Running Logistic Regression ---
✅ LogisticRegressionModel trained.

--- Running Decision Tree ---
✅ DecisionTreeModel trained.

--- Running Random Forest ---
✅ RandomForestModel trained.

--- Running Naïve Bayesian ---
✅ NaiveBayesModel trained.

--- Running Support Vector Machine ---
✅ SupportVectorMachineModel trained.

--- Running Perceptron (SLP) ---
✅ PerceptronModel trained.

--- Running Multi-layer Perceptron (MLP) ---
✅ MLPModel trained.

--- Running PCA(5) + RF ---
✅ ReducedClassifierModel trained.

--- Running PCA(5) + SVM ---
✅ ReducedClassifierModel trained.

--- Running K-Means Clustering (k=2) ---
✅ KMeanClustering trained.

--- Running Agglomerative Clustering (k=2) ---


In [None]:
# ==============================
# Benchmarking Table
# ==============================
controller.show_benchmarking()


In [None]:
# ==============================
# ROC Curves (if available)
# ==============================
controller.plot_all_roc()


In [None]:
# ==============================
# MLP Loss Curve (if available)
# ==============================
controller.plot_loss_curves()
