<a href="https://colab.research.google.com/github/muajnstu/Bank-Marketing-using-rough-set-approach/blob/main/Implementation_Pipeline_using_AL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Run this only once in your environment (e.g., terminal or notebook cell)
!pip install -q imbalanced-learn catboost xgboost lightgbm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.1/97.1 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, matthews_corrcoef, classification_report, roc_curve, auc
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso, ElasticNet, Perceptron
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    StackingClassifier, VotingClassifier,
    RandomForestClassifier, GradientBoostingClassifier,
    ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

import warnings
warnings.filterwarnings("ignore", message="'force_all_finite' was renamed to 'ensure_all_finite'")
warnings.filterwarnings("ignore", message="Parameters: { \"use_label_encoder\" } are not used.")
warnings.filterwarnings("ignore", message=".*does not have valid feature names.*")


In [4]:

# --- Base estimators reused across ensembles ---
base_estimators = [
    ("lr",  LogisticRegression(max_iter=1000)),
    ("rf",  RandomForestClassifier(random_state=42)),
    ("gb",  GradientBoostingClassifier(random_state=42)),
    ("xgb", XGBClassifier(eval_metric='logloss', random_state=42, use_label_encoder=False)),
    ("lgbm",LGBMClassifier(verbosity=-1)),
    ("svm", SVC(probability=True)),
    ("et",  ExtraTreesClassifier(random_state=42)),
    ("knn", KNeighborsClassifier()),
]

classifiers = {
    # ── Original classifiers ──────────────────────────────────────────────────
    "Logistic Regression":   LogisticRegression(max_iter=1000),
    "SVM":                   SVC(),
    "Naive Bayes":           GaussianNB(),
    "K-Nearest Neighbors":   KNeighborsClassifier(),
    "Decision Tree":         DecisionTreeClassifier(),
    "Random Forest":         RandomForestClassifier(random_state=42),
    "Gradient Boosting":     GradientBoostingClassifier(random_state=42),
    "XGBoost":               XGBClassifier(eval_metric='logloss', random_state=42, enable_categorical=True),
    "LightGBM":              LGBMClassifier(verbosity=-1),
    "CatBoost":              CatBoostClassifier(verbose=0, random_state=42),
    "AdaBoost":              AdaBoostClassifier(random_state=42),
    "Extra Trees":           ExtraTreesClassifier(random_state=42),
    "Ridge Classifier":      RidgeClassifier(),
    "Perceptron":            Perceptron(),
    "Bagging Classifier":    BaggingClassifier(random_state=42),
    "GaussianNB":            GaussianNB(),

    # ── Stacking ──────────────────────────────────────────────────────────────
    "Stacking": StackingClassifier(
        estimators=base_estimators,
        final_estimator=LogisticRegression(max_iter=1000),
        cv=5,
        passthrough=False,
        n_jobs=-1
    ),

    # ── Hard Voting ───────────────────────────────────────────────────────────
    "Hard Voting": VotingClassifier(
        estimators=base_estimators,
        voting='hard',
        n_jobs=-1
    ),

    # ── Soft Voting ───────────────────────────────────────────────────────────
    "Soft Voting": VotingClassifier(
        estimators=base_estimators,
        voting='soft',
        n_jobs=-1
    ),

    # ── Weighted Soft Voting ──────────────────────────────────────────────────
    "Weighted Voting": VotingClassifier(
        estimators=base_estimators,
        voting='soft',
        weights=[
            3,   # lr   – reliable baseline
            4,   # rf   – strong ensemble
            4,   # gb   – strong ensemble
            5,   # xgb  – typically best single model
            5,   # lgbm – typically best single model
            2,   # svm  – good but slower to calibrate
            3,   # et   – good diversity
            2,   # knn  – useful diversity, lower weight
        ],
        n_jobs=-1
    ),

    # ── Adding / Averaging (soft alias) ──────────────────────────────────────
    "Adding (Avg Probabilities)": VotingClassifier(
        estimators=base_estimators,
        voting='soft',
        weights=[1, 1, 1, 1, 1, 1, 1, 1],
        n_jobs=-1
    ),
}

# **Step 2: Load Dataset, Remove Outliers, Drop Features, and Scale Data**

In [5]:
# Load dataset
data = pd.read_csv('https://raw.githubusercontent.com/muajnstu/Bank-Marketing-using-rough-set-approach/refs/heads/main/MicroClustered_bank_data.csv')


# Separate features and target
X = data.drop('Personal_Loan', axis=1)
y = data['Personal_Loan']

# **Step 3: Handle Class Imbalance with Multiple Sampling Techniques**

In [6]:
# Random Undersampling
rus = RandomUnderSampler(random_state=42)
ros = RandomOverSampler(random_state=42)
smote = SMOTE(random_state=42)
# adasyn = ADASYN(random_state=42) # Removed ADASYN due to error

X_rus, y_rus = rus.fit_resample(X, y)
X_ros, y_ros = ros.fit_resample(X, y)
X_smote, y_smote = smote.fit_resample(X, y)
# X_adasyn, y_adasyn = adasyn.fit_resample(X_scaled, y) # Removed ADASYN due to error


# Print class distributions
print(f"Original class distribution: {np.bincount(y.astype(int))}")
print(f"After Random Undersampling: {np.bincount(y_rus.astype(int))}")
print(f"After Random Oversampling: {np.bincount(y_ros.astype(int))}")
print(f"After SMOTE: {np.bincount(y_smote.astype(int))}")
# print(f"After ADASYN: {np.bincount(y_adasyn.astype(int))}") # Removed ADASYN due to error

Original class distribution: [333 375 160 265 234 390 248 264 325 251 262 159 345 207 297 405 232 248]
After Random Undersampling: [159 159 159 159 159 159 159 159 159 159 159 159 159 159 159 159 159 159]
After Random Oversampling: [405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405]
After SMOTE: [405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405]


# **Step 5: Run Active Learning on Different sampled Dataset for All Classifiers**

In [7]:
def remove_instance(X, y, idx):
    X_new = np.delete(X, idx, axis=0)
    y_new = np.delete(y, idx, axis=0)
    return X_new, y_new


def g_mean(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    recalls = []
    for i in range(cm.shape[0]):
        tp = cm[i, i]
        fn = np.sum(cm[i, :]) - tp
        recall_i = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        recalls.append(recall_i)

    valid_recalls = [r for r in recalls if r > 0]
    if not valid_recalls:
        return 0.0

    product = 1.0
    for r in valid_recalls:
        product *= r
    return product ** (1.0 / len(valid_recalls))


# Define result dictionaries
results_active_learning         = {}
results_active_learning_rus     = {}
results_active_learning_ros     = {}
results_active_learning_smote   = {}

# Dictionary mapping sampling method names to data and result storage
sampling_methods = {
    "Original"             : (X, y, results_active_learning),
    "Random Undersampling" : (X_rus, y_rus, results_active_learning_rus),
    "Random Oversampling"  : (X_ros, y_ros, results_active_learning_ros),
    "SMOTE"                : (X_smote, y_smote, results_active_learning_smote)
}

# Loop through each sampling method
for method_name, (X_data, y_data, result_dict) in sampling_methods.items():

    # y is already 0/1 integers — no LabelEncoder needed
    y_data = np.array(y_data, dtype=int)
    unique_classes = np.unique(y_data)

    for name, clf in classifiers.items():
        print(f"\n=== Active Learning: {name} | {method_name} ===")

        X_train_full, X_test, y_train_full, y_test = train_test_split(
            X_data, y_data,
            test_size=0.20,
            random_state=42,
            stratify=y_data        # ensures class balance in split
        )

        np.random.seed(42)

        # --- Seed: one sample per class to guarantee all classes from iteration 1 ---
        seed_idx = []
        for c in unique_classes:
            class_indices = np.where(y_train_full == c)[0]
            seed_idx.append(class_indices[0])
        seed_idx = np.array(seed_idx)

        # --- Fill remaining slots up to initial pool size of 60 ---
        remaining_pool = np.array([i for i in range(len(X_train_full)) if i not in seed_idx])
        n_extra = max(0, 500 - len(seed_idx))
        extra_idx = np.random.choice(remaining_pool, size=n_extra, replace=False)
        labeled_idx = np.concatenate([seed_idx, extra_idx])

        # --- Build labeled and unlabeled sets ---
        if hasattr(X_train_full, 'iloc'):
            X_labeled   = X_train_full.iloc[labeled_idx].values
            X_unlabeled = X_train_full.iloc[[i for i in range(len(X_train_full)) if i not in labeled_idx]].values
        else:
            X_labeled   = X_train_full[labeled_idx]
            X_unlabeled = X_train_full[[i for i in range(len(X_train_full)) if i not in labeled_idx]]

        y_labeled   = y_train_full[labeled_idx]
        y_unlabeled = y_train_full[np.array([i for i in range(len(X_train_full)) if i not in labeled_idx])]

        if hasattr(X_test, 'values'):
            X_test = X_test.values

        history = []

        # --- Active Learning Loop ---
        for i in range(10):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")

                clf.fit(X_labeled, y_labeled)

                # Uncertainty sampling
                if hasattr(clf, "predict_proba"):
                    probs       = clf.predict_proba(X_unlabeled)
                    uncertainty = 1 - np.max(probs, axis=1)
                else:
                    uncertainty = np.random.rand(len(X_unlabeled))

                # Query most uncertain sample
                query_idx = np.argmax(uncertainty)

                # Move queried sample from unlabeled → labeled
                X_labeled   = np.vstack([X_labeled, X_unlabeled[query_idx].reshape(1, -1)])
                y_labeled   = np.append(y_labeled, y_unlabeled[query_idx])
                X_unlabeled, y_unlabeled = remove_instance(X_unlabeled, y_unlabeled, query_idx)

                # Evaluate
                y_pred      = clf.predict(X_test)
                acc         = accuracy_score(y_test, y_pred)
                prec        = precision_score(y_test, y_pred, average='weighted', zero_division=0)
                rec         = recall_score(y_test, y_pred, average='weighted', zero_division=0)
                f1          = f1_score(y_test, y_pred, average='weighted', zero_division=0)
                gmean_score = g_mean(y_test, y_pred)
                mcc         = matthews_corrcoef(y_test, y_pred)

                history.append({
                    'iteration'      : i + 1,
                    'accuracy'       : acc,
                    'precision'      : prec,
                    'recall'         : rec,
                    'f1_score'       : f1,
                    'gmean'          : gmean_score,
                    'mcc'            : mcc,
                    'labeled_samples': len(X_labeled)
                })

                print(f"  [{name}] Iter {i+1:>2} | "
                      f"Labeled={len(X_labeled):>3} | "
                      f"Acc={acc:.4f} | F1={f1:.4f} | "
                      f"G-Mean={gmean_score:.4f} | MCC={mcc:.4f}")

        result_dict[name] = history


=== Active Learning: Logistic Regression | Original ===
  [Logistic Regression] Iter  1 | Labeled=501 | Acc=0.8540 | F1=0.8502 | G-Mean=0.8092 | MCC=0.8451
  [Logistic Regression] Iter  2 | Labeled=502 | Acc=0.8510 | F1=0.8472 | G-Mean=0.8061 | MCC=0.8419
  [Logistic Regression] Iter  3 | Labeled=503 | Acc=0.8520 | F1=0.8481 | G-Mean=0.8068 | MCC=0.8430
  [Logistic Regression] Iter  4 | Labeled=504 | Acc=0.8490 | F1=0.8446 | G-Mean=0.7974 | MCC=0.8398
  [Logistic Regression] Iter  5 | Labeled=505 | Acc=0.8540 | F1=0.8501 | G-Mean=0.8054 | MCC=0.8451
  [Logistic Regression] Iter  6 | Labeled=506 | Acc=0.8560 | F1=0.8521 | G-Mean=0.8070 | MCC=0.8472
  [Logistic Regression] Iter  7 | Labeled=507 | Acc=0.8540 | F1=0.8498 | G-Mean=0.8052 | MCC=0.8451
  [Logistic Regression] Iter  8 | Labeled=508 | Acc=0.8520 | F1=0.8473 | G-Mean=0.7999 | MCC=0.8430
  [Logistic Regression] Iter  9 | Labeled=509 | Acc=0.8550 | F1=0.8502 | G-Mean=0.8035 | MCC=0.8462
  [Logistic Regression] Iter 10 | Labeled=5