In [23]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearnex import patch_sklearn

In [24]:
patch_sklearn()

Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [25]:
df = pd.read_csv("bank-additional.csv", sep=';')
df = df.drop("duration", axis=1)
X = df.drop("y", axis=1)
y = (df["y"] == "yes").astype(int)
X = pd.get_dummies(X, drop_first=True)

In [26]:
target_count = df['y'].value_counts()
print(target_count)

y
no     36548
yes     4640
Name: count, dtype: int64


In [27]:
COST_MODEL = {'TP': 100, 'FP': -5, 'FN': -100, 'TN': 0}

In [28]:
def make_ann(random_state=42):
    return Pipeline(
        [
            ("scaler", StandardScaler()),
            ("clf", MLPClassifier(random_state=random_state)),
        ]
    )


def compute_business_value(cm, cost_structure):
    tn, fp, fn, tp = cm.ravel()
    return (
        tp * cost_structure["TP"]
        + tn * cost_structure["TN"]
        + fp * cost_structure["FP"]
        + fn * cost_structure["FN"]
    )


def evaluate_model_with_business(
    model, X_train, y_train, X_test, y_test, cost_model, label=None
):
    print(f"\n=== Evaluating {label} ===")

    print("[STEP] Fitting model...")
    model.fit(X_train, y_train)

    print("[STEP] Predicting probabilities...")
    y_proba = model.predict_proba(X_test)[:, 1]

    print("[STEP] Finding optimal threshold for business value...")
    best_t, best_val = find_best_threshold(y_test, y_proba, cost_model)

    print(f"[STEP] Applying threshold {best_t:.2f} to predictions...")
    y_pred = (y_proba >= best_t).astype(int)

    print("[STEP] Calculating metrics...")
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    bacc = balanced_accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    df_cm = pd.DataFrame(
        cm, index=["Actual No", "Actual Yes"], columns=["Pred No", "Pred Yes"]
    )
    print("[RESULT] Confusion Matrix:")
    print(df_cm)

    print(f"[RESULT] Business Value at optimal threshold: {best_val}")

    return {
        "Model": label,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "BalancedAcc": bacc,
        "BusinessValue": best_val,
        "Threshold": best_t,
    }


def grid_search_model(model_class, param_grid, X_train, y_train):
    grid = GridSearchCV(model_class, param_grid, cv=2, scoring="balanced_accuracy")
    grid.fit(X_train, y_train)

    return grid.best_estimator_, grid.best_params_


def find_best_threshold(y_true, y_proba, cost_model):
    print("\n[INFO] Searching best threshold...")
    thresholds = np.linspace(0, 1, 101)

    best_value = -np.inf
    best_threshold = 0.5

    for t in thresholds:
        y_pred = (y_proba >= t).astype(int)
        cm = confusion_matrix(y_true, y_pred)
        value = compute_business_value(cm, cost_model)

        if value > best_value:
            best_value = value
            best_threshold = t

    print(
        f"[INFO] Best threshold found: {best_threshold:.2f} with BusinessValue {best_value}"
    )
    return best_threshold, best_value


def run_model(clf_label, random_state=42):
    """
    Run one model (by label) across all split_sets with grid search
    and business-metric evaluation.
    """

    if clf_label not in model_classes:
        raise ValueError(f"Unknown model: {clf_label}")

    results = []

    clf_factory = model_classes[clf_label]
    param_grid = param_grids.get(clf_label)

    for split_label, (X_tr, X_te, y_tr, y_te) in split_sets.items():
        print("\n" + "=" * 50)
        print(f"Model: {clf_label} | Split: {split_label}")
        print("=" * 50)

        base_model = clf_factory(random_state)

        # Grid search if params exist
        if param_grid:
            model, best_params = grid_search_model(
                base_model,
                param_grid,
                X_tr,
                y_tr,
            )
            print("[INFO] Best params:", best_params)
        else:
            model = base_model
            model.fit(X_tr, y_tr)
            best_params = None

        result = evaluate_model_with_business(
            model,
            X_tr,
            y_tr,
            X_te,
            y_te,
            COST_MODEL,
            label=f"{clf_label} ({split_label})",
        )

        result.update(
            {
                "Model": clf_label,
                "Split": split_label,
                "BestParams": best_params,
            }
        )

        results.append(result)

    return pd.DataFrame(results)

In [29]:
model_classes = {
    "DecisionTree": lambda rs: DecisionTreeClassifier(random_state=rs),
    "RandomForest": lambda rs: RandomForestClassifier(random_state=rs),
    "GradientBoosting": lambda rs: GradientBoostingClassifier(random_state=rs),
    "AdaBoost": lambda rs: AdaBoostClassifier(random_state=rs),
    "XGBoost": lambda rs: XGBClassifier(eval_metric="logloss", random_state=rs),
    "ANN": make_ann,
}

In [None]:
param_grids = {
    "DecisionTree": {
        "criterion": ["gini", "entropy"],
        "max_depth": [3, 5, 7, 10, 15],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 5],
        "ccp_alpha": [0.0, 0.001, 0.01],
    },
    "RandomForest": {
        "n_estimators": [100, 200],
        "max_depth": [5, 10, 20],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2],
    },
    "GradientBoosting": {
        "n_estimators": [100, 200],
        "learning_rate": [0.05, 0.1, 0.2],
        "max_depth": [3, 5, 7],
    },
    "AdaBoost": {"n_estimators": [50, 100, 200], "learning_rate": [0.5, 1.0, 1.5]},
    "XGBoost": {
        "n_estimators": [100, 200],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.05, 0.1, 0.2],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0],
        # "n_jobs": 1
    },
    "ANN": {
        "clf__hidden_layer_sizes": [(32,), (64,), (32, 16)],
        "clf__activation": ["relu", "tanh"],
        "clf__alpha": [0.0001, 0.001],
        "clf__learning_rate_init": [0.001, 0.01],
        "clf__max_iter": [200],
    },
}

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)
split_sets = {}

# Original
split_sets['Original'] = (X_train, X_test, y_train, y_test)

# SMOTE
X_train_sm, y_train_sm = SMOTE(random_state=42).fit_resample(X_train, y_train)
split_sets['SMOTE'] = (X_train_sm, X_test, y_train_sm, y_test)

# SMOTE_ENN
X_train_se, y_train_se = SMOTEENN(random_state=42).fit_resample(X_train, y_train)
split_sets['SMOTE_ENN'] = (X_train_se, X_test, y_train_se, y_test)

# SMOTE_Tomek
X_train_st, y_train_st = SMOTETomek(random_state=42).fit_resample(X_train, y_train)
split_sets['SMOTE_Tomek'] = (X_train_st, X_test, y_train_st, y_test)

# Random OverSampler
X_train_over, y_train_over = RandomOverSampler(random_state=42).fit_resample(X_train, y_train)
split_sets['Over'] = (X_train_over, X_test, y_train_over, y_test)

# Random UnderSampler
X_train_under, y_train_under = RandomUnderSampler(random_state=42).fit_resample(X_train, y_train)
split_sets['Under'] = (X_train_under, X_test, y_train_under, y_test)

In [32]:
dt_results = run_model("DecisionTree")


Model: DecisionTree | Split: Original
[INFO] Best params: {'ccp_alpha': 0.0, 'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 2}

=== Evaluating DecisionTree (Original) ===
[STEP] Fitting model...
[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold found: 0.00 with BusinessValue 84375
[STEP] Applying threshold 0.00 to predictions...
[STEP] Calculating metrics...
[RESULT] Confusion Matrix:
            Pred No  Pred Yes
Actual No         0     10965
Actual Yes        0      1392
[RESULT] Business Value at optimal threshold: 84375

Model: DecisionTree | Split: SMOTE
[INFO] Best params: {'ccp_alpha': 0.0, 'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2}

=== Evaluating DecisionTree (SMOTE) ===
[STEP] Fitting model...
[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best

In [33]:
rf_results = run_model("RandomForest")


Model: RandomForest | Split: Original
[INFO] Best params: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

=== Evaluating RandomForest (Original) ===
[STEP] Fitting model...
[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold found: 0.01 with BusinessValue 84425
[STEP] Applying threshold 0.01 to predictions...
[STEP] Calculating metrics...
[RESULT] Confusion Matrix:
            Pred No  Pred Yes
Actual No       250     10715
Actual Yes        6      1386
[RESULT] Business Value at optimal threshold: 84425

Model: RandomForest | Split: SMOTE
[INFO] Best params: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

=== Evaluating RandomForest (SMOTE) ===
[STEP] Fitting model...
[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold

In [35]:
gb_results = run_model("GradientBoosting")


Model: GradientBoosting | Split: Original
[INFO] Best params: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}

=== Evaluating GradientBoosting (Original) ===
[STEP] Fitting model...
[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold found: 0.00 with BusinessValue 84375
[STEP] Applying threshold 0.00 to predictions...
[STEP] Calculating metrics...
[RESULT] Confusion Matrix:
            Pred No  Pred Yes
Actual No         0     10965
Actual Yes        0      1392
[RESULT] Business Value at optimal threshold: 84375

Model: GradientBoosting | Split: SMOTE
[INFO] Best params: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200}

=== Evaluating GradientBoosting (SMOTE) ===
[STEP] Fitting model...
[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold found: 0.00 with BusinessValue 8437

In [36]:
ab_results = run_model("AdaBoost")


Model: AdaBoost | Split: Original
[INFO] Best params: {'learning_rate': 1.5, 'n_estimators': 200}

=== Evaluating AdaBoost (Original) ===
[STEP] Fitting model...
[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold found: 0.39 with BusinessValue 85080
[STEP] Applying threshold 0.39 to predictions...
[STEP] Calculating metrics...
[RESULT] Confusion Matrix:
            Pred No  Pred Yes
Actual No       421     10544
Actual Yes        7      1385
[RESULT] Business Value at optimal threshold: 85080

Model: AdaBoost | Split: SMOTE
[INFO] Best params: {'learning_rate': 1.5, 'n_estimators': 200}

=== Evaluating AdaBoost (SMOTE) ===
[STEP] Fitting model...
[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold found: 0.41 with BusinessValue 84385
[STEP] Applying threshold 0.41 to predictions...
[STEP] Calcul

In [37]:
xgb_results = run_model("XGBoost")


Model: XGBoost | Split: Original
[INFO] Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}

=== Evaluating XGBoost (Original) ===
[STEP] Fitting model...
[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold found: 0.00 with BusinessValue 84375
[STEP] Applying threshold 0.00 to predictions...
[STEP] Calculating metrics...
[RESULT] Confusion Matrix:
            Pred No  Pred Yes
Actual No         0     10965
Actual Yes        0      1392
[RESULT] Business Value at optimal threshold: 84375

Model: XGBoost | Split: SMOTE
[INFO] Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}

=== Evaluating XGBoost (SMOTE) ===
[STEP] Fitting model...
[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] 

In [38]:
ann_results = run_model("ANN")


Model: ANN | Split: Original




[INFO] Best params: {'clf__activation': 'relu', 'clf__alpha': 0.001, 'clf__hidden_layer_sizes': (32, 16), 'clf__learning_rate_init': 0.001, 'clf__max_iter': 200}

=== Evaluating ANN (Original) ===
[STEP] Fitting model...




[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold found: 0.00 with BusinessValue 84375
[STEP] Applying threshold 0.00 to predictions...
[STEP] Calculating metrics...
[RESULT] Confusion Matrix:
            Pred No  Pred Yes
Actual No         0     10965
Actual Yes        0      1392
[RESULT] Business Value at optimal threshold: 84375

Model: ANN | Split: SMOTE




[INFO] Best params: {'clf__activation': 'relu', 'clf__alpha': 0.001, 'clf__hidden_layer_sizes': (64,), 'clf__learning_rate_init': 0.01, 'clf__max_iter': 200}

=== Evaluating ANN (SMOTE) ===
[STEP] Fitting model...
[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold found: 0.00 with BusinessValue 84375
[STEP] Applying threshold 0.00 to predictions...
[STEP] Calculating metrics...
[RESULT] Confusion Matrix:
            Pred No  Pred Yes
Actual No         0     10965
Actual Yes        0      1392
[RESULT] Business Value at optimal threshold: 84375

Model: ANN | Split: SMOTE_ENN




[INFO] Best params: {'clf__activation': 'relu', 'clf__alpha': 0.001, 'clf__hidden_layer_sizes': (32,), 'clf__learning_rate_init': 0.001, 'clf__max_iter': 200}

=== Evaluating ANN (SMOTE_ENN) ===
[STEP] Fitting model...




[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold found: 0.00 with BusinessValue 84375
[STEP] Applying threshold 0.00 to predictions...
[STEP] Calculating metrics...
[RESULT] Confusion Matrix:
            Pred No  Pred Yes
Actual No         0     10965
Actual Yes        0      1392
[RESULT] Business Value at optimal threshold: 84375

Model: ANN | Split: SMOTE_Tomek




[INFO] Best params: {'clf__activation': 'relu', 'clf__alpha': 0.0001, 'clf__hidden_layer_sizes': (64,), 'clf__learning_rate_init': 0.001, 'clf__max_iter': 200}

=== Evaluating ANN (SMOTE_Tomek) ===
[STEP] Fitting model...




[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold found: 0.00 with BusinessValue 84375
[STEP] Applying threshold 0.00 to predictions...
[STEP] Calculating metrics...
[RESULT] Confusion Matrix:
            Pred No  Pred Yes
Actual No         0     10965
Actual Yes        0      1392
[RESULT] Business Value at optimal threshold: 84375

Model: ANN | Split: Over




[INFO] Best params: {'clf__activation': 'tanh', 'clf__alpha': 0.0001, 'clf__hidden_layer_sizes': (64,), 'clf__learning_rate_init': 0.01, 'clf__max_iter': 200}

=== Evaluating ANN (Over) ===
[STEP] Fitting model...




[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold found: 0.00 with BusinessValue 84375
[STEP] Applying threshold 0.00 to predictions...
[STEP] Calculating metrics...
[RESULT] Confusion Matrix:
            Pred No  Pred Yes
Actual No         0     10965
Actual Yes        0      1392
[RESULT] Business Value at optimal threshold: 84375

Model: ANN | Split: Under




[INFO] Best params: {'clf__activation': 'relu', 'clf__alpha': 0.001, 'clf__hidden_layer_sizes': (32,), 'clf__learning_rate_init': 0.001, 'clf__max_iter': 200}

=== Evaluating ANN (Under) ===
[STEP] Fitting model...




[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold found: 0.00 with BusinessValue 84375
[STEP] Applying threshold 0.00 to predictions...
[STEP] Calculating metrics...
[RESULT] Confusion Matrix:
            Pred No  Pred Yes
Actual No         0     10965
Actual Yes        0      1392
[RESULT] Business Value at optimal threshold: 84375


In [45]:
results = pd.concat([dt_results, rf_results, gb_results, ab_results, xgb_results, ann_results])


In [46]:
results_df = pd.DataFrame(results)
results_df_sorted = results_df.sort_values(by="BusinessValue", ascending=False)
print("\nSorted by Business Value:")
print(results_df_sorted.reset_index(drop=True))


Sorted by Business Value:
               Model  Accuracy  Precision    Recall        F1  BalancedAcc  \
0           AdaBoost  0.146152   0.116104  0.994971  0.207942     0.516683   
1            XGBoost  0.137978   0.115193  0.995690  0.206496     0.512391   
2       RandomForest  0.116938   0.113134  1.000000  0.203271     0.502417   
3       RandomForest  0.138869   0.115234  0.994971  0.206547     0.512579   
4           AdaBoost  0.119851   0.113403  0.999282  0.203690     0.503745   
5       RandomForest  0.157482   0.117093  0.990661  0.209431     0.521185   
6       RandomForest  0.115643   0.112987  1.000000  0.203034     0.501687   
7       RandomForest  0.115238   0.112941  1.000000  0.202960     0.501459   
8   GradientBoosting  0.113701   0.112767  1.000000  0.202679     0.500593   
9           AdaBoost  0.113458   0.112740  1.000000  0.202635     0.500456   
10      RandomForest  0.132395   0.114536  0.995690  0.205440     0.509245   
11          AdaBoost  0.113296   0.11

In [47]:
results_df_sorted.reset_index(drop=True)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,BalancedAcc,BusinessValue,Threshold,Split,BestParams
0,AdaBoost,0.146152,0.116104,0.994971,0.207942,0.516683,85080,0.39,Original,"{'learning_rate': 1.5, 'n_estimators': 200}"
1,XGBoost,0.137978,0.115193,0.99569,0.206496,0.512391,84770,0.14,Under,"{'colsample_bytree': 1.0, 'learning_rate': 0.1..."
2,RandomForest,0.116938,0.113134,1.0,0.203271,0.502417,84640,0.01,SMOTE_Tomek,"{'max_depth': 20, 'min_samples_leaf': 1, 'min_..."
3,RandomForest,0.138869,0.115234,0.994971,0.206547,0.512579,84630,0.17,Under,"{'max_depth': 10, 'min_samples_leaf': 1, 'min_..."
4,AdaBoost,0.119851,0.113403,0.999282,0.20369,0.503745,84625,0.42,Over,"{'learning_rate': 1.5, 'n_estimators': 200}"
5,RandomForest,0.157482,0.117093,0.990661,0.209431,0.521185,84610,0.03,SMOTE_ENN,"{'max_depth': 20, 'min_samples_leaf': 1, 'min_..."
6,RandomForest,0.115643,0.112987,1.0,0.203034,0.501687,84560,0.02,Over,"{'max_depth': 20, 'min_samples_leaf': 1, 'min_..."
7,RandomForest,0.115238,0.112941,1.0,0.20296,0.501459,84535,0.01,SMOTE,"{'max_depth': 20, 'min_samples_leaf': 1, 'min_..."
8,GradientBoosting,0.113701,0.112767,1.0,0.202679,0.500593,84440,0.08,Under,"{'learning_rate': 0.05, 'max_depth': 5, 'n_est..."
9,AdaBoost,0.113458,0.11274,1.0,0.202635,0.500456,84425,0.43,SMOTE_ENN,"{'learning_rate': 1.5, 'n_estimators': 200}"


In [49]:
file_content = results.to_csv('./results.csv')

In [50]:
dt_results.to_csv('./dt_results.csv')