In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearnex import patch_sklearn

In [3]:
patch_sklearn()

Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [4]:
df = pd.read_csv("bank-additional.csv", sep=';')
df = df.drop("duration", axis=1)
X = df.drop("y", axis=1)
y = (df["y"] == "yes").astype(int)
X = pd.get_dummies(X, drop_first=True)

In [12]:
target_count = df['y'].value_counts()
print(target_count)

y
no     36548
yes     4640
Name: count, dtype: int64


In [5]:
COST_MODEL = {'TP': 100, 'FP': -5, 'FN': -100, 'TN': 0}

In [6]:
def compute_business_value(cm, cost_structure):
    tn, fp, fn, tp = cm.ravel()
    return (tp * cost_structure['TP'] +
            tn * cost_structure['TN'] +
            fp * cost_structure['FP'] +
            fn * cost_structure['FN'])

def evaluate_model_with_business(model, X_train, y_train, X_test, y_test, cost_model, label=None):
    print(f"\n=== Evaluating {label} ===")

    print("[STEP] Fitting model...")
    model.fit(X_train, y_train)

    print("[STEP] Predicting probabilities...")
    y_proba = model.predict_proba(X_test)[:, 1]

    print("[STEP] Finding optimal threshold for business value...")
    best_t, best_val = find_best_threshold(y_test, y_proba, cost_model)

    print(f"[STEP] Applying threshold {best_t:.2f} to predictions...")
    y_pred = (y_proba >= best_t).astype(int)

    print("[STEP] Calculating metrics...")
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    bacc = balanced_accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    df_cm = pd.DataFrame(cm, index=["Actual No", "Actual Yes"], columns=["Pred No", "Pred Yes"])
    print("[RESULT] Confusion Matrix:")
    print(df_cm)

    print(f"[RESULT] Business Value at optimal threshold: {best_val}")

    return {
        'Model': label,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1': f1,
        'BalancedAcc': bacc,
        'BusinessValue': best_val,
        'Threshold': best_t
    }

def grid_search_model(model_class, param_grid, X_train, y_train):
    grid = GridSearchCV(model_class, param_grid, cv=2, scoring='balanced_accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)

    return grid.best_estimator_, grid.best_params_

def find_best_threshold(y_true, y_proba, cost_model):
    print("\n[INFO] Searching best threshold...")
    thresholds = np.linspace(0, 1, 101)

    best_value = -np.inf
    best_threshold = 0.5

    for t in thresholds:
        y_pred = (y_proba >= t).astype(int)
        cm = confusion_matrix(y_true, y_pred)
        value = compute_business_value(cm, cost_model)

        if value > best_value:
            best_value = value
            best_threshold = t

    print(f"[INFO] Best threshold found: {best_threshold:.2f} with BusinessValue {best_value}")
    return best_threshold, best_value


In [7]:
param_grids = {
    'DecisionTree': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [3, 5, 7, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5],
        'ccp_alpha': [0.0, 0.001, 0.01]
    },
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [5, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'GradientBoosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    # 'AdaBoost': {
    #     'n_estimators': [50, 100, 200],
    #     'learning_rate': [0.5, 1.0, 1.5]
    # },
    # 'XGBoost': {
    #     'n_estimators': [100, 200],
    #     'max_depth': [3, 5, 7],
    #     'learning_rate': [0.05, 0.1, 0.2],
    #     'subsample': [0.8, 1.0],
    #     'colsample_bytree': [0.8, 1.0]
    # }
}

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)
split_sets = {}

# Original
split_sets['Original'] = (X_train, X_test, y_train, y_test)

# SMOTE
X_train_sm, y_train_sm = SMOTE(random_state=42).fit_resample(X_train, y_train)
split_sets['SMOTE'] = (X_train_sm, X_test, y_train_sm, y_test)

# SMOTE_ENN
X_train_se, y_train_se = SMOTEENN(random_state=42).fit_resample(X_train, y_train)
split_sets['SMOTE_ENN'] = (X_train_se, X_test, y_train_se, y_test)

# SMOTE_Tomek
X_train_st, y_train_st = SMOTETomek(random_state=42).fit_resample(X_train, y_train)
split_sets['SMOTE_Tomek'] = (X_train_st, X_test, y_train_st, y_test)

# Random OverSampler
X_train_over, y_train_over = RandomOverSampler(random_state=42).fit_resample(X_train, y_train)
split_sets['Over'] = (X_train_over, X_test, y_train_over, y_test)

# Random UnderSampler
X_train_under, y_train_under = RandomUnderSampler(random_state=42).fit_resample(X_train, y_train)
split_sets['Under'] = (X_train_under, X_test, y_train_under, y_test)

In [9]:
results = []
selected_for_grid = ['Original', 'SMOTE', 'SMOTE_ENN', 'SMOTE_Tomek', 'Over', 'Under']

In [10]:
for split_label, (X_tr, X_te, y_tr, y_te) in split_sets.items():
    for clf_label, clf_class in {
        "DecisionTree": DecisionTreeClassifier,
        "RandomForest": RandomForestClassifier,
        "GradientBoosting": GradientBoostingClassifier,
        # "AdaBoost": AdaBoostClassifier,
        # "XGBoost": XGBClassifier
    }.items():
        if split_label in selected_for_grid:
            model, best_params = grid_search_model(clf_class(random_state=42), param_grids[clf_label], X_tr, y_tr)
        else:
            if clf_label == "XGBoost":
                model = clf_class(use_label_encoder=False, eval_metric='logloss', random_state=42)
            else:
                model = clf_class(random_state=42)
        name = f"{clf_label} ({split_label})"
        result = evaluate_model_with_business(model, X_tr, y_tr, X_te, y_te, COST_MODEL, label=name)
        results.append(result)


=== Evaluating DecisionTree (Original) ===
[STEP] Fitting model...
[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold found: 0.00 with BusinessValue 84375
[STEP] Applying threshold 0.00 to predictions...
[STEP] Calculating metrics...
[RESULT] Confusion Matrix:
            Pred No  Pred Yes
Actual No         0     10965
Actual Yes        0      1392
[RESULT] Business Value at optimal threshold: 84375

=== Evaluating RandomForest (Original) ===
[STEP] Fitting model...
[STEP] Predicting probabilities...
[STEP] Finding optimal threshold for business value...

[INFO] Searching best threshold...
[INFO] Best threshold found: 0.01 with BusinessValue 84425
[STEP] Applying threshold 0.01 to predictions...
[STEP] Calculating metrics...
[RESULT] Confusion Matrix:
            Pred No  Pred Yes
Actual No       250     10715
Actual Yes        6      1386
[RESULT] Business Value at optimal threshold: 8442

In [11]:
results_df = pd.DataFrame(results)
results_df_sorted = results_df.sort_values(by="BusinessValue", ascending=False)
print("\nSorted by Business Value:")
print(results_df_sorted.reset_index(drop=True))


Sorted by Business Value:
                             Model  Accuracy  Precision    Recall        F1  \
0       RandomForest (SMOTE_Tomek)  0.116938   0.113134  1.000000  0.203271   
1             RandomForest (Under)  0.138869   0.115234  0.994971  0.206547   
2         RandomForest (SMOTE_ENN)  0.157482   0.117093  0.990661  0.209431   
3              RandomForest (Over)  0.115643   0.112987  1.000000  0.203034   
4             RandomForest (SMOTE)  0.115238   0.112941  1.000000  0.202960   
5         GradientBoosting (Under)  0.113701   0.112767  1.000000  0.202679   
6          RandomForest (Original)  0.132395   0.114536  0.995690  0.205440   
7          DecisionTree (Original)  0.112649   0.112649  1.000000  0.202487   
8             DecisionTree (SMOTE)  0.112649   0.112649  1.000000  0.202487   
9      GradientBoosting (Original)  0.112649   0.112649  1.000000  0.202487   
10      DecisionTree (SMOTE_Tomek)  0.112649   0.112649  1.000000  0.202487   
11    GradientBoosting (S

In [13]:
results_df_sorted.reset_index(drop=True)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,BalancedAcc,BusinessValue,Threshold
0,RandomForest (SMOTE_Tomek),0.116938,0.113134,1.0,0.203271,0.502417,84640,0.01
1,RandomForest (Under),0.138869,0.115234,0.994971,0.206547,0.512579,84630,0.17
2,RandomForest (SMOTE_ENN),0.157482,0.117093,0.990661,0.209431,0.521185,84610,0.03
3,RandomForest (Over),0.115643,0.112987,1.0,0.203034,0.501687,84560,0.02
4,RandomForest (SMOTE),0.115238,0.112941,1.0,0.20296,0.501459,84535,0.01
5,GradientBoosting (Under),0.113701,0.112767,1.0,0.202679,0.500593,84440,0.08
6,RandomForest (Original),0.132395,0.114536,0.99569,0.20544,0.509245,84425,0.01
7,DecisionTree (Original),0.112649,0.112649,1.0,0.202487,0.5,84375,0.0
8,DecisionTree (SMOTE),0.112649,0.112649,1.0,0.202487,0.5,84375,0.0
9,GradientBoosting (Original),0.112649,0.112649,1.0,0.202487,0.5,84375,0.0
