## Codebase for bank marketing dataset classification using boosting techniques

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from dataset import get_data

df = get_data()
print(df.head())

   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.conf.idx  euribor3m  nr.employed

Pipeline preprocessing

In [4]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek

X = df.drop(columns=['y'])
y = df['y']

def preprocessing_pipeline(X, y, test_size=0.3, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    smote_tomek = SMOTETomek(random_state=random_state)
    X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)

    return X_resampled, X_test, y_resampled, y_test

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


### Baseline

Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

X_resampled, X_test, y_resampled, y_test, encoders, scalers = preprocessing_pipeline(df)

lr = LogisticRegression(max_iter=100, random_state=42)
lr.fit(X_resampled, y_resampled)

y_pred = lr.predict(X_test)

acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"Accuracy: {acc:.4f}")
print(f"Recall: {rec:.4f}")
print(f"Precision: {prec:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

Accuracy: 0.8599
Recall: 0.8405
Precision: 0.4367
F1 Score: 0.5748
Confusion Matrix: TN=9456, FP=1509, FN=222, TP=1170


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from preprocessing import *

X_resampled, X_test, y_resampled, y_test, encoders, scalers = preprocessing_pipeline(df)

lr = LogisticRegression(max_iter=100, random_state=42)
lr.fit(X_resampled, y_resampled)

y_pred = lr.predict(X_test)

acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"Accuracy: {acc:.4f}")
print(f"Recall: {rec:.4f}")
print(f"Precision: {prec:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

Accuracy: 0.8600
Recall: 0.8405
Precision: 0.4369
F1 Score: 0.5749
Confusion Matrix: TN=9457, FP=1508, FN=222, TP=1170


### Boosting Model

AdaBoost

In [6]:
#Base Model
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

X_resampled, X_test, y_resampled, y_test = preprocessing_pipeline(X, y)

adb = AdaBoostClassifier(n_estimators=50, random_state=42)
adb.fit(X_resampled, y_resampled)

y_pred = adb.predict(X_test)

acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"Accuracy: {acc:.4f}")
print(f"Recall: {rec:.4f}")
print(f"Precision: {prec:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

Accuracy: 0.8717
Recall: 0.8218
Precision: 0.4609
F1 Score: 0.5906
Confusion Matrix: TN=9627, FP=1338, FN=248, TP=1144


In [9]:
#Optimized Model GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

X_resampled, X_test, y_resampled, y_test = preprocessing_pipeline(X, y)

param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],
    'learning_rate': [ 0.01, 0.1, 0.3, 0.5, 1.0],
}

adb = AdaBoostClassifier(random_state=42)

grid_search = GridSearchCV(adb, param_grid, cv=cv, scoring='f1', n_jobs=-1)
grid_search.fit(X_resampled, y_resampled)


best_adb = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

opt_adb = AdaBoostClassifier(n_estimators=best_adb.n_estimators, learning_rate=best_adb.learning_rate, random_state=42)
opt_adb.fit(X_resampled, y_resampled)

y_pred = opt_adb.predict(X_test)

acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"Accuracy: {acc:.4f}")
print(f"Recall: {rec:.4f}")
print(f"Precision: {prec:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

Best Parameters: {'learning_rate': 1.0, 'n_estimators': 500}
Accuracy: 0.8943
Recall: 0.7522
Precision: 0.5214
F1 Score: 0.6159
Confusion Matrix: TN=10004, FP=961, FN=345, TP=1047


Gradient Boosting

In [10]:
#Base Model
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

X_resampled, X_test, y_resampled, y_test = preprocessing_pipeline(X, y)

gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_resampled, y_resampled)

y_pred = gb.predict(X_test)

acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"Accuracy: {acc:.4f}")
print(f"Recall: {rec:.4f}")
print(f"Precision: {prec:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

Accuracy: 0.8832
Recall: 0.8463
Precision: 0.4894
F1 Score: 0.6202
Confusion Matrix: TN=9736, FP=1229, FN=214, TP=1178


In [11]:
#Optimized Model using GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

X_resampled, X_test, y_resampled, y_test = preprocessing_pipeline(X, y)

param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],
    'learning_rate': [0.01, 0.1, 0.3, 0.5, 1.0],
    'max_depth': [3, 5, 7]
}

gb = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(gb, param_grid, cv=cv, scoring='f1', n_jobs=-1)
grid_search.fit(X_resampled, y_resampled)

best_gb = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

opt_gb = GradientBoostingClassifier(
    n_estimators=best_gb.n_estimators,
    learning_rate=best_gb.learning_rate,
    max_depth=best_gb.max_depth,
    random_state=42
)

opt_gb.fit(X_resampled, y_resampled)
y_pred = opt_gb.predict(X_test)
acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"Accuracy: {acc:.4f}")
print(f"Recall: {rec:.4f}")
print(f"Precision: {prec:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500}
Accuracy: 0.9120
Recall: 0.6466
Precision: 0.6020
F1 Score: 0.6235
Confusion Matrix: TN=10370, FP=595, FN=492, TP=900


XGBoost

In [14]:
#Base Model
import xgboost as xgb
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

X_resampled, X_test, y_resampled, y_test = preprocessing_pipeline(X, y)

xgb = xgb.XGBClassifier(tree_method="hist", device="cuda", random_state=42)
xgb.fit(X_resampled, y_resampled)

y_pred = xgb.predict(X_test)

acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"Accuracy: {acc:.4f}")
print(f"Recall: {rec:.4f}")
print(f"Precision: {prec:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

Accuracy: 0.9119
Recall: 0.6825
Precision: 0.5949
F1 Score: 0.6357
Confusion Matrix: TN=10318, FP=647, FN=442, TP=950


In [None]:
#Optimized Model GridSearchCV
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from preprocessing import *

X_resampled, X_test, y_resampled, y_test, encoders, scalers = preprocessing_pipeline(df)

param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],
    'learning_rate': [ 0.01, 0.1, 0.3, 0.5, 1.0],
    'subsample': [0.5, 0.8, 1.0]
}

xgbc = xgb.XGBClassifier(tree_method="hist", device="cuda", random_state=42)
grid_search = GridSearchCV(xgbc, param_grid, cv=cv_5fold, scoring='f1', n_jobs=-1)

grid_search.fit(X_resampled, y_resampled)
best_xgb = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

opt_xgb = xgb.XGBClassifier(
    n_estimators=best_xgb.n_estimators,
    learning_rate=best_xgb.learning_rate,
    max_depth=best_xgb.max_depth,
    subsample=best_xgb.subsample,
    tree_method="hist",
    device="cuda",
    random_state=42
)
opt_xgb.fit(X_resampled, y_resampled)
y_pred = opt_xgb.predict(X_test)


acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
tp, fn, fp, tn = confusion_matrix(y_test, y_pred).ravel()

print(f"Accuracy: {acc}")
print(f"Recall: {rec}")
print(f"Precision: {prec}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

Best Parameters: {'learning_rate': 0.1, 'n_estimators': 500, 'subsample': 0.8}
Accuracy: 0.9113862588006798
Recall: 0.6436781609195402
Precision: 0.5993311036789297
F1 Score: 0.6207135434707308
Confusion Matrix:
[[10366   599]
 [  496   896]]


In [4]:
#Optimized Model GridSearchCV
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from preprocessing import *

X_resampled, X_test, y_resampled, y_test, encoders, scalers = preprocessing_pipeline(df)

param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],
    'learning_rate': [ 0.01, 0.1, 0.3, 0.5, 1.0],
    'subsample': [0.5, 0.8, 1.0]
}

xgbc = xgb.XGBClassifier(tree_method="hist", device="cuda", random_state=42)
grid_search = GridSearchCV(xgbc, param_grid, cv=cv_5fold(), scoring='f1', n_jobs=-1)

grid_search.fit(X_resampled, y_resampled)
best_xgb = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

opt_xgb = xgb.XGBClassifier(
    n_estimators=best_xgb.n_estimators,
    learning_rate=best_xgb.learning_rate,
    max_depth=best_xgb.max_depth,
    subsample=best_xgb.subsample,
    tree_method="hist",
    device="cuda",
    random_state=42
)
opt_xgb.fit(X_resampled, y_resampled)
y_pred = opt_xgb.predict(X_test)


acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
tp, fn, fp, tn = confusion_matrix(y_test, y_pred).ravel()

print(f"Accuracy: {acc}")
print(f"Recall: {rec}")
print(f"Precision: {prec}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

Best Parameters: {'learning_rate': 0.1, 'n_estimators': 500, 'subsample': 0.5}
Accuracy: 0.910091446143886
Recall: 0.6429597701149425
Precision: 0.5931080185553347
F1 Score: 0.6170286108238539
Confusion Matrix: TN=895, FP=497, FN=614, TP=10351


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
