In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

In [11]:
train_df = pd.read_csv("Data/bank-additional-train.csv", sep=';')
test_df  = pd.read_csv("Data/bank-additional-test.csv", sep=';')

# Encode target
train_df['y'] = train_df['y'].map({'no': 0, 'yes': 1})
test_df['y']  = test_df['y'].map({'no': 0, 'yes': 1})

# One-hot encoding (fit on train, apply on test)
train_df = pd.get_dummies(train_df, drop_first=True)
test_df  = pd.get_dummies(test_df, drop_first=True)

test_df = test_df.reindex(columns=train_df.columns, fill_value=0)

X_train = train_df.drop('y', axis=1)
y_train = train_df['y']
X_test  = test_df.drop('y', axis=1)
y_test  = test_df['y']

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [12]:
# Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)


Decision Tree Classifier

In [13]:
dt_model = DecisionTreeClassifier(
    random_state=42,
    max_depth=10   # prevents overfitting (important)
)

dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)
y_prob_dt = dt_model.predict_proba(X_test)[:, 1]


In [14]:
accuracy_dt  = accuracy_score(y_test, y_pred_dt)
auc_dt       = roc_auc_score(y_test, y_prob_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt    = recall_score(y_test, y_pred_dt)
f1_dt        = f1_score(y_test, y_pred_dt)
mcc_dt       = matthews_corrcoef(y_test, y_pred_dt)

K-Nearest Neighbours

In [15]:
knn_model = KNeighborsClassifier(
    n_neighbors=5,
    metric='minkowski'
)

knn_model.fit(X_train, y_train)


0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [16]:
y_pred_knn = knn_model.predict(X_test)
y_prob_knn = knn_model.predict_proba(X_test)[:, 1]


In [17]:
accuracy_knn  = accuracy_score(y_test, y_pred_knn)
auc_knn       = roc_auc_score(y_test, y_prob_knn)
precision_knn = precision_score(y_test, y_pred_knn)
recall_knn    = recall_score(y_test, y_pred_knn)
f1_knn        = f1_score(y_test, y_pred_knn)
mcc_knn       = matthews_corrcoef(y_test, y_pred_knn)


Naive Bayes Classifier

In [18]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)


0,1,2
,priors,
,var_smoothing,1e-09


In [19]:
y_pred_nb = nb_model.predict(X_test)
y_prob_nb = nb_model.predict_proba(X_test)[:, 1]


In [20]:
accuracy_nb  = accuracy_score(y_test, y_pred_nb)
auc_nb       = roc_auc_score(y_test, y_prob_nb)
precision_nb = precision_score(y_test, y_pred_nb)
recall_nb    = recall_score(y_test, y_pred_nb)
f1_nb        = f1_score(y_test, y_pred_nb)
mcc_nb       = matthews_corrcoef(y_test, y_pred_nb)


Ensemble Model – Random Forest

In [21]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [22]:
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

In [23]:
accuracy_rf  = accuracy_score(y_test, y_pred_rf)
auc_rf       = roc_auc_score(y_test, y_prob_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf    = recall_score(y_test, y_pred_rf)
f1_rf        = f1_score(y_test, y_pred_rf)
mcc_rf       = matthews_corrcoef(y_test, y_pred_rf)


XGBoost

In [24]:
xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42
)

xgb_model.fit(X_train, y_train)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [25]:
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]


In [26]:
accuracy_xgb  = accuracy_score(y_test, y_pred_xgb)
auc_xgb       = roc_auc_score(y_test, y_prob_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb    = recall_score(y_test, y_pred_xgb)
f1_xgb        = f1_score(y_test, y_pred_xgb)
mcc_xgb       = matthews_corrcoef(y_test, y_pred_xgb)


In [27]:
results = pd.DataFrame({
    "ML Model": [
        "Logistic Regression",
        "Decision Tree",
        "K-Nearest Neighbors",
        "Naive Bayes",
        "Random Forest",
        "XGBoost"
    ],
    "Accuracy": [
        accuracy,
        accuracy_dt,
        accuracy_knn,
        accuracy_nb,
        accuracy_rf,
        accuracy_xgb
    ],
    "AUC": [
        auc,
        auc_dt,
        auc_knn,
        auc_nb,
        auc_rf,
        auc_xgb
    ],
    "Precision": [
        precision,
        precision_dt,
        precision_knn,
        precision_nb,
        precision_rf,
        precision_xgb
    ],
    "Recall": [
        recall,
        recall_dt,
        recall_knn,
        recall_nb,
        recall_rf,
        recall_xgb
    ],
    "F1 Score": [
        f1,
        f1_dt,
        f1_knn,
        f1_nb,
        f1_rf,
        f1_xgb
    ],
    "MCC": [
        mcc,
        mcc_dt,
        mcc_knn,
        mcc_nb,
        mcc_rf,
        mcc_xgb
    ]
})

# Round values for neat display
results = results.round(4)

results


Unnamed: 0,ML Model,Accuracy,AUC,Precision,Recall,F1 Score,MCC
0,Logistic Regression,0.9138,0.9385,0.6667,0.4257,0.5196,0.4892
1,Decision Tree,0.9378,0.9658,0.7671,0.6208,0.6863,0.6567
2,K-Nearest Neighbors,0.9235,0.9568,0.7361,0.4701,0.5737,0.5502
3,Naive Bayes,0.7429,0.8365,0.2711,0.7982,0.4047,0.357
4,Random Forest,0.9709,0.989,0.9853,0.745,0.8485,0.8427
5,XGBoost,0.9517,0.9763,0.8298,0.7029,0.7611,0.7376
