# Bank-Marketing-Campaign-Predictive-Model-Project


### Here we are importing the necessary librabries

In [2]:
import os
import numpy as np
import pandas as pd
import shap
from pandas import read_csv, get_dummies, Series
from datetime import datetime
from matplotlib.pyplot import *
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from imblearn.over_sampling import SMOTE
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, roc_auc_score, confusion_matrix, plot_confusion_matrix, classification_report
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# from IPython.display import set_matplotlib_formats
# set_matplotlib_formats('png', 'pdf')

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

#### Read in the dataset 

In [77]:
# # Path to the CSV file in the sub-directory
# file_path = "data/bank-additional/bank-additional-full.csv"

# # Read the CSV file into a DataFrame
# df = pd.read_csv(file_path, sep=";")

# #Check the top five rows in the dataset
# df.head()

In [55]:
# Read the CSV file into a DataFrame
df = pd.read_csv("data/df_cleaned.csv")
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscription
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,59,admin.,married,professional.course,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


### Checking the dataset shape in rows and columns

In [56]:
df.shape

(30478, 21)

# Data Preparation 

In [57]:
df.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
subscription       object
dtype: object

## 1. Data Cleaning

### Missing values, unknown entries, nulls and any NaN have been removed in the EDA file

## 2. Data Encoding

#### Create a list of categorical values that need encoding

In [58]:
''' Creating a list of the categorical features '''
obj_cols = []
for i in df:
     if df[i].dtypes == 'object':
        obj_cols.append(i)

#### Let's find out the number of classes in each categorical feature

In [59]:
for k in obj_cols:
    print(k, ":", df[k].nunique())

job : 11
marital : 3
education : 7
default : 2
housing : 2
loan : 2
contact : 2
month : 10
day_of_week : 5
poutcome : 3
subscription : 2


#### Let's print out the number of classes in each categorical feature

In [60]:
for k in obj_cols:
    print(k, ":", df[k].unique(), '\n')
   

job : ['housemaid' 'services' 'admin.' 'technician' 'blue-collar' 'unemployed'
 'retired' 'entrepreneur' 'management' 'student' 'self-employed'] 

marital : ['married' 'single' 'divorced'] 

education : ['basic.4y' 'high.school' 'basic.6y' 'professional.course' 'basic.9y'
 'university.degree' 'illiterate'] 

default : ['no' 'yes'] 

housing : ['no' 'yes'] 

loan : ['no' 'yes'] 

contact : ['telephone' 'cellular'] 

month : ['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'mar' 'apr' 'sep'] 

day_of_week : ['mon' 'tue' 'wed' 'thu' 'fri'] 

poutcome : ['nonexistent' 'failure' 'success'] 

subscription : ['no' 'yes'] 



##### Label encoding

In [61]:
le = LabelEncoder()
df.marital = le.fit_transform(df.marital)
df.housing = le.fit_transform(df.housing)
df.subscription = le.fit_transform(df.subscription)
df.loan = le.fit_transform(df.loan)
df.default = le.fit_transform(df.default)
df.contact = le.fit_transform(df.contact)
df.poutcome = le.fit_transform(df.poutcome)
df.day_of_week = le.fit_transform(df.day_of_week)
df.month = le.fit_transform(df.month)

##### Encode Job and education with One-hot encoding

In [62]:
categorical_features = ['job','education']

dataset = pd.get_dummies(df, columns=categorical_features)
dataset.head()

Unnamed: 0,age,marital,default,housing,loan,contact,month,day_of_week,duration,campaign,...,job_student,job_technician,job_unemployed,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree
0,56,1,0,0,0,1,6,1,261,1,...,0,0,0,1,0,0,0,0,0,0
1,37,1,0,1,0,1,6,1,226,1,...,0,0,0,0,0,0,1,0,0,0
2,40,1,0,0,0,1,6,1,151,1,...,0,0,0,0,1,0,0,0,0,0
3,56,1,0,0,1,1,6,1,307,1,...,0,0,0,0,0,0,1,0,0,0
4,59,1,0,0,0,1,6,1,139,1,...,0,0,0,0,0,0,0,0,1,0


In [63]:
dataset.shape

(30478, 37)

## 3. Data Splitting

In [64]:
X = dataset.drop('subscription', axis=1) # Obtain Feature Set
y = dataset['subscription'] # Extract Labels

In [65]:
print(X.shape)
print(y.shape)

(30478, 36)
(30478,)


In [66]:
# obtain a list of dataset columns
dataset.columns

Index(['age', 'marital', 'default', 'housing', 'loan', 'contact', 'month',
       'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome',
       'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m',
       'nr.employed', 'subscription', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree'],
      dtype='object')

In [67]:
dataset_cols = ['age', 'marital', 'default', 'housing', 'loan', 'contact', 'month',
       'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome',
       'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m',
       'nr.employed', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree']

## 4. Data Standardization

##### The features are standardized using the Scikit-Learn StandardScaler method in Python. 
Data scaling or standardization is essential to ensure that each contributing feature has a fair and balanced impact, preventing any feature from dominating the others. This process effectively eliminates potential biases. The StandardScaler method scales the feature set by subtracting the mean and normalizing it to have unit variance.


In [68]:
X_scaled = StandardScaler().fit_transform(X.astype(float))
X_scaled = pd.DataFrame(X_scaled,columns=dataset_cols)
X_scaled.head()

Unnamed: 0,age,marital,default,housing,loan,contact,month,day_of_week,duration,campaign,...,job_student,job_technician,job_unemployed,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree
0,1.642408,-0.309721,-0.009922,-1.087624,-0.430643,1.426447,0.734364,-0.725499,0.005681,-0.559363,...,-0.14291,-0.467634,-0.157528,3.435969,-0.218435,-0.403972,-0.581265,-0.019001,-0.406277,-0.720128
1,-0.196436,-0.309721,-0.009922,0.919436,-0.430643,1.426447,0.734364,-0.725499,-0.128039,-0.559363,...,-0.14291,-0.467634,-0.157528,-0.291039,-0.218435,-0.403972,1.720385,-0.019001,-0.406277,-0.720128
2,0.093908,-0.309721,-0.009922,-1.087624,-0.430643,1.426447,0.734364,-0.725499,-0.41458,-0.559363,...,-0.14291,-0.467634,-0.157528,-0.291039,4.578014,-0.403972,-0.581265,-0.019001,-0.406277,-0.720128
3,1.642408,-0.309721,-0.009922,-1.087624,2.322111,1.426447,0.734364,-0.725499,0.181426,-0.559363,...,-0.14291,-0.467634,-0.157528,-0.291039,-0.218435,-0.403972,1.720385,-0.019001,-0.406277,-0.720128
4,1.932752,-0.309721,-0.009922,-1.087624,-0.430643,1.426447,0.734364,-0.725499,-0.460427,-0.559363,...,-0.14291,-0.467634,-0.157528,-0.291039,-0.218435,-0.403972,-0.581265,-0.019001,2.461374,-0.720128


### Here, the dataset is split to training and testing data

#### The dataset is split into two set: the training data and the testing data using a ratio of $70$\% to $30$\%. 

The $70$\% in the training data will be used for model training. The remaining $30$\% has been set aside for testing the data after training. 

This is done to evaluate the model's performance when presented with unseen data 

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)

(21334, 36)
(9144, 36)


In [70]:
print(y_train.shape)
print(y_test.shape)

(21334,)
(9144,)


## 5. Data Balancing

### Synthetic Minority Oversampling Technique (SMOTE) is used to balance the training data.

In [71]:
# Before applying SMOTE, let's check the class distribution in the training set
print("Before applying SMOTE, number in each class of the target: \n", pd.Series(y_train).value_counts())

Before applying SMOTE, number in each class of the target: 
 0    18661
1     2673
Name: subscription, dtype: int64


In [72]:
# Applying SMOTE to oversample the minority class
smote = SMOTE(sampling_strategy='auto', random_state=10)

In [73]:
X_train, y_train = smote.fit_resample(X_train, y_train)

In [74]:
print("After applying SMOTE, number in each class of the target: \n", pd.Series(y_train).value_counts())

After applying SMOTE, number in each class of the target: 
 0    18661
1    18661
Name: subscription, dtype: int64


## Create a directory to save cleaned, encoded and balanced dataset

In [75]:
# if not os.path.exists("cleaned_data_final"):
#     os.makedirs("cleaned_data_final")

In [None]:
# Saving data
# X_train.to_csv("cleaned_data_final/train_features.csv", index=False)
# X_test.to_csv("cleaned_data_final/test_features.csv", index=False)

# y_train.to_csv("cleaned_data_final/train_labels.csv", index=False)
# y_test.to_csv("cleaned_data_final/test_labels.csv", index=False)

# Read in saved data

In [3]:
X_train = pd.read_csv("cleaned_data/train_features.csv")
X_test = pd.read_csv("cleaned_data/test_features.csv")

y_train = pd.read_csv("cleaned_data/train_labels.csv")
y_test = pd.read_csv("cleaned_data/test_labels.csv")

In [4]:
print(X_train.shape)
print(X_test.shape)

(37322, 36)
(9144, 36)


In [5]:
print(y_train.shape)
print(y_test.shape)

(37322, 1)
(9144, 1)


## Change the labels back to the default "Series" type

In [6]:
y_train = y_train.squeeze()
y_test = y_test.squeeze()

In [7]:
print(y_train.shape)
print(y_test.shape)

(37322,)
(9144,)


# Machine Learning (ML) Models

### Three classes of ML models were used in this project. In general, eight (8) ML models were employed for the study.

- **Single-Base Classifiers**
    - Decision Tree (DT)
    - Support Vector Classifier (SVC)
    - Naïve Bayes (NB)
    - Logistic Regression (LR)


- __Traditional Ensemble Model__
    - Random Forest Classifier (RFC)
    
    
- **Boosting Models**
    - Gradient Boosting Machines (GBM)
    - Light Gradient Boosting Machines (LightGBM)
    - eXtreme Gradient Boosting (XGBoost)

'''   
Ensemble models are a machine learning technique that combines the predictions from multiple individual models to improve overall predictive performance. The idea behind ensemble methods is to leverage the wisdom of the crowd – by combining the opinions of multiple models, the ensemble can often make more accurate predictions than any individual model.

There are several popular ensemble methods, including:

**Bagging (Bootstrap Aggregating)**: Bagging involves training multiple instances of the same base model on different subsets of the training data. These subsets are created through resampling with replacement (bootstrap samples). The predictions from each model are then averaged (for regression) or voted upon (for classification) to make the final prediction. Random Forest is a well-known algorithm that uses bagging.

**Boosting**: Boosting aims to correct the errors of previous models by giving more weight to misclassified data points. Popular boosting algorithms include AdaBoost, Gradient Boosting, and XGBoost.

'''

# Model Evaluation Strategies

### Six Model Evaluation Strategies were adopted for this work
1. Models were evaluated using the default settings
2. Models were evaluated using the bagging technique
3. Models were evaluated using the reduced dataset
4. Models were evaluated by applying hyperparameter tuning
5. Models were evaluated using the Cross-Validation via the GridSearchCV (5-fold)
6. Models were evaluated using the top ten features selected by RFC and GBM

## 1. Evaluating the models using the default settings

In [39]:
# Set the random seed for reproduability
default = 24

In [40]:
# # Create a directory to store the figures
if not os.path.exists("default_settings"):
    os.makedirs("default_settings")

# Define the models to be used in the pipeline
models = [
    ('Random Forest', RandomForestClassifier(random_state=default)),
    ('LightGBM', LGBMClassifier(random_state=default)),
    ('SVC', SVC(random_state=default)),
    ('Naive Bayes', GaussianNB()),
    ('Decision Tree', DecisionTreeClassifier(random_state=default)),
    ('XGBoost', XGBClassifier(random_state=default)),
    ('Logistic Regression', LogisticRegression(random_state=default)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=default))
]


# Create a dictionary to store the results
results = {}

# Train and test each model, and compute metrics
for model_name, model in models:
    print(f"Training and testing {model_name}...")
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Confusion Matrix': conf_matrix,
        'Classification Report': classification_rep
    }

       
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.set(style='white')
    sns.heatmap(np.eye(2), annot=conf_matrix, fmt='g', annot_kws={'size': 20},
            cmap=sns.color_palette("light:b", as_cmap=True), cbar=False, ax=ax)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    additional_texts = ['(TN)', '(FP)', '(FN)', '(TP)']
    for text_elt, additional_text in zip(ax.texts, additional_texts):
        ax.text(*text_elt.get_position(), '\n' + additional_text, color=text_elt.get_color(),
            ha='center', va='top', size=15)
    plt.tight_layout()
    plt.savefig(f'default_settings/confusion_matrix_{model_name}.pdf')
    plt.close()

# Convert results to a DataFrame and print

results_df = pd.DataFrame(results).T[['Accuracy', 'Precision', 'Recall', 'F1-Score']]
print(results_df)


Training and testing Random Forest...
Training and testing LightGBM...
Training and testing SVC...
Training and testing Naive Bayes...
Training and testing Decision Tree...
Training and testing XGBoost...
Training and testing Logistic Regression...
Training and testing Gradient Boosting...
                     Accuracy Precision    Recall  F1-Score
Random Forest        0.902668  0.620212  0.642194  0.631012
LightGBM             0.908683  0.646321  0.652321  0.649307
SVC                   0.85783  0.471689  0.808439  0.595771
Naive Bayes          0.393045  0.170665   0.95443  0.289555
Decision Tree        0.873469  0.510355  0.582278   0.54395
XGBoost              0.901794  0.634489  0.571308  0.601243
Logistic Regression  0.857393  0.471545  0.832068  0.601954
Gradient Boosting    0.895669  0.575244  0.745148  0.649265


# 2. Bagging

#### Bagging (Bootstrap Aggregating) is an ensemble technique that involves training multiple instances of the same model on different bootstrap samples of the training data and then combining their predictions to make a final decision. 

First, we create a BaggingClassifier for each base model defined in the models list. We then fit the BaggingClassifier on the training data and make predictions on the test data. We compute various classification metrics for the bagging ensemble and store the results in the results dictionary. The confusion matrix plots for each bagging ensemble are also saved in the "bagged_mod" folder.

The BaggingClassifier automatically performs bootstrap aggregation, creating multiple instances of the base model, each trained on a different bootstrap sample of the training data. It aggregates the predictions of the individual models to make a final decision.

In [42]:
# Set the random seed for reproduability
bagged = 42

In [45]:
# Create a directory to store the figures
if not os.path.exists("bagged_model"):
    os.makedirs("bagged_model")

# Define the models to be used in the pipeline
models = [
    ('Random Forest', RandomForestClassifier(random_state=bagged)),
    ('LightGBM', LGBMClassifier(random_state=bagged)),
    ('SVC', SVC(random_state=bagged)),
    ('Naive Bayes', GaussianNB()),
    ('Decision Tree', DecisionTreeClassifier(random_state=bagged)),
    ('XGBoost', XGBClassifier(random_state=bagged)),
    ('Logistic Regression', LogisticRegression(random_state=bagged)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=bagged))
]

# Create a dictionary to store the results
results = {}

# Train and test each model with Bagging, and compute metrics
for model_name, model in models:
    print(f"Training and testing {model_name} with Bagging...")
    
    # Applying Bagging to the model
    bagging_model = BaggingClassifier(base_estimator=model, n_estimators=10, random_state=42)
    
    bagging_model.fit(X_train, y_train)
    y_pred = bagging_model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Confusion Matrix': conf_matrix,
        'Classification Report': classification_rep
    }

    fig, ax = plt.subplots(figsize=(6, 4))
    sns.set(style='white')
    sns.heatmap(np.eye(2), annot=conf_matrix, fmt='g', annot_kws={'size': 20},
            cmap=sns.color_palette("light:b", as_cmap=True), cbar=False, ax=ax)
    plt.title(f'Confusion Matrix - {model_name} with Bagging')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    additional_texts = ['(TN)', '(FP)', '(FN)', '(TP)']
    for text_elt, additional_text in zip(ax.texts, additional_texts):
        ax.text(*text_elt.get_position(), '\n' + additional_text, color=text_elt.get_color(),
            ha='center', va='top', size=15)
    plt.tight_layout()
    plt.savefig(f'bagged_model/confusion_matrix_{model_name}_with_bagging.pdf')
    plt.close()

# Convert results to a DataFrame and print
bag_results_df = pd.DataFrame(results).T[['Accuracy', 'Precision', 'Recall', 'F1-Score']]
print(bag_results_df)


Training and testing Random Forest with Bagging...
Training and testing LightGBM with Bagging...
Training and testing SVC with Bagging...
Training and testing Naive Bayes with Bagging...
Training and testing Decision Tree with Bagging...
Training and testing XGBoost with Bagging...
Training and testing Logistic Regression with Bagging...
Training and testing Gradient Boosting with Bagging...
                     Accuracy Precision    Recall  F1-Score
Random Forest        0.901465  0.605341  0.688608  0.644295
LightGBM             0.909121  0.644608  0.665823  0.655044
SVC                   0.85958  0.475063  0.795781  0.594953
Naive Bayes          0.401903  0.172377  0.951055  0.291855
Decision Tree        0.894685  0.590244  0.612658  0.601242
XGBoost              0.907043  0.652689  0.604219   0.62752
Logistic Regression  0.858049  0.472928  0.832911  0.603301
Gradient Boosting    0.895997   0.57529   0.75443  0.652793


## 3. Reduced dataset by half (0.5)

In [15]:
# # Assuming X and y are your feature and target data in a DataFrame
# subset_size = int(len(X_train) * 0.5)  # Choose the desired subset size (e.g., 50% of the data)

# # Sample a random subset of data
# subset_indices = X_train.sample(n=subset_size, random_state=100).index

# # Select the subset of data
# X_subset = X_train.loc[subset_indices]
# y_subset = y_train.loc[subset_indices]

In [None]:
# Random selection of a susbet of data

In [20]:
# Assuming X and y are your feature and target data in a DataFrame
subset_size = int(len(X_train) * 0.5)  # Choose the desired subset size (e.g., 50% of the data)

In [21]:
# Sample a random subset of data
subset_indices = X_train.sample(n=subset_size, random_state=100).index

In [22]:
# Select the subset of data -- Training data
X_subset = X_train.loc[subset_indices]
y_subset = y_train.loc[subset_indices]

In [23]:
print(X_subset.shape)

(18661, 36)


In [24]:
print(y_subset.shape)

(18661,)


## Subset of Testing Data

In [50]:
X_test.shape

(9144, 36)

In [51]:
test_sub = int(len(X_test) * 0.5)
# Sample a random subset of data
sub_ind = X_test.sample(n=test_sub, random_state=200).index

In [52]:
# Select the subset of data -- Training data
X_subset_test = X_test.loc[sub_ind]
y_subset_test = y_test.loc[sub_ind]

In [54]:
print(X_subset_test.shape)
print(y_subset_test.shape)

(4572, 36)
(4572,)


## 3.1 Default with reduced dataset (0.5)

In [56]:
def_half = 48

In [58]:
# # Create a directory to store the figures
if not os.path.exists("default_half"):
    os.makedirs("default_half")

# Define the models to be used in the pipeline
models = [
    ('Random Forest', RandomForestClassifier(random_state=def_half)),
    ('LightGBM', LGBMClassifier(random_state=def_half)),
    ('SVC', SVC(random_state=def_half)),
    ('Naive Bayes', GaussianNB()),
    ('Decision Tree', DecisionTreeClassifier(random_state=def_half)),
    ('XGBoost', XGBClassifier(random_state=def_half)),
    ('Logistic Regression', LogisticRegression(random_state=def_half)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=def_half))
]


# Create a dictionary to store the results
results = {}

# Train and test each model, and compute metrics
for model_name, model in models:
    print(f"Training and testing {model_name}...")
    
    model.fit(X_subset, y_subset)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Confusion Matrix': conf_matrix,
        'Classification Report': classification_rep
    }

       
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.set(style='white')
    sns.heatmap(np.eye(2), annot=conf_matrix, fmt='g', annot_kws={'size': 20},
            cmap=sns.color_palette("light:b", as_cmap=True), cbar=False, ax=ax)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    additional_texts = ['(TN)', '(FP)', '(FN)', '(TP)']
    for text_elt, additional_text in zip(ax.texts, additional_texts):
        ax.text(*text_elt.get_position(), '\n' + additional_text, color=text_elt.get_color(),
            ha='center', va='top', size=15)
    plt.tight_layout()
    plt.savefig(f'default_half/confusion_matrix_{model_name}.pdf')
    plt.close()

# Convert results to a DataFrame and print

results_def_half = pd.DataFrame(results).T[['Accuracy', 'Precision', 'Recall', 'F1-Score']]
print(results_def_half)


Training and testing Random Forest...
Training and testing LightGBM...
Training and testing SVC...
Training and testing Naive Bayes...
Training and testing Decision Tree...
Training and testing XGBoost...
Training and testing Logistic Regression...
Training and testing Gradient Boosting...
                     Accuracy Precision    Recall  F1-Score
Random Forest        0.901137  0.599013    0.7173  0.652842
LightGBM             0.905074  0.629599  0.649789  0.639535
SVC                  0.854003  0.464455  0.827004  0.594841
Naive Bayes          0.814414  0.376089  0.655696  0.478007
Decision Tree        0.867345  0.490331  0.599156  0.539309
XGBoost              0.900262  0.620053  0.594937  0.607235
Logistic Regression  0.857065  0.470897  0.832911  0.601646
Gradient Boosting    0.895232  0.573845  0.744304  0.648053


# Hyperparameter Tuning 

## 4.1 Hyperparameter Tuning -- LightGBM, XGB and SVC

In [13]:
tuned = 24

In [18]:
# Create a directory to store the figures
if not os.path.exists("tuned_mod"):
    os.makedirs("tuned_mod")

# Define the models to be used in the pipeline
models = [
    ('LightGBM', LGBMClassifier(random_state=tuned)),
    ('XGBoost', XGBClassifier(random_state=tuned)),
    ('SVC', SVC(random_state=tuned))   
]

# Create a dictionary to store the results
results = {}

# Hyperparameter tuning for LightGBM
lgb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 10]
}


# Hyperparameter tuning for XGBoost
xgb_params = {
    'n_estimators': [50, 100, 200, 300, 400],
    'learning_rate': [0.01, 0.1, 0.5, 0.7, 0.99],
    'max_depth': [3, 5, 10, 12, 15]
}

# Hyperparameter tuning for SVC
svc_params = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [0.1, 1, 10]
}

for model_name, model in models:
    print(f"Training and testing {model_name}...")
    
    if  model_name == 'LightGBM':
        grid_search = GridSearchCV(model, lgb_params, cv=5, n_jobs=-1)
    elif model_name == 'XGBoost':
        grid_search = GridSearchCV(model, xgb_params, cv=5, n_jobs=-1)
    elif model_name == 'SVC':
        grid_search = GridSearchCV(model, svc_params, cv=5, n_jobs=-1)
    else:
        grid_search = model
    
    grid_search.fit(X_subset, y_subset)
    model = grid_search.best_estimator_
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Confusion Matrix': conf_matrix,
        'Classification Report': classification_rep
    }
    
    print(f"The best parameters for the {model_name}...", "are ", grid_search.best_params_, "\n")

    fig, ax = plt.subplots(figsize=(6, 4))
    sns.set(style='white')
    sns.heatmap(np.eye(2), annot=conf_matrix, fmt='g', annot_kws={'size': 20},
            cmap=sns.color_palette("light:b", as_cmap=True), cbar=False, ax=ax)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    additional_texts = ['(TN)', '(FP)', '(FN)', '(TP)']
    for text_elt, additional_text in zip(ax.texts, additional_texts):
        ax.text(*text_elt.get_position(), '\n' + additional_text, color=text_elt.get_color(),
            ha='center', va='top', size=15)
    plt.tight_layout()
    plt.savefig(f'tuned_mod/confusion_matrix_{model_name}_with_tuning.pdf')
    plt.close()

# Convert results to a DataFrame and print
results_df = pd.DataFrame(results).T[['Accuracy', 'Precision', 'Recall', 'F1-Score']]
print(results_df)


Training and testing LightGBM...
The best parameters for the LightGBM... are  {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100} 

Training and testing XGBoost...
The best parameters for the XGBoost... are  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300} 

Training and testing SVC...
The best parameters for the SVC... are  {'C': 10, 'kernel': 'rbf'} 

          Accuracy Precision    Recall  F1-Score
LightGBM   0.90584  0.629808  0.663291  0.646116
XGBoost   0.904528  0.635182  0.618565  0.626764
SVC       0.867563  0.492672  0.737553   0.59074


## 4.2 Hyperparameter Tuning -- RFC and DT

In [19]:
# Create a directory to store the figures
if not os.path.exists("tuned_mod"):
    os.makedirs("tuned_mod")

# Define the models to be used in the pipeline
models = [
    ('Random Forest', RandomForestClassifier(random_state=tuned)),
    ('Decision Tree', DecisionTreeClassifier(random_state=tuned))
   ]

# Create a dictionary to store the results
results = {}

# Hyperparameter tuning for RandomForestClassifier
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Hyperparameter tuning for DecisionTreeClassifier
dt_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10]
}



for model_name, model in models:
    print(f"Training and testing {model_name}...")
    
    if model_name == 'Random Forest':
        grid_search = GridSearchCV(model, rf_params, cv=5, n_jobs=-1)
    elif model_name == 'Decision Tree':
        grid_search = GridSearchCV(model, dt_params, cv=5, n_jobs=-1)
    else:
        grid_search = model
    
    grid_search.fit(X_subset, y_subset)
    model = grid_search.best_estimator_
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Confusion Matrix': conf_matrix,
        'Classification Report': classification_rep
    }
    
    print(f"The best parameters for the {model_name}...", "are ", grid_search.best_params_, "\n")
    
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.set(style='white')
    sns.heatmap(np.eye(2), annot=conf_matrix, fmt='g', annot_kws={'size': 20},
            cmap=sns.color_palette("light:b", as_cmap=True), cbar=False, ax=ax)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    additional_texts = ['(TN)', '(FP)', '(FN)', '(TP)']
    for text_elt, additional_text in zip(ax.texts, additional_texts):
        ax.text(*text_elt.get_position(), '\n' + additional_text, color=text_elt.get_color(),
            ha='center', va='top', size=15)
    plt.tight_layout()
    plt.savefig(f'tuned_mod/confusion_matrix_{model_name}_with_tuning.pdf')
    plt.close()

# Convert results to a DataFrame and print
results_df = pd.DataFrame(results).T[['Accuracy', 'Precision', 'Recall', 'F1-Score']]
print(results_df)


Training and testing Random Forest...
The best parameters for the Random Forest... are  {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100} 

Training and testing Decision Tree...
The best parameters for the Decision Tree... are  {'criterion': 'gini', 'max_depth': 10} 

               Accuracy Precision    Recall  F1-Score
Random Forest  0.897638  0.588865  0.696203  0.638051
Decision Tree  0.876094  0.515099  0.748523  0.610251


## 4.3 Hyperparameter Tuning -- Logistic Regression and GBM

In [14]:
# Create a directory to store the figures
if not os.path.exists("tuned_mod"):
    os.makedirs("tuned_mod")

# Define the models to be used in the pipeline
models = [
    ('Logistic Regression', LogisticRegression(random_state=tuned)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=tuned))
]

# Create a dictionary to store the results
results = {}

# Hyperparameter tuning for GradientBoostingClassifier
gb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [None, 5, 10]
}

# Hyperparameter tuning for Logistic Regression
log_reg_params = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

for model_name, model in models:
    print(f"Training and testing {model_name}...")
    
    if model_name == 'Gradient Boosting':
        grid_search = GridSearchCV(model, gb_params, cv=5, n_jobs=-1)
    elif model_name == 'Logistic Regression':
        grid_search = GridSearchCV(model, log_reg_params, cv=5, n_jobs=-1)
    else:
        grid_search = model
    
    grid_search.fit(X_subset, y_subset)
    model = grid_search.best_estimator_
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Confusion Matrix': conf_matrix,
        'Classification Report': classification_rep
    }
    
    print(f"The best parameters for the {model_name}...", "are ", grid_search.best_params_, "\n")
        
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.set(style='white')
    sns.heatmap(np.eye(2), annot=conf_matrix, fmt='g', annot_kws={'size': 20},
            cmap=sns.color_palette("light:b", as_cmap=True), cbar=False, ax=ax)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    additional_texts = ['(TN)', '(FP)', '(FN)', '(TP)']
    for text_elt, additional_text in zip(ax.texts, additional_texts):
        ax.text(*text_elt.get_position(), '\n' + additional_text, color=text_elt.get_color(),
            ha='center', va='top', size=15)
    plt.tight_layout()
    plt.savefig(f'tuned_mod/confusion_matrix_{model_name}_with_tuning.pdf')
    plt.close()

# Convert results to a DataFrame and print
results_df = pd.DataFrame(results).T[['Accuracy', 'Precision', 'Recall', 'F1-Score']]
print(results_df)


Training and testing Logistic Regression...
The best parameters for the Logistic Regression... are  {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'} 

Training and testing Gradient Boosting...
The best parameters for the Gradient Boosting... are  {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200} 

                     Accuracy Precision    Recall  F1-Score
Logistic Regression  0.857065  0.470925  0.833755  0.601889
Gradient Boosting    0.898731  0.597736  0.668354  0.631076


# Cross-Validation using GridSearchCV

## 5.0 Finding the best number of trees using GridSearchCV

### XGBoost, LightGBM, GBM and RFC

In [45]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

# Create a dictionary of classifiers
classifiers = {
    'XGBoost': XGBClassifier(random_state=103),
    'GBM': GradientBoostingClassifier(random_state=103),
    'LightGBM': LGBMClassifier(random_state=103),
    'RandomForest': RandomForestClassifier(criterion='entropy', max_features='auto', random_state=103)
}

# Create a dictionary of hyperparameter grids for each classifier
param_grids = {
    'XGBoost': {'classifier__n_estimators': [200, 250, 300, 350, 400, 450, 500]},  # Set n_estimators for XGBoost
    'GBM': {'classifier__n_estimators': [200, 250, 300, 350, 400, 450, 500]},
    'LightGBM': {'classifier__n_estimators': [200, 250, 300, 350, 400, 450, 500]},
    'RandomForest': {'classifier__n_estimators': [200, 250, 300, 350, 400, 450, 500]}
}

# Create a pipeline with each classifier and perform grid search
best_params = {}
best_scores = {}

for clf_name, clf in classifiers.items():
    param_grid = param_grids[clf_name]
    pipeline = Pipeline([
        ('classifier', clf)
    ])
    
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='recall', cv=5)
    grid_search.fit(X_subset, y_subset)
    
    best_parameters = grid_search.best_params_
    best_result = grid_search.best_score_
    
    best_params[clf_name] = best_parameters
    best_scores[clf_name] = best_result

# Print the best parameters and scores for each classifier
for clf_name in classifiers.keys():
    print(f"Best Parameters for {clf_name}: {best_params[clf_name]}")
    print(f"Best Score for {clf_name}: {best_scores[clf_name]}")


Best Parameters for XGBoost: {'classifier__n_estimators': 200}
Best Score for XGBoost: 0.9449090289561031
Best Parameters for GBM: {'classifier__n_estimators': 200}
Best Score for GBM: 0.9487756082176917
Best Parameters for LightGBM: {'classifier__n_estimators': 200}
Best Score for LightGBM: 0.9448021941211436
Best Parameters for RandomForest: {'classifier__n_estimators': 450}
Best Score for RandomForest: 0.9618770298186229


## 5.1 Fitting models with the best number of trees obtained via GridSearchCV

In [55]:
# # Create a directory to store the figures
if not os.path.exists("grid_search"):
    os.makedirs("grid_search")

# Define the models to be used in the pipeline
models = [
    ('Random Forest', RandomForestClassifier(n_estimators=450, random_state=103)),
    ('LightGBM', LGBMClassifier(n_estimators=200, random_state=103)),
    ('XGBoost', XGBClassifier(n_estimators=200, random_state=103)),
    ('Gradient Boosting', GradientBoostingClassifier(n_estimators=200, random_state=103))
]


# Create a dictionary to store the results
results = {}

# Train and test each model, and compute metrics
for model_name, model in models:
    print(f"Training and testing {model_name}...")
    
    model.fit(X_subset, y_subset)
    y_pred = model.predict(X_subset_test)
    
    accuracy = accuracy_score(y_subset_test, y_pred)
    precision = precision_score(y_subset_test, y_pred)
    recall = recall_score(y_subset_test, y_pred)
    f1 = f1_score(y_subset_test, y_pred)
    conf_matrix = confusion_matrix(y_subset_test, y_pred)
    classification_rep = classification_report(y_subset_test, y_pred, output_dict=True)
    
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Confusion Matrix': conf_matrix,
        'Classification Report': classification_rep
    }

       
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.set(style='white')
    sns.heatmap(np.eye(2), annot=conf_matrix, fmt='g', annot_kws={'size': 20},
            cmap=sns.color_palette("light:b", as_cmap=True), cbar=False, ax=ax)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    additional_texts = ['(TN)', '(FP)', '(FN)', '(TP)']
    for text_elt, additional_text in zip(ax.texts, additional_texts):
        ax.text(*text_elt.get_position(), '\n' + additional_text, color=text_elt.get_color(),
            ha='center', va='top', size=15)
    plt.tight_layout()
    plt.savefig(f'grid_search/confusion_matrix_{model_name}_with_GridSearch_CV.pdf')
    plt.close()

# Convert results to a DataFrame and print

results_grid = pd.DataFrame(results).T[['Accuracy', 'Precision', 'Recall', 'F1-Score']]
print(results_grid)


Training and testing Random Forest...
Training and testing LightGBM...
Training and testing XGBoost...
Training and testing Gradient Boosting...
                   Accuracy Precision    Recall  F1-Score
Random Forest      0.893045   0.57784  0.686667   0.62757
LightGBM             0.9007  0.624573      0.61  0.617201
XGBoost            0.894357   0.60354  0.568333  0.585408
Gradient Boosting  0.899825  0.606607  0.673333  0.638231


## 6 Feature Selection for the top boosting model, GBM and RFC

## 6.1 RFC Model with Top 10 Features

In [61]:
# Let's obtain the most important features
imp_features = Series(rfc.feature_importances_, index=list(X_subset)).sort_values(ascending=False)
print(imp_features)

duration                         0.322755
euribor3m                        0.094318
nr.employed                      0.075100
emp.var.rate                     0.057475
campaign                         0.051207
cons.conf.idx                    0.046478
age                              0.042119
day_of_week                      0.040727
cons.price.idx                   0.038487
month                            0.038290
contact                          0.025187
marital                          0.023795
housing                          0.020140
pdays                            0.017629
poutcome                         0.016371
previous                         0.010853
loan                             0.009429
education_university.degree      0.008291
job_admin.                       0.007291
education_high.school            0.006987
job_technician                   0.005926
education_professional.course    0.005411
job_blue-collar                  0.005394
education_basic.9y               0

In [68]:
# Train a Random Forest model
rf_classifier = RandomForestClassifier(n_estimators=450, random_state=42)
rf_classifier.fit(X_subset, y_subset)

# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Pair feature importances with their corresponding feature names
feature_names = X_subset.columns

# Create a DataFrame with feature names and importances
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Select the top ten features
top_n = 10
top_features = feature_importance_df.head(top_n)['Feature'].tolist()

# Create new datasets with only the top features
X_train_top = X_train[top_features]
X_valid_top = X_subset_test[top_features]

# Train a new Random Forest model using only the top features
rf_classifier_top = RandomForestClassifier(n_estimators=450, random_state=42)
rf_classifier_top.fit(X_train_top, y_train)

# Make predictions on the validation set using the new model
y_pred_top = rf_classifier_top.predict(X_valid_top)

# Calculate evaluation metrics
accuracy = accuracy_score(y_subset_test, y_pred_top)
recall = recall_score(y_subset_test, y_pred_top)
precision = precision_score(y_subset_test, y_pred_top)
f1 = f1_score(y_subset_test, y_pred_top)

# Create a DataFrame to store the metrics
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Recall', 'Precision', 'F1-Score'],
    'Value': [accuracy, recall, precision, f1]
})

# Print the metrics DataFrame
print("--------RFC Model with top ten features----------")
print("\n")
print(metrics_df)
print("\n")
print(top_features)


--------RFC Model with top ten features----------


      Metric     Value
0   Accuracy  0.888889
1     Recall  0.640000
2  Precision  0.568047
3   F1-Score  0.601881


['duration', 'euribor3m', 'nr.employed', 'emp.var.rate', 'cons.conf.idx', 'campaign', 'age', 'day_of_week', 'cons.price.idx', 'month']


## 6.2 GBM Model with Top 10 Features

In [69]:
# Assuming you have X and y as your feature matrix and target variable

# # Split your data into training and validation sets
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Gradient Boosting Machine (GBM) model
gbm_classifier = GradientBoostingClassifier(n_estimators=200, random_state=42)
gbm_classifier.fit(X_subset, y_subset)

# Get feature importances
feature_importances = gbm_classifier.feature_importances_

# Pair feature importances with their corresponding feature names
feature_names = X_subset.columns

# Create a DataFrame with feature names and importances
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Select the top ten features
top_n = 10
top_features = feature_importance_df.head(top_n)['Feature'].tolist()

# Create new datasets with only the top features
X_train_top = X_subset[top_features]
X_valid_top = X_subset_test[top_features]

# Train a new GBM model using only the top features
gbm_classifier_top = GradientBoostingClassifier(n_estimators=200, random_state=42)
gbm_classifier_top.fit(X_train_top, y_subset)

# Make predictions on the validation set using the new model
y_pred_top = gbm_classifier_top.predict(X_valid_top)

# Calculate evaluation metrics
accuracy = accuracy_score(y_subset_test, y_pred_top)
recall = recall_score(y_subset_test, y_pred_top)
precision = precision_score(y_subset_test, y_pred_top)
f1 = f1_score(y_subset_test, y_pred_top)

# Create a DataFrame to store the metrics
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Recall', 'Precision', 'F1-Score'],
    'Value': [accuracy, recall, precision, f1]
})

# Print the metrics DataFrame
print("-------- GBM Model with top ten features----------")
print("\n")
print(metrics_df)
print("\n")
print(top_features)


-------- GBM Model with top ten features----------


      Metric     Value
0   Accuracy  0.900481
1     Recall  0.708333
2  Precision  0.602837
3   F1-Score  0.651341


['duration', 'nr.employed', 'euribor3m', 'cons.conf.idx', 'campaign', 'day_of_week', 'month', 'contact', 'poutcome', 'emp.var.rate']


# THE END 