# MODEL BUILDING

In [1]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [2]:
df=pd.read_csv("bank_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,CreditScore,Age,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,Tenure_Group_0 - 4,Tenure_Group_5 - 9,Tenure_Group_10 - 14
0,0,619,42,0.0,1,1,1,101348.88,1,1,0,0,1,0,1,0,0
1,1,608,41,83807.86,1,0,1,112542.58,0,0,0,1,1,0,1,0,0
2,2,502,42,159660.8,3,1,0,113931.57,1,1,0,0,1,0,0,1,0
3,3,699,39,0.0,2,0,0,93826.63,0,1,0,0,1,0,1,0,0
4,4,850,43,125510.82,1,1,1,79084.1,0,0,0,1,1,0,1,0,0


In [3]:
df=df.drop('Unnamed: 0',axis=1)

In [4]:
x=df.drop('Exited',axis=1) #independent variables
y=df['Exited'] #dependent variable

In [5]:
#Dividing the dataset into training and testing sets
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=10)

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# Decision Tree Classifier

In [7]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [8]:
model_dt.fit(x_train,y_train)

In [9]:
y_pred=model_dt.predict(x_test)

In [10]:
model_dt.score(x_test,y_test)

0.847

In [11]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.87      0.96      0.91      1578
           1       0.72      0.44      0.55       422

    accuracy                           0.85      2000
   macro avg       0.79      0.70      0.73      2000
weighted avg       0.84      0.85      0.83      2000



# MODEL OPTIMISATION

Applying SMOTE to upscale the data to improve accuracy

In [12]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x_train,y_train)

In [13]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [14]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [15]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.8939554612937434
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       830
           1       0.90      0.91      0.91      1056

    accuracy                           0.89      1886
   macro avg       0.89      0.89      0.89      1886
weighted avg       0.89      0.89      0.89      1886



Performing Grid Search for cross validation and hyperparameter tuning

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
param_grid = {
    'max_depth': [6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [18]:
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=100), param_grid=param_grid, cv=5, scoring='accuracy')


In [19]:
grid_search.fit(xr_train, yr_train)


In [20]:
best_params = grid_search.best_params_


In [21]:
best_model = DecisionTreeClassifier(random_state=100, **best_params)
best_model.fit(xr_train, yr_train)

In [22]:
yr_pred=best_model.predict(xr_test)

In [23]:
best_model.score(xr_test, yr_test)

0.9146341463414634

In [24]:
print(classification_report(yr_test, yr_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.90      0.91      0.90       830
           1       0.93      0.92      0.92      1056

    accuracy                           0.91      1886
   macro avg       0.91      0.91      0.91      1886
weighted avg       0.91      0.91      0.91      1886



# Random Forest Classifier

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [27]:
model_rf.fit(x_train,y_train)

In [28]:
y_pred=model_rf.predict(x_test)

In [29]:
model_rf.score(x_test,y_test)

0.838

In [30]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.84      0.99      0.91      1578
           1       0.84      0.29      0.43       422

    accuracy                           0.84      2000
   macro avg       0.84      0.64      0.67      2000
weighted avg       0.84      0.84      0.80      2000



# Artificial Neural Network

In [31]:
import tensorflow as tf
from tensorflow import keras

In [35]:
# define sequential model
model = keras.Sequential([
    # input layer
    keras.layers.Dense(19, input_shape=(15,), activation='relu'),
    keras.layers.Dense(15, activation='relu'),
    keras.layers.Dense(10,activation = 'relu'),
    # we use sigmoid for binary output
    # output layer
    keras.layers.Dense(1, activation='sigmoid')
]
)

In [36]:
model.compile(optimizer = 'adam',
             loss = 'binary_crossentropy',
             metrics = ['accuracy'])

In [37]:
model.fit(x_train,y_train,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2228d91df50>

In [38]:
y_pred=model.predict(x_test)



In [39]:
ypred_lis = []
for i in y_pred:
    if i>0.5:
        ypred_lis.append(1)
    else:
        ypred_lis.append(0)

In [40]:
print(classification_report(y_test, ypred_lis, zero_division=1))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1578
           1       0.72      0.46      0.56       422

    accuracy                           0.85      2000
   macro avg       0.79      0.71      0.73      2000
weighted avg       0.84      0.85      0.84      2000



# XGBoost

In [41]:
import xgboost as xgb
from xgboost import XGBClassifier

In [42]:
xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.08, objective= 'binary:logistic',n_jobs=-1).fit(x_train, y_train)

In [43]:
y_pred = xgb_model.predict(x_test)

In [44]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.96      0.91      1578
           1       0.77      0.45      0.57       422

    accuracy                           0.86      2000
   macro avg       0.82      0.71      0.74      2000
weighted avg       0.85      0.86      0.84      2000



Since the decision tree classifier after model optimisation has the highest accuracy compared to the other models, this model is used for pickling.

In [42]:
#Pickling the model
import pickle

In [43]:
filename = 'model.sav'

In [44]:
pickle.dump(best_model, open(filename, 'wb'))

In [45]:
load_model = pickle.load(open(filename, 'rb'))

In [52]:
model_score_r1 = load_model.score(xr_test, yr_test)

In [53]:
model_score_r1

0.5541275333662877