# Customer Churn Predictor

Dataset: https://www.kaggle.com/datasets/saurabhbadole/bank-customer-churn-prediction-dataset/data

Imports

In [138]:
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras import models, layers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
import os

Reading the csv + dropping unnecessary columns

In [139]:
dataset = pd.read_csv('datasets/Churn_Modelling.csv')
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [140]:
X = dataset.drop(['RowNumber', 'CustomerId', 'Surname','Exited'], axis = 1)
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [141]:
Y = dataset[['Exited']]
Y.head()

Unnamed: 0,Exited
0,1
1,0
2,1
3,0
4,0


Label encoding the categorical 'Geography' and 'Gender' columns

In [143]:
country_mapping = {
    'France': 1,
    'Germany': 2,
    'Spain': 3
}
gender_mapping = {
    'Male': 1,
    'Female': 2
}

X['Geography'] = X['Geography'].replace(country_mapping)
X['Gender'] = X['Gender'].replace(gender_mapping)
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,1,2,42,2,0.0,1,1,1,101348.88
1,608,3,2,41,1,83807.86,1,0,1,112542.58
2,502,1,2,42,8,159660.8,3,1,0,113931.57
3,699,1,2,39,1,0.0,2,0,0,93826.63
4,850,3,2,43,2,125510.82,1,1,1,79084.1


Scaling the columns with a higher range of values such as Balance and EstimatedSalary

In [144]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(X)
selected_cols = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']
X_scaled[selected_cols] = scaler.fit_transform(X_scaled[selected_cols])
X_scaled.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,-0.326221,1,2,0.293517,2,-1.225848,1,1,1,0.021886
1,-0.440036,3,2,0.198164,1,0.11735,1,0,1,0.216534
2,-1.536794,1,2,0.293517,8,1.333053,3,1,0,0.240687
3,0.501521,1,2,0.007457,1,-1.225848,2,0,0,-0.108918
4,2.063884,3,2,0.388871,2,0.785728,1,1,1,-0.365276


Formatting data into arrays so it can be fed into the model

In [198]:
X_np = X_scaled.values
Y_np = Y.values

In [199]:
X_np[0:5]

array([[-3.26221422e-01,  1.00000000e+00,  2.00000000e+00,
         2.93517423e-01,  2.00000000e+00, -1.22584767e+00,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         2.18864940e-02],
       [-4.40035955e-01,  3.00000000e+00,  2.00000000e+00,
         1.98163832e-01,  1.00000000e+00,  1.17350021e-01,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         2.16533752e-01],
       [-1.53679418e+00,  1.00000000e+00,  2.00000000e+00,
         2.93517423e-01,  8.00000000e+00,  1.33305335e+00,
         3.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         2.40686900e-01],
       [ 5.01520635e-01,  1.00000000e+00,  2.00000000e+00,
         7.45665079e-03,  1.00000000e+00, -1.22584767e+00,
         2.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -1.08917917e-01],
       [ 2.06388377e+00,  3.00000000e+00,  2.00000000e+00,
         3.88871014e-01,  2.00000000e+00,  7.85727900e-01,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
        -3.

Splitting train and test sets

In [200]:
X_train = X_np[:9000]
Y_train = Y_np[:9000]
X_test = X_np[9000:]
Y_test = Y_np[9000:]
print(f"X_train size:", X_train.shape[0])
print(f"X_test size:", X_test.shape[0])
print(f"Y_train size:", Y_train.shape[0])
print(f"Y_test size:", Y_test.shape[0])

X_train size: 9000
X_test size: 1000
Y_train size: 9000
Y_test size: 1000


In [201]:
Y_train[0:5]

array([[1],
       [0],
       [1],
       [0],
       [0]], dtype=int64)

In [202]:
# Hyperparameters
NUM_EPOCHS = 10
LEARNING_RATE = .001
BATCH_SIZE = 24

Defining the model with Tensorflow Keras

In [203]:
model = models.Sequential([
    layers.Dense(128, input_dim=X_train.shape[1], activation = 'relu'),
    layers.Dense(32),
    layers.Dense(1, activation='sigmoid')
])
model.summary()

In [204]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
metrics = ["accuracy"]

Running the model on the train set for 10 epochs

In [205]:
model.compile(loss=loss, optimizer=optim, metrics=metrics)
model.fit(X_train, Y_train, batch_size = BATCH_SIZE, epochs=NUM_EPOCHS, verbose=1)

Epoch 1/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 423us/step - accuracy: 0.7939 - loss: 0.4847
Epoch 2/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 423us/step - accuracy: 0.8242 - loss: 0.4256
Epoch 3/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 425us/step - accuracy: 0.8320 - loss: 0.4061
Epoch 4/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 428us/step - accuracy: 0.8431 - loss: 0.3824
Epoch 5/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 441us/step - accuracy: 0.8297 - loss: 0.3887
Epoch 6/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 431us/step - accuracy: 0.8402 - loss: 0.3769
Epoch 7/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 433us/step - accuracy: 0.8476 - loss: 0.3730
Epoch 8/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 425us/step - accuracy: 0.8544 - loss: 0.3554
Epoch 9/10
[1m375/375[

<keras.src.callbacks.history.History at 0x184fc7c4910>

Running the model on test set

In [206]:
test_loss, test_accuracy = model.evaluate(X_test, Y_test, verbose=1)

print(f"Test Loss: {round(test_loss, 2)}")
print(f"Test Accuracy: {round(test_accuracy, 2)}")

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 419us/step - accuracy: 0.8506 - loss: 0.3690
Test Loss: 0.35
Test Accuracy: 0.86


Outputting a subset of values to see individual predictions

In [219]:
pred_array = model.predict(X_test)
pred_binary = [1 if pred >= 0.5 else 0 for pred in pred_array]

print(f"1 = Churned, 0 = Didn't Churn")
print("")
for i in range(30, 35):
    print(f"Customer name: {dataset['Surname'][9000 + i]}")
    print(f"Predicted: {pred_binary[i]}")
    print(f"Actual: {Y_test[i]}")
    print("")

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 387us/step
1 = Exited, 0 = Didn't exit

Customer name: Arbour
Predicted: 1
Actual: [1]

Customer name: Barese
Predicted: 0
Actual: [0]

Customer name: Hingston
Predicted: 0
Actual: [0]

Customer name: Davis
Predicted: 0
Actual: [0]

Customer name: Lawrence
Predicted: 1
Actual: [1]



### rohan11parekh@gmail.com
### LinkedIn: https://www.linkedin.com/in/rohan-parekh-39b070225/