In [57]:
import pandas as pd 
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix

In [4]:
tf.__version__

'2.13.0'

Churn model with ANN

# Data Preprocessing

In [33]:
dataset = pd.read_csv("Churn_Modelling.csv")


In [34]:
X = dataset.iloc[:, 3:-1].values
y = dataset.iloc[:, -1].values

In [39]:
print(X)

[[1.0 0.0 0.0 ... 1 1 101348.88]
 [0.0 0.0 1.0 ... 0 1 112542.58]
 [1.0 0.0 0.0 ... 1 0 113931.57]
 ...
 [1.0 0.0 0.0 ... 0 1 42085.58]
 [0.0 1.0 0.0 ... 1 0 92888.52]
 [1.0 0.0 0.0 ... 1 0 38190.78]]


In [36]:
# Use LabelEncoder to encode "Male"/"Female" into binary 1/0
le = LabelEncoder()
X[:, 2] = le.fit_transform(X[:, 2])

In [38]:
# Use OneHotEncoding to encode "Geography" field of the data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
# Note: dummy vars are moved to the first cols of the matrix of features. 

In [41]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Feature scaling is fundamental and essentially compulsory for all deep learning, so much so that you do it to all fields in your data

In [43]:
# Feature scaling for DL
sc = StandardScaler()
X_train = sc.fit_transform(X_train) # apply to all features of both sets 
X_test = sc.fit_transform(X_test)

# Building and training an ANN

In [44]:
ann = tf.keras.models.Sequential() # create an instances of the sequential() class

In [45]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu')) # add a fully connected layer with dense() class, the hidden layer
# Number of neurons is a "hyperparameter", i.e., it wont be 'trained' in the training process

In [46]:
# add a second hidden layer
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [47]:
# add an output layer 
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid')) # sigmoid gives the probability the customer churns as well as the classification

In [48]:
# Compiling the ANN with optimizer, loss funciton and metric, which will be the accuracy
ann.compile(optimizer='adam' , loss='binary_crossentropy', metrics=['accuracy']) # adam is a variant of stochastic gradient descent (SGD)
# loss function must be "binary_crossentropy" whenever you do classification problems. For non-binary output you would use "categorical_crossentropy"

In [50]:
# Training the ANN
ann.fit(X_train, y_train, batch_size=32, epochs=100) # batch_size and epochs are hyperparameters

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.src.callbacks.History at 0x247026ddc90>

# Predictions and Model Evaluation

In [54]:
print(ann.predict(sc.transform([[1.0, 0.0, 0.0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000]]))) # input must be 2d array
# prints output in probability
print(ann.predict(sc.transform([[1.0, 0.0, 0.0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000]])) > 0.5) # if predicted prob > 0.5 => 1, < 0.5 => 0

[[0.01781454]]
[[False]]


In [56]:
# Accuracy and Confusion matrix 
y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5) # returns true if likely to leave, and false if likely to stay
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[0 0]
 [0 1]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


In [61]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
ac = accuracy_score(y_test, y_pred)
print('accuracy score =', ac*100, '%')

[[1515   80]
 [ 195  210]]
accuracy score = 86.25 %
