In [1]:
# Classification template

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Churn_Modelling.csv')
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values


dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [2]:
# Encoding categorical data
# Encoding the Independent Variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder() # Encode countries
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1]) # Will encode the 3 country strings to 0,1,2

labelencoder_X_2 = LabelEncoder() # Encode gender
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2]) # Will encode country gender to 0,1


In [3]:
# Don't need to onehotencode gender because its binary

onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]

# Encoding the Dependent Variable
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [4]:
y

array([1, 0, 1, ..., 1, 1, 0])

In [5]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [6]:
# Feature Scaling
# Very important for NN calculations
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [7]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

Using TensorFlow backend.
  return f(*args, **kwds)


In [8]:
# Initialize the Classifier ANN
classifier = Sequential()

# Adding input layer AND first hidden layer
# output_dim = (11 inputs nodes + 1 output node )/2 = 6
# rectifier activation

classifier.add(Dense(activation="relu", input_dim=11, units=6, kernel_initializer="uniform"))
# Adding Dropout 
classifier.add(Dropout(p = 0.1))


'''
Dense parameters:
output_dim: number of nodes you want to add to the next HIDDEN layer
Determining the number of nodes in the hidden layer is an art, but one tip
could be to have the number of nodes in the hidden layer be equal to the average of the nodes in 
the input layer and output layer. You could also determine this number by doing some form of parameter tuning

init: Will randomly initizalize weights to 0, using the glorot_uniform function be default

activation: What activation function to use

input_dim: Number of input nodes in our INPUT layer, so we only need to specify this parameter ONCE
'''


'\nDense parameters:\noutput_dim: number of nodes you want to add to the next HIDDEN layer\nDetermining the number of nodes in the hidden layer is an art, but one tip\ncould be to have the number of nodes in the hidden layer be equal to the average of the nodes in \nthe input layer and output layer. You could also determine this number by doing some form of parameter tuning\n\ninit: Will randomly initizalize weights to 0, using the glorot_uniform function be default\n\nactivation: What activation function to use\n\ninput_dim: Number of input nodes in our INPUT layer, so we only need to specify this parameter ONCE\n'

In [9]:
# Adding second hidden layer

classifier.add(Dense(activation="relu", units=6, kernel_initializer="uniform"))
classifier.add(Dropout(p = 0.1))

# Adding output layer
# Sigmoid activation will output possibility of the classification
# We can use softmax activation function that is like sigmoid, but works for multiclass-classification
classifier.add(Dense(activation="sigmoid", units=1, kernel_initializer="uniform"))

In [10]:
# Compiling the ANN
# Compiling is the "training" or "learning" process for our ANN
# Stochastic Gradient descent will utilze a logartimic loss function NOT MSE, this is based on the 
# sigmoid activation of our output layer
# loss function for binary logarithmic: binary_crossentropy, if not binary, then: categorical_crossentropy

classifier.compile(optimizer= 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])
'''
Compile parameters:

optimizer: algorithm to find the optimial set of weights - How we tune our weights - ie: Adam Gradient Descent Algo.

loss: Loss function to determine the error between ypred and ytrue

metrics: list of metrics to be evaluated by the model and then improved upon, typically 'accuracy'

epochs: number of training iterations, one full cycle of foward and back propagations
'''


"\nCompile parameters:\n\noptimizer: algorithm to find the optimial set of weights - How we tune our weights - ie: Adam Gradient Descent Algo.\n\nloss: Loss function to determine the error between ypred and ytrue\n\nmetrics: list of metrics to be evaluated by the model and then improved upon, typically 'accuracy'\n\nepochs: number of training iterations, one full cycle of foward and back propagations\n"

In [11]:
# Fitting the ANN to the training set
classifier.fit(X_train, y_train, batch_size = 10, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100


Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x12be12390>

In [12]:
# Fitting classifier to the Training set

# Predicting the Test set results
y_pred = classifier.predict(X_test) # Returns ONLY probabilities of people leaving the bank

# Convert probabilities to True/False using a threshold value in order to evaluate the model. Threshold value will
# be dependent on the sensitivity of the results

y_pred = (y_pred > 0.5) # Compares each value to 0.5 threshold and replaces with bool


# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [13]:
cm

array([[1887,  104],
       [ 248,  261]])

In [14]:
1923 + 183

2106

In [15]:
1923 + 183 + 68 + 326

2500

In [16]:
2106/2500 # Accuracy score

0.8424

## Evaluate the ANN

* We need a keras wrapper for scikit_learn's cross validation so that we can utilize cross valudation on our keras model

In [19]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

In [20]:
def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(activation="relu", input_dim=11, units=6, kernel_initializer="uniform"))
    classifier.add(Dense(activation="relu", units=6, kernel_initializer="uniform"))
    classifier.add(Dense(activation="sigmoid", units=1, kernel_initializer="uniform"))
    classifier.compile(optimizer= 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])
    return classifier

In [21]:
classifier = KerasClassifier(build_fn = build_classifier, batch_size=10, epochs=100)

# Will contain all the accuracies generated by k-fold cross validation
# CV is the number of K-folds we want
# nj_jobs will allow us to do parallel computing so that we can train 10 k folds in a reasonable about of time,
# nj_jobs = -1 essentially tells the computer to use all available CPUs
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv=10, n_jobs= -1) 

In [27]:
mean = accuracies.mean()
variance = accuracies.std()
mean

0.8437333288987476

In [28]:
variance

0.019329194618068445

### Low bias!, low variance!

## Tuning ANN hyperparameters using GridSearchCV

* **Warning:** Grid Search w/ 10 k-folds will take hours to train

In [29]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

* If you want to tune hyperparameters that are in the architecture of your ANN, then you need to pass it along as function paramter

In [None]:
def build_classifier(optimizer):
    classifier = Sequential()
    classifier.add(Dense(activation="relu", input_dim=11, units=6, kernel_initializer="uniform"))
    classifier.add(Dense(activation="relu", units=6, kernel_initializer="uniform"))
    classifier.add(Dense(activation="sigmoid", units=1, kernel_initializer="uniform"))
    classifier.compile(optimizer= optimizer, loss = 'binary_crossentropy', metrics=['accuracy'])
    return classifier

In [None]:
classifier = KerasClassifier(build_fn = build_classifier)

In [None]:
parameters = {'batch_size': [25, 32], 'epochs': [100, 250], 'optimizer': ['adam', 'rmsprop']}

In [None]:
# GridSearch hyperparameters w/ 10 folds
grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv = 10)

In [None]:
grid_search = grid_search.fit(X_train, y_train)

In [None]:
best_parameters = grid_search.best_parameters_
best_accuracy = grid_search.best_accuracy