# Data Preprocessing for Churn Modeling data

### Import usual suspects

In [1]:
import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset


In [2]:
dataset = pd.read_csv('Churn_Modelling.csv')

In [3]:
dataset.head(20)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [4]:
dataset.tail(20)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9980,9981,15719276,T'ao,741,Spain,Male,35,6,74371.49,1,0,0,99595.67,0
9981,9982,15672754,Burbidge,498,Germany,Male,42,3,152039.7,1,1,1,53445.17,1
9982,9983,15768163,Griffin,655,Germany,Female,46,7,137145.12,1,1,0,115146.4,1
9983,9984,15656710,Cocci,613,France,Male,40,4,0.0,1,0,0,151325.24,0
9984,9985,15696175,Echezonachukwu,602,Germany,Male,35,7,90602.42,2,1,1,51695.41,0
9985,9986,15586914,Nepean,659,France,Male,36,6,123841.49,2,1,0,96833.0,0
9986,9987,15581736,Bartlett,673,Germany,Male,47,1,183579.54,2,0,1,34047.54,0
9987,9988,15588839,Mancini,606,Spain,Male,30,8,180307.73,2,1,1,1914.41,0
9988,9989,15589329,Pirozzi,775,France,Male,30,4,0.0,2,1,0,49337.84,0
9989,9990,15605622,McMillan,841,Spain,Male,28,4,0.0,2,1,1,179436.6,0


In [5]:
# Creating X by defining all indepdendent variables from dataset and using it values as numpy array
X = dataset.iloc[:,3:13].values

# Creating y by defining dependent variable from dataset and using it values as numpy array
y = dataset.iloc[:,13].values

In [6]:
# Let's see first 5 rows
print (X[0:9,:])

[[619 'France' 'Female' 42 2 0.0 1 1 1 101348.88]
 [608 'Spain' 'Female' 41 1 83807.86 1 0 1 112542.58]
 [502 'France' 'Female' 42 8 159660.8 3 1 0 113931.57]
 [699 'France' 'Female' 39 1 0.0 2 0 0 93826.63]
 [850 'Spain' 'Female' 43 2 125510.82 1 1 1 79084.1]
 [645 'Spain' 'Male' 44 8 113755.78 2 1 0 149756.71]
 [822 'France' 'Male' 50 7 0.0 2 1 1 10062.8]
 [376 'Germany' 'Female' 29 4 115046.74 4 1 0 119346.88]
 [501 'France' 'Male' 44 4 142051.07 2 0 1 74940.5]]


In [7]:
# Binary variable showing 1 == yes, customer exited the bank and 0 == no, custome didn't exited bank
print (y[0:5])

[1 0 1 0 0]


### Now, since our dataset contains few categorical variables with string/character dtypes, we need to do encoding to convert them into numerical dtypes

In [8]:
# let's use sklearn preprocessing based LabelEncode and OneHotEncoder method to do this
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# get copy of X to do encoding
# get copy of y to do encoding
X_encoded = X
y_encoded = y

# now, let's do encoding for country categorical variable at 1 position
labelencoder_X_1 = LabelEncoder()
X_encoded[:,1] = labelencoder_X_1.fit_transform(X_encoded[:,1])

# encoding for Sex categorical variable at 2 position
labelencoder_X_2 = LabelEncoder()
X_encoded[:,2] = labelencoder_X_2.fit_transform(X_encoded[:,2])


### Now, IMPORTANT, we know our categorical variables are not ordinal type mean they don't have any order so country france = 0, spain = 2, germany = 1 has no order

### To acheive above, we need to use one hot encoding method from sklearn preprocessing, which creates dummy variables for each category type of data so, 3 dummy columns will be created aka dummy_france_0, dummy_spain_2 and dummy_germany_1 

### and wherever their is true value in those dummy columns, 1 will be putted as one hot encoding ideally same for sex and 0 = female and 1 = male has no order but since it's only 0 and 1, sex variable can be treated as binary variable

In [9]:
# so let's use OneHotEncoder method and give it to only country variable at index position 1
onehotencoder = OneHotEncoder(categorical_features=[1])
# now, use fit transform to apply one hot encoding and then cast it into numpy array
X_encoded = onehotencoder.fit_transform(X_encoded).toarray()

print (type(X_encoded))
print (X_encoded.shape)
print (X_encoded[0:9,:])

<class 'numpy.ndarray'>
(10000, 12)
[[1.00 0.00 0.00 619.00 0.00 42.00 2.00 0.00 1.00 1.00 1.00 101348.88]
 [0.00 0.00 1.00 608.00 0.00 41.00 1.00 83807.86 1.00 0.00 1.00 112542.58]
 [1.00 0.00 0.00 502.00 0.00 42.00 8.00 159660.80 3.00 1.00 0.00 113931.57]
 [1.00 0.00 0.00 699.00 0.00 39.00 1.00 0.00 2.00 0.00 0.00 93826.63]
 [0.00 0.00 1.00 850.00 0.00 43.00 2.00 125510.82 1.00 1.00 1.00 79084.10]
 [0.00 0.00 1.00 645.00 1.00 44.00 8.00 113755.78 2.00 1.00 0.00 149756.71]
 [1.00 0.00 0.00 822.00 1.00 50.00 7.00 0.00 2.00 1.00 1.00 10062.80]
 [0.00 1.00 0.00 376.00 0.00 29.00 4.00 115046.74 4.00 1.00 0.00 119346.88]
 [1.00 0.00 0.00 501.00 1.00 44.00 4.00 142051.07 2.00 0.00 1.00 74940.50]]


In [10]:
# can u see first 3 columns as dummy columns for dummy_france_0, dummy_germany_1 and dummy_spain_2

### Now, IMPORTANT: to handle dummy variable trap, we need to drop one of 3 newly created dummy column
### Why because, we only need 2 dummy varibles to represent 3 countries true and false values
### meaning if values in dummy_germany_1 is 0.00 and dummy_spain_2 is 0.00 it mean that that row obviously belong to france, so we don't need dummy_france_0

In [11]:
# so let's drop first column dummy_france_0 as 2 dummy columns are sufficient to reprsent 3 countries presense in dataset
X_encoded = X_encoded[:,1:]

print ("X_encoded", type(X_encoded))
print (X_encoded.shape)
print (X_encoded[0:9,:])

X_encoded <class 'numpy.ndarray'>
(10000, 11)
[[0.00 0.00 619.00 0.00 42.00 2.00 0.00 1.00 1.00 1.00 101348.88]
 [0.00 1.00 608.00 0.00 41.00 1.00 83807.86 1.00 0.00 1.00 112542.58]
 [0.00 0.00 502.00 0.00 42.00 8.00 159660.80 3.00 1.00 0.00 113931.57]
 [0.00 0.00 699.00 0.00 39.00 1.00 0.00 2.00 0.00 0.00 93826.63]
 [0.00 1.00 850.00 0.00 43.00 2.00 125510.82 1.00 1.00 1.00 79084.10]
 [0.00 1.00 645.00 1.00 44.00 8.00 113755.78 2.00 1.00 0.00 149756.71]
 [0.00 0.00 822.00 1.00 50.00 7.00 0.00 2.00 1.00 1.00 10062.80]
 [1.00 0.00 376.00 0.00 29.00 4.00 115046.74 4.00 1.00 0.00 119346.88]
 [0.00 0.00 501.00 1.00 44.00 4.00 142051.07 2.00 0.00 1.00 74940.50]]


In [12]:
# Now, let's Split encoded X and y into Training set and Test set via sklearn model selection train_test_split method
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size = 0.20, random_state = 0)

### Let's see shape and sample from X_train and X_test

In [13]:
print ("X_train", type(X_train))
print (X_train.shape)
print (X_train[0:9,:])

print ("\n X_test", type(X_test))
print (X_test.shape)
print (X_test[0:9,:])

X_train <class 'numpy.ndarray'>
(8000, 11)
[[0.00 1.00 667.00 0.00 34.00 5.00 0.00 2.00 1.00 0.00 163830.64]
 [1.00 0.00 427.00 1.00 42.00 1.00 75681.52 1.00 1.00 1.00 57098.00]
 [0.00 0.00 535.00 0.00 29.00 2.00 112367.34 1.00 1.00 0.00 185630.76]
 [0.00 1.00 654.00 1.00 40.00 5.00 105683.63 1.00 1.00 0.00 173617.09]
 [0.00 1.00 850.00 0.00 57.00 8.00 126776.30 2.00 1.00 1.00 132298.49]
 [1.00 0.00 776.00 0.00 37.00 2.00 103769.22 2.00 1.00 0.00 194099.12]
 [0.00 0.00 807.00 1.00 47.00 1.00 95120.59 1.00 0.00 0.00 127875.10]
 [0.00 1.00 598.00 1.00 41.00 8.00 0.00 2.00 1.00 1.00 161954.43]
 [0.00 1.00 636.00 1.00 76.00 9.00 126534.60 1.00 1.00 1.00 39789.62]]

 X_test <class 'numpy.ndarray'>
(2000, 11)
[[1.00 0.00 597.00 0.00 35.00 8.00 131101.04 1.00 1.00 1.00 192852.67]
 [0.00 0.00 523.00 0.00 40.00 2.00 102967.41 1.00 1.00 0.00 128702.10]
 [0.00 1.00 706.00 0.00 42.00 8.00 95386.82 1.00 1.00 1.00 75732.25]
 [0.00 0.00 788.00 1.00 32.00 4.00 112079.58 1.00 0.00 0.00 89368.59]
 [1.00

### Now, IMPORTANT: let's do Feature Scaling for our future ANN model

In [14]:
# we will use here sklearn preprocessing StandardScaler method to do feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

print (type(X_train))
print (X_train.shape)
print (X_train[0:9,:])
print (type(X_test))
print (X_test.shape)
print (X_test[0:9,:])

<class 'numpy.ndarray'>
(8000, 11)
[[-0.57 1.74 0.17 -1.09 -0.46 0.01 -1.22 0.81 0.64 -1.03 1.11]
 [1.75 -0.57 -2.30 0.92 0.30 -1.38 -0.01 -0.92 0.64 0.97 -0.75]
 [-0.57 -0.57 -1.19 -1.09 -0.94 -1.03 0.58 -0.92 0.64 -1.03 1.49]
 [-0.57 1.74 0.04 0.92 0.11 0.01 0.47 -0.92 0.64 -1.03 1.28]
 [-0.57 1.74 2.06 -1.09 1.74 1.04 0.81 0.81 0.64 0.97 0.56]
 [1.75 -0.57 1.29 -1.09 -0.18 -1.03 0.44 0.81 0.64 -1.03 1.63]
 [-0.57 -0.57 1.61 0.92 0.78 -1.38 0.30 -0.92 -1.56 -1.03 0.48]
 [-0.57 1.74 -0.54 0.92 0.21 1.04 -1.22 0.81 0.64 0.97 1.07]
 [-0.57 1.74 -0.15 0.92 3.55 1.39 0.81 -0.92 0.64 0.97 -1.05]]
<class 'numpy.ndarray'>
(2000, 11)
[[1.63 -0.57 -0.56 -1.11 -0.39 0.99 0.86 -0.87 0.66 0.98 1.62]
 [-0.61 -0.57 -1.34 -1.11 0.08 -1.08 0.40 -0.87 0.66 -1.02 0.50]
 [-0.61 1.74 0.58 -1.11 0.26 0.99 0.28 -0.87 0.66 0.98 -0.42]
 [-0.61 -0.57 1.44 0.90 -0.68 -0.39 0.55 -0.87 -1.51 -1.02 -0.18]
 [1.63 -0.57 0.58 0.90 -0.11 -0.05 1.38 0.80 0.66 0.98 0.63]
 [-0.61 1.74 0.21 -1.11 1.67 -0.74 1.58 0.80 0.6

### Can you see how all long range integer features values like credit score at index 2 and Age at index 4 and Tenure at index 5 and Balance at index 6 and Estimated Salary at index 10 are all now in lower range

# DATA IS PREPROCESSED NOW AND READY FOR ANN MODELING

### Import the keras libaray and make sure to use Tensorflow background

### to change keras background make sure to edit

```
nano ~/.keras/keras.json
```

### and make sure it look like below

```
{
    "image_dim_ordering": "tf", 
    "epsilon": 1e-07, 
    "floatx": "float32", 
    "backend": "tensorflow"
}
```

In [15]:
import keras

Using TensorFlow backend.


In [16]:
# now let's use keras sequential module to initialize sequential (not a graph) ANN
from keras.models import Sequential

# and dense layer to build layers
from keras.layers import Dense

In [17]:
# Now, To build and training a Ann with Stochastic Gradient Descent, we will follow step by step

# Step 1
# Weight Initialization:
# Randomly Initialize the weights to small numbers close to 0 (but not 0)

# Step 2
# Input Assigenment:
# Input the first observation of your dataset in the input layer, each feature is one input node

# Step 3
# Forward-Propogation:
# from left to right, the neurons are activated in a way that the impact of each neurons'
# activation is limited by the weights. 
# Propogate the activation until getting the predicted results y

# Step 4
# Cost Calculation:
# Compare the predicted result to actual result. Measure the generated error.

# Step 5
# Back Propogation:
# From right to left, the error is backpropogated. Updates the weight according to how much proportionally
# they are responsible for the error. 

# Step 6
# Batch Processing:
# Repeat Step 1 to 5 and update the weights after a batch of observations
# Use learning rate to decide how much gradually in each batch a weight should update

# Step 7
# Epoch Training:
# When the whole training set passed through the ANN, that makes an epoch. 
# Redo the whole training again and again on many epochs


In [18]:
#  Now, let's initialize sequential based model (or graph based model)
#  The Sequential model is a linear stack of layers.
classifier = Sequential()

In [19]:
# You can also simply add layers via the .add() method

# Adding the Input layer and the first hidden layer in Dense method
# choosing unit =6 is an art, basically u average total dimensions of output and input layer
# so, total dim in input layer is 11 and total dim in output layer is 1, so (11 + 1) / number of layers = 12/2 = 6
classifier.add(Dense(units=6, kernel_initializer= 'glorot_uniform' , activation='relu', input_shape=(11,)))

In [20]:
# Adding the second hidden layer
classifier.add(Dense(units=6, kernel_initializer= 'glorot_uniform' , activation='relu', input_shape=(6,)))

In [21]:
# Adding the output layer
classifier.add(Dense(units=1, kernel_initializer= 'glorot_uniform' , activation='sigmoid', input_shape=(6,)))

In [22]:
# Compilation
# Before training a model, you need to configure the learning process, which is done via the compile method.
# It receives three arguments:
# An optimizer. This could be the string identifier of an existing optimizer (such as rmsprop or adagrad), or an instance of the Optimizer class. See: optimizers.
# A loss function. This is the objective that the model will try to minimize. It can be the string identifier of an existing loss function (such as categorical_crossentropy or mse), or it can be an objective function. See: losses.
# A list of metrics. For any classification problem you will want to set this to metrics=['accuracy']. A metric could be the string identifier of an existing metric or a custom metric function.

#  Compling the ANN
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [23]:
# Training
# Keras models are trained on Numpy arrays of input data and labels.
# For training a model, you will typically use the  fit function.

classifier.fit(X_test, y_test, batch_size= 10 ,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x11df1a8d0>

In [24]:
score = classifier.evaluate(X_test, y_test)

  32/2000 [..............................] - ETA: 1s

In [25]:
print (score)

[0.31067434525489807, 0.87150000000000005]


In [26]:
# let's see confusion metrix
from sklearn.metrics import confusion_matrix

In [27]:
y_pred = classifier.predict(X_test)
print (y_pred)
print (type(y_pred))

[[0.20]
 [0.41]
 [0.14]
 ..., 
 [0.10]
 [0.16]
 [0.18]]
<class 'numpy.ndarray'>


In [28]:
y_pred = (y_pred > 0.5)
print (y_pred)

[[False]
 [False]
 [False]
 ..., 
 [False]
 [False]
 [False]]


In [29]:
cm = confusion_matrix(y_test, y_pred)
print (cm)

[[1539   56]
 [ 201  204]]


In [48]:
#  Calculate Accuracy
print (cm[[0][0]])
print (cm[[1]])
# print ((cm[[0,0]]+cm[[1,1]])/(2000))

[1539   56]
[[201 204]]


## Now, Predict a single customer with given data, whether he is goint to leave or stay in bank

### Here is data

Geography: France

Credit Score: 600

Gender: Male

Age: 40 years old

Tenure: 3 years

Balance: $60000

Number of Products: 2

Does this customer have a credit card ? Yes

Is this customer an Active Member: Yes

Estimated Salary: $50000

In [31]:
print (type(X_test))

<class 'numpy.ndarray'>


[[0.00 0.00 619.00 0.00 42.00 2.00 0.00 1.00 1.00 1.00 101348.88]

 [0.00 1.00 608.00 0.00 41.00 1.00 83807.86 1.00 0.00 1.00 112542.58]
 
 [0.00 0.00 502.00 0.00 42.00 8.00 159660.80 3.00 1.00 0.00 113931.57]
 
 [0.00 0.00 699.00 0.00 39.00 1.00 0.00 2.00 0.00 0.00 93826.63]
 
 [0.00 1.00 850.00 0.00 43.00 2.00 125510.82 1.00 1.00 1.00 79084.10]
 
 [0.00 1.00 645.00 1.00 44.00 8.00 113755.78 2.00 1.00 0.00 149756.71]
 
 [0.00 0.00 822.00 1.00 50.00 7.00 0.00 2.00 1.00 1.00 10062.80]
 
 [1.00 0.00 376.00 0.00 29.00 4.00 115046.74 4.00 1.00 0.00 119346.88]
 
 [0.00 0.00 501.00 1.00 44.00 4.00 142051.07 2.00 0.00 1.00 74940.50]]

In [32]:
# let's create 2 dimension numpy array with one row representing this customer
X_sample = np.array([[0.0,0.0,600,1,40,3,60000,2,1,1,50000]])
print (X_sample)

[[0.00 0.00 600.00 1.00 40.00 3.00 60000.00 2.00 1.00 1.00 50000.00]]


In [33]:
# IMPORTANT since our model is trained on scaled values of X
# let's scale X_sample numpy array before we use this for predection with model,
X_sample = sc.transform(X_sample)
print (X_sample)

[[-0.61 -0.57 -0.53 0.90 0.08 -0.74 -0.29 0.80 0.66 0.98 -0.87]]


In [34]:
# Now, let's predict, whether this customer will leave or remain on bank
y_sample = classifier.predict(X_sample)
print (y_sample)
y_sample = (y_sample > 0.5)
print (y_sample)

[[0.02]]
[[False]]


In [35]:
# so, False, customer will not leave the bank