### Import Modules and Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV
KFoldrom sklearn.metrics import precision_recall_fscore_support as score,confusion_matrix,accuracy_score
from sklearn.preprocessing import StandardScaler

### Importing Dataset

In [2]:
dataset = pd.read_csv("Churn_Modelling.csv")
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Dropping of Columns which is no use for model i.e RowNumber, CustomerID and Surname

In [3]:
dataset.drop(columns=["RowNumber","CustomerId","Surname"],axis=1,inplace=True)
dataset.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Factorising and Extracting Dummy variables

In [4]:
dataset.Gender = pd.factorize(dataset.Gender)[0]
Dummies = pd.get_dummies(dataset["Geography"],drop_first=True)
dataset = pd.concat([Dummies,dataset],axis=1)
dataset.drop(["Geography"],axis=1,inplace=True)
dataset.head()

Unnamed: 0,Germany,Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,0,619,0,42,2,0.0,1,1,1,101348.88,1
1,0,1,608,0,41,1,83807.86,1,0,1,112542.58,0
2,0,0,502,0,42,8,159660.8,3,1,0,113931.57,1
3,0,0,699,0,39,1,0.0,2,0,0,93826.63,0
4,0,1,850,0,43,2,125510.82,1,1,1,79084.1,0


### Scaling of Features

In [5]:
sc = StandardScaler()
dataset.iloc[:,:-1] = sc.fit_transform(dataset.iloc[:,:-1])
dataset.head()

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,Germany,Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,-0.578736,-0.573809,-0.326221,-1.095988,0.293517,-1.04176,-1.225848,-0.911583,0.646092,0.970243,0.021886,1
1,-0.578736,1.74274,-0.440036,-1.095988,0.198164,-1.387538,0.11735,-0.911583,-1.547768,0.970243,0.216534,0
2,-0.578736,-0.573809,-1.536794,-1.095988,0.293517,1.032908,1.333053,2.527057,0.646092,-1.03067,0.240687,1
3,-0.578736,-0.573809,0.501521,-1.095988,0.007457,-1.387538,-1.225848,0.807737,-1.547768,-1.03067,-0.108918,0
4,-0.578736,1.74274,2.063884,-1.095988,0.388871,-1.04176,0.785728,-0.911583,0.646092,0.970243,-0.365276,0


### Model Selection and hyperparameter Tuning

In [21]:
rf = RandomForestClassifier()
param = {
    'n_estimators':[10,150,300],
    'max_depth': [30, 60, 90, None]
}
gs = GridSearchCV(rf,param,n_jobs=-1,cv=5,return_train_score=True)
gs_fit = gs.fit(dataset.iloc[:,:-1],dataset.iloc[:,11])
pd.DataFrame(gs_fit.cv_results_).sort_values("mean_test_score",ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
8,10.679311,0.712148,0.382403,0.030249,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.86057,0.872064,0.8645,...,0.8654,0.003715,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,8.43847,0.365831,0.403109,0.045958,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.86007,0.875062,0.8635,...,0.8653,0.005103,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
11,7.741111,0.802797,0.334981,0.05052,,300,"{'max_depth': None, 'n_estimators': 300}",0.861569,0.874063,0.8635,...,0.8647,0.004935,3,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,3.740457,0.028173,0.169903,0.011552,30.0,150,"{'max_depth': 30, 'n_estimators': 150}",0.864068,0.869065,0.862,...,0.8645,0.00252,4,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,8.207299,0.930997,0.392776,0.086116,30.0,300,"{'max_depth': 30, 'n_estimators': 300}",0.862069,0.872564,0.859,...,0.8645,0.004674,4,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [22]:
gb = GradientBoostingClassifier()
param = {
    'n_estimators':[10,150,300],
    'max_depth': [30, 60, 90, None],
    'learning_rate':[0.1]
}
gb = GridSearchCV(rf,param,n_jobs=-1,cv=5,return_train_score=True)
gb_fit = gs.fit(dataset.iloc[:,:-1],dataset.iloc[:,11])
pd.DataFrame(gs_fit.cv_results_).sort_values("mean_test_score",ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
2,8.306593,0.099197,0.429103,0.078053,30.0,300,"{'max_depth': 30, 'n_estimators': 300}",0.863568,0.875562,0.8625,...,0.866,0.005242,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,4.452223,0.310173,0.1894,0.004994,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.865067,0.869565,0.863,...,0.866,0.002561,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,8.418355,0.544996,0.387834,0.018236,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.86007,0.872064,0.8645,...,0.865,0.003898,3,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,4.125515,0.045548,0.190402,0.004649,30.0,150,"{'max_depth': 30, 'n_estimators': 150}",0.86057,0.872064,0.863,...,0.8645,0.004364,4,1.0,1.0,1.0,1.0,1.0,1.0,0.0
10,3.828467,0.136144,0.189092,0.021689,,150,"{'max_depth': None, 'n_estimators': 150}",0.862069,0.870565,0.8615,...,0.8641,0.003479,5,1.0,1.0,1.0,1.0,1.0,1.0,0.0


### Splitting The Datasets

In [8]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,11].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Train and fitting of models

In [23]:
rf = RandomForestClassifier(n_estimators=300,max_depth=90,n_jobs=-1)
rf_fit = rf.fit(X_train,y_train)
rf_pred = rf_fit.predict(X_test)
precision, recall, fscore, support = score(y_test,rf_pred,average='binary')
print("Precision: {}/ Recall: {}/ Fscore: {}/ Accuracy: {}".format(
    round(precision,3),round(recall,3),round(fscore,3),round(accuracy_score(y_test,rf_pred),3)))

Precision: 0.739/ Recall: 0.523/ Fscore: 0.613/ Accuracy: 0.866


In [24]:
gb = GradientBoostingClassifier(n_estimators=300,max_depth=30,learning_rate=0.1)
gb_fit = gb.fit(X_train,y_train)
gb_pred = gb_fit.predict(X_test)
precision, recall, fscore, support = score(y_test,rf_pred,average='binary')
print("Precision: {}/ Recall: {}/ Fscore: {}/ Accuracy: {}".format(
    round(precision,3),round(recall,3),round(fscore,3),round(accuracy_score(y_test,rf_pred),3)))

Precision: 0.739/ Recall: 0.523/ Fscore: 0.613/ Accuracy: 0.866


### Confusion matrix of Both Models

In [25]:
print("RF CM: \n{}".format(confusion_matrix(y_test,rf_pred)))
print("GB CM: \n{}".format(confusion_matrix(y_test,gb_pred)))

RF CM: 
[[1520   75]
 [ 193  212]]
GB CM: 
[[1397  198]
 [ 177  228]]


### ANN

In [12]:
import keras
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


In [36]:
# Initialising the ANN
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 11))

# Adding the second hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))

# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 10, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1f6dbbdcf98>

### Confusion matrix of ANN

In [35]:
ann_pred = classifier.predict(X_test)
ann_pred = (ann_pred > 0.5)
confusion_matrix(y_test,ann_pred)

array([[1544,   51],
       [ 259,  146]], dtype=int64)

### Stats of All the models

In [27]:
#RF Stats
print("RF || Accuracy: {}, Precision: {}, Recall: {}".format(
    round((1519+217)/(2000),3),round((1519/(1519+217)),3),round((1519/(1519+217)),3)))
#GB Stats
print("GB || Accuracy: {}, Precision: {}, Recall: {}".format(
    round((1397+227)/(2000),3),round((1397/(1397+198)),3),round((1397/(1397+178)),3)))
#ANN Stats
print("ANN || Accuracy: {}, Precision: {}, Recall: {}".format(
    round((1523+203)/(2000),3),round((1523/(1523+72)),3),round((1523/(1523+202)),3)))

RF || Accuracy: 0.868, Precision: 0.875, Recall: 0.875
GB || Accuracy: 0.812, Precision: 0.876, Recall: 0.887
ANN || Accuracy: 0.863, Precision: 0.955, Recall: 0.883


### Prediction from all the models on Values
<font color='green'>
Geography = France<br>
Credit Score = 600<br>
Gender = Male<br>
Age = 40<br>
Tenure = 3<br>
Balance = 60000<br>
Number of products = 2<br>
Has Credit Card = Yes<br>
Is active member = Yes<br>
Estimated Salary = 50000
</font>

In [18]:
rf_predict = rf_fit.predict(sc.transform(np.array([[0,0,600,1,40,3,60000,2,1,1,50000]])))
gb_predict = gb_fit.predict(sc.transform(np.array([[0,0,600,1,40,3,60000,2,1,1,50000]])))
ann_predict = classifier.predict(sc.transform(np.array([[0,0,600,1,40,3,60000,2,1,1,50000]])))
print("Rf: {}/ GB: {}/ ANN: {}".format(rf_predict,gb_predict,ann_predict>0.5))



Rf: [0]/ GB: [0]/ ANN: [[False]]




#### So Answer is False or Customer will not Leave the bank

### KFold Cross Validation inside Keras

In [37]:
from sklearn.model_selection import cross_val_score
from keras.wrappers.scikit_learn import KerasClassifier

In [38]:
def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 11))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier

In [39]:
classifier = KerasClassifier(build_fn=build_classifier,batch_size = 10, epochs = 100)
accuracies = cross_val_score(estimator=classifier,X=X_train,y=y_train,cv=10,n_jobs=-1)

In [48]:
mean = accuracies.mean()
variance = accuracies.std()
print("Mean: {}/ Variance: {}".format(mean,variance))

Mean: 0.8387499953061341/ Variance: 0.011765946925744116
