In [97]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from keras.models import load_model
from keras import backend as K
from werkzeug import secure_filename
import json
import csv
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [75]:
dataset = pd.read_csv('data.csv')
data_re=dataset[dataset['Exited']==1]
data_re.set_index('RowNumber',inplace=True)
dataset = data_re

In [76]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2037 entries, 1 to 9999
Data columns (total 14 columns):
CustomerId                    2037 non-null int64
Surname                       2037 non-null object
CreditScore                   2037 non-null int64
Geography                     2037 non-null object
Gender                        2037 non-null object
Age                           2037 non-null int64
Tenure                        2037 non-null int64
Balance                       2037 non-null float64
NumOfProducts                 2037 non-null int64
HasCrCard                     2037 non-null int64
IsActiveMember                2037 non-null int64
EstimatedSalary               2037 non-null float64
Exited                        2037 non-null int64
Reason for exiting company    2037 non-null object
dtypes: float64(2), int64(8), object(4)
memory usage: 238.7+ KB


In [77]:
print(dataset.isnull().values.any())

False


In [78]:
dataset['Geography'] = dataset['Geography'].astype('category')
dataset['Gender'] = dataset['Gender'].astype('category')
dataset['EstimatedSalary'] = dataset['EstimatedSalary'].astype('float32')
dataset['Balance'] = dataset['Balance'].astype('float32')
dataset['Reason for exiting company'] = dataset['Reason for exiting company'].astype('category')



In [79]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2037 entries, 1 to 9999
Data columns (total 14 columns):
CustomerId                    2037 non-null int64
Surname                       2037 non-null object
CreditScore                   2037 non-null int64
Geography                     2037 non-null category
Gender                        2037 non-null category
Age                           2037 non-null int64
Tenure                        2037 non-null int64
Balance                       2037 non-null float32
NumOfProducts                 2037 non-null int64
HasCrCard                     2037 non-null int64
IsActiveMember                2037 non-null int64
EstimatedSalary               2037 non-null float32
Exited                        2037 non-null int64
Reason for exiting company    2037 non-null category
dtypes: category(3), float32(2), int64(8), object(1)
memory usage: 181.4+ KB


In [80]:
from scipy.stats import kurtosis, skew
print('before converting',dataset.head())
cat_columns = dataset.select_dtypes(['category']).columns
dataset[cat_columns] = dataset[cat_columns].apply(lambda x: x.cat.codes)
print('************************************')
print('after converting',dataset.tail())


before converting            CustomerId   Surname  CreditScore Geography  Gender  Age  Tenure  \
RowNumber                                                                     
1            15634602  Hargrave          619    France  Female   42       2   
3            15619304      Onio          502    France  Female   42       8   
6            15574012       Chu          645     Spain    Male   44       8   
8            15656148    Obinna          376   Germany  Female   29       4   
17           15737452     Romeo          653   Germany    Male   58       1   

                 Balance  NumOfProducts  HasCrCard  IsActiveMember  \
RowNumber                                                            
1               0.000000              1          1               1   
3          159660.796875              3          1               0   
6          113755.781250              2          1               0   
8          115046.742188              4          1               0   
17      

In [81]:
X = dataset.iloc[:, 3:13].values
Y= dataset.iloc[:, 13].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0,stratify=Y)


In [82]:
print("X:",X_train)
print("Number of training records",len(X_train))
print("**************************************************")
print("X_test:",X_test)
print("Number of testing records",len(X_test))

X: [[0.00000000e+00 0.00000000e+00 3.90000000e+01 ... 0.00000000e+00
  1.76614859e+05 1.00000000e+00]
 [2.00000000e+00 0.00000000e+00 5.60000000e+01 ... 0.00000000e+00
  1.40991234e+05 1.00000000e+00]
 [0.00000000e+00 1.00000000e+00 4.90000000e+01 ... 1.00000000e+00
  1.00075102e+05 1.00000000e+00]
 ...
 [1.00000000e+00 0.00000000e+00 4.20000000e+01 ... 0.00000000e+00
  1.03516078e+05 1.00000000e+00]
 [0.00000000e+00 0.00000000e+00 3.90000000e+01 ... 0.00000000e+00
  1.15287992e+05 1.00000000e+00]
 [0.00000000e+00 0.00000000e+00 5.50000000e+01 ... 1.00000000e+00
  1.96794109e+05 1.00000000e+00]]
Number of training records 1527
**************************************************
X_test: [[0.00000000e+00 0.00000000e+00 5.50000000e+01 ... 1.00000000e+00
  6.75398516e+04 1.00000000e+00]
 [1.00000000e+00 0.00000000e+00 4.70000000e+01 ... 1.00000000e+00
  4.77771484e+04 1.00000000e+00]
 [0.00000000e+00 1.00000000e+00 3.40000000e+01 ... 1.00000000e+00
  2.57441309e+04 1.00000000e+00]
 ...
 [1.

In [83]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.25882352941176473

In [84]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(y_pred)
print(y_test)

0.24509803921568626
[3 2 3 2 0 3 0 3 2 3 2 2 3 2 0 3 2 3 0 3 0 2 3 3 3 0 0 0 0 3 3 2 3 2 2 3 3
 0 0 3 3 3 3 3 0 3 0 0 3 2 3 3 0 2 0 0 2 0 2 3 1 2 3 3 2 3 2 0 0 0 3 0 2 2
 3 2 3 0 3 3 3 3 3 3 2 2 2 0 2 2 2 0 3 0 2 3 3 0 1 0 3 2 0 2 3 2 2 3 3 0 0
 3 0 3 3 2 3 3 2 0 0 2 3 2 3 0 1 3 2 3 3 3 3 0 0 2 2 2 2 3 3 0 3 2 3 0 0 3
 3 2 3 0 2 0 0 3 0 1 2 0 3 0 3 3 2 3 0 3 3 0 3 3 3 3 0 0 0 2 3 3 2 2 0 0 0
 0 2 0 3 0 3 2 3 0 2 2 3 2 3 3 0 0 3 0 0 3 3 2 3 0 0 0 2 3 2 3 3 0 3 2 3 0
 3 0 0 3 2 2 3 2 3 3 0 2 0 3 0 3 0 3 0 3 0 3 0 3 0 2 2 0 0 3 3 3 3 2 3 2 3
 0 2 3 3 0 3 3 3 3 3 2 0 0 3 2 0 3 0 2 3 3 0 0 2 3 3 0 2 0 2 2 3 2 3 2 2 0
 0 3 2 0 3 0 0 3 3 2 2 3 3 2 0 3 3 0 2 3 2 2 3 2 0 3 0 2 3 2 2 0 0 0 3 3 2
 2 2 2 1 3 0 3 2 0 3 2 3 2 2 3 3 3 3 3 1 0 3 0 0 3 3 3 3 2 3 3 0 1 2 3 3 3
 3 3 0 2 0 3 2 3 3 3 3 0 0 3 0 0 3 3 3 0 3 0 3 3 0 2 0 3 0 1 3 3 3 0 2 0 3
 0 3 2 3 1 0 3 0 0 3 0 0 3 0 3 0 0 3 2 3 3 3 3 3 2 0 3 3 2 2 2 0 2 3 3 2 2
 3 2 3 0 0 3 3 2 2 3 0 3 3 2 3 3 2 3 0 2 2 2 0 3 2 2 0 2 0 2 2 1 3 0 2 2 3
 3 0 

In [85]:
from sklearn.naive_bayes import ComplementNB
clf = ComplementNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(y_pred)
print(y_test)

0.25882352941176473
[0 2 2 2 2 0 0 2 2 2 2 2 2 2 0 0 0 0 0 2 0 2 0 2 2 0 2 0 0 0 0 2 0 2 2 2 0
 2 0 2 2 0 0 0 0 0 2 2 2 2 0 0 0 2 0 2 2 0 2 0 0 0 0 2 2 2 2 0 0 0 0 0 2 2
 0 2 2 2 0 2 0 2 0 2 0 2 2 0 0 2 2 2 0 0 2 0 2 0 0 0 0 0 0 2 0 2 2 0 2 0 0
 0 2 0 0 2 0 2 0 0 0 0 2 2 2 2 0 2 0 0 2 2 0 0 0 2 2 2 2 0 0 2 0 2 2 0 0 0
 2 2 0 0 2 0 2 0 0 0 2 0 2 2 0 2 2 0 0 0 2 2 2 0 0 2 2 2 2 2 0 0 2 2 0 2 2
 2 2 0 2 2 0 2 0 0 0 2 2 0 2 0 2 0 0 2 2 0 3 0 2 2 0 2 2 0 0 0 2 0 2 2 0 2
 2 0 2 2 2 2 2 2 2 0 0 2 0 2 0 2 2 2 2 0 0 2 2 0 0 0 2 2 0 0 0 2 0 0 0 2 0
 2 2 0 0 0 0 2 0 0 0 2 2 2 0 2 0 2 0 2 0 2 2 2 2 0 0 0 2 0 0 2 0 0 2 2 2 0
 2 0 0 2 0 0 0 2 0 2 0 0 0 2 0 2 0 2 0 0 2 0 0 2 2 0 0 0 2 2 2 0 0 0 2 0 2
 0 0 2 0 2 0 0 2 0 0 0 0 2 2 0 2 2 2 2 0 0 0 0 0 0 2 0 0 2 0 0 0 2 2 0 2 0
 2 2 2 2 2 0 0 0 0 0 0 0 0 0 2 0 2 0 2 0 0 0 0 2 0 0 0 2 2 2 2 2 2 0 2 2 0
 2 0 2 2 2 0 2 2 2 0 0 2 2 0 2 0 2 2 2 0 0 2 0 2 2 0 2 2 0 0 2 0 0 0 0 0 2
 2 2 0 2 0 2 0 2 2 2 0 0 0 2 0 2 2 0 2 2 2 2 0 2 0 2 0 2 2 2 2 2 0 0 2 0 2
 0 0 

In [177]:
dataset = pd.read_csv('data.csv')
data_re=dataset[dataset['Exited']==1]
data_re.set_index('RowNumber',inplace=True)
dataset = data_re

In [180]:
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values

'High Service Charges/Rate of Interest'

In [181]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
onehot_encoder = OneHotEncoder(sparse=False)
y = y.reshape(len(y), 1)
Y = onehot_encoder.fit_transform(y)
print(Y[2])
print(Y.shape)

[0. 1. 0. 0.]
(2037, 4)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [182]:
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])


In [183]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0,stratify=Y)


In [184]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [199]:
from keras.models import load_model

from keras.models import Sequential
from keras.layers import Dense

classifier = Sequential() 
classifier.add(Dense(output_dim = 6, init = 'uniform', activation = 'relu', input_dim = 10))
classifier.add(Dense(output_dim = 6, init = 'uniform', activation = 'sigmoid'))
classifier.add(Dense(output_dim = 6, init = 'uniform', activation = 'sigmoid'))
classifier.add(Dense(output_dim = 4, init = 'uniform', activation = 'sigmoid'))
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy')



  import sys
  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [200]:
classifier.fit(X_train, y_train,batch_size=10,nb_epoch=50)

  """Entry point for launching an IPython kernel.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f28d3cd3240>