In [14]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from keras.models import load_model
from keras import backend as K
from werkzeug import secure_filename
import json
import csv
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

In [15]:
dataset = pd.read_csv('data.csv')
data_re=dataset[dataset['Exited']==1]
data_re.set_index('RowNumber',inplace=True)
dataset.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited',
       'Reason for exiting company'],
      dtype='object')

In [16]:
dataset = dataset.dropna()  
print(dataset.isnull().values.any())
dataset.head()
dataset.info()


False
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 15 columns):
RowNumber                     10000 non-null int64
CustomerId                    10000 non-null int64
Surname                       10000 non-null object
CreditScore                   10000 non-null int64
Geography                     10000 non-null object
Gender                        10000 non-null object
Age                           10000 non-null int64
Tenure                        10000 non-null int64
Balance                       10000 non-null float64
NumOfProducts                 10000 non-null int64
HasCrCard                     10000 non-null int64
IsActiveMember                10000 non-null int64
EstimatedSalary               10000 non-null float64
Exited                        10000 non-null int64
Reason for exiting company    10000 non-null object
dtypes: float64(2), int64(9), object(4)
memory usage: 1.2+ MB


In [17]:
dataset['Geography'] = dataset['Geography'].astype('category')
dataset['Gender'] = dataset['Gender'].astype('category')
dataset['EstimatedSalary'] = dataset['EstimatedSalary'].astype('float32')
dataset['Balance'] = dataset['Balance'].astype('float32')



In [18]:
from scipy.stats import kurtosis, skew
print('before converting',dataset.head())
cat_columns = dataset.select_dtypes(['category']).columns
dataset[cat_columns] = dataset[cat_columns].apply(lambda x: x.cat.codes)
print('************************************')
print('after converting',dataset.tail())


before converting    RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure        Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.000000              1          1               1   
1       1   83807.859375              1          0               1   
2       8  159660.796875              3          1               0   
3       1       0.000000              2          0               0   
4       2  125510.820312              1          1               1   

   EstimatedSalary  Exited             Reason for exiting company  
0    101348.882812       1  High Service Charges

In [19]:
X = dataset.iloc[:, 3:13].values
Y= dataset.iloc[:, 13].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0,stratify=Y)
X_test[2]

array([5.63000000e+02, 2.00000000e+00, 1.00000000e+00, 2.40000000e+01,
       7.00000000e+00, 0.00000000e+00, 2.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.63195596e+04])

In [20]:
print("X:",X_train)
print("Number of training records",len(X_train))
print("**************************************************")
print("X_test:",X_test)
print("Number of testing records",len(X_test))

X: [[5.44000000e+02 2.00000000e+00 0.00000000e+00 ... 1.00000000e+00
  0.00000000e+00 1.25692070e+05]
 [7.47000000e+02 1.00000000e+00 1.00000000e+00 ... 1.00000000e+00
  0.00000000e+00 8.92895391e+04]
 [6.42000000e+02 0.00000000e+00 1.00000000e+00 ... 1.00000000e+00
  1.00000000e+00 1.38052516e+05]
 ...
 [6.97000000e+02 1.00000000e+00 0.00000000e+00 ... 1.00000000e+00
  1.00000000e+00 7.38779980e+03]
 [7.22000000e+02 0.00000000e+00 0.00000000e+00 ... 1.00000000e+00
  1.00000000e+00 1.50681797e+04]
 [7.43000000e+02 2.00000000e+00 1.00000000e+00 ... 1.00000000e+00
  0.00000000e+00 1.29740109e+05]]
Number of training records 7500
**************************************************
X_test: [[6.97000000e+02 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 8.78033203e+04]
 [5.95000000e+02 2.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 1.05149797e+05]
 [5.63000000e+02 2.00000000e+00 1.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 1.63195596e+04]
 ...
 [6.

In [21]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [22]:
print(X_train)
print(X_test)


[[-1.09958704  1.51817902 -1.09429439 ...  0.64510708 -1.03694883
   0.44273632]
 [ 1.00236569  0.30969434  0.91383087 ...  0.64510708 -1.03694883
  -0.18928067]
 [-0.08485124 -0.89879034  0.91383087 ...  0.64510708  0.96436774
   0.65733709]
 ...
 [ 0.48464334  0.30969434 -1.09429439 ...  0.64510708  0.96436774
  -1.61125022]
 [ 0.74350452 -0.89879034 -1.09429439 ...  0.64510708  0.96436774
  -1.47790425]
 [ 0.9609479   1.51817902  0.91383087 ...  0.64510708 -1.03694883
   0.51301796]]
[[ 0.48464334 -0.89879034 -1.09429439 ... -1.55013026 -1.03694883
  -0.21508425]
 [-0.57151025  1.51817902 -1.09429439 ... -1.55013026 -1.03694883
   0.08608349]
 [-0.90285255  1.51817902  0.91383087 ... -1.55013026 -1.03694883
  -1.45617792]
 ...
 [-0.34371242 -0.89879034  0.91383087 ...  0.64510708  0.96436774
   0.11903854]
 [-1.43092935  0.30969434 -1.09429439 ... -1.55013026 -1.03694883
  -1.33843698]
 [-0.14697792 -0.89879034 -1.09429439 ...  0.64510708 -1.03694883
   1.46708294]]


In [23]:
from sklearn.ensemble import RandomForestClassifier
RFclassifier = RandomForestClassifier(n_estimators=20, max_depth=20,random_state=34)
RFclassifier.fit(X_train, y_train)
y_pred = RFclassifier.predict(X_test)
print("=== Accuracy Score ===")
accuracy_score(y_test, y_pred)

=== Accuracy Score ===


0.846

In [24]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression()
model = clf_lr.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)



0.8036

In [31]:
from sklearn.svm import SVC

SVCclassifier = SVC(random_state=42,probability=True)
SVCclassifier.fit(X_train, y_train)
y_pred = SVCclassifier.predict(X_test)
print("=== Accuracy Score ===")
print(accuracy_score(y_test, y_pred))
import pickle
filename = 'SVCclassifier.sav'
pickle.dump(SVCclassifier, open(filename, 'wb'))

=== Accuracy Score ===
0.8484


In [26]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("=== Accuracy Score ===")
print(accuracy_score(y_test, y_pred))

=== Accuracy Score ===
0.79


In [29]:
from sklearn.metrics import confusion_matrix
results = confusion_matrix(y_test, y_pred)
print(results)

[[1945   46]
 [ 333  176]]


In [33]:

import sklearn


print('The scikit-learn version is {}.'.format(sklearn.__version__))


The scikit-learn version is 0.21.2.
