In [None]:
import pandas as pd
import numpy as np
from numpy.core.fromnumeric import mean

from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn import metrics

In [None]:
## Read dataset
d = pd.read_csv('churn_modeling.csv', index_col = 'RowNumber')
#d
#d.describe()

In [None]:
## Encoding 
#d.dtypes

# Encode Geography into three dummies 
d['Geography'].unique()
d['Spain'] = d['Geography'].apply(lambda x: 1 if x =='Spain' else 0)
d['France'] = d['Geography'].apply(lambda x: 1 if x =='France' else 0)
d['Germany'] = d['Geography'].apply(lambda x: 1 if x =='Germany' else 0)

# Encode Gender into numeric values for Female
d['Female'] = d['Gender'].apply(lambda x: 1 if x =='Female' else 0)

In [None]:
# Dropping all Object columns plus CustomerID as they are not needed for prediction or encoded
d=d.drop(columns=['Geography', 'Gender', 'Surname', 'CustomerId'])
#d

In [None]:
# Feature transformation of Balance and Estimated Salary
# Checking for Null Values # None-There
d.isnull().sum()

# Transforming Balance and Estimated Salary to a logarithm
d = d.loc[d['Balance'] > 0]
d = d.loc[d['EstimatedSalary'] > 0]
d['EstimatedSalary'] = d['EstimatedSalary'].apply(lambda x: np.log(x))
d['Balance'] = d['Balance'].apply(lambda x: np.log(x))
d

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Spain,France,Germany,Female
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2,608,41,1,11.336282,1,0,1,11.631087,0,1,0,0,1
3,502,42,8,11.980807,3,1,0,11.643353,1,0,1,0,1
5,850,43,2,11.740147,1,1,1,11.278267,0,1,0,0,1
6,645,44,8,11.641809,2,1,0,11.916767,1,1,0,0,0
8,376,29,4,11.653094,4,1,0,11.689789,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9992,597,53,4,11.389415,1,1,0,11.147422,1,0,1,0,1
9994,644,28,7,11.951570,1,1,0,10.281222,0,0,1,0,0
9997,516,35,10,10.957270,1,1,1,11.529780,0,0,1,0,0
9999,772,42,3,11.226247,2,1,0,11.439155,1,0,0,1,0


In [None]:
# Dataset balancing for 'Exited'

## See whether dataset needs to be rebalanced in regards to exit rates
#data_log['Exited'].hist()
exit = d[d['Exited'] == 1]
no_exit = d[d['Exited'] == 0]
print(len(exit)/(len(exit)+len(no_exit))) # Only 25% Exited

## Balancing the data (for both data sets logarithmic and normal)
exit_index = exit.index
np.random.seed(86) #Random seed to reproduce results
random_exit_indexes = np.random.choice(exit_index, len(no_exit))
balanced_data = no_exit.append(d.loc[random_exit_indexes])

## Test whether the rebalancing worked
#exit = balanced_data[balanced_data['Exited'] == 1]
#no_exit = balanced_data[balanced_data['Exited'] == 0]
print(len(exit)/(len(exit)+len(no_exit))) #Now 50% Exited
#balanced_data['Exited'].hist()

0.2407958640137866
0.2407958640137866


In [None]:
## Test-Training Dataset

Y = balanced_data['Exited']
X = balanced_data.drop(['Exited'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=17)


In [None]:
## Logistic Regression not tuned
logres = LogisticRegressionCV(scoring='roc_auc', 
                              max_iter = 500)
logres.fit(X_train, y_train)
logres_model_scores = cross_val_score(logres, X_test, y_test, cv=5, scoring='roc_auc') 


print('The average roc_auc is {0} from the {1}-folded cross-validated Logistic Regression'.format(str(round(mean(logres_model_scores), 4)), str(5)))


The average roc_auc is 0.7443 from the 5-folded cross-validated Logistic Regression


In [None]:
### SVC Model not tuned
svc_model = make_pipeline(StandardScaler(), SVC())
svc_model.fit(X_train, y_train)
svc_model_scores = cross_val_score(svc_model, X_test,y_test, cv=5, scoring='roc_auc')
print('From the not tuned SVC StandardScaler the average roc_auc is {0}'.format(str(round(mean(svc_model_scores),3))))

From the not tuned SVC StandardScaler the average roc_auc is 0.832


In [None]:
### SVC Model tuned woth GridSearch
# Parameters that should be tuned
tuned_parameter = {'svc__C': [0.1,1,10], #The larger the C the higher the penalty for miskaes, suggested [0.1:100]
                   'svc__gamma': [0.0001, 0.001, 0.01, 0.1, 1], #The larger the gamma the more overfitted, default 1/n_feature, suggested [0.001:10]
                   'svc__degree': [1,2,3], #Degree of function
                   'svc__kernel': ['rbf','linear']} #Kernel used (poly scored low and takes long, so excluded)

## Constructing the Pipeline
gs_scv_model = GridSearchCV(svc_model,
                      param_grid=tuned_parameter,
                      scoring='roc_auc',
                      refit = 'AUC',
                      cv = 5)

#Fitting and testing the model
#gs_scv_model=gs_scv_model.fit(X_train, y_train)
gs_scv_model_scores = cross_val_score(gs_scv_model, X_test, y_test, scoring='roc_auc', cv=5)

print('From the tuned SVC StandardScaler the average roc_auc is {0}'.format(str(round(mean(gs_scv_model_scores),3))))

Fitting 3 folds for each of 90 candidates, totalling 270 fits
[CV 1/3] END svc__C=0.1, svc__degree=1, svc__gamma=0.0001, svc__kernel=rbf;, score=0.718 total time=   0.2s
[CV 2/3] END svc__C=0.1, svc__degree=1, svc__gamma=0.0001, svc__kernel=rbf;, score=0.746 total time=   0.2s
[CV 3/3] END svc__C=0.1, svc__degree=1, svc__gamma=0.0001, svc__kernel=rbf;, score=0.732 total time=   0.2s
[CV 1/3] END svc__C=0.1, svc__degree=1, svc__gamma=0.0001, svc__kernel=linear;, score=0.733 total time=   0.1s
[CV 2/3] END svc__C=0.1, svc__degree=1, svc__gamma=0.0001, svc__kernel=linear;, score=0.761 total time=   0.1s
[CV 3/3] END svc__C=0.1, svc__degree=1, svc__gamma=0.0001, svc__kernel=linear;, score=0.733 total time=   0.1s
[CV 1/3] END svc__C=0.1, svc__degree=1, svc__gamma=0.001, svc__kernel=rbf;, score=0.719 total time=   0.2s
[CV 2/3] END svc__C=0.1, svc__degree=1, svc__gamma=0.001, svc__kernel=rbf;, score=0.746 total time=   0.2s
[CV 3/3] END svc__C=0.1, svc__degree=1, svc__gamma=0.001, svc__kern

In [None]:
##GradientBoostingClassifier
gbc_model = GradientBoostingClassifier(learning_rate=0.01,
                                       max_depth=50,
                                       subsample=0.9,
                                       n_estimators = 1000,
                                       loss = 'exponential',
                                       n_iter_no_change = 10)
model = gbc_model.fit(X_test, y_test)
gbc_model_scores = cross_val_score(model, X_test, y_test, scoring='roc_auc', cv=5)
print('From the Gradient Boosting Classfier the average roc_auc score is {0}'.format(str(round(mean(gbc_model_scores), 4))))

From the Gradient Boosting Classfier the average roc_auc score is 0.8687


In [None]:
## Results
models = ['Logistic Regression', 'Not tuned StandardScaler', 'StandardScaler tuned with Grid Search', 'Gradient Boosting Classifier']
scores = [logres_model_scores, svc_model_scores, gs_scv_model_scores, gbc_model_scores]

for i in range(0,4):
  print('From the model {0} the average auc_score from a 5-folded cross valudation is {1}'.format(models[i], str(round(mean(scores[i]),3))))

From the model Logistic Regression the average auc_score from a 5-folded cross valudation is 0.744
From the model Not tuned StandardScaler the average auc_score from a 5-folded cross valudation is 0.832
From the model StandardScaler tuned with Grid Search the average auc_score from a 5-folded cross valudation is 0.834
From the model Gradient Boosting Classifier the average auc_score from a 5-folded cross valudation is 0.869
