In [1]:
import pandas as pd
import numpy as np

from sklearn.utils import resample
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC


from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [2]:
## Read dataset
d = pd.read_csv('churn_modeling.csv', index_col = 'RowNumber')
#d
#d.describe()

In [3]:
## Encoding 
#d.dtypes

# Encode Geography into three dummies 
d['Geography'].unique()
d['Spain'] = d['Geography'].apply(lambda x: 1 if x =='Spain' else 0)
d['France'] = d['Geography'].apply(lambda x: 1 if x =='France' else 0)
d['Germany'] = d['Geography'].apply(lambda x: 1 if x =='Germany' else 0)

# Encode Gender into numeric values for Female
d['Female'] = d['Gender'].apply(lambda x: 1 if x =='Female' else 0)

In [4]:
# Dropping all Object columns plus CustomerID as they are not needed for prediction or encoded
d=d.drop(columns=['Geography', 'Gender', 'Surname', 'CustomerId'])
#d

In [5]:
# Feature transformation of Balance and Estimated Salary
# Checking for Null Values # None-There
d.isnull().sum()

# Transforming Balance and Estimated Salary to a logarithm
d = d.loc[d['Balance'] > 0]
d = d.loc[d['EstimatedSalary'] > 0]
d['EstimatedSalary'] = d['EstimatedSalary'].apply(lambda x: np.log(x))
d['Balance'] = d['Balance'].apply(lambda x: np.log(x))
d

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Spain,France,Germany,Female
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2,608,41,1,11.336282,1,0,1,11.631087,0,1,0,0,1
3,502,42,8,11.980807,3,1,0,11.643353,1,0,1,0,1
5,850,43,2,11.740147,1,1,1,11.278267,0,1,0,0,1
6,645,44,8,11.641809,2,1,0,11.916767,1,1,0,0,0
8,376,29,4,11.653094,4,1,0,11.689789,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9992,597,53,4,11.389415,1,1,0,11.147422,1,0,1,0,1
9994,644,28,7,11.951570,1,1,0,10.281222,0,0,1,0,0
9997,516,35,10,10.957270,1,1,1,11.529780,0,0,1,0,0
9999,772,42,3,11.226247,2,1,0,11.439155,1,0,0,1,0


In [6]:
# Dataset balancing for 'Exited'

## See whether dataset needs to be rebalanced in regards to exit rates
#data_log['Exited'].hist()
exit = d[d['Exited'] == 1]
no_exit = d[d['Exited'] == 0]
print(len(exit)/(len(exit)+len(no_exit))) # Only 25% Exited

## Balancing the data (for both data sets logarithmic and normal)
exit_index = exit.index
np.random.seed(86) #Random seed to reproduce results
random_exit_indexes = np.random.choice(exit_index, len(no_exit))
balanced_data = no_exit.append(d.loc[random_exit_indexes])

## Test whether the rebalancing worked
#exit = balanced_data[balanced_data['Exited'] == 1]
#no_exit = balanced_data[balanced_data['Exited'] == 0]
#print(len(exit)/(len(exit)+len(no_exit))) #Now 50% Exited
#balanced_data['Exited'].hist()

0.2407958640137866


In [7]:
## NOT-tuned Logistic Regression

Y = balanced_data['Exited']
X = balanced_data.drop(['Exited'], axis=1)

clf = LogisticRegressionCV(scoring='roc_auc', max_iter = 500).fit(X, Y)
## Choosing a 3-folded Cross Validation
CVC_balance = cross_val_score(clf, X, Y, cv=3)
average = sum(CVC_balance)/ len(CVC_balance)


print('The average CVC score is {0} from the {1}-folded cross-validated Logistic Regression'.format(str(round(average, 4)), str(3)))
print('The maximal CVC score is {0} from the {1}-folded cross-validated Logistic Regression'.format(str(round(CVC_balance.max(),4)), str(3)))



The average CVC score is 0.7609 from the 3-folded cross-validated Logistic Regression
The maximal CVC score is 0.7737 from the 3-folded cross-validated Logistic Regression


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=86)


clf = make_pipeline(StandardScaler(), SVC())
clf.fit(X_train, y_train)
clf.score(X_test,y_test)
clf.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'svc', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'svc__C', 'svc__break_ties', 'svc__cache_size', 'svc__class_weight', 'svc__coef0', 'svc__decision_function_shape', 'svc__degree', 'svc__gamma', 'svc__kernel', 'svc__max_iter', 'svc__probability', 'svc__random_state', 'svc__shrinking', 'svc__tol', 'svc__verbose'])

In [None]:
## Parameters choosen to be tuned

tuned_parameter = {'svc__C': [0.1, 1],
                   'svc__gamma': [1,2],
                   'svc__degree': [1, 2],
                   'svc__kernel': ['poly','linear']}


gs_scv = GridSearchCV(clf,
                      param_grid=tuned_parameter,
                      scoring='roc_auc',
                      cv = 3) #Three folded Cross Validation
gs_scv.fit(X_train, y_train)
print(gs_scv.best_params_)
gs_scv.best_score_
score=gs_scv.score(X_test, y_test)

#print("The exhaustive search identified the best parameters for X,Y,Z")
#print(The score for the tuned model is {0}'.format(str(score)))