In [1]:
import pandas as pd
import numpy as np

from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import cross_val_score
from sklearn import metrics


In [2]:
# Read dataset
d = pd.read_csv('churn_modeling.csv')
d

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [3]:
# Feature transformation: some suggested tasks
# - Detect and impute null values
# - Detect and encode categorical columns
# - Drop the columns not used in the model
# - Log transformations (if needed)d = d.drop(columns=['RowNumber', 'CustomerId', 'Surname'])
d = d.drop(columns=['RowNumber', 'CustomerId', 'Surname'])
for c in ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']:
    d[c] = (d[c] - d[c].mean()) / d[c].std()
d = pd.get_dummies(d, prefix = 'oh', columns = ['Geography', 'Gender'])
d

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,oh_France,oh_Germany,oh_Spain,oh_Female,oh_Male
0,-0.326205,0.293503,2,-1.225786,-0.911538,1,1,0.021885,1,1,0,0,1,0
1,-0.440014,0.198154,1,0.117344,-0.911538,0,1,0.216523,0,0,0,1,1,0
2,-1.536717,0.293503,8,1.332987,2.526930,1,0,0.240675,1,1,0,0,1,0
3,0.501496,0.007456,1,-1.225786,0.807696,0,0,-0.108912,0,1,0,0,1,0
4,2.063781,0.388852,2,0.785689,-0.911538,1,1,-0.365258,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.246426,0.007456,5,-1.225786,0.807696,1,0,-0.066416,0,1,0,0,0,1
9996,-1.391870,-0.373939,10,-0.306363,-0.911538,1,1,0.027987,0,1,0,0,0,1
9997,0.604958,-0.278590,7,-1.225786,-0.911538,0,1,-1.008593,1,1,0,0,1,0
9998,1.256772,0.293503,3,-0.022606,0.807696,1,0,-0.125224,1,0,1,0,0,1


In [4]:
# Dataset balancing
# -----------------
# In the last notebook I included a small program to show how undersample/oversampling can be implemented
# You can use the same program chunk or try to use the resampling function provided by scikit-learn:
# https://scikit-learn.org/stable/modules/generated/sklearn.utils.resample.html
# this function can be used both for undersample and onversampling
d_no_churn = d[d['Exited'] == 0]
d_churn = d[d['Exited'] == 1]
print(d_churn.shape[0])
d_no_churn_sampled = resample(d, replace=False, n_samples=d_churn.shape[0], random_state=42)

d = pd.concat([d_churn, d_no_churn_sampled])
d


2037


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,oh_France,oh_Germany,oh_Spain,oh_Female,oh_Male
0,-0.326205,0.293503,2,-1.225786,-0.911538,1,1,0.021885,1,1,0,0,1,0
2,-1.536717,0.293503,8,1.332987,2.526930,1,0,0.240675,1,1,0,0,1,0
5,-0.057202,0.484200,8,0.597299,0.807696,1,0,0.863607,1,0,0,1,0,1
7,-2.840346,-0.946032,4,0.617988,4.246164,1,0,0.334837,1,0,1,0,1,0
16,0.025568,1.819084,1,0.899348,-0.911538,1,0,-1.651743,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,0.335955,-0.850683,10,-0.301750,-0.911538,0,0,-0.362531,0,1,0,0,0,1
9193,-0.553823,-1.232078,8,1.177990,-0.911538,1,0,-1.004333,0,1,0,0,1,0
4617,1.670623,-1.422776,10,-1.225786,0.807696,1,1,0.987923,0,1,0,0,0,1
3128,1.877548,-0.469288,6,0.988891,0.807696,0,1,0.807174,0,0,0,1,1,0


In [1]:
# At this point you have to include the code to train the models, tune their metaparameters and crossvalidate the results
# Try the following models:
# - Logistic regression (sklearn.linear_model.LinearRegression) No parameter to tune
# - Support Vector Machine (sklearn.svm.SVC) Choose three parameters to tune
# - Gradient Boost (sklearn.ensemble.GradientBoostingClassifier)
# Use the AUC as the score to compare the results

y = d['Exited']
X = d.drop(['Exited'], axis=1)

scores = cross_val_score(LogisticRegression(), X, y, cv=5, scoring='roc_auc')
print(scores)
"""

params = [{'C': [1, 2], 'kernel': ['poly'], 'degree': [1, 2]}]
models = GridSearchCV(SVC(), params, scoring='roc_auc')
models.fit(X, y)
print(models.best_score_)
print(models.best_params_)

params = [{'learning_rate': [0.02, 0.05, 0.1, 0.2], 'n_estimators': [200, 300], 'criterion': ['mse']}]
models = GridSearchCV(GradientBoostingClassifier(), params, scoring='roc_auc')
models.fit(X, y)
print(models.best_score_)
print(models.best_params_)
"""

NameError: name 'd' is not defined