# Machine learning classification model

##  1) Imports

In [3]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [7]:
df = pd.read_csv('Data/churn_data_scaled_minmax.csv', index_col=0)

In [10]:
df

Unnamed: 0,creditscore,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited,France,Germany,Spain
0,0.538,0.0,0.324324,0.2,0.000000,0.000000,1.0,1.0,0.506735,1.0,1.0,0.0,0.0
1,0.516,0.0,0.310811,0.1,0.334031,0.000000,0.0,1.0,0.562709,0.0,0.0,0.0,1.0
2,0.304,0.0,0.324324,0.8,0.636357,0.666667,1.0,0.0,0.569654,1.0,1.0,0.0,0.0
3,1.000,0.0,0.337838,0.2,0.500246,0.000000,1.0,1.0,0.395400,0.0,0.0,0.0,1.0
4,0.590,1.0,0.351351,0.8,0.453394,0.333333,1.0,0.0,0.748797,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9785,0.842,1.0,0.283784,0.5,0.000000,0.333333,1.0,0.0,0.481341,0.0,1.0,0.0,0.0
9786,0.332,1.0,0.229730,1.0,0.228657,0.000000,1.0,1.0,0.508490,0.0,1.0,0.0,0.0
9787,0.718,0.0,0.243243,0.7,0.000000,0.000000,0.0,1.0,0.210390,1.0,1.0,0.0,0.0
9788,0.844,1.0,0.324324,0.3,0.299226,0.333333,1.0,0.0,0.464429,1.0,0.0,1.0,0.0


## 2) Model selection

In [14]:

# I will perform a kfold cross validation
# dividing the dataframe into the data and the target 
X, y = df.drop('exited', axis=1).copy(), df['exited'].copy()

In [15]:
#Gradient boosting
cross_val_score(GradientBoostingClassifier(),X, y, cv = 5)

array([0.86159346, 0.86874362, 0.85444331, 0.86925434, 0.85086823])

In [16]:
#Random Forest
cross_val_score(RandomForestClassifier(), X, y, cv = 5)

array([0.86057201, 0.86210419, 0.85648621, 0.86465781, 0.85597549])

The Gradient Boosting classifier performs slightly better

## 3) Hyperparameter optimization


In [18]:
# Spliting the data into train, val and test
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

print(x_train.shape, x_val.shape, x_test.shape)


(7342, 12) (1468, 12) (980, 12)


In [19]:
# Setting all the parameters for the model
scoring='accuracy'
parameters = {
              'learning_rate': [0.01,0.05,0.1], 
              'max_depth': [3,4,5],
              'min_samples_leaf': [4,5,6],
              'n_estimators': [3,5,10,12],
              }

clf_gbm = GridSearchCV(GradientBoostingClassifier(), parameters, cv = 3, scoring=scoring)

clf_gbm.fit(x_train,y_train)


print(clf_gbm.best_params_)
print(clf_gbm.best_score_)

feature_importances = pd.DataFrame(clf_gbm.best_estimator_.feature_importances_,
                                   index = x_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)
depth=clf_gbm.cv_results_["param_max_depth"]
score=clf_gbm.cv_results_["mean_test_score"]
params=clf_gbm.cv_results_["params"]

{'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 6, 'n_estimators': 12}
0.852901055758997
                 importance
age                0.423988
numofproducts      0.292322
isactivemember     0.146243
balance            0.058538
Germany            0.047308
creditscore        0.010689
estimatedsalary    0.008938
gender             0.004578
tenure             0.004338
France             0.003059
hascrcard          0.000000
Spain              0.000000


In [20]:
GB = GradientBoostingClassifier(learning_rate=0.1, max_depth=5, min_samples_leaf=6, n_estimators=12)
GB.fit(x_train, y_train)

GradientBoostingClassifier(max_depth=5, min_samples_leaf=6, n_estimators=12)

In [21]:
# Making predictions
val_pred = GB.predict(x_val)
test_pred = GB.predict(x_test)

In [22]:
# Evaluating the model
from sklearn.metrics import accuracy_score

print(GB.score(x_train, y_train))
print(accuracy_score(y_val, val_pred))
print(accuracy_score(y_test, test_pred))

0.8572596022882049
0.8487738419618529
0.8520408163265306
