### XGBoost

In [10]:
# Import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBClassifier

In [2]:
dataset = pd.read_csv("Churn_Modelling.csv")
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
X = dataset.iloc[:,3:-1].values
y = dataset.iloc[:, -1].values

In [9]:
label_encoder_x1 = LabelEncoder()
X[:, 1] = label_encoder_x1.fit_transform(X[:, 1])

label_encoder_x2 = LabelEncoder()
X[:, 2] = label_encoder_x2.fit_transform(X[:, 2])

onehot_encoder = OneHotEncoder(categorical_features=[1])
X = onehot_encoder.fit_transform(X).toarray()
X = X[:, 1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [11]:
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

In [12]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[2291,   88],
       [ 305,  316]])

In [13]:
# Applying K-fold validation
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10, n_jobs=-1)
print("Min: %.2f" %(min(accuracies)*100))
print("Max: %.2f" %(max(accuracies)*100))
print("Avg: %.2f" %((accuracies.mean())*100))
print("Std Dev: %.2f" %((accuracies.std())*100))

Min: 85.12
Max: 88.16
Avg: 86.30
Std Dev: 0.97


In [16]:
# Applying GridSearch to find best model and parameters
parameters = [{
    "max_depth" : [5, 10],
    "learning_rate" : [0.01, 0.005],
    "n_estimators" : [200, 300, 400],
}]

grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1)
grid_search = grid_search.fit(X_train, y_train)
best_score = grid_search.best_score_
best_parameters = grid_search.best_params_

In [31]:
best_parameters

{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200}

In [24]:
classifier_best = XGBClassifier(learning_rate=best_parameters["learning_rate"], 
                                max_depth=best_parameters["max_depth"], 
                                n_estimators=best_parameters["n_estimators"])
classifier_best.fit(X_train, y_train)

y_pred = classifier_best.predict(X_test)

In [25]:
# Applying K-fold validation
accuracies = cross_val_score(estimator=classifier_best, X=X_train, y=y_train, cv=10, n_jobs=-1)
print("Min: %.2f" %(min(accuracies)*100))
print("Max: %.2f" %(max(accuracies)*100))
print("Avg: %.2f" %((accuracies.mean())*100))
print("Std Dev: %.2f" %((accuracies.std())*100))

Min: 84.57
Max: 87.45
Avg: 86.10
Std Dev: 0.81
