# XG-Boost Model

In [1]:
# Importing all the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split,cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")



In [2]:
# Importing the dataset
df = pd.read_csv("/Users/omkarmutreja/Downloads/XGBoost/Churn_Modelling.csv")
print(df.head())
# Creating feature and target array
X = df.iloc[:,3:13].values
y = df.iloc[:,13].values
print(X[:5])
print(y[:5])

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [3]:
# Encoding Categorical data
labelEncoder_X1 = LabelEncoder()
X[:,1] = labelEncoder_X1.fit_transform(X[:,1])
labelEncoder_X2 = LabelEncoder()
X[:,2] = labelEncoder_X2.fit_transform(X[:,2])
oneHotEncoder = OneHotEncoder(categorical_features=[1])
X =oneHotEncoder.fit_transform(X).toarray()
X=X[:,1:]

In [4]:
# Splitting the dataset into training and testing 
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

# Fitting the XG-Boost model to the training set
xgBoost = XGBClassifier()
xgBoost.fit(X_train,y_train)
y_pred = xgBoost.predict(X_test)

# Confusion matrix
print(confusion_matrix(y_test,y_pred))

# Applying K-fold cross validation
accuracy = cross_val_score(estimator=xgBoost,X=X_train,y=y_train,cv=10)
print(accuracy)
print("Mean accuracy: ",accuracy.mean())

[[1555   38]
 [ 229  178]]
[0.87875 0.86    0.87125 0.85375 0.865   0.86375 0.87875 0.845   0.86625
 0.86   ]
Mean accuracy:  0.86425


In [5]:
# Comparing XG-Boost with other classification models
classifiers = [['DecisionTree :',DecisionTreeClassifier()],
               ['RandomForest :',RandomForestClassifier()],
               ['KNeighbours :', KNeighborsClassifier()],
               ['Support Vector Machine :',SVC()],
               ['Naive Bayes  :',GaussianNB()],
               ['LogisticReg :', LogisticRegression()]]

for name,classifier in classifiers:
    classifier = classifier
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    print(name, (cross_val_score(estimator=classifier,X=X_train,y=y_train,cv=10).mean()))

DecisionTree : 0.7935000000000001
RandomForest : 0.851375
KNeighbours : 0.759625
Support Vector Machine : 0.7962499999999999
Naive Bayes  : 0.7856250000000001
LogisticReg : 0.790625


In [6]:
# We can clearly see that XG-Boost performs the best as compared to other models.
# We can also increase the model accuracy of all these models by tuning the parameters and performing GridSearch Cross Validation

# Grid Search to find the best model and best parameters
parameters = [{'max_depth':[2,3,4,5,6],'learning_rate':[0.5,0.1,0.01],
               'n_estimators':[10,100,200]}]
grid_search = GridSearchCV(estimator=xgBoost,param_grid=parameters,scoring='accuracy',cv=10,n_jobs=-1)
grid_search = grid_search.fit(X_train,y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)

0.864875
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}


In [None]:
# We can see that after tuning the parameters of the Xg-Bosst model, the accuracy increases from 86.33 to 86.55