In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df=pd.read_csv('Churn_Modelling.csv')

In [3]:
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [4]:
#Taking Independent and Dependent Features separtely
X = df.iloc[:,3:13]
y = df.iloc[:,13]

In [5]:
X.head(3)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57


converting the categorical feature column (Geography, Gender) into numeric using one - hot encoding

In [6]:
geography=pd.get_dummies(X['Geography'],drop_first=True)

In [7]:
geography.head()

Unnamed: 0,Germany,Spain
0,0,0
1,0,1
2,0,0
3,0,0
4,0,1


In [8]:
gender=pd.get_dummies(X['Gender'],drop_first=True)

In [9]:
gender.head()

Unnamed: 0,Male
0,0
1,0
2,0
3,0
4,0


In [10]:
X.head(3)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57


In [11]:
# Drop Categorical Features
X=X.drop(['Geography','Gender'],axis=1)

In [12]:
# concatinating the one hot encoded datframe
X=pd.concat([X,geography,gender],axis=1)

In [13]:
X.head(3)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Germany,Spain,Male
0,619,42,2,0.0,1,1,1,101348.88,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,0,0,0


## Train test split

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Xg Boost

In [15]:
import xgboost
xg = xgboost.XGBClassifier()
xgboost_model = xg.fit(X_train, y_train)





In [16]:
y_pred = xgboost_model.predict(X_test)

from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred, y_test)
print(score)

0.865


# XgBoost with  Hyper Parameter Optimization

In [17]:
import xgboost
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [18]:
classifier = xgboost.XGBClassifier()

In [19]:
params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]  
}

In [20]:
random_search = RandomizedSearchCV(classifier, param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=1, cv=5, verbose=3)

In [21]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] min_child_weight=7, max_depth=8, learning_rate=0.1, gamma=0.4, colsample_bytree=0.7 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  min_child_weight=7, max_depth=8, learning_rate=0.1, gamma=0.4, colsample_bytree=0.7, score=0.857, total=   0.5s
[CV] min_child_weight=7, max_depth=8, learning_rate=0.1, gamma=0.4, colsample_bytree=0.7 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV]  min_child_weight=7, max_depth=8, learning_rate=0.1, gamma=0.4, colsample_bytree=0.7, score=0.849, total=   0.5s
[CV] min_child_weight=7, max_depth=8, learning_rate=0.1, gamma=0.4, colsample_bytree=0.7 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s


[CV]  min_child_weight=7, max_depth=8, learning_rate=0.1, gamma=0.4, colsample_bytree=0.7, score=0.863, total=   0.5s
[CV] min_child_weight=7, max_depth=8, learning_rate=0.1, gamma=0.4, colsample_bytree=0.7 




[CV]  min_child_weight=7, max_depth=8, learning_rate=0.1, gamma=0.4, colsample_bytree=0.7, score=0.852, total=   0.5s
[CV] min_child_weight=7, max_depth=8, learning_rate=0.1, gamma=0.4, colsample_bytree=0.7 




[CV]  min_child_weight=7, max_depth=8, learning_rate=0.1, gamma=0.4, colsample_bytree=0.7, score=0.860, total=   0.5s
[CV] min_child_weight=1, max_depth=8, learning_rate=0.15, gamma=0.4, colsample_bytree=0.5 




[CV]  min_child_weight=1, max_depth=8, learning_rate=0.15, gamma=0.4, colsample_bytree=0.5, score=0.849, total=   0.5s
[CV] min_child_weight=1, max_depth=8, learning_rate=0.15, gamma=0.4, colsample_bytree=0.5 




[CV]  min_child_weight=1, max_depth=8, learning_rate=0.15, gamma=0.4, colsample_bytree=0.5, score=0.842, total=   0.5s
[CV] min_child_weight=1, max_depth=8, learning_rate=0.15, gamma=0.4, colsample_bytree=0.5 




[CV]  min_child_weight=1, max_depth=8, learning_rate=0.15, gamma=0.4, colsample_bytree=0.5, score=0.854, total=   0.5s
[CV] min_child_weight=1, max_depth=8, learning_rate=0.15, gamma=0.4, colsample_bytree=0.5 




[CV]  min_child_weight=1, max_depth=8, learning_rate=0.15, gamma=0.4, colsample_bytree=0.5, score=0.842, total=   0.5s
[CV] min_child_weight=1, max_depth=8, learning_rate=0.15, gamma=0.4, colsample_bytree=0.5 




[CV]  min_child_weight=1, max_depth=8, learning_rate=0.15, gamma=0.4, colsample_bytree=0.5, score=0.853, total=   0.5s
[CV] min_child_weight=7, max_depth=10, learning_rate=0.15, gamma=0.0, colsample_bytree=0.4 




[CV]  min_child_weight=7, max_depth=10, learning_rate=0.15, gamma=0.0, colsample_bytree=0.4, score=0.851, total=   0.4s
[CV] min_child_weight=7, max_depth=10, learning_rate=0.15, gamma=0.0, colsample_bytree=0.4 




[CV]  min_child_weight=7, max_depth=10, learning_rate=0.15, gamma=0.0, colsample_bytree=0.4, score=0.843, total=   0.5s
[CV] min_child_weight=7, max_depth=10, learning_rate=0.15, gamma=0.0, colsample_bytree=0.4 




[CV]  min_child_weight=7, max_depth=10, learning_rate=0.15, gamma=0.0, colsample_bytree=0.4, score=0.856, total=   0.5s
[CV] min_child_weight=7, max_depth=10, learning_rate=0.15, gamma=0.0, colsample_bytree=0.4 




[CV]  min_child_weight=7, max_depth=10, learning_rate=0.15, gamma=0.0, colsample_bytree=0.4, score=0.842, total=   0.5s
[CV] min_child_weight=7, max_depth=10, learning_rate=0.15, gamma=0.0, colsample_bytree=0.4 




[CV]  min_child_weight=7, max_depth=10, learning_rate=0.15, gamma=0.0, colsample_bytree=0.4, score=0.848, total=   0.5s
[CV] min_child_weight=7, max_depth=8, learning_rate=0.3, gamma=0.1, colsample_bytree=0.7 




[CV]  min_child_weight=7, max_depth=8, learning_rate=0.3, gamma=0.1, colsample_bytree=0.7, score=0.840, total=   0.4s
[CV] min_child_weight=7, max_depth=8, learning_rate=0.3, gamma=0.1, colsample_bytree=0.7 




[CV]  min_child_weight=7, max_depth=8, learning_rate=0.3, gamma=0.1, colsample_bytree=0.7, score=0.832, total=   0.4s
[CV] min_child_weight=7, max_depth=8, learning_rate=0.3, gamma=0.1, colsample_bytree=0.7 




[CV]  min_child_weight=7, max_depth=8, learning_rate=0.3, gamma=0.1, colsample_bytree=0.7, score=0.852, total=   0.4s
[CV] min_child_weight=7, max_depth=8, learning_rate=0.3, gamma=0.1, colsample_bytree=0.7 




[CV]  min_child_weight=7, max_depth=8, learning_rate=0.3, gamma=0.1, colsample_bytree=0.7, score=0.839, total=   0.4s
[CV] min_child_weight=7, max_depth=8, learning_rate=0.3, gamma=0.1, colsample_bytree=0.7 




[CV]  min_child_weight=7, max_depth=8, learning_rate=0.3, gamma=0.1, colsample_bytree=0.7, score=0.845, total=   0.4s
[CV] min_child_weight=3, max_depth=12, learning_rate=0.1, gamma=0.4, colsample_bytree=0.4 




[CV]  min_child_weight=3, max_depth=12, learning_rate=0.1, gamma=0.4, colsample_bytree=0.4, score=0.847, total=   0.6s
[CV] min_child_weight=3, max_depth=12, learning_rate=0.1, gamma=0.4, colsample_bytree=0.4 




[CV]  min_child_weight=3, max_depth=12, learning_rate=0.1, gamma=0.4, colsample_bytree=0.4, score=0.839, total=   0.6s
[CV] min_child_weight=3, max_depth=12, learning_rate=0.1, gamma=0.4, colsample_bytree=0.4 




[CV]  min_child_weight=3, max_depth=12, learning_rate=0.1, gamma=0.4, colsample_bytree=0.4, score=0.853, total=   0.6s
[CV] min_child_weight=3, max_depth=12, learning_rate=0.1, gamma=0.4, colsample_bytree=0.4 




[CV]  min_child_weight=3, max_depth=12, learning_rate=0.1, gamma=0.4, colsample_bytree=0.4, score=0.843, total=   0.6s
[CV] min_child_weight=3, max_depth=12, learning_rate=0.1, gamma=0.4, colsample_bytree=0.4 




[CV]  min_child_weight=3, max_depth=12, learning_rate=0.1, gamma=0.4, colsample_bytree=0.4, score=0.847, total=   0.7s


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:   12.5s finished


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                                           reg_lambda=None,
                                           scale_pos_weight=None,
                                           subsample=No

In [22]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.4, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=7, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [23]:
random_search.best_params_

{'min_child_weight': 7,
 'max_depth': 8,
 'learning_rate': 0.1,
 'gamma': 0.4,
 'colsample_bytree': 0.7}

In [24]:
best_classifier_model = xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=np.nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
best_classifier_model.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [26]:
y_pred = best_classifier_model.predict(X_test)

from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred, y_test)
print(score)

0.8825
