In [24]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [11]:
diabetes = pd.read_csv('diabetes.csv')

In [12]:
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [13]:
feature_columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
X = diabetes.loc[:,feature_columns]
y = diabetes.loc[:,'Outcome']

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2)


In [14]:
model=XGBClassifier(learning_rate=0.1,max_depth=5,n_estimators=400,subsample=1,colsample_bytree=1,eval_metric='auc',verbosity=1)

model.fit(X_train,y_train,early_stopping_rounds=20,eval_set = [(X_test,y_test)],verbose=True)

[0]	validation_0-auc:0.76328
[1]	validation_0-auc:0.77641
[2]	validation_0-auc:0.78888
[3]	validation_0-auc:0.78431
[4]	validation_0-auc:0.78365
[5]	validation_0-auc:0.78155
[6]	validation_0-auc:0.78631
[7]	validation_0-auc:0.78555
[8]	validation_0-auc:0.79155
[9]	validation_0-auc:0.79555
[10]	validation_0-auc:0.79602
[11]	validation_0-auc:0.79536
[12]	validation_0-auc:0.79697
[13]	validation_0-auc:0.79945
[14]	validation_0-auc:0.79526
[15]	validation_0-auc:0.79735
[16]	validation_0-auc:0.79393
[17]	validation_0-auc:0.79355
[18]	validation_0-auc:0.79298
[19]	validation_0-auc:0.79107
[20]	validation_0-auc:0.79716
[21]	validation_0-auc:0.79783
[22]	validation_0-auc:0.79631
[23]	validation_0-auc:0.79802
[24]	validation_0-auc:0.79973
[25]	validation_0-auc:0.79497
[26]	validation_0-auc:0.79650
[27]	validation_0-auc:0.79802




[28]	validation_0-auc:0.79631
[29]	validation_0-auc:0.79726
[30]	validation_0-auc:0.79669
[31]	validation_0-auc:0.79783
[32]	validation_0-auc:0.79973
[33]	validation_0-auc:0.80088
[34]	validation_0-auc:0.79973
[35]	validation_0-auc:0.80030
[36]	validation_0-auc:0.80049
[37]	validation_0-auc:0.79973
[38]	validation_0-auc:0.79935
[39]	validation_0-auc:0.79707
[40]	validation_0-auc:0.79555
[41]	validation_0-auc:0.79421
[42]	validation_0-auc:0.79459
[43]	validation_0-auc:0.79440
[44]	validation_0-auc:0.79402
[45]	validation_0-auc:0.79174
[46]	validation_0-auc:0.79345
[47]	validation_0-auc:0.79402
[48]	validation_0-auc:0.79250
[49]	validation_0-auc:0.79383
[50]	validation_0-auc:0.79516
[51]	validation_0-auc:0.79631
[52]	validation_0-auc:0.79593
[53]	validation_0-auc:0.79650


XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='auc', gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=400,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [15]:
y_train_pred = model.predict_proba(X_train)[:,1]
y_test_pred = model.predict_proba(X_test)[:,1]

print("AUC Train: {:.4f}\nAUC Test:{:.4f}".format(roc_auc_score(y_train,y_train_pred),roc_auc_score(y_test,y_test_pred)))

AUC Train: 0.9827
AUC Test:0.8009


In [16]:
hyper_params = {"learning_rate":[0.01,0.02,0.05,0.1],
                "max_depth":[3,5,6,7],
                "n_estimators":[100,300,500,700]
                }
hyper_params

{'learning_rate': [0.01, 0.02, 0.05, 0.1],
 'max_depth': [3, 5, 6, 7],
 'n_estimators': [100, 300, 500, 700]}

In [17]:
def my_roc_auc_score(model,X,y):
  return roc_auc_score(y,model.predict_proba(X)[:,1])

model_hyper = GridSearchCV(estimator=XGBClassifier(subsample=1,colsample_bytree=1,
                                                   eval_metric='auc',
                                                   use_label_encoder=False), param_grid = hyper_params, cv=3,scoring=my_roc_auc_score,return_train_score=True,verbose=True)
model_hyper.fit(X,y)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None, colsample_bytree=1,
                                     early_stopping_rounds=None,
                                     enable_categorical=False,
                                     eval_metric='auc', gamma=None, gpu_id=None,
                                     grow_policy=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_cat_...
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_a

In [18]:
hyper_results = pd.DataFrame(model_hyper.cv_results_)
hyper_results = hyper_results[['rank_test_score','mean_train_score','mean_test_score','param_learning_rate','param_max_depth','param_n_estimators']].sort_values(by='rank_test_score')

In [19]:
hyper_results


Unnamed: 0,rank_test_score,mean_train_score,mean_test_score,param_learning_rate,param_max_depth,param_n_estimators
3,1,0.953422,0.834445,0.01,3,700
17,2,0.947109,0.834443,0.02,3,300
32,3,0.941252,0.832620,0.05,3,100
2,4,0.940378,0.832250,0.01,3,500
18,5,0.969013,0.832118,0.02,3,500
...,...,...,...,...,...,...
58,60,1.000000,0.795994,0.1,6,500
62,61,1.000000,0.795359,0.1,7,500
12,62,0.979210,0.795316,0.01,7,100
59,63,1.000000,0.793057,0.1,6,700


In [20]:
#final model
final_model=XGBClassifier(learning_rate=0.01,max_depth=3,n_estimators=700,subsample=1,colsample_bytree=1,eval_metric='auc',verbosity=1)

final_model.fit(X_train,y_train,early_stopping_rounds=20,eval_set = [(X_test,y_test)],verbose=True)

[0]	validation_0-auc:0.72682
[1]	validation_0-auc:0.72682
[2]	validation_0-auc:0.72682
[3]	validation_0-auc:0.72682
[4]	validation_0-auc:0.72682
[5]	validation_0-auc:0.76994
[6]	validation_0-auc:0.76994
[7]	validation_0-auc:0.77927
[8]	validation_0-auc:0.77432
[9]	validation_0-auc:0.77870
[10]	validation_0-auc:0.77356
[11]	validation_0-auc:0.77775
[12]	validation_0-auc:0.78041
[13]	validation_0-auc:0.78441
[14]	validation_0-auc:0.78022
[15]	validation_0-auc:0.78098
[16]	validation_0-auc:0.78127
[17]	validation_0-auc:0.78146
[18]	validation_0-auc:0.78108
[19]	validation_0-auc:0.78079
[20]	validation_0-auc:0.78079
[21]	validation_0-auc:0.78060
[22]	validation_0-auc:0.78184
[23]	validation_0-auc:0.78279
[24]	validation_0-auc:0.78527
[25]	validation_0-auc:0.78774
[26]	validation_0-auc:0.78812
[27]	validation_0-auc:0.78450
[28]	validation_0-auc:0.78469
[29]	validation_0-auc:0.78622
[30]	validation_0-auc:0.78565
[31]	validation_0-auc:0.78460
[32]	validation_0-auc:0.78498
[33]	validation_0-au



[41]	validation_0-auc:0.78860
[42]	validation_0-auc:0.78765
[43]	validation_0-auc:0.78784
[44]	validation_0-auc:0.78707
[45]	validation_0-auc:0.78612
[46]	validation_0-auc:0.78936
[47]	validation_0-auc:0.78612
[48]	validation_0-auc:0.78660
[49]	validation_0-auc:0.78679
[50]	validation_0-auc:0.78546
[51]	validation_0-auc:0.78565
[52]	validation_0-auc:0.78546
[53]	validation_0-auc:0.78546
[54]	validation_0-auc:0.78717
[55]	validation_0-auc:0.78717
[56]	validation_0-auc:0.78717
[57]	validation_0-auc:0.78755
[58]	validation_0-auc:0.78755
[59]	validation_0-auc:0.78831
[60]	validation_0-auc:0.78831
[61]	validation_0-auc:0.78964
[62]	validation_0-auc:0.78917
[63]	validation_0-auc:0.78926
[64]	validation_0-auc:0.78983
[65]	validation_0-auc:0.78945
[66]	validation_0-auc:0.78945
[67]	validation_0-auc:0.78945
[68]	validation_0-auc:0.78964
[69]	validation_0-auc:0.79098
[70]	validation_0-auc:0.79193
[71]	validation_0-auc:0.79193
[72]	validation_0-auc:0.79107
[73]	validation_0-auc:0.79183
[74]	valid

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='auc', gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=700,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [21]:
predictions = final_model.predict(X_test)

In [22]:
values  = pd.DataFrame({'Actual': y_test[0:],'Predicted':predictions})
values

Unnamed: 0,Actual,Predicted
80,0,0
72,1,0
67,0,0
196,0,0
141,0,0
...,...,...
279,0,0
327,0,1
733,0,0
61,1,1


In [30]:
accuracy = accuracy_score(y_test,predictions)
print(accuracy)

0.7402597402597403
