In [1]:
import warnings
warnings.filterwarnings('ignore')
# data cleaning
import numpy as np
import pandas as pd
from collections import Counter
# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#data preprocessing
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold

#data modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb

#Hyperparameter tuning
from sklearn.model_selection import GridSearchCV 

#model evaluation
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.metrics import confusion_matrix,plot_confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, average_precision_score, f1_score, log_loss
from sklearn.metrics import roc_curve, auc, plot_roc_curve, roc_auc_score, plot_precision_recall_curve

**Read and inspect data**

In [2]:
df= pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
x= df.iloc[:,:-1]

In [4]:
y= df['Outcome']

In [5]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size= 0.3, random_state= 123)

# Boosted tree with default hyperparameter

In [6]:
model1= xgb.XGBClassifier(objective ='reg:logistic',random_state= 42)
#objective: determines the loss function to be used like 
#reg:linear for regression problems, 
#reg:logistic for classification problems with only decision
#binary:logistic for classification problems with probability.

In [7]:
model1.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='reg:logistic', predictor='auto',
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [8]:
model1.score(x_train,y_train)

1.0

In [9]:
model1.score(x_test,y_test)

0.7575757575757576

# Boosted tree with cross-validation

In [10]:
score= cross_val_score(model1,x,y,cv=20,scoring="accuracy")

In [11]:
score.mean()

0.7333333333333335

In [12]:
score.std()

0.07196621209343816

# Boosted tree with hyperparameter

**Commom hyperparameter**
- booster:  gbtree (default),gblinear or dart
- learning_rate: step size shrinkage used to prevent overfitting. Range is [0,1]
- max_depth: determines how deeply each tree is allowed to grow during any boosting round
- subsample: percentage of samples used per tree. Low value can lead to underfitting
- colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
- n_estimators: number of trees you want to build.
- objective: determines the loss function to be used like 
    - reg:linear for regression problems, 
    - reg:logistic for classification problems with only decision
    - binary:logistic for classification problems with probability.

**regularization parameters to penalize models as they become more complex and reduce them to simple (parsimonious) model**
- gamma: controls whether a given node will split based on the expected reduction in loss after the split. A higher value leads to fewer splits. Supported only for tree-based learners.
- alpha: L1 regularization on leaf weights. A large value leads to more regularization.
- gambda: L2 regularization on leaf weights and is smoother than L1 regularization.

In [13]:
model2= xgb.XGBClassifier(learning_rate=0.1, max_depth=3,
                          random_state= 42,objective ='reg:logistic',
                          alpha=8,
                          subsample= 0.9)

In [14]:
model2.fit(x_train,y_train)

XGBClassifier(alpha=8, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='reg:logistic', predictor='auto',
              random_state=42, reg_alpha=8, reg_lambda=1, scale_pos_weight=1,
              subsample=0.9, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [15]:
model2.score(x_train,y_train)

0.8175046554934823

In [16]:
model2.score(x_test,y_test)

0.7748917748917749

In [17]:
score= cross_val_score(model2,x,y,cv=20,scoring="accuracy")
score.mean()

0.7620782726045884