In [1]:
import pandas as pd
import numpy as np 
import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

%run util.ipynb

# Gradient Boosting Classifier & XGBoost

In [2]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
# https://xgboost.readthedocs.io/en/stable/python/sklearn_estimator.html

## Load Data

In [3]:
X, Y, df = get_data()

In [4]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=42)

s = StandardScaler()
Xtrain = s.fit_transform(Xtrain)
Xtest = s.transform(Xtest)

### Gradient Boosting 

In [5]:
clf = GradientBoostingClassifier()

In [6]:
clf.fit(Xtrain, Ytrain)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [7]:
YtestPred = clf.predict(Xtest)
accuracy_score(Ytest, YtestPred)

0.8229166666666666

In [8]:
print(classification_report(Ytest, YtestPred))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83        50
           1       0.82      0.80      0.81        46

    accuracy                           0.82        96
   macro avg       0.82      0.82      0.82        96
weighted avg       0.82      0.82      0.82        96



In [9]:
### check for overfitting in training 

YtrainPred = clf.predict(Xtrain)
accuracy_score(Ytrain, YtrainPred)

1.0

### hyperparam tuning 

In [10]:
# NOTE no hyperparam tuning mentioned in paper for this model

In [11]:
clf.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [None]:
# to reduce overfitting, we might try adjusting boosting rounds (estimators), max tree depth, minimum leaves and max features 

param_grid_gb = {
    'n_estimators': [1, 3, 5, 10],
    'max_depth': [1, 2, 3, 4, 5, 10, 15, 20, 25, 50, 100, 200],
    'min_samples_leaf': range(1,4),
    'max_features': range(1,31)
}

gridSearchGb = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42), 
                            param_grid=param_grid_gb, cv=10, scoring='accuracy',
                             n_jobs=-1)

In [13]:
#gridSearchGb.fit(Xtrain, Ytrain)

# check best paramters and cv score

gridSearchGb.best_params_, gridSearchGb.best_score_

YtestPredGrid = gridSearchGb.predict(Xtest)
accuracy_score(YtestPredGrid, Ytest)

### check for overfitting

YtrainPredGrid = gridSearchGb.predict(Xtrain)
accuracy_score(Ytrain, YtrainPredGrid)

# show feature importances  

# TODO redo code and drop in models in future

importances = gridSearchGb.best_estimator_.feature_importances_
features = X.columns

feature_importance_pairs = sorted(zip(features, importances), key=lambda x: x[1], reverse=True)

# Print
for feature, importance in feature_importance_pairs:
    print(f"{feature}: {importance:.4f}")

### XGboost 

In [15]:
clfXG = xgb.XGBClassifier()


In [None]:
clfXG.fit(Xtrain, Ytrain, eval_set=[(Xtest, Ytest)])

In [17]:
YtestPredGX = clfXG.predict(Xtest)
accuracy_score(Ytest, YtestPredGX)

0.8020833333333334

In [19]:
print(classification_report(Ytest, YtestPred))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83        50
           1       0.82      0.80      0.81        46

    accuracy                           0.82        96
   macro avg       0.82      0.82      0.82        96
weighted avg       0.82      0.82      0.82        96



In [20]:
### check for overfitting in training 

YtrainPredGX = clfXG.predict(Xtrain)
accuracy_score(Ytrain, YtrainPredGX)

1.0

### hyperparam tuning

In [24]:
clfXG.get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'feature_weights': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [None]:
param_grid_gx = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [1, 5, 10, 25, 50, 100],
    # L1 regularisation
    'reg_alpha': [0, 0.1, 1, 5],  
    # L2 regularisation  
    'reg_lambda': [1, 5, 10],      
}


clfXG = xgb.XGBClassifier()

grid_search_gx = GridSearchCV(estimator=xgb.XGBClassifier(random_state=42), param_grid=param_grid_gx,
                              scoring='accuracy', cv=10, n_jobs=-1)

In [41]:
grid_search_gx.fit(Xtrain, Ytrain)

0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_grid,"{'gamma': [0, 0.1, ...], 'learning_rate': [0.01, 0.1, ...], 'max_depth': [3, 4, ...], 'n_estimators': [1, 5, ...], ...}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [42]:
grid_search_gx.best_params_, grid_search_gx.best_score_

({'gamma': 0.1,
  'learning_rate': 0.2,
  'max_depth': 3,
  'n_estimators': 100,
  'reg_alpha': 0.1,
  'reg_lambda': 5},
 np.float64(0.7851778656126482))

In [43]:
YtestPredGXg = grid_search_gx.predict(Xtest)
accuracy_score(Ytest, YtestPredGXg)

0.8125

In [44]:
print(classification_report(Ytest, YtestPredGXg))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82        50
           1       0.80      0.80      0.80        46

    accuracy                           0.81        96
   macro avg       0.81      0.81      0.81        96
weighted avg       0.81      0.81      0.81        96



In [45]:
### check for overfitting in training 

YtrainPredGXg = grid_search_gx.predict(Xtrain)
accuracy_score(Ytrain, YtrainPredGXg)

1.0