For XGB reference 
    
- https://towardsdatascience.com/beginners-guide-to-xgboost-for-classification-problems-50f75aac5390


- Using XGBoost in Pipelines : https://goodboychan.github.io/python/datacamp/machine_learning/2020/07/07/03-Using-XGBoost-in-pipelines.html

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score,confusion_matrix, plot_confusion_matrix
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, StratifiedKFold

import xgboost as xgb

In [3]:
pd.options.display.max_columns = None

In [4]:
data= pd.read_csv('fetal_health.csv')
data.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,mean_value_of_long_term_variability,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,2.4,64.0,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,10.4,130.0,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,13.4,130.0,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,23.0,117.0,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,19.9,117.0,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [5]:
data.shape

(2126, 22)

In [6]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

X.shape, y.shape

((2126, 21), (2126,))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=45)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1700, 21), (426, 21), (1700,), (426,))

In [8]:
model = xgb.XGBClassifier(n_jobs=-1)

In [9]:
pipe = Pipeline(steps = [
    ('scaler',MinMaxScaler()),
    ('model',model)
])

In [10]:
skf = StratifiedKFold(n_splits=10)

In [11]:
np.mean(cross_val_score(pipe,X,y,cv=skf,n_jobs=-1,verbose=2))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    6.7s remaining:   15.8s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    7.1s finished


0.8907653467977678

In [12]:
model.get_params

<bound method XGBModel.get_params of XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=None,
              tree_method=None, validate_parameters=None, verbosity=None)>

In [13]:
max_depth = [3, 4, 5, 7]
learning_rate = [0.1, 0.01, 0.05]
gamma = [0, 0.25, 1]
scale_pos_weight = [1, 3, 5]
subsample = [0.8]
colsample_bytree = [0.6,0.8]
n_estimators = [200,500,600]
min_child_weight = [50,70,100]

In [14]:
param_grid = {
    "max_depth": max_depth,
    "learning_rate": learning_rate,
    "gamma": gamma,
    "scale_pos_weight": scale_pos_weight,
    "subsample": subsample,
    "colsample_bytree": colsample_bytree,
    "n_estimators" : n_estimators,
    "min_child_weight" : min_child_weight
}

In [15]:
model = xgb.XGBClassifier()

In [33]:
label_encoder = LabelEncoder()

In [48]:
pipe = Pipeline(steps = [
    ('scaler',MinMaxScaler()),
    ('model',model)
])

In [44]:
grid_xgb = GridSearchCV(pipe,param_grid=param_grid,n_jobs=-1,verbose=1,return_train_score=True,cv=skf)

In [45]:
grid_xgb = GridSearchCV(estimator=xgb.XGBClassifier(),param_grid=param_grid,scoring='accuracy',n_jobs=-1,verbose=1,return_train_score=True,cv=skf)

In [19]:
grid_xgb.fit(X,y)

Fitting 10 folds for each of 324 candidates, totalling 3240 fits


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Parameters: { "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, m...
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, verbosity=None),
             n_jobs=-1,
             param_grid={'colsample_bytree': [0.5], 'gamma': [0, 0.25

In [20]:
grid_xgb.best_score_

0.9025445123571618

In [21]:
grid_xgb.best_params_

{'colsample_bytree': 0.5,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_depth': 3,
 'n_estimators': 200,
 'scale_pos_weight': 1,
 'subsample': 0.8}

In [22]:
hyper_df = pd.DataFrame(grid_xgb.cv_results_).sort_values(by='mean_test_score',ascending=False)
hyper_df.to_csv('hyperparamter_xgb.csv')

In [23]:
hyper_df.head(6)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_gamma,param_learning_rate,param_max_depth,param_n_estimators,param_scale_pos_weight,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,13.1524,1.194542,0.011808,0.003599,0.5,0.0,0.1,3,200,1,0.8,"{'colsample_bytree': 0.5, 'gamma': 0, 'learnin...",0.896714,0.934272,0.938967,0.929577,0.948357,0.957746,0.910377,0.933962,0.858491,0.716981,0.902545,0.067577,1,0.995295,0.994773,0.993204,0.993727,0.992682,0.993727,0.994775,0.994253,0.993208,0.996343,0.994199,0.001057
2,12.451464,0.179077,0.011255,0.003458,0.5,0.0,0.1,3,200,5,0.8,"{'colsample_bytree': 0.5, 'gamma': 0, 'learnin...",0.896714,0.934272,0.938967,0.929577,0.948357,0.957746,0.910377,0.933962,0.858491,0.716981,0.902545,0.067577,1,0.995295,0.994773,0.993204,0.993727,0.992682,0.993727,0.994775,0.994253,0.993208,0.996343,0.994199,0.001057
1,13.715602,0.987348,0.017714,0.0082,0.5,0.0,0.1,3,200,3,0.8,"{'colsample_bytree': 0.5, 'gamma': 0, 'learnin...",0.896714,0.934272,0.938967,0.929577,0.948357,0.957746,0.910377,0.933962,0.858491,0.716981,0.902545,0.067577,1,0.995295,0.994773,0.993204,0.993727,0.992682,0.993727,0.994775,0.994253,0.993208,0.996343,0.994199,0.001057
108,11.691577,0.103476,0.009166,0.000935,0.5,0.25,0.1,3,200,1,0.8,"{'colsample_bytree': 0.5, 'gamma': 0.25, 'lear...",0.892019,0.924883,0.938967,0.924883,0.953052,0.957746,0.910377,0.938679,0.858491,0.721698,0.902079,0.066366,4,0.99425,0.99425,0.992682,0.992682,0.991636,0.992682,0.994253,0.99373,0.993208,0.99582,0.993519,0.001127
109,12.591749,0.354897,0.010607,0.004559,0.5,0.25,0.1,3,200,3,0.8,"{'colsample_bytree': 0.5, 'gamma': 0.25, 'lear...",0.892019,0.924883,0.938967,0.924883,0.953052,0.957746,0.910377,0.938679,0.858491,0.721698,0.902079,0.066366,4,0.99425,0.99425,0.992682,0.992682,0.991636,0.992682,0.994253,0.99373,0.993208,0.99582,0.993519,0.001127
110,13.259716,0.206036,0.00938,0.00063,0.5,0.25,0.1,3,200,5,0.8,"{'colsample_bytree': 0.5, 'gamma': 0.25, 'lear...",0.892019,0.924883,0.938967,0.924883,0.953052,0.957746,0.910377,0.938679,0.858491,0.721698,0.902079,0.066366,4,0.99425,0.99425,0.992682,0.992682,0.991636,0.992682,0.994253,0.99373,0.993208,0.99582,0.993519,0.001127


In [61]:
model = xgb.XGBClassifier(colsample_bytree=0.5,
                         n_jobs=-1,
                         verbose=2,
                          n_estimators=200,
                         gamma=1,
                         learning_rate=0.1,
                         max_depth=3,
                         scale_pos_weight = 1,
                          subsample = 0.8)

In [62]:
pipe.fit(X_train,y_train)



Parameters: { "scale_pos_weight", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




Pipeline(steps=[('scaler', MinMaxScaler()),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=0.5, enable_categorical=False,
                               gamma=0, gpu_id=-1, importance_type=None,
                               interaction_constraints='', learning_rate=0.01,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=200,
                               n_jobs=-1, num_parallel_tree=1,
                               objective='multi:softprob', predictor='auto',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, subsample=0.8,
                               tree_method='exact', validate_parameters=1,
     

In [63]:
y_pred = pipe.predict(X_test)
y_pred

array([3., 2., 1., 1., 3., 3., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 3., 1., 1., 2., 1., 1., 3., 1., 1., 1., 1., 1., 2., 1.,
       1., 1., 1., 1., 2., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 2., 1., 3., 1., 1., 1., 1.,
       1., 1., 1., 1., 3., 1., 3., 1., 1., 1., 1., 1., 2., 1., 1., 2., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 2., 3., 1., 2.,
       1., 1., 1., 1., 1., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       3., 1., 1., 1., 1., 2., 1., 3., 1., 1., 1., 1., 1., 1., 2., 1., 1.,
       1., 1., 1., 1., 1., 3., 1., 1., 1., 1., 1., 1., 3., 1., 3., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 1., 1., 1., 3., 3., 1., 3., 1., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1.,
       1., 1., 1., 2., 2., 2., 1., 1., 1., 3., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [64]:
print("testing score of gradient Boosting : ","%.3f"%(accuracy_score(y_test,y_pred)*100),"%")

testing score of gradient Boosting :  95.070 %


In [65]:
print("training score of gradient Boosting : ","%.3f"%(accuracy_score(y_train,pipe.predict(X_train))*100),"%")

training score of gradient Boosting :  94.059 %


In [60]:
xgb_classifcation = classification_report(y_test,y_pred)
print(xgb_classifcation)

              precision    recall  f1-score   support

         1.0       0.95      0.99      0.97       332
         2.0       0.98      0.75      0.85        59
         3.0       0.94      0.91      0.93        35

    accuracy                           0.95       426
   macro avg       0.96      0.88      0.91       426
weighted avg       0.95      0.95      0.95       426

