# Boosting Machines

In [1]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

In [2]:
import warnings

warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from mypipes import *

In [3]:
train_file=r'C:\\Users\\Larry Williams\\Desktop\\Mamata\\Edvancer\\data\\data\\rg_train.csv'
test_file=r'C:\\Users\\Larry Williams\\Desktop\\Mamata\\Edvancer\\data\\data\\rg_test.csv'
bd_train=pd.read_csv(train_file)

bd_test=pd.read_csv(test_file)

In [4]:
num_vars=list(bd_train.select_dtypes(exclude=['object']).columns)

In [5]:
num_vars=[_ for _ in num_vars if _ not in ['REF_NO','Revenue.Grid']]

In [6]:
cat_vars=list(bd_train.select_dtypes(include=['object']).columns)

In [7]:
cat_vars=[_ for _ in cat_vars if _ not in 
          ['children','age_band', 'post_code','post_area','family_income']]

In [8]:
p1=pdPipeline([
    ('var_select',VarSelector(num_vars)),
    ('missing_trt',DataFrameImputer())
])

In [9]:
p2=pdPipeline([
    ('var_select',VarSelector(cat_vars)),
    ('missing_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(70))
])

In [10]:
p3=pdPipeline([
    ('var_select',VarSelector(['age_band'])),
    ('custom_fico',custom_age_band()),
    ('missing_trt',DataFrameImputer())
])

p4=pdPipeline([
    ('var_select',VarSelector(['family_income'])),
    ('custom_fico',custom_family_income()),
    ('missing_trt',DataFrameImputer())
])

p5=pdPipeline([
    ('var_select',VarSelector(['children'])),
    ('string_clean1',string_clean(replace_it='Zero',replace_with='0')),
    ('string_clean2',string_clean(replace_it='4+',replace_with='4')),
    ('convert_to_numeric',convert_to_numeric()),
    ('missing_trt',DataFrameImputer())
])

In [11]:
data_pipe=FeatureUnion([
    ('num',p1),
    ('obj_to_dum',p2),
    ('age_band',p3),
    ('family_income',p4),
    ('children',p5)
])

In [12]:
x_train=pd.DataFrame(data=data_pipe.fit_transform(bd_train),
                     columns=data_pipe.get_feature_names())


In [13]:
x_test=pd.DataFrame(data=data_pipe.transform(bd_test),
                     columns=data_pipe.get_feature_names())

In [14]:
y_train=(bd_train['Revenue.Grid']==1).astype(int)

In [15]:
x_train.shape

(8124, 71)

In [16]:
gbm_params={'n_estimators':[50,100,200,500,700],
           'learning_rate': [0.01,.05,0.1,0.4,0.8,1],
            'max_depth':[1,2,3,4,5,6],
#             'min_samples_split':[2,5,10,20],
#             'min_samples_leaf':[2,5,10,20],
            'subsample':[0.5,0.8,1],
            'max_features':[5,10,15,20,30,45,55,65]
           }


In [17]:
gbm=GradientBoostingClassifier()

In [18]:
random_search=RandomizedSearchCV(gbm,
                                 scoring='roc_auc',
                                 param_distributions=gbm_params,
                                 cv=10,
                                 n_iter=10,
                                 n_jobs=-1,
                                verbose=20)

In [19]:
random_search.fit(x_train,y_train)


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                        criterion='friedman_mse',
                                                        init=None,
                                                        learning_rate=0.1,
                                                        loss='deviance',
                                                        max_depth=3,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                   

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=5,
              max_features=20, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=700,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=0.5, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
              
use the above result in the class, its a result from previous run. This can be definitely different on a rerun. use this to save time in class so that you dont have to wait for the randomised search to finish

In [20]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.5f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [21]:
report(random_search.cv_results_,5)

Model with rank: 1
Mean validation score: 0.990 (std: 0.00225)
Parameters: {'subsample': 0.8, 'n_estimators': 500, 'max_features': 45, 'max_depth': 5, 'learning_rate': 0.1}

Model with rank: 2
Mean validation score: 0.984 (std: 0.00564)
Parameters: {'subsample': 1, 'n_estimators': 700, 'max_features': 5, 'max_depth': 4, 'learning_rate': 0.4}

Model with rank: 3
Mean validation score: 0.973 (std: 0.00478)
Parameters: {'subsample': 1, 'n_estimators': 50, 'max_features': 55, 'max_depth': 6, 'learning_rate': 0.01}

Model with rank: 4
Mean validation score: 0.966 (std: 0.03738)
Parameters: {'subsample': 1, 'n_estimators': 700, 'max_features': 45, 'max_depth': 4, 'learning_rate': 0.8}

Model with rank: 5
Mean validation score: 0.958 (std: 0.01015)
Parameters: {'subsample': 0.5, 'n_estimators': 200, 'max_features': 55, 'max_depth': 1, 'learning_rate': 0.1}



top 5 classfiers from the previous run were as follows : 

Model with rank: 1

Mean validation score: 0.925 (std: 0.00188)

Parameters: {'max_features': 20, 'max_depth': 3, 'subsample': 0.8, 'learning_rate': 0.4, 'n_estimators': 100}

~~~~~~~~~~

Model with rank: 2

Mean validation score: 0.924 (std: 0.00121)

Parameters: {'max_features': 15, 'max_depth': 4, 'subsample': 1, 'learning_rate': 0.4, 'n_estimators': 100}

~~~~~~~~~~

Model with rank: 3

Mean validation score: 0.923 (std: 0.00250)

Parameters: {'max_features': 5, 'max_depth': 4, 'subsample': 0.5, 'learning_rate': 0.05, 'n_estimators': 500}

~~~~~~~~~~

Model with rank: 4

Mean validation score: 0.914 (std: 0.00290)

Parameters: {'max_features': 10, 'max_depth': 5, 'subsample': 1, 'learning_rate': 0.05, 'n_estimators': 50}

~~~~~~~~~~

Model with rank: 5

Mean validation score: 0.913 (std: 0.00174)

Parameters: {'max_features': 30, 'max_depth': 5, 'subsample': 0.8, 'learning_rate': 0.4, 'n_estimators': 200}

tentative performance : 0.925 for the best classfier 

**Note: you can use the random search predict,predict_proba function to make prediction as randomisedsearchcv automatically fits the best candidate on complete data. If you want to look into feature_importance etc, then fit the best estimator separately**

# Xgboost

In [22]:
xgb_params = {  
                "learning_rate":[0.01,0.05,0.1,0.3,0.5],
                "gamma":[i/10.0 for i in range(0,5)],
                "max_depth": [2,3,4,5,6,7,8],
                "min_child_weight":[1,2,5,10],
                "max_delta_step":[0,1,2,5,10],
                "subsample":[i/10.0 for i in range(5,10)],
                "colsample_bytree":[i/10.0 for i in range(5,10)],
                "colsample_bylevel":[i/10.0 for i in range(5,10)],
                "reg_lambda":[1e-5, 1e-2, 0.1, 1, 100], 
                "reg_alpha":[1e-5, 1e-2, 0.1, 1, 100],
                "scale_pos_weight":[1,2,3,4,5,6,7,8,9],
                "n_estimators":[100,500,700,1000]
             }


In [24]:
xgb=XGBClassifier(objective='binary:logistic')

In [25]:
n_iter=10

random_search=RandomizedSearchCV(xgb,n_jobs=-1,cv=10,n_iter=n_iter,scoring='roc_auc',
                                 param_distributions=xgb_params)

In [26]:
random_search.fit(x_train,y_train)



RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           enable_categorical=False, gamma=None,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           mon...
                                        'max_delta_step': [0, 1, 2, 5, 10],
                                        'max_depth': [2, 3, 4, 5, 6, 7, 8],
                            

In [27]:
report(random_search.cv_results_,5)

Model with rank: 1
Mean validation score: 0.991 (std: 0.00198)
Parameters: {'subsample': 0.9, 'scale_pos_weight': 8, 'reg_lambda': 0.01, 'reg_alpha': 1, 'n_estimators': 700, 'min_child_weight': 5, 'max_depth': 5, 'max_delta_step': 1, 'learning_rate': 0.05, 'gamma': 0.0, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.5}

Model with rank: 2
Mean validation score: 0.991 (std: 0.00197)
Parameters: {'subsample': 0.9, 'scale_pos_weight': 5, 'reg_lambda': 100, 'reg_alpha': 1, 'n_estimators': 700, 'min_child_weight': 2, 'max_depth': 4, 'max_delta_step': 0, 'learning_rate': 0.5, 'gamma': 0.2, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.7}

Model with rank: 3
Mean validation score: 0.990 (std: 0.00198)
Parameters: {'subsample': 0.5, 'scale_pos_weight': 7, 'reg_lambda': 100, 'reg_alpha': 0.1, 'n_estimators': 700, 'min_child_weight': 10, 'max_depth': 6, 'max_delta_step': 10, 'learning_rate': 0.3, 'gamma': 0.1, 'colsample_bytree': 0.9, 'colsample_bylevel': 0.7}

Model with rank: 4
Mean validat

top 5 classfiers from the previous run 

Model with rank: 1

Mean validation score: 0.928 (std: 0.00232)

Parameters: {'reg_lambda': 1e-05, 'subsample': 0.9, 'reg_alpha': 1, 'max_depth': 8, 'min_child_weight': 10, 'n_estimators': 1000, 'gamma': 0.1, 'colsample_bylevel': 0.8, 'scale_pos_weight': 2, 'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_delta_step': 10}

____

Model with rank: 2

Mean validation score: 0.927 (std: 0.00160)

Parameters: {'reg_lambda': 1, 'subsample': 0.6, 'reg_alpha': 0.1, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 1000, 'gamma': 0.3, 'colsample_bylevel': 0.9, 'scale_pos_weight': 2, 'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_delta_step': 5}

____

Model with rank: 3

Mean validation score: 0.926 (std: 0.00101)

Parameters: {'reg_lambda': 0.1, 'subsample': 0.7, 'reg_alpha': 1e-05, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 500, 'gamma': 0.2, 'colsample_bylevel': 0.5, 'scale_pos_weight': 3, 'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_delta_step': 1}

____

Model with rank: 4

Mean validation score: 0.925 (std: 0.00104)

Parameters: {'reg_lambda': 0.1, 'subsample': 0.9, 'reg_alpha': 0.01, 'max_depth': 6, 'min_child_weight': 2, 'n_estimators': 1000, 'gamma': 0.2, 'colsample_bylevel': 0.8, 'scale_pos_weight': 8, 'colsample_bytree': 0.5, 'learning_rate': 0.05, 'max_delta_step': 0}

____

Model with rank: 5

Mean validation score: 0.920 (std: 0.00278)

Parameters: {'reg_lambda': 1, 'subsample': 0.8, 'reg_alpha': 0.1, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 500, 'gamma': 0.0, 'colsample_bylevel': 0.5, 'scale_pos_weight': 8, 'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_delta_step': 5}

____

tentative performance of best estimator : 0.928


In [28]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, gamma=0.0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_delta_step=1, max_depth=5,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=700, n_jobs=4, num_parallel_tree=1,
              objective='binary:logistic', predictor='auto', random_state=0,
              reg_alpha=1, reg_lambda=0.01, scale_pos_weight=8, subsample=0.9,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

best estimator from the previous run can be copied from here :

XGBClassifier(base_score=0.5, colsample_bylevel=0.8, colsample_bytree=0.5,
       gamma=0.1, learning_rate=0.01, max_delta_step=10, max_depth=8,
       min_child_weight=10, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=1, reg_lambda=1e-05,
       scale_pos_weight=2, seed=0, silent=True, subsample=0.9)

## Sequential Parameter tuning for xgboost

If we tune all the paramters together , there are chances that our results will be much far from the best. There are many parameters where variation doesnt impact the performance too much and we can tune them later once we have fixed values of parameters with volatile performance.

As a general strtaegy we can start with tuning numer of trees or n_estimators , in case of boosting machines , learning_rate is directly related with n_estimators . A very low learning_rate will need high number of n_estimators . We can start with a decent fixed learning rate and tune n_estimaors for it. 

All can be left as default for now except subsample , colsample_bytree and colsample_bylevel, these are set to default=1, we'll take a more conservative value 0.8

In [29]:
XGBClassifier?

In [30]:
xgb_params = {  
                "n_estimators":[100,500,700,900,1000,1200,1500]
             }

In [31]:
xgb1=XGBClassifier(subsample=0.8,
                   colsample_bylevel=0.8,
                   colsample_bytree=0.8)

In [32]:
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [33]:
from sklearn.model_selection import GridSearchCV

In [34]:
grid_search=GridSearchCV(xgb1,cv=10,
                         param_grid=xgb_params,
                         scoring='roc_auc',
                         verbose=20)

# two issues : currently xgboost is not running with multicores 
# mac issue:

In [35]:
grid_search.fit(x_train,y_train)

Fitting 10 folds for each of 7 candidates, totalling 70 fits
[CV] n_estimators=100 ................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................... n_estimators=100, score=0.992, total=   1.2s
[CV] n_estimators=100 ................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV] .................... n_estimators=100, score=0.987, total=   1.2s
[CV] n_estimators=100 ................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.4s remaining:    0.0s


[CV] .................... n_estimators=100, score=0.990, total=   1.2s
[CV] n_estimators=100 ................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.6s remaining:    0.0s


[CV] .................... n_estimators=100, score=0.992, total=   1.2s
[CV] n_estimators=100 ................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.8s remaining:    0.0s


[CV] .................... n_estimators=100, score=0.992, total=   1.2s
[CV] n_estimators=100 ................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.0s remaining:    0.0s


[CV] .................... n_estimators=100, score=0.992, total=   1.3s
[CV] n_estimators=100 ................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    7.3s remaining:    0.0s


[CV] .................... n_estimators=100, score=0.990, total=   1.2s
[CV] n_estimators=100 ................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    8.5s remaining:    0.0s


[CV] .................... n_estimators=100, score=0.987, total=   1.4s
[CV] n_estimators=100 ................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    9.8s remaining:    0.0s


[CV] .................... n_estimators=100, score=0.989, total=   1.2s
[CV] n_estimators=100 ................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   11.1s remaining:    0.0s


[CV] .................... n_estimators=100, score=0.990, total=   1.1s
[CV] n_estimators=500 ................................................


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   12.2s remaining:    0.0s


[CV] .................... n_estimators=500, score=0.989, total=   4.6s
[CV] n_estimators=500 ................................................


[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:   16.9s remaining:    0.0s


[CV] .................... n_estimators=500, score=0.987, total=   5.0s
[CV] n_estimators=500 ................................................


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:   21.9s remaining:    0.0s


[CV] .................... n_estimators=500, score=0.990, total=   4.9s
[CV] n_estimators=500 ................................................


[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:   26.8s remaining:    0.0s


[CV] .................... n_estimators=500, score=0.993, total=   4.7s
[CV] n_estimators=500 ................................................


[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:   31.6s remaining:    0.0s


[CV] .................... n_estimators=500, score=0.991, total=   4.5s
[CV] n_estimators=500 ................................................


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   36.1s remaining:    0.0s


[CV] .................... n_estimators=500, score=0.993, total=   4.4s
[CV] n_estimators=500 ................................................


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:   40.5s remaining:    0.0s


[CV] .................... n_estimators=500, score=0.990, total=   4.4s
[CV] n_estimators=500 ................................................


[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:   44.9s remaining:    0.0s


[CV] .................... n_estimators=500, score=0.991, total=   5.1s
[CV] n_estimators=500 ................................................


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:   50.0s remaining:    0.0s


[CV] .................... n_estimators=500, score=0.990, total=   4.4s
[CV] n_estimators=500 ................................................


[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed:   54.4s remaining:    0.0s


[CV] .................... n_estimators=500, score=0.991, total=   5.1s
[CV] n_estimators=700 ................................................
[CV] .................... n_estimators=700, score=0.989, total=   7.9s
[CV] n_estimators=700 ................................................
[CV] .................... n_estimators=700, score=0.987, total=   6.9s
[CV] n_estimators=700 ................................................
[CV] .................... n_estimators=700, score=0.989, total=   8.6s
[CV] n_estimators=700 ................................................
[CV] .................... n_estimators=700, score=0.993, total=   6.3s
[CV] n_estimators=700 ................................................
[CV] .................... n_estimators=700, score=0.991, total=   7.1s
[CV] n_estimators=700 ................................................
[CV] .................... n_estimators=700, score=0.993, total=   6.9s
[CV] n_estimators=700 ................................................
[CV] .

[CV] .................... n_estimators=900, score=0.990, total=   7.8s
[CV] n_estimators=900 ................................................
[CV] .................... n_estimators=900, score=0.990, total=   7.8s
[CV] n_estimators=1000 ...............................................
[CV] ................... n_estimators=1000, score=0.988, total=   8.8s
[CV] n_estimators=1000 ...............................................
[CV] ................... n_estimators=1000, score=0.986, total=   8.5s
[CV] n_estimators=1000 ...............................................
[CV] ................... n_estimators=1000, score=0.989, total=   9.0s
[CV] n_estimators=1000 ...............................................
[CV] ................... n_estimators=1000, score=0.994, total=   9.4s
[CV] n_estimators=1000 ...............................................
[CV] ................... n_estimators=1000, score=0.991, total=   9.5s
[CV] n_estimators=1000 ...............................................
[CV] .

[CV] ................... n_estimators=1200, score=0.990, total=  10.0s
[CV] n_estimators=1200 ...............................................
[CV] ................... n_estimators=1200, score=0.990, total=  10.0s
[CV] n_estimators=1200 ...............................................
[CV] ................... n_estimators=1200, score=0.990, total=   9.7s
[CV] n_estimators=1500 ...............................................
[CV] ................... n_estimators=1500, score=0.987, total=  11.2s
[CV] n_estimators=1500 ...............................................
[CV] ................... n_estimators=1500, score=0.986, total=  11.2s
[CV] n_estimators=1500 ...............................................
[CV] ................... n_estimators=1500, score=0.989, total=  12.0s
[CV] n_estimators=1500 ...............................................
[CV] ................... n_estimators=1500, score=0.994, total=  11.1s
[CV] n_estimators=1500 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed:  8.9min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=0.8,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_co...
                                     predictor=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=0.8,
                                     tree_method=None, use_label_e

In [36]:
report(grid_search.cv_results_,3)

Model with rank: 1
Mean validation score: 0.990 (std: 0.00170)
Parameters: {'n_estimators': 500}

Model with rank: 2
Mean validation score: 0.990 (std: 0.00183)
Parameters: {'n_estimators': 700}

Model with rank: 3
Mean validation score: 0.990 (std: 0.00188)
Parameters: {'n_estimators': 900}



we got n_estimator=500 as best with learning_rate=0.1  . Next we'll tune max_depth,gamma and min_child_weight, which control overfit by controlling size of individual trees

In [37]:
xgb_params={
            "gamma":[0,2,5,8,10],
            "max_depth": [2,3,4,5,6,7,8],
            "min_child_weight":[0.5,1,2,5,10]
            }

In [38]:
xgb2=XGBClassifier(learning_rate=0.1,n_estimators=500,
                   subsample=0.8,colsample_bylevel=0.8,colsample_bytree=0.8)

In [39]:
random_search=RandomizedSearchCV(xgb2,
                                 param_distributions=xgb_params,n_iter=20,cv=5,scoring='roc_auc',
                                 n_jobs=-1,verbose=2)

In [40]:
random_search.fit(x_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  6.9min finished




RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=0.8,
                                           colsample_bynode=None,
                                           colsample_bytree=0.8,
                                           enable_categorical=False, gamma=None,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=0.1,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monoton...
                                           scale_pos_weight=None, subsample=0.8,
                                           tree_method=None,
                                      

In [41]:
report(random_search.cv_results_,3)

Model with rank: 1
Mean validation score: 0.991 (std: 0.00136)
Parameters: {'min_child_weight': 2, 'max_depth': 6, 'gamma': 0}

Model with rank: 2
Mean validation score: 0.991 (std: 0.00124)
Parameters: {'min_child_weight': 2, 'max_depth': 7, 'gamma': 0}

Model with rank: 3
Mean validation score: 0.989 (std: 0.00235)
Parameters: {'min_child_weight': 2, 'max_depth': 3, 'gamma': 0}



we got best values for parameters being tuned as follows : {'min_child_weight': 1, 'gamma': 0, 'max_depth': 3}

Since there is imbalance in the data , we'll look into max_delta_step and scale_pos_weight next

In [42]:
y_train.value_counts()

0    7261
1     863
Name: Revenue.Grid, dtype: int64

In [43]:
24720/7841

3.152659099604642

In [44]:
xgb_params={
            'max_delta_step':[0,1,3,6,10],
            'scale_pos_weight':[1,2,3,4]
            }

In [45]:
xgb3=XGBClassifier(learning_rate=0.1,
                   n_estimators=500,min_child_weight=1,
                   gamma=0,max_depth=3,
                   
                  subsample=0.8,colsample_bylevel=0.8,colsample_bytree=0.8)

In [46]:
grid_search=GridSearchCV(xgb3,param_grid=xgb_params,cv=5,scoring='roc_auc',n_jobs=-1,
                         verbose=10)

In [47]:
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   37.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   58.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.4min finished




GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=0.8,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8,
                                     enable_categorical=False, gamma=0,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=3, min_child_weight=1,
                                     missing=nan, monotone_constraints=N...
                                     predictor=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=0.8,
                                     tree_method=None, use_label_e

In [48]:
report(grid_search.cv_results_,3)

Model with rank: 1
Mean validation score: 0.990 (std: 0.00205)
Parameters: {'max_delta_step': 0, 'scale_pos_weight': 2}

Model with rank: 1
Mean validation score: 0.990 (std: 0.00205)
Parameters: {'max_delta_step': 6, 'scale_pos_weight': 2}

Model with rank: 1
Mean validation score: 0.990 (std: 0.00205)
Parameters: {'max_delta_step': 10, 'scale_pos_weight': 2}



it turns out that , since imbalance was not that severe , defaults come out as best choices {'scale_pos_weight': 1, 'max_delta_step': 0}

Next we check the effect of the noise on data and tune , subsample , colsample_bytree and colsample_bylevel

In [49]:
xgb_params={
            'subsample':[i/10 for i in range(5,11)],
            'colsample_bytree':[i/10 for i in range(5,11)],
            'colsample_bylevel':[i/10 for i in range(5,11)]
            }

In [50]:
xgb4=XGBClassifier(learning_rate=0.1,n_estimators=500,min_child_weight=1,gamma=0,max_depth=3,
                        scale_pos_weight=1,max_delta_step=0
                  )

In [51]:
random_search=RandomizedSearchCV(xgb4,param_distributions=xgb_params,cv=5,n_iter=20,scoring='roc_auc',
                                n_jobs=-1,verbose=20)

In [52]:
random_search.fit(x_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   31.5s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   



RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           enable_categorical=False, gamma=0,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=nan,
                                           monotone_constrai...
                                           use_label_encoder=True,
                                           validate_parameters=None,
                                           v

In [53]:
report(random_search.cv_results_,3)

Model with rank: 1
Mean validation score: 0.989 (std: 0.00156)
Parameters: {'subsample': 0.8, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.7}

Model with rank: 2
Mean validation score: 0.989 (std: 0.00152)
Parameters: {'subsample': 0.7, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.9}

Model with rank: 3
Mean validation score: 0.989 (std: 0.00182)
Parameters: {'subsample': 0.7, 'colsample_bytree': 1.0, 'colsample_bylevel': 0.9}



bets values that we got for paramaeters are as follows : {'colsample_bylevel': 0.5, 'colsample_bytree': 0.6, 'subsample': 1.0}

lastly we can work on L2 and L1 penalty on leaf node score to further reduce overfit if there is any

In [54]:
xgb5=XGBClassifier(learning_rate=0.1,n_estimators=500,min_child_weight=1,gamma=0,max_depth=3,
                        scale_pos_weight=1,max_delta_step=0,
                   colsample_bylevel= 0.5, colsample_bytree= 0.6, subsample= 1.0
                  )

In [55]:
xgb_params={
            'reg_lambda':[i/10 for i in range(0,50)],
            'reg_alpha':[i/10 for i in range(0,50)]
            }

In [56]:
random_search=RandomizedSearchCV(xgb5,param_distributions=xgb_params,cv=5,n_iter=20,scoring='roc_auc',
                                n_jobs=-1,verbose=10)

In [57]:
random_search.fit(x_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   26.0s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   32.6s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   48.4s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.3min finished




RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=0.5,
                                           colsample_bynode=None,
                                           colsample_bytree=0.6,
                                           enable_categorical=False, gamma=0,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=nan,
                                           monotone_constraint...
                                                      0.5, 0.6, 0.7, 0.8, 0.9,
                                                      1.0, 1.1, 1.2, 1.3, 1.4,
                      

In [58]:
report(random_search.cv_results_,3)

Model with rank: 1
Mean validation score: 0.987 (std: 0.00205)
Parameters: {'reg_lambda': 1.4, 'reg_alpha': 0.2}

Model with rank: 2
Mean validation score: 0.987 (std: 0.00178)
Parameters: {'reg_lambda': 3.1, 'reg_alpha': 0.0}

Model with rank: 3
Mean validation score: 0.987 (std: 0.00223)
Parameters: {'reg_lambda': 0.4, 'reg_alpha': 2.5}



The best value that we got here is {'reg_lambda': 1.5, 'reg_alpha': 0.0}, but the performance has gone down. May be the default was doing better and wasnt picked as one of the candidates here in the random_search. we'll go with those defaults values instead

In [59]:
xgb6=XGBClassifier(learning_rate=0.1,n_estimators=500,min_child_weight=1,gamma=0,max_depth=3,
                        scale_pos_weight=1,max_delta_step=0,
                   colsample_bylevel= 0.5, colsample_bytree= 0.6, subsample= 1.0,
                  reg_lambda=1,reg_alpha=0)

If we want to simply get cv performance of a model , without having to select any parameters we can make use of cross_validation_score function

In [60]:
from sklearn.model_selection import cross_val_score

In [61]:
cross_val_score(xgb6,x_train,y_train,scoring='roc_auc',verbose=10,n_jobs=-1,cv=10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   12.5s remaining:   12.5s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   12.6s remaining:    5.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   16.0s finished


array([0.99009948, 0.98758747, 0.98340775, 0.99056395, 0.98930104,
       0.99173554, 0.98973349, 0.9880838 , 0.98875649, 0.98630598])

In [62]:
scores=[0.92951477, 0.92590096, 0.93070889, 0.92176974, 0.92882013,
       0.93128318, 0.93018259, 0.93297173, 0.93256565, 0.92947388]
# these are from an earlier iteration , need not match with your current run

In [63]:
import numpy as np

In [64]:
np.mean(scores)

0.929319152

In [65]:
np.std(scores)

0.0031528442142034264