# Gradient Boosting

In [None]:
# load the Higgs data
import pandas as pd
df = pd.read_csv('higgs_data.csv')

In [None]:
# Documentation of features:
# http://opendata.cern.ch/record/328
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,b
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,b


In [None]:
# data preparation
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

X = df.drop(['EventId', 'Label', 'Weight'], axis=1)

y = LabelEncoder().fit_transform(df['Label'])

# handel missing values (-999.0) 
imputer = SimpleImputer(missing_values=-999.0,strategy='median')
X = imputer.fit_transform(X.values)

X.shape

(250000, 30)

In [None]:
# check percentage of 1 class
import numpy as np
np.mean(y)

0.342668

In [None]:
from time import time
from sklearn.model_selection import cross_val_score

def cross_validate(model):
    t0 = time()
    scores = cross_val_score(model, X, y, cv=3)
    print(scores)
    print("=> Accuracy = %.3f ± %.3f (%.1f sec)"%(np.mean(scores),np.std(scores),time()-t0))

In [None]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier()

cross_validate(model)

[0.80158159 0.80161521 0.8006072 ]
=> Accuracy = 0.801 ± 0.000 (135.9 sec)


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()

cross_validate(model)

[0.82921737 0.83049932 0.83273133]
=> Accuracy = 0.831 ± 0.001 (717.7 sec)


In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(booster='gbtree', 
                      objective='binary:logistic', 
                      eval_metric='logloss',
                      random_state=42)

cross_validate(model)

[0.83837329 0.83941536 0.84015936]
=> Accuracy = 0.839 ± 0.001 (3.9 sec)


In [None]:
# --->>> Your Turn <<<---
# Change the following hyperparameters:
# - n_estimators
# - learning_rate
# - max_depth
# - gamma
# What do you observe?

In [None]:
#use a smaller sub-dataset to save on time
X2,y2 = X[:10000], y[:10000] 
print(X.shape, X2.shape)

(250000, 30) (10000, 30)


In [None]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'n_estimators':[100, 200, 400, 800],
    'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
    'max_depth':[2, 3, 5, 6, 8],
    'gamma':[0, 0.01, 0.1, 0.5, 1, 2],
    'min_child_weight':[1, 2, 3, 4, 5],
    'subsample':[0.5, 0.7, 0.8, 0.9, 1],
    'colsample_bytree':[0.5, 0.7, 0.8, 0.9, 1]
}

model = XGBClassifier(booster='gbtree', 
                      objective='binary:logistic', 
                      eval_metric='logloss',
                      random_state=42)

rs = RandomizedSearchCV(model, params, cv=3, n_iter=20, n_jobs=-1, random_state=2)

t0 = time()
ret = rs.fit(X2,y2)
print(time()-t0)

print(ret.best_params_)
ret.best_score_

21.482486486434937
{'subsample': 0.9, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.9}


0.8285998665853388

In [None]:
model = XGBClassifier(booster='gbtree', 
                      objective='binary:logistic', 
                      eval_metric='logloss',
                      random_state=42, 
                      **ret.best_params_)

cross_validate(model)

[0.83390933 0.83163933 0.83618734]
=> Accuracy = 0.834 ± 0.002 (7.0 sec)
