# Gradient Boosting

In [1]:
# load the Higgs data
import pandas as pd
df = pd.read_csv('higgs_data.csv')

In [2]:
# Documentation of features:
# http://opendata.cern.ch/record/328
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,b
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,b


In [3]:
# data preparation
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

X = df.drop(['EventId', 'Label', 'Weight'], axis=1)

y = LabelEncoder().fit_transform(df['Label'])

# handel missing values (-999.0) 
imputer = SimpleImputer(missing_values=-999.0,strategy='median')
X = imputer.fit_transform(X.values)

X.shape

(250000, 30)

In [4]:
# check percentage of 1 class
import numpy as np
np.mean(y)

0.342668

In [5]:
from time import time
from sklearn.model_selection import cross_val_score

def cross_validate(model):
    t0 = time()
    scores = cross_val_score(model, X, y, cv=3)
    print(scores)
    print("=> Accuracy = %.3f ± %.3f (%.1f sec)"%(np.mean(scores),np.std(scores),time()-t0))

In [None]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier()

cross_validate(model)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()

cross_validate(model)

[0.82921737 0.83049932 0.83273133]
=> Accuracy = 0.831 ± 0.001 (647.3 sec)


In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(booster='gbtree', 
                      objective='binary:logistic', 
                      eval_metric='logloss',
                      random_state=42)

cross_validate(model)

[0.83837329 0.83941536 0.84015936]
=> Accuracy = 0.839 ± 0.001 (6.9 sec)


In [None]:
# --->>> Your Turn <<<---
# Change the following hyperparameters:
# - n_estimators
# - learning_rate
# - max_depth
# - gamma
# What do you observe?