In [1]:
import numpy as np
import util
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import matplotlib.pyplot as plt

seed=229
np.random.seed(seed)



In [None]:
trainfile = 'data/train.csv'
header, ids, X, Y = util.fetch_data(trainfile)

test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Randomized Search
Use 10% of the data to conduct a search over XGBoost parameters, then test 

In [None]:
import scipy.stats

m_search = X_train.shape[0]/10
X_paramsearch = X_train[:m_search,:]; Y_paramsearch = y_train[:m_search]

print(X_paramsearch.shape)

param_distros = {'learning_rate': scipy.stats.uniform(loc=0.1,scale=0.1),
                'min_child_weight': scipy.stats.uniform(loc=0.1,scale=0.9),
                'max_depth': scipy.stats.binom(n=10, p=0.6), 
                #'gamma': scipy.stats.uniform(loc=0, scale=1),
                'subsample': scipy.stats.uniform(loc=0.5,scale=0.5),
                'colsample_bytree': scipy.stats.uniform(loc=0.5, scale=0.5)}
xgb = XGBClassifier(silent=0)

# 9.7s for one candidate
cv_random = RandomizedSearchCV(xgb, n_iter=1000, param_distributions=param_distros,
                               scoring=util.gini_scorer, verbose=2, n_jobs=-1)

cv_random.fit(X_paramsearch, Y_paramsearch)
xgb_model = cv_random.best_estimator_

(47616, 57)
Fitting 3 folds for each of 1000 candidates, totalling 3000 fits
[CV] subsample=0.889730870102, learning_rate=0.173855720703, colsample_bytree=0.54851249975, max_depth=5, min_child_weight=0.322788105024 
[CV] subsample=0.889730870102, learning_rate=0.173855720703, colsample_bytree=0.54851249975, max_depth=5, min_child_weight=0.322788105024 
[CV] subsample=0.889730870102, learning_rate=0.173855720703, colsample_bytree=0.54851249975, max_depth=5, min_child_weight=0.322788105024 
[CV] subsample=0.982999550166, learning_rate=0.137186577101, colsample_bytree=0.735231720695, max_depth=5, min_child_weight=0.418744731918 
[CV]  subsample=0.889730870102, learning_rate=0.173855720703, colsample_bytree=0.54851249975, max_depth=5, min_child_weight=0.322788105024, total=   7.4s
[CV] subsample=0.982999550166, learning_rate=0.137186577101, colsample_bytree=0.735231720695, max_depth=5, min_child_weight=0.418744731918 
[CV]  subsample=0.889730870102, learning_rate=0.173855720703, colsample_

[CV]  subsample=0.561122243029, learning_rate=0.143046835273, colsample_bytree=0.52923777809, max_depth=6, min_child_weight=0.973872991351, total=  11.9s
[CV] subsample=0.533261059725, learning_rate=0.17718667901, colsample_bytree=0.630179376688, max_depth=3, min_child_weight=0.875647373597 
[CV]  subsample=0.777015072759, learning_rate=0.191522506061, colsample_bytree=0.658817213399, max_depth=7, min_child_weight=0.724202968472, total=  16.3s
[CV] subsample=0.533261059725, learning_rate=0.17718667901, colsample_bytree=0.630179376688, max_depth=3, min_child_weight=0.875647373597 
[CV]  subsample=0.561122243029, learning_rate=0.143046835273, colsample_bytree=0.52923777809, max_depth=6, min_child_weight=0.973872991351, total=  14.0s
[CV] subsample=0.533261059725, learning_rate=0.17718667901, colsample_bytree=0.630179376688, max_depth=3, min_child_weight=0.875647373597 
[CV]  subsample=0.561122243029, learning_rate=0.143046835273, colsample_bytree=0.52923777809, max_depth=6, min_child_wei

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.7min


[CV]  subsample=0.801945281925, learning_rate=0.154408375081, colsample_bytree=0.564876245659, max_depth=7, min_child_weight=0.317725550604, total=  13.6s
[CV] subsample=0.809532949307, learning_rate=0.13793884758, colsample_bytree=0.5909635491, max_depth=6, min_child_weight=0.828339309912 
[CV]  subsample=0.801945281925, learning_rate=0.154408375081, colsample_bytree=0.564876245659, max_depth=7, min_child_weight=0.317725550604, total=  13.9s
[CV] subsample=0.809532949307, learning_rate=0.13793884758, colsample_bytree=0.5909635491, max_depth=6, min_child_weight=0.828339309912 
[CV]  subsample=0.809532949307, learning_rate=0.13793884758, colsample_bytree=0.5909635491, max_depth=6, min_child_weight=0.828339309912, total=  12.0s
[CV]  subsample=0.801945281925, learning_rate=0.154408375081, colsample_bytree=0.564876245659, max_depth=7, min_child_weight=0.317725550604, total=  13.9s
[CV] subsample=0.956398038061, learning_rate=0.160424324996, colsample_bytree=0.597792021882, max_depth=8, mi

[CV]  subsample=0.82926808507, learning_rate=0.116508635242, colsample_bytree=0.687990741379, max_depth=2, min_child_weight=0.916433283003, total=   6.6s
[CV] subsample=0.986374147801, learning_rate=0.138610669489, colsample_bytree=0.720476399962, max_depth=7, min_child_weight=0.596487831092 
[CV]  subsample=0.664026972365, learning_rate=0.195744262214, colsample_bytree=0.653144811908, max_depth=8, min_child_weight=0.563616119745, total=  17.7s
[CV] subsample=0.640801766241, learning_rate=0.140289752909, colsample_bytree=0.602312306636, max_depth=3, min_child_weight=0.982119376533 
[CV]  subsample=0.640801766241, learning_rate=0.140289752909, colsample_bytree=0.602312306636, max_depth=3, min_child_weight=0.982119376533, total=   5.7s
[CV] subsample=0.640801766241, learning_rate=0.140289752909, colsample_bytree=0.602312306636, max_depth=3, min_child_weight=0.982119376533 
[CV]  subsample=0.986374147801, learning_rate=0.138610669489, colsample_bytree=0.720476399962, max_depth=7, min_chil

[CV]  subsample=0.712528724392, learning_rate=0.185407640793, colsample_bytree=0.792774596976, max_depth=4, min_child_weight=0.867925888153, total=  14.3s
[CV] subsample=0.838780464982, learning_rate=0.113079881248, colsample_bytree=0.966509169639, max_depth=5, min_child_weight=0.790786805381 
[CV]  subsample=0.679845810832, learning_rate=0.154326019624, colsample_bytree=0.689102190004, max_depth=5, min_child_weight=0.982427942053, total=  15.6s
[CV] subsample=0.838780464982, learning_rate=0.113079881248, colsample_bytree=0.966509169639, max_depth=5, min_child_weight=0.790786805381 
[CV]  subsample=0.679845810832, learning_rate=0.154326019624, colsample_bytree=0.689102190004, max_depth=5, min_child_weight=0.982427942053, total=  15.3s
[CV] subsample=0.838780464982, learning_rate=0.113079881248, colsample_bytree=0.966509169639, max_depth=5, min_child_weight=0.790786805381 
[CV]  subsample=0.679845810832, learning_rate=0.154326019624, colsample_bytree=0.689102190004, max_depth=5, min_chi

[CV]  subsample=0.674005481638, learning_rate=0.156419684677, colsample_bytree=0.886360558524, max_depth=8, min_child_weight=0.474253032078, total=  20.9s
[CV] subsample=0.837835048755, learning_rate=0.122572133757, colsample_bytree=0.705239868182, max_depth=5, min_child_weight=0.389013903525 
[CV]  subsample=0.642446584853, learning_rate=0.128790512765, colsample_bytree=0.951406375269, max_depth=3, min_child_weight=0.201064128751, total=   8.7s
[CV] subsample=0.837835048755, learning_rate=0.122572133757, colsample_bytree=0.705239868182, max_depth=5, min_child_weight=0.389013903525 
[CV]  subsample=0.642446584853, learning_rate=0.128790512765, colsample_bytree=0.951406375269, max_depth=3, min_child_weight=0.201064128751, total=   8.9s
[CV] subsample=0.921087991869, learning_rate=0.104619219356, colsample_bytree=0.953528944288, max_depth=5, min_child_weight=0.133733737884 
[CV]  subsample=0.837835048755, learning_rate=0.122572133757, colsample_bytree=0.705239868182, max_depth=5, min_chi

[CV]  subsample=0.906118712523, learning_rate=0.163327978557, colsample_bytree=0.921813158736, max_depth=5, min_child_weight=0.682443203531, total=  11.3s
[CV] subsample=0.70346977932, learning_rate=0.113690295187, colsample_bytree=0.827861236154, max_depth=6, min_child_weight=0.889394827625 
[CV]  subsample=0.906118712523, learning_rate=0.163327978557, colsample_bytree=0.921813158736, max_depth=5, min_child_weight=0.682443203531, total=  11.5s
[CV] subsample=0.678596649316, learning_rate=0.155326601316, colsample_bytree=0.802146628293, max_depth=7, min_child_weight=0.609632923774 
[CV]  subsample=0.70346977932, learning_rate=0.113690295187, colsample_bytree=0.827861236154, max_depth=6, min_child_weight=0.889394827625, total=  13.6s
[CV] subsample=0.678596649316, learning_rate=0.155326601316, colsample_bytree=0.802146628293, max_depth=7, min_child_weight=0.609632923774 
[CV]  subsample=0.70346977932, learning_rate=0.113690295187, colsample_bytree=0.827861236154, max_depth=6, min_child_

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  8.2min


[CV]  subsample=0.678040090946, learning_rate=0.156102966497, colsample_bytree=0.874973031054, max_depth=5, min_child_weight=0.557258853067, total=  12.9s
[CV] subsample=0.922584509597, learning_rate=0.177201673878, colsample_bytree=0.645972842052, max_depth=7, min_child_weight=0.775338822964 
[CV]  subsample=0.678040090946, learning_rate=0.156102966497, colsample_bytree=0.874973031054, max_depth=5, min_child_weight=0.557258853067, total=  12.5s
[CV] subsample=0.617467392502, learning_rate=0.173732056798, colsample_bytree=0.531852864139, max_depth=4, min_child_weight=0.529951084511 
[CV]  subsample=0.922584509597, learning_rate=0.177201673878, colsample_bytree=0.645972842052, max_depth=7, min_child_weight=0.775338822964, total=  12.2s
[CV] subsample=0.617467392502, learning_rate=0.173732056798, colsample_bytree=0.531852864139, max_depth=4, min_child_weight=0.529951084511 
[CV]  subsample=0.922584509597, learning_rate=0.177201673878, colsample_bytree=0.645972842052, max_depth=7, min_chi

[CV]  subsample=0.602044804011, learning_rate=0.107704380385, colsample_bytree=0.741999662908, max_depth=5, min_child_weight=0.219388578863, total=  10.5s
[CV] subsample=0.606307956298, learning_rate=0.11963776228, colsample_bytree=0.532697652385, max_depth=7, min_child_weight=0.849208031747 
[CV]  subsample=0.711825992564, learning_rate=0.105949159125, colsample_bytree=0.815538386919, max_depth=8, min_child_weight=0.207072128068, total=  18.3s
[CV] subsample=0.606307956298, learning_rate=0.11963776228, colsample_bytree=0.532697652385, max_depth=7, min_child_weight=0.849208031747 
[CV]  subsample=0.606307956298, learning_rate=0.11963776228, colsample_bytree=0.532697652385, max_depth=7, min_child_weight=0.849208031747, total=  13.0s
[CV] subsample=0.606307956298, learning_rate=0.11963776228, colsample_bytree=0.532697652385, max_depth=7, min_child_weight=0.849208031747 
[CV]  subsample=0.711825992564, learning_rate=0.105949159125, colsample_bytree=0.815538386919, max_depth=8, min_child_w

[CV]  subsample=0.768361548557, learning_rate=0.152806784273, colsample_bytree=0.841886791805, max_depth=5, min_child_weight=0.741035956203, total=  12.7s
[CV] subsample=0.995298608702, learning_rate=0.116367200414, colsample_bytree=0.749497318359, max_depth=7, min_child_weight=0.337422094379 
[CV]  subsample=0.768361548557, learning_rate=0.152806784273, colsample_bytree=0.841886791805, max_depth=5, min_child_weight=0.741035956203, total=  11.9s
[CV]  subsample=0.768361548557, learning_rate=0.152806784273, colsample_bytree=0.841886791805, max_depth=5, min_child_weight=0.741035956203, total=  12.1s
[CV] subsample=0.995298608702, learning_rate=0.116367200414, colsample_bytree=0.749497318359, max_depth=7, min_child_weight=0.337422094379 
[CV] subsample=0.517963109656, learning_rate=0.193995399475, colsample_bytree=0.575911788142, max_depth=5, min_child_weight=0.137969394778 
[CV]  subsample=0.995298608702, learning_rate=0.116367200414, colsample_bytree=0.749497318359, max_depth=7, min_chi

[CV]  subsample=0.798018583074, learning_rate=0.103819956492, colsample_bytree=0.987697751988, max_depth=4, min_child_weight=0.962260757894, total=  12.4s
[CV] subsample=0.63778406659, learning_rate=0.140588318571, colsample_bytree=0.948018059626, max_depth=5, min_child_weight=0.843116187809 
[CV]  subsample=0.798018583074, learning_rate=0.103819956492, colsample_bytree=0.987697751988, max_depth=4, min_child_weight=0.962260757894, total=  11.6s
[CV] subsample=0.763840094645, learning_rate=0.104068665754, colsample_bytree=0.898572507966, max_depth=7, min_child_weight=0.166559775427 
[CV]  subsample=0.63778406659, learning_rate=0.140588318571, colsample_bytree=0.948018059626, max_depth=5, min_child_weight=0.843116187809, total=  14.7s
[CV] subsample=0.763840094645, learning_rate=0.104068665754, colsample_bytree=0.898572507966, max_depth=7, min_child_weight=0.166559775427 
[CV]  subsample=0.63778406659, learning_rate=0.140588318571, colsample_bytree=0.948018059626, max_depth=5, min_child_

[CV]  subsample=0.791298734167, learning_rate=0.168112480913, colsample_bytree=0.594965871172, max_depth=6, min_child_weight=0.494217232405, total=   9.3s
[CV] subsample=0.648635095042, learning_rate=0.141683030923, colsample_bytree=0.960316491539, max_depth=7, min_child_weight=0.334068812783 
[CV]  subsample=0.876253694029, learning_rate=0.127027579596, colsample_bytree=0.604495919764, max_depth=9, min_child_weight=0.414208439034, total=  13.9s
[CV] subsample=0.648635095042, learning_rate=0.141683030923, colsample_bytree=0.960316491539, max_depth=7, min_child_weight=0.334068812783 
[CV]  subsample=0.791298734167, learning_rate=0.168112480913, colsample_bytree=0.594965871172, max_depth=6, min_child_weight=0.494217232405, total=   9.6s
[CV] subsample=0.648635095042, learning_rate=0.141683030923, colsample_bytree=0.960316491539, max_depth=7, min_child_weight=0.334068812783 
[CV]  subsample=0.791298734167, learning_rate=0.168112480913, colsample_bytree=0.594965871172, max_depth=6, min_chi

In [None]:
xgb_model

In [None]:
trainsizes, traingini, testgini = util.learning_curves(xgb_model, X_train, X_test, y_train, y_test)

In [None]:
print(traingini)
print(testgini)

In [None]:
trainsizes, traingini, testgini = util.learning_curves(xgb_model, X_train, X_test, y_train, y_test)

plt.figure()
plt.plot(trainsizes, traingini, label='train gini')
plt.plot(trainsizes, testgini, label='test gini')
plt.xlabel('Training set size')
plt.ylabel('Normalized Gini coefficient')
plt.title('XGBoost')
plt.legend()
plt.show()