In [1]:
import numpy as np
import util
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

seed=229
np.random.seed(seed)



In [2]:
trainfile = 'data/train.csv'
header, ids, X, Y = util.fetch_data(trainfile)

test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Randomized Search
Use 10% of the data to conduct a search over XGBoost parameters, then test 

In [3]:
import scipy.stats

m_search = X_train.shape[0]/10
X_paramsearch = X_train[:m_search,:]; Y_paramsearch = y_train[:m_search]

print(X_paramsearch.shape)

param_distros = {'learning_rate': scipy.stats.uniform(loc=0.1,scale=0.1),
                'min_child_weight': scipy.stats.uniform(loc=0.1,scale=0.9),
                'max_depth': scipy.stats.binom(n=10, p=0.6), 
                #'gamma': scipy.stats.uniform(loc=0, scale=1),
                'subsample': scipy.stats.uniform(loc=0.5,scale=0.5),
                'colsample_bytree': scipy.stats.uniform(loc=0.5, scale=0.5)}
xgb = XGBClassifier(silent=0)

# 9.7s for one candidate
cv_random = RandomizedSearchCV(xgb, n_iter=1, param_distributions=param_distros,
                               scoring=util.gini_scorer, verbose=2, n_jobs=-1)

cv_random.fit(X_paramsearch, Y_paramsearch)
xgb_model = cv_random.best_estimator_

(47616, 57)
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] subsample=0.889730870102, learning_rate=0.173855720703, colsample_bytree=0.54851249975, max_depth=5, min_child_weight=0.322788105024 
[CV] subsample=0.889730870102, learning_rate=0.173855720703, colsample_bytree=0.54851249975, max_depth=5, min_child_weight=0.322788105024 
[CV] subsample=0.889730870102, learning_rate=0.173855720703, colsample_bytree=0.54851249975, max_depth=5, min_child_weight=0.322788105024 
[CV]  subsample=0.889730870102, learning_rate=0.173855720703, colsample_bytree=0.54851249975, max_depth=5, min_child_weight=0.322788105024, total=   5.1s
[CV]  subsample=0.889730870102, learning_rate=0.173855720703, colsample_bytree=0.54851249975, max_depth=5, min_child_weight=0.322788105024, total=   5.1s
[CV]  subsample=0.889730870102, learning_rate=0.173855720703, colsample_bytree=0.54851249975, max_depth=5, min_child_weight=0.322788105024, total=   5.2s


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    5.7s finished


In [None]:
xgb_model

XGBClassifier(base_score=0.5, colsample_bylevel=1,
       colsample_bytree=0.5485124997498374, gamma=0,
       learning_rate=0.17385572070311084, max_delta_step=0, max_depth=5,
       min_child_weight=0.3227881050237511, missing=None, n_estimators=100,
       nthread=-1, objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=0, subsample=0.88973087010218777)

In [None]:
traingini, testgini = util.learning_curves(xgb_model, X_train, X_test, y_train, y_test)

Evaluating model on training set size 23808
Evaluating model on training set size 47616
Evaluating model on training set size 71425
Evaluating model on training set size 95233
Evaluating model on training set size 119042
Evaluating model on training set size 142850
Evaluating model on training set size 166659
Evaluating model on training set size 190467
Evaluating model on training set size 214276
Evaluating model on training set size 238084
Evaluating model on training set size 261892
Evaluating model on training set size 285701
Evaluating model on training set size 309509
