In [7]:
import numpy as np
import util
import pickle
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import matplotlib.pyplot as plt
import scipy.stats

In [2]:
seed=229
np.random.seed(seed)

trainfile = 'data/train.csv'
header, ids, X, Y = util.fetch_data(trainfile)

test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Randomized Search
Use 10% of the data to conduct a search over XGBoost parameters, then test 

In [None]:
prev_best_xgb = None

try:
    prev_best_xgb = pickle.load(open('models/xgb.pickle'))
except IOError:
    pass 

In [5]:
m_search = X_train.shape[0]/10
X_paramsearch = X_train[:m_search,:]; Y_paramsearch = y_train[:m_search]

print(X_paramsearch.shape)

param_distros = {'learning_rate': scipy.stats.uniform(loc=0.1,scale=0.1),
                'min_child_weight': scipy.stats.uniform(loc=0.1,scale=0.9),
                'max_depth': scipy.stats.binom(n=10, p=0.6), 
                #'gamma': scipy.stats.uniform(loc=0, scale=1),
                'subsample': scipy.stats.uniform(loc=0.5,scale=0.5),
                'colsample_bytree': scipy.stats.uniform(loc=0.5, scale=0.5)}
xgb = XGBClassifier(silent=0)


for i in range(100):

# 9.7s for one candidate
cv_random = RandomizedSearchCV(xgb, n_iter=10, param_distributions=param_distros,
                               scoring=util.gini_scorer, verbose=2, n_jobs=-1)

cv_random.fit(X_paramsearch, Y_paramsearch)
xgb_model = cv_random.best_estimator_

(47616, 57)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] subsample=0.775326632468, learning_rate=0.175168614473, colsample_bytree=0.755041822007, max_depth=7, min_child_weight=0.666710316511 
[CV] subsample=0.775326632468, learning_rate=0.175168614473, colsample_bytree=0.755041822007, max_depth=7, min_child_weight=0.666710316511 
[CV] subsample=0.775326632468, learning_rate=0.175168614473, colsample_bytree=0.755041822007, max_depth=7, min_child_weight=0.666710316511 
[CV] subsample=0.928857517897, learning_rate=0.15922564382, colsample_bytree=0.808875125905, max_depth=4, min_child_weight=0.401209397303 
[CV]  subsample=0.928857517897, learning_rate=0.15922564382, colsample_bytree=0.808875125905, max_depth=4, min_child_weight=0.401209397303, total=   9.1s
[CV] subsample=0.928857517897, learning_rate=0.15922564382, colsample_bytree=0.808875125905, max_depth=4, min_child_weight=0.401209397303 
[CV]  subsample=0.775326632468, learning_rate=0.175168614473, colsample_byt

[CV]  subsample=0.670978854807, learning_rate=0.130212658437, colsample_bytree=0.737658526392, max_depth=7, min_child_weight=0.368640711953, total=  15.8s
[CV]  subsample=0.549299035411, learning_rate=0.107575690345, colsample_bytree=0.739496051875, max_depth=7, min_child_weight=0.924302083877, total=  15.9s
[CV]  subsample=0.549299035411, learning_rate=0.107575690345, colsample_bytree=0.739496051875, max_depth=7, min_child_weight=0.924302083877, total=  15.3s
[CV]  subsample=0.549299035411, learning_rate=0.107575690345, colsample_bytree=0.739496051875, max_depth=7, min_child_weight=0.924302083877, total=  11.4s


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.8min finished


In [8]:
xgb_model = cv_random.best_estimator_
print(xgb_model)

XGBClassifier(base_score=0.5, colsample_bylevel=1,
       colsample_bytree=0.65853149706877101, gamma=0,
       learning_rate=0.17834478300405329, max_delta_step=0, max_depth=6,
       min_child_weight=0.92624678262866966, missing=None,
       n_estimators=100, nthread=-1, objective='binary:logistic',
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=0,
       subsample=0.73427510537253016)


In [9]:
trainsizes, traingini, testgini = util.learning_curves(xgb_model, X_train, X_test, y_train, y_test)

Evaluating model on training set size 23808
Evaluating model on training set size 47616
Evaluating model on training set size 71425
Evaluating model on training set size 95233
Evaluating model on training set size 119042
Evaluating model on training set size 142850
Evaluating model on training set size 166659
Evaluating model on training set size 190467
Evaluating model on training set size 214276
Evaluating model on training set size 238084
Evaluating model on training set size 261892
Evaluating model on training set size 285701
Evaluating model on training set size 309509
Evaluating model on training set size 333318
Evaluating model on training set size 357126
Evaluating model on training set size 380935
Evaluating model on training set size 404743
Evaluating model on training set size 428552
Evaluating model on training set size 452360
Evaluating model on training set size 476169


In [None]:
print(traingini)
print(testgini)

In [None]:
trainsizes, traingini, testgini = util.learning_curves(xgb_model, X_train, X_test, y_train, y_test)

plt.figure()
plt.plot(trainsizes, traingini, label='train gini')
plt.plot(trainsizes, testgini, label='test gini')
plt.xlabel('Training set size')
plt.ylabel('Normalized Gini coefficient')
plt.title('XGBoost')
plt.legend()
plt.show()