In [None]:
import numpy as np
import util
import pickle
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import matplotlib.pyplot as plt
import scipy.stats

In [None]:
seed=229
state = np.random.RandomState(seed)

trainfile = 'data/train.csv'
header, ids, X, Y = util.fetch_data(trainfile)

test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=state)

# Training: Randomized Search
Use 10% of the data to conduct a search over XGBoost parameters, then test 

In [None]:
best_xgb = None

try:
    best_xgb = pickle.load(open('models/xgb_randomized.pickle'))
except IOError:
    pass 

In [None]:
m_search = X_train.shape[0]/10
X_paramsearch = X_train[:m_search,:]; Y_paramsearch = y_train[:m_search]

param_distros = {'learning_rate': scipy.stats.uniform(loc=0.1,scale=0.1),
                 'min_child_weight': scipy.stats.uniform(loc=0.1,scale=0.9),
                'max_depth': scipy.stats.binom(n=10, p=0.6), 
                 #'gamma': scipy.stats.uniform(loc=0, scale=1),
                 'subsample': scipy.stats.uniform(loc=0.5,scale=0.5),
                 'colsample_bytree': scipy.stats.uniform(loc=0.5, scale=0.5),
                }
xgb = XGBClassifier(silent=0)
cv_random = RandomizedSearchCV(xgb, n_iter=500, param_distributions=param_distros,
                               scoring=util.gini_proba_scorer, verbose=2, n_jobs=-1)
cv_random.fit(X_paramsearch, Y_paramsearch)

best_xgb_score = (-float('inf') if best_xgb == None 
                  else util.gini_proba_scorer(best_xgb, X_paramsearch, Y_paramsearch)) 
                  #.gini_normalized(Y_paramsearch, best_xgb.predict_proba(X_paramsearch)[1:]))

# Update best estimator
if cv_random.best_score_ > best_xgb_score: 
    best_xgb = cv_random.best_estimator_

xgb_out = open('models/xgb_randomized.pickle', 'wb')
pickle.dump(best_xgb, xgb_out)
xgb_out.close()

# Evaluation

In [None]:
best_xgb = pickle.load(open('models/xgb_randomized.pickle'))

In [None]:
trainsizes, traingini, testgini = util.learning_curves(best_xgb, X_train, X_test, y_train, y_test,
                                                      scorer=util.gini_proba_scorer)

In [None]:
print(traingini)
print('')
print(testgini)

In [None]:
plt.figure()
plt.plot(trainsizes, traingini, label='train gini')
plt.plot(trainsizes, testgini, label='test gini')
plt.xlabel('Training set size')
plt.ylabel('Normalized Gini coefficient')
plt.title('XGBoost with RandomizedSearchCV')
plt.legend()
plt.savefig('figures/learning_curves/xgboost_randomized.eps')

In [None]:
plt.show()

# Output

In [None]:
best_xgb.fit(X, Y)
print(util.gini_proba_scorer(best_xgb, X, Y))

In [None]:
util.make_prediction(best_xgb, 'data/test.csv', 'predictions/xgboost_randomized_171212_01.csv',
                     predict_method=util.proba_method(best_xgb))