In [ ]:
import numpy as np
import util
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV



In [6]:
trainfile = 'data/train.csv'
header, ids, X, Y = util.fetch_data(trainfile)

seed = 23
test_size = 0.80
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Basic Classifier
with data split into training and dev sets

In [None]:
xgb = XGBClassifier(objective='binary:logistic', max_depth=10)
xgb.fit(X_train, y_train)

In [11]:
# gini score for training data
print(util.gini_normalized(y_train, xgb.predict(X_train)))

0.151970485915


In [12]:
# gini score for dev data
print(util.gini_normalized(y_test, xgb.predict(X_test)))

0.00590810496614


In [None]:
xgb.fit(X, Y) # retrain with all of data
util.make_prediction(xgb, 'data/test.csv', 'predictions/xgboost.csv')

# Grid Search
with full data set and 3-fold CV

In [14]:
params_to_try = {'max_depth': range(5,X.shape[1]+1, 10), 'learning_rate':np.linspace(0.05, 0.3, num=5)}
c_validator = GridSearchCV(xgb, params_to_try, scoring=util.gini_scorer)
c_validator.fit(X_train, y_train)

print(util.gini_normalized(y_train, c_validator.predict(X_train)))

0.00694068534706


In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [None]:
def plot_gridsearch(c_validator, param1, param2, param1name, param2name):
    cv_results = c_validator.cv_results_
    test_mean = cv_results['mean_test_score']
    train_mean = cv_results['mean_train_score']
    
    x = np.tile(param2, len(param1))
    y = np.repeat(param1, len(param2))
    
    fig = plt.figure()
    ax1 = fig.add_subplot(111, projection='3d')
    ax1.scatter(x, y, test_mean)
    ax1.title("Grid Search Test Scores")
    ax1.xlabel(param2name)
    ax1.ylabel(param1name)
    
    ax2 = fig.add_subplot(111, projection='3d')
    ax2.scatter(x, y, train_mean)
    ax2.title("Grid Search Training Scores")
    ax2.xlabel(param2name)
    ax2.ylabel(param1name)
    fig.show()

In [None]:
plot_gridsearch(c_validator, params_to_try['learning_rate'], params_to_try['max_depth'], "learning rate", "max_depth")

In [None]:
c_validator.best_params_