In [2]:
import numpy as np
import pandas as pd
import sklearn.cross_validation
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from IPython.display import display, HTML

%matplotlib inline

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv', index_col='id')

display(train.head(5))

# Break the training data into a target ("dependent") and inputs ("inpedendents")
y_train = train.ACTION
X_train = train.drop(["ACTION"], axis=1)
print("Number of instances:{}".format(len(train.index)))

from collections import Counter

def tally_predictions(predictions):
    count = Counter()
    for pred in predictions:
        if pred == 1:
            count[1] += 1
        else:
            count[0] += 1
    print(count[0])
    print(count[1])

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325


Number of instances:32769


## XGBoost Parameter Tuning:
(From most important/succint to least):
- http://datascience.stackexchange.com/questions/9364/hypertuning-xgboost-parameters
- http://www.slideshare.net/OwenZhang2/tips-for-data-science-competitions
- http://www.slideshare.net/odsc/owen-zhangopen-sourcetoolsanddscompetitions1
- http://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [5]:
import xgboost as xgb
from sklearn.grid_search import RandomizedSearchCV

# Grid search XGB
parameters = {
    'max_depth': [6, 7, 8],
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 200],
    'min_child_weight': [1/(0.05**(1/2)), 1/(0.95**(1/2))],
    'colsample_bytree': [0.3, 0.4, 0.5]
}

xg_clf = RandomizedSearchCV(xgb.XGBClassifier(), parameters, n_iter=50, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'min_child_weight': 1.0259783520851542, 'max_depth': 8, 'n_estimators': 200, 'learning_rate': 0.1, 'colsample_bytree': 0.4}

0.778 (+/-0.039) for {'min_child_weight': 4.47213595499958, 'max_depth': 8, 'n_estimators': 100, 'learning_rate': 0.01, 'colsample_bytree': 0.3}
0.791 (+/-0.035) for {'min_child_weight': 1.0259783520851542, 'max_depth': 7, 'n_estimators': 200, 'learning_rate': 0.01, 'colsample_bytree': 0.3}
0.840 (+/-0.024) for {'min_child_weight': 1.0259783520851542, 'max_depth': 8, 'n_estimators': 100, 'learning_rate': 0.1, 'colsample_bytree': 0.3}
0.830 (+/-0.027) for {'min_child_weight': 1.0259783520851542, 'max_depth': 7, 'n_estimators': 100, 'learning_rate': 0.1, 'colsample_bytree': 0.3}
0.765 (+/-0.041) for {'min_child_weight': 1.0259783520851542, 'max_depth': 7, 'n_estimators': 100, 'learning_rate': 0.01, 'colsample_bytree': 0.3}
0.793 (+/-0.037) for {'min_child_weight': 4.47213595499958, 'max_depth': 8, 'n_estimat

In [7]:
import xgboost as xgb
from sklearn.grid_search import GridSearchCV

# Grid search XGB
parameters = {
    'max_depth': [8, 9],
    'learning_rate': [0.3, 0.4],
    'n_estimators': [200, 250, 300],
    'min_child_weight': [1, 1/(0.95**(1/2))],
    'colsample_bytree': [0.5, 0.6]
}

xg_clf = GridSearchCV(xgb.XGBClassifier(), parameters, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 200, 'learning_rate': 0.3, 'max_depth': 8}

0.864 (+/-0.032) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 200, 'learning_rate': 0.3, 'max_depth': 8}
0.864 (+/-0.032) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 250, 'learning_rate': 0.3, 'max_depth': 8}
0.863 (+/-0.032) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 300, 'learning_rate': 0.3, 'max_depth': 8}
0.864 (+/-0.032) for {'min_child_weight': 1.0259783520851542, 'colsample_bytree': 0.5, 'n_estimators': 200, 'learning_rate': 0.3, 'max_depth': 8}
0.864 (+/-0.032) for {'min_child_weight': 1.0259783520851542, 'colsample_bytree': 0.5, 'n_estimators': 250, 'learning_rate': 0.3, 'max_depth': 8}
0.863 (+/-0.031) for {'min_child_weight': 1.0259783520851542, 'colsample_bytree': 0.5, 'n_estimators': 300, 'learning_rate': 0.3, 'max_depth': 8}
0.863 (+/

In [8]:
import xgboost as xgb
from sklearn.grid_search import GridSearchCV

# Grid search XGB
parameters = {
    'max_depth': [8],
    'learning_rate': [0.3],
    'n_estimators': [120, 140, 160, 180, 200],
    'min_child_weight': [1],
    'colsample_bytree': [0.5]
}

xg_clf = GridSearchCV(xgb.XGBClassifier(), parameters, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 160, 'learning_rate': 0.3, 'max_depth': 8}

0.863 (+/-0.030) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 120, 'learning_rate': 0.3, 'max_depth': 8}
0.865 (+/-0.030) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 140, 'learning_rate': 0.3, 'max_depth': 8}
0.866 (+/-0.030) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 160, 'learning_rate': 0.3, 'max_depth': 8}
0.865 (+/-0.030) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 180, 'learning_rate': 0.3, 'max_depth': 8}
0.864 (+/-0.032) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 200, 'learning_rate': 0.3, 'max_depth': 8}



In [9]:
import xgboost as xgb
from sklearn.grid_search import GridSearchCV

# Grid search XGB
parameters = {
    'max_depth': [8],
    'learning_rate': [0.2, 0.3],
    'n_estimators': [145, 150, 155, 160, 165],
    'min_child_weight': [1],
    'colsample_bytree': [0.5]
}

xg_clf = GridSearchCV(xgb.XGBClassifier(), parameters, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}

0.865 (+/-0.030) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 145, 'learning_rate': 0.3, 'max_depth': 8}
0.866 (+/-0.030) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 150, 'learning_rate': 0.3, 'max_depth': 8}
0.866 (+/-0.029) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
0.866 (+/-0.030) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 160, 'learning_rate': 0.3, 'max_depth': 8}
0.866 (+/-0.030) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 165, 'learning_rate': 0.3, 'max_depth': 8}



In [12]:
import xgboost as xgb
from sklearn.grid_search import GridSearchCV

# Grid search XGB
parameters = {
    'max_depth': [8],
    'learning_rate': [0.29, 0.3, 0.31],
    'n_estimators': [155],
    'min_child_weight': [1],
    'colsample_bytree': [0.5]
}

xg_clf = GridSearchCV(xgb.XGBClassifier(), parameters, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}

0.863 (+/-0.029) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.29, 'max_depth': 8}
0.866 (+/-0.029) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
0.865 (+/-0.029) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.31, 'max_depth': 8}



In [15]:
import xgboost as xgb
from sklearn.grid_search import GridSearchCV

# Grid search XGB
parameters = {
    'max_depth': [8],
    'learning_rate': [0.3],
    'n_estimators': [155],
    'min_child_weight': [.99, .995, 1],
    'colsample_bytree': [0.5]
}

xg_clf = GridSearchCV(xgb.XGBClassifier(), parameters, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}

0.862 (+/-0.033) for {'min_child_weight': 0.99, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
0.863 (+/-0.031) for {'min_child_weight': 0.995, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
0.866 (+/-0.029) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}



In [16]:
import xgboost as xgb
from sklearn.grid_search import GridSearchCV

# Grid search XGB
parameters = {
    'max_depth': [8],
    'learning_rate': [0.3],
    'n_estimators': [155],
    'min_child_weight': [1],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

xg_clf = GridSearchCV(xgb.XGBClassifier(), parameters, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}

0.866 (+/-0.029) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
0.860 (+/-0.029) for {'min_child_weight': 1, 'colsample_bytree': 0.6, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
0.858 (+/-0.032) for {'min_child_weight': 1, 'colsample_bytree': 0.7, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
0.858 (+/-0.035) for {'min_child_weight': 1, 'colsample_bytree': 0.8, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
0.854 (+/-0.042) for {'min_child_weight': 1, 'colsample_bytree': 0.9, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
0.854 (+/-0.039) for {'min_child_weight': 1, 'colsample_bytree': 1, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}



In [17]:
import xgboost as xgb
from sklearn.grid_search import GridSearchCV

# Grid search XGB
parameters = {
    'max_depth': [8],
    'learning_rate': [0.3],
    'n_estimators': [155],
    'min_child_weight': [1],
    'colsample_bytree': [0.3, 0.4, 0.5]
}

xg_clf = GridSearchCV(xgb.XGBClassifier(), parameters, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}

0.859 (+/-0.030) for {'min_child_weight': 1, 'colsample_bytree': 0.3, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
0.862 (+/-0.034) for {'min_child_weight': 1, 'colsample_bytree': 0.4, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
0.866 (+/-0.029) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}



In [20]:
predictions = xg_clf.predict_proba(test)

# Spit out predictions to a file
pred_write = enumerate(predictions, start=1)
with open('output/xgb_155trees_colsampletree.5_learningrate0.3_maxdepth8.csv', 'w') as f:
    f.write('Id,Action\n')
    for instance, prediction in pred_write:
        f.write('{},{}\n'.format(instance, prediction[1]))

In [22]:
# Grid search XGB
parameters = {
    'max_depth': list(range(6, 9)),
    'learning_rate': [0.3],
    'n_estimators': list(range(130, 170, 5)),
    'min_child_weight': [1],
    'colsample_bytree': [0.3, 0.4, 0.5]
}

xg_clf = GridSearchCV(xgb.XGBClassifier(), parameters, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print("Best score: {}\n".format(xg_clf.best_score_))
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
Best score: 0.8658009059401618


0.844 (+/-0.026) for {'min_child_weight': 1, 'colsample_bytree': 0.3, 'n_estimators': 130, 'learning_rate': 0.3, 'max_depth': 6}
0.845 (+/-0.025) for {'min_child_weight': 1, 'colsample_bytree': 0.3, 'n_estimators': 135, 'learning_rate': 0.3, 'max_depth': 6}
0.846 (+/-0.026) for {'min_child_weight': 1, 'colsample_bytree': 0.3, 'n_estimators': 140, 'learning_rate': 0.3, 'max_depth': 6}
0.847 (+/-0.027) for {'min_child_weight': 1, 'colsample_bytree': 0.3, 'n_estimators': 145, 'learning_rate': 0.3, 'max_depth': 6}
0.848 (+/-0.027) for {'min_child_weight': 1, 'colsample_bytree': 0.3, 'n_estimators': 150, 'learning_rate': 0.3, 'max_depth': 6}
0.848 (+/-0.026) for {'min_child_weight': 1, 'colsample_bytree': 0.3, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 6}
0.849 (+/-0.027) for {'min_c

In [23]:
# Grid search XGB
parameters = {
    'max_depth': [8],
    'learning_rate': [0.3],
    'n_estimators': [155, 400, 1000],
    'min_child_weight': [1],
    'colsample_bytree': [0.5],
}

xg_clf = GridSearchCV(xgb.XGBClassifier(), parameters, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print("Best score: {}\n".format(xg_clf.best_score_))
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
Best score: 0.8658009059401618


0.866 (+/-0.029) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
0.864 (+/-0.033) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 400, 'learning_rate': 0.3, 'max_depth': 8}
0.861 (+/-0.033) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 1000, 'learning_rate': 0.3, 'max_depth': 8}



In [25]:
# Grid search XGB
learning_rate = [x/155 for x in range(2, 11)] # 2~10 divided by # of trees.
parameters = {
    'max_depth': [8],
    'learning_rate': learning_rate,
    'n_estimators': [155],
    'min_child_weight': [1],
    'colsample_bytree': [0.5],
}

xg_clf = GridSearchCV(xgb.XGBClassifier(), parameters, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print("Best score: {}\n".format(xg_clf.best_score_))
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.06451612903225806, 'max_depth': 8}
Best score: 0.8516473386907805


0.818 (+/-0.030) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.012903225806451613, 'max_depth': 8}
0.831 (+/-0.031) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.01935483870967742, 'max_depth': 8}
0.838 (+/-0.027) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.025806451612903226, 'max_depth': 8}
0.843 (+/-0.025) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.03225806451612903, 'max_depth': 8}
0.844 (+/-0.025) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.03870967741935484, 'max_depth': 8}
0.846 (+/-0.027) for {'min_child_weight': 1, 'colsample_byt

In [27]:
# Grid search XGB
learning_rate = [0.3]
subsample = []
parameters = {
    'max_depth': [8],
    'learning_rate': learning_rate,
    'n_estimators': [155],
    'min_child_weight': [1, 3/5],
    'colsample_bytree': [0.5],
}

xg_clf = GridSearchCV(xgb.XGBClassifier(), parameters, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print("Best score: {}\n".format(xg_clf.best_score_))
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'min_child_weight': 0.6, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
Best score: 0.8667531493479463


0.866 (+/-0.029) for {'min_child_weight': 1, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}
0.867 (+/-0.027) for {'min_child_weight': 0.6, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}



In [28]:
xg_clf

GridSearchCV(cv=10, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'min_child_weight': [1, 0.6], 'max_depth': [8], 'n_estimators': [155], 'learning_rate': [0.3], 'colsample_bytree': [0.5]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [30]:
# Grid search XGB
learning_rate = [0.3]
subsample = [.5, .75, 1.0]
colsample_bytree = [.4, .6, .8, 1.0]
parameters = {
    'max_depth': [8],
    'learning_rate': learning_rate,
    'n_estimators': [155],
    'min_child_weight': [3/5],
    'subsample': subsample,
    'colsample_bytree': colsample_bytree,
}

xg_clf = GridSearchCV(xgb.XGBClassifier(), parameters, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print("Best score: {}\n".format(xg_clf.best_score_))
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'min_child_weight': 0.6, 'max_depth': 8, 'n_estimators': 155, 'learning_rate': 0.3, 'subsample': 1.0, 'colsample_bytree': 0.4}
Best score: 0.8635881122375122


0.852 (+/-0.030) for {'min_child_weight': 0.6, 'max_depth': 8, 'n_estimators': 155, 'learning_rate': 0.3, 'subsample': 0.5, 'colsample_bytree': 0.4}
0.861 (+/-0.030) for {'min_child_weight': 0.6, 'max_depth': 8, 'n_estimators': 155, 'learning_rate': 0.3, 'subsample': 0.75, 'colsample_bytree': 0.4}
0.864 (+/-0.034) for {'min_child_weight': 0.6, 'max_depth': 8, 'n_estimators': 155, 'learning_rate': 0.3, 'subsample': 1.0, 'colsample_bytree': 0.4}
0.851 (+/-0.038) for {'min_child_weight': 0.6, 'max_depth': 8, 'n_estimators': 155, 'learning_rate': 0.3, 'subsample': 0.5, 'colsample_bytree': 0.6}
0.860 (+/-0.032) for {'min_child_weight': 0.6, 'max_depth': 8, 'n_estimators': 155, 'learning_rate': 0.3, 'subsample': 0.75, 'colsample_bytree': 0.6}
0.861 (+/-0.032) for {'min_child_we

In [34]:
# Grid search XGB
learning_rate = [0.3]
subsample = [1.0]
colsample_bytree = [.45]
parameters = {
    'max_depth': [8],
    'learning_rate': learning_rate,
    'n_estimators': [155],
    'min_child_weight': [3/(100*1897/(1897+30872)), .55, 3/5, .65],
    'subsample': subsample,
    'colsample_bytree': colsample_bytree,
}

xg_clf = GridSearchCV(xgb.XGBClassifier(), parameters, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print("Best score: {}\n".format(xg_clf.best_score_))
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'min_child_weight': 0.6, 'max_depth': 8, 'n_estimators': 155, 'learning_rate': 0.3, 'subsample': 1.0, 'colsample_bytree': 0.45}
Best score: 0.8667531493479463


0.865 (+/-0.032) for {'min_child_weight': 0.5182235108065366, 'max_depth': 8, 'n_estimators': 155, 'learning_rate': 0.3, 'subsample': 1.0, 'colsample_bytree': 0.45}
0.862 (+/-0.033) for {'min_child_weight': 0.55, 'max_depth': 8, 'n_estimators': 155, 'learning_rate': 0.3, 'subsample': 1.0, 'colsample_bytree': 0.45}
0.867 (+/-0.027) for {'min_child_weight': 0.6, 'max_depth': 8, 'n_estimators': 155, 'learning_rate': 0.3, 'subsample': 1.0, 'colsample_bytree': 0.45}
0.865 (+/-0.032) for {'min_child_weight': 0.65, 'max_depth': 8, 'n_estimators': 155, 'learning_rate': 0.3, 'subsample': 1.0, 'colsample_bytree': 0.45}



In [36]:
predictions = xg_clf.predict_proba(test)

# Spit out predictions to a file
pred_write = enumerate(predictions, start=1)
with open('output/xgb_155trees_minchildweight.6_colsampletree.45_learningrate0.3_maxdepth8.csv', 'w') as f:
    f.write('Id,Action\n')
    for instance, prediction in pred_write:
        f.write('{},{}\n'.format(instance, prediction[1]))