In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append("/home/mizworski/Repos/xgboost/python-package/")
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datetime import datetime
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.utils.validation import column_or_1d
from scipy import interp

In [2]:
date_parse_format = '%Y-%m-%d'


def preprocess_data(data):
    categorical_cols = list(set(data.columns) - set(['birth_date', 'contact_date', 'pdays', 'campaign', 'y']))
    
    X = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
    X['contacted'] = pd.get_dummies(X.pdays < 999, drop_first=True)
    now = datetime.strptime(X.contact_date.max(), date_parse_format)
    X['age'] = X.apply(lambda r: int((now - datetime.strptime(r.birth_date, date_parse_format)).days / 365.25), axis=1)
    X.drop('birth_date', axis=1, inplace=True)
    X['days_since_contact'] =  X.apply(lambda r: int((now - datetime.strptime(r.contact_date, date_parse_format)).days), axis=1)
    X.drop('contact_date', axis=1, inplace=True)
    
    train_x = X[X.y != 'unknown']
    test_x = X[X.y == 'unknown']
    train_y = train_x['y']
    train_y = pd.get_dummies(train_y, drop_first=True)
    train_x = train_x.drop('y', axis=1)
    test_x = test_x.drop('y', axis=1)
    
#     train_y = column_or_1d(train_y)
#     train_x = train_x.as_matrix()
#     test_x = test_x.as_matrix()
    
    return train_x, train_y, test_x
    

In [3]:
df = pd.read_csv('../data/bank-classification.csv', index_col=0)
df = df.sample(frac=1)

In [4]:
X_train, Y_train, X_test = preprocess_data(df)

In [5]:

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [6]:

X_train = X_train.as_matrix()
Y_train = column_or_1d(Y_train)

In [18]:
from numpy import random

In [65]:
clf_xgb = XGBClassifier(objective = 'binary:logistic')
param_dist = {'n_estimators': stats.randint(200, 1000),
              'learning_rate': np.arange(0.01, 0.7, 0.01),
              'subsample': np.arange(0.95, 1.0, 0.001),
              'max_depth': [3, 4, 5, 6, 7, 9],
              'colsample_bytree': np.arange(0.95, 1.0, 0.001),
              'min_child_weight': [1, 2, 3, 4]
             }
param_dist = {'n_estimators': stats.randint(200, 1000),
              'learning_rate': 10 ** np.arange(-3, -1, 0.01),
              'subsample': [1],
              'max_depth': [3, 4, 5, 6, 7, 8, 9],
              'colsample_bytree': [1],
              'min_child_weight': [1, 2, 3],
              'gamma' : np.arange(0, 1.488, 0.01),
              'reg_alpha':np.arange(0, 1, 0.01), 
              'reg_lambda':np.arange(0, 1, 0.01)
             }

# numFolds = 5
# kfold_5 = cross_validation.KFold(n = len(X), shuffle = True, n_folds = numFolds)

clf = RandomizedSearchCV(clf_xgb, 
                         param_distributions = param_dist,
                         cv = 6,  
                         n_iter = 25, # you want 5 here not 25 if I understand you correctly 
                         scoring = 'roc_auc', 
                         error_score = 0, 
                         verbose = 3, 
                         n_jobs = -1)

In [67]:
clf.fit(X_train, Y_train)

Fitting 6 folds for each of 25 candidates, totalling 150 fits
[CV] colsample_bytree=1, gamma=1.21, learning_rate=0.00354813389234, max_depth=9, min_child_weight=2, n_estimators=550, reg_alpha=0.5, reg_lambda=0.78, subsample=1 
[CV] colsample_bytree=1, gamma=1.21, learning_rate=0.00354813389234, max_depth=9, min_child_weight=2, n_estimators=550, reg_alpha=0.5, reg_lambda=0.78, subsample=1 
[CV] colsample_bytree=1, gamma=1.21, learning_rate=0.00354813389234, max_depth=9, min_child_weight=2, n_estimators=550, reg_alpha=0.5, reg_lambda=0.78, subsample=1 
[CV] colsample_bytree=1, gamma=1.21, learning_rate=0.00354813389234, max_depth=9, min_child_weight=2, n_estimators=550, reg_alpha=0.5, reg_lambda=0.78, subsample=1 
[CV] colsample_bytree=1, gamma=1.21, learning_rate=0.00354813389234, max_depth=9, min_child_weight=2, n_estimators=550, reg_alpha=0.5, reg_lambda=0.78, subsample=1 
[CV] colsample_bytree=1, gamma=1.21, learning_rate=0.00354813389234, max_depth=9, min_child_weight=2, n_estimator

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   53.7s


[CV]  colsample_bytree=1, gamma=0.71, learning_rate=0.00181970085861, max_depth=4, min_child_weight=2, n_estimators=422, reg_alpha=0.83, reg_lambda=0.88, subsample=1, score=0.760944, total=  12.4s
[CV] colsample_bytree=1, gamma=0.44, learning_rate=0.0251188643151, max_depth=7, min_child_weight=2, n_estimators=512, reg_alpha=0.75, reg_lambda=0.57, subsample=1 
[CV]  colsample_bytree=1, gamma=0.71, learning_rate=0.00181970085861, max_depth=4, min_child_weight=2, n_estimators=422, reg_alpha=0.83, reg_lambda=0.88, subsample=1, score=0.786929, total=  13.5s
[CV] colsample_bytree=1, gamma=0.44, learning_rate=0.0251188643151, max_depth=7, min_child_weight=2, n_estimators=512, reg_alpha=0.75, reg_lambda=0.57, subsample=1 
[CV]  colsample_bytree=1, gamma=0.11, learning_rate=0.00512861383991, max_depth=5, min_child_weight=3, n_estimators=940, reg_alpha=0.72, reg_lambda=0.35, subsample=1, score=0.794637, total=  33.6s
[CV] colsample_bytree=1, gamma=0.44, learning_rate=0.0251188643151, max_depth=7

[CV]  colsample_bytree=1, gamma=0.82, learning_rate=0.00389045144994, max_depth=5, min_child_weight=3, n_estimators=295, reg_alpha=0.63, reg_lambda=0.47, subsample=1, score=0.792486, total=  13.0s
[CV] colsample_bytree=1, gamma=0.82, learning_rate=0.00389045144994, max_depth=5, min_child_weight=3, n_estimators=295, reg_alpha=0.63, reg_lambda=0.47, subsample=1 
[CV]  colsample_bytree=1, gamma=1.15, learning_rate=0.00338844156139, max_depth=6, min_child_weight=3, n_estimators=914, reg_alpha=0.88, reg_lambda=0.3, subsample=1, score=0.807335, total=  43.0s
[CV] colsample_bytree=1, gamma=0.79, learning_rate=0.00660693448008, max_depth=5, min_child_weight=3, n_estimators=766, reg_alpha=0.79, reg_lambda=0.71, subsample=1 
[CV]  colsample_bytree=1, gamma=1.15, learning_rate=0.00338844156139, max_depth=6, min_child_weight=3, n_estimators=914, reg_alpha=0.88, reg_lambda=0.3, subsample=1, score=0.781678, total=  42.7s
[CV] colsample_bytree=1, gamma=0.79, learning_rate=0.00660693448008, max_depth=

[CV]  colsample_bytree=1, gamma=0.31, learning_rate=0.00316227766017, max_depth=8, min_child_weight=1, n_estimators=469, reg_alpha=0.35, reg_lambda=0.66, subsample=1, score=0.778701, total=  28.1s
[CV] colsample_bytree=1, gamma=0.03, learning_rate=0.0144543977075, max_depth=9, min_child_weight=3, n_estimators=577, reg_alpha=0.06, reg_lambda=0.54, subsample=1 
[CV]  colsample_bytree=1, gamma=0.31, learning_rate=0.00316227766017, max_depth=8, min_child_weight=1, n_estimators=469, reg_alpha=0.35, reg_lambda=0.66, subsample=1, score=0.764708, total=  27.8s
[CV] colsample_bytree=1, gamma=0.03, learning_rate=0.0144543977075, max_depth=9, min_child_weight=3, n_estimators=577, reg_alpha=0.06, reg_lambda=0.54, subsample=1 
[CV]  colsample_bytree=1, gamma=0.31, learning_rate=0.00316227766017, max_depth=8, min_child_weight=1, n_estimators=469, reg_alpha=0.35, reg_lambda=0.66, subsample=1, score=0.789417, total=  29.1s
[CV] colsample_bytree=1, gamma=0.19, learning_rate=0.00912010839356, max_depth=

[CV]  colsample_bytree=1, gamma=0.78, learning_rate=0.0346736850453, max_depth=5, min_child_weight=2, n_estimators=958, reg_alpha=0.28, reg_lambda=0.24, subsample=1, score=0.785773, total=  30.4s
[CV] colsample_bytree=1, gamma=0.61, learning_rate=0.0295120922667, max_depth=7, min_child_weight=2, n_estimators=745, reg_alpha=0.31, reg_lambda=0.14, subsample=1 
[CV]  colsample_bytree=1, gamma=0.78, learning_rate=0.0346736850453, max_depth=5, min_child_weight=2, n_estimators=958, reg_alpha=0.28, reg_lambda=0.24, subsample=1, score=0.787394, total=  30.2s
[CV] colsample_bytree=1, gamma=0.61, learning_rate=0.0295120922667, max_depth=7, min_child_weight=2, n_estimators=745, reg_alpha=0.31, reg_lambda=0.14, subsample=1 
[CV]  colsample_bytree=1, gamma=0.78, learning_rate=0.0346736850453, max_depth=5, min_child_weight=2, n_estimators=958, reg_alpha=0.28, reg_lambda=0.24, subsample=1, score=0.785525, total=  30.8s
[CV] colsample_bytree=1, gamma=0.61, learning_rate=0.0295120922667, max_depth=7, m

[CV]  colsample_bytree=1, gamma=0.9, learning_rate=0.0457088189615, max_depth=4, min_child_weight=3, n_estimators=684, reg_alpha=0.31, reg_lambda=0.27, subsample=1, score=0.780468, total=  16.9s
[CV] colsample_bytree=1, gamma=0.29, learning_rate=0.0181970085861, max_depth=8, min_child_weight=2, n_estimators=834, reg_alpha=0.25, reg_lambda=0.68, subsample=1 
[CV]  colsample_bytree=1, gamma=0.9, learning_rate=0.0457088189615, max_depth=4, min_child_weight=3, n_estimators=684, reg_alpha=0.31, reg_lambda=0.27, subsample=1, score=0.763383, total=  17.1s
[CV] colsample_bytree=1, gamma=0.29, learning_rate=0.0181970085861, max_depth=8, min_child_weight=2, n_estimators=834, reg_alpha=0.25, reg_lambda=0.68, subsample=1 
[CV]  colsample_bytree=1, gamma=0.12, learning_rate=0.0208929613085, max_depth=3, min_child_weight=2, n_estimators=608, reg_alpha=0.43, reg_lambda=0.66, subsample=1, score=0.794334, total=  11.8s
[CV] colsample_bytree=1, gamma=0.29, learning_rate=0.0181970085861, max_depth=8, min

[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  5.7min


[CV]  colsample_bytree=1, gamma=0.12, learning_rate=0.0208929613085, max_depth=3, min_child_weight=2, n_estimators=608, reg_alpha=0.43, reg_lambda=0.66, subsample=1, score=0.764965, total=  11.8s
[CV] colsample_bytree=1, gamma=0.23, learning_rate=0.00194984459976, max_depth=7, min_child_weight=3, n_estimators=203, reg_alpha=0.13, reg_lambda=0.01, subsample=1 
[CV]  colsample_bytree=1, gamma=0.23, learning_rate=0.00194984459976, max_depth=7, min_child_weight=3, n_estimators=203, reg_alpha=0.13, reg_lambda=0.01, subsample=1, score=0.786453, total=   9.3s
[CV] colsample_bytree=1, gamma=0.23, learning_rate=0.00194984459976, max_depth=7, min_child_weight=3, n_estimators=203, reg_alpha=0.13, reg_lambda=0.01, subsample=1 
[CV]  colsample_bytree=1, gamma=0.23, learning_rate=0.00194984459976, max_depth=7, min_child_weight=3, n_estimators=203, reg_alpha=0.13, reg_lambda=0.01, subsample=1, score=0.784241, total=   9.5s
[CV] colsample_bytree=1, gamma=0.23, learning_rate=0.00194984459976, max_depth

[CV]  colsample_bytree=1, gamma=0.66, learning_rate=0.026302679919, max_depth=9, min_child_weight=3, n_estimators=401, reg_alpha=0.89, reg_lambda=0.72, subsample=1, score=0.765114, total=  24.6s
[CV] colsample_bytree=1, gamma=0.13, learning_rate=0.011220184543, max_depth=5, min_child_weight=3, n_estimators=345, reg_alpha=0.31, reg_lambda=0.67, subsample=1 
[CV]  colsample_bytree=1, gamma=0.38, learning_rate=0.0107151930524, max_depth=5, min_child_weight=3, n_estimators=238, reg_alpha=0.0, reg_lambda=0.14, subsample=1, score=0.761248, total=   8.0s
[CV] colsample_bytree=1, gamma=0.13, learning_rate=0.011220184543, max_depth=5, min_child_weight=3, n_estimators=345, reg_alpha=0.31, reg_lambda=0.67, subsample=1 
[CV]  colsample_bytree=1, gamma=0.13, learning_rate=0.011220184543, max_depth=5, min_child_weight=3, n_estimators=345, reg_alpha=0.31, reg_lambda=0.67, subsample=1, score=0.807052, total=  11.1s
[CV] colsample_bytree=1, gamma=0.13, learning_rate=0.011220184543, max_depth=5, min_chi

[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  7.3min finished


RandomizedSearchCV(cv=6, error_score=0,
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params={}, iid=True, n_iter=25, n_jobs=-1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7feda8d33dd8>, 'learning_rate': array([ 0.001  ,  0.00102, ...,  0.0955 ,  0.09772]), 'subsample': [1], 'max_depth': [3, 4, 5, 6, 7, 8, 9], 'colsample_bytree': [1], 'min_child_weight': [1, 2, 3], 'gamma': array([ 0.  ,  0.01, ...,  1.47,  1.48]), 'reg_alpha': array([ 0.  ,  0.01, ...,  0.98,  0.99]), 'reg_lambda': array([ 0.  ,  0.01, ...,  0.98,  0.99])},
          pre_dispatch='2*n_job

In [68]:

#trust your CV!
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

test_probs = clf.predict_proba(X_test.as_matrix())[:,1]



Raw AUC score: 0.789855551734
colsample_bytree: 1
gamma: 0.79000000000000004
learning_rate: 0.0066069344800756945
max_depth: 5
min_child_weight: 3
n_estimators: 766
reg_alpha: 0.79000000000000004
reg_lambda: 0.70999999999999996
subsample: 1


In [69]:
clf.grid_scores_



[mean: 0.78846, std: 0.01308, params: {'colsample_bytree': 1, 'gamma': 1.21, 'learning_rate': 0.0035481338923356587, 'max_depth': 9, 'min_child_weight': 2, 'n_estimators': 550, 'reg_alpha': 0.5, 'reg_lambda': 0.78000000000000003, 'subsample': 1},
 mean: 0.78782, std: 0.01236, params: {'colsample_bytree': 1, 'gamma': 0.98999999999999999, 'learning_rate': 0.021379620895020924, 'max_depth': 6, 'min_child_weight': 2, 'n_estimators': 276, 'reg_alpha': 0.76000000000000001, 'reg_lambda': 0.52000000000000002, 'subsample': 1},
 mean: 0.78179, std: 0.01097, params: {'colsample_bytree': 1, 'gamma': 0.70999999999999996, 'learning_rate': 0.0018197008586099603, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 422, 'reg_alpha': 0.83000000000000007, 'reg_lambda': 0.88, 'subsample': 1},
 mean: 0.78961, std: 0.01402, params: {'colsample_bytree': 1, 'gamma': 0.11, 'learning_rate': 0.0051286138399134701, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 940, 'reg_alpha': 0.71999999999999997, 'r

In [None]:
param_dist = {'n_estimators': stats.randint(200, 1000),
              'learning_rate': 10 ** np.arange(-4, -2, 0.01),
              'subsample': np.arange(0.8, 1, 0.01),
              'max_depth': [4, 5, 6],
              'colsample_bytree': [1],
              'min_child_weight': [1, 2, 3, 4],
              'gamma' : np.arange(0.3, 1.488, 0.01),
              'reg_alpha':np.arange(0.6, 1, 0.01), 
              'reg_lambda':np.arange(0.6, 1, 0.01)
             }

clf = RandomizedSearchCV(clf_xgb, 
                         param_distributions = param_dist,
                         cv = 6,  
                         n_iter = 25, # you want 5 here not 25 if I understand you correctly 
                         scoring = 'roc_auc', 
                         error_score = 0, 
                         verbose = 3, 
                         n_jobs = -1)
clf.fit(X_train, Y_train)

#trust your CV!
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

test_probs = clf.predict_proba(X_test.as_matrix())[:,1]

Fitting 6 folds for each of 25 candidates, totalling 150 fits
[CV] colsample_bytree=1, gamma=0.69, learning_rate=0.0011220184543, max_depth=6, min_child_weight=2, n_estimators=824, reg_alpha=0.9, reg_lambda=0.88, subsample=0.94 
[CV] colsample_bytree=1, gamma=0.69, learning_rate=0.0011220184543, max_depth=6, min_child_weight=2, n_estimators=824, reg_alpha=0.9, reg_lambda=0.88, subsample=0.94 
[CV] colsample_bytree=1, gamma=0.69, learning_rate=0.0011220184543, max_depth=6, min_child_weight=2, n_estimators=824, reg_alpha=0.9, reg_lambda=0.88, subsample=0.94 
[CV] colsample_bytree=1, gamma=0.69, learning_rate=0.0011220184543, max_depth=6, min_child_weight=2, n_estimators=824, reg_alpha=0.9, reg_lambda=0.88, subsample=0.94 
[CV] colsample_bytree=1, gamma=0.69, learning_rate=0.0011220184543, max_depth=6, min_child_weight=2, n_estimators=824, reg_alpha=0.9, reg_lambda=0.88, subsample=0.94 
[CV] colsample_bytree=1, gamma=0.69, learning_rate=0.0011220184543, max_depth=6, min_child_weight=2, n_

In [72]:
param_dist = {'n_estimators': stats.randint(200, 1000),
              'learning_rate': 10 ** np.arange(-4, -2, 0.01),
              'subsample': [1],
              'max_depth': [4, 5, 6],
              'colsample_bytree': [1],
              'min_child_weight': [1, 2, 3, 4],
              'gamma' : np.arange(1.1, 3.488, 0.01),
              'reg_alpha':np.arange(0.6, 1, 0.01), 
              'reg_lambda':np.arange(0.6, 1, 0.01)
             }

clf = RandomizedSearchCV(clf_xgb, 
                         param_distributions = param_dist,
                         cv = 6,  
                         n_iter = 25, # you want 5 here not 25 if I understand you correctly 
                         scoring = 'roc_auc', 
                         error_score = 0, 
                         verbose = 3, 
                         n_jobs = -1)
clf.fit(X_train, Y_train)

#trust your CV!
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

test_probs = clf.predict_proba(X_test.as_matrix())[:,1]

Fitting 6 folds for each of 25 candidates, totalling 150 fits
[CV] colsample_bytree=1, gamma=2.32, learning_rate=0.00234422881532, max_depth=6, min_child_weight=1, n_estimators=869, reg_alpha=0.7, reg_lambda=0.79, subsample=1 
[CV] colsample_bytree=1, gamma=2.32, learning_rate=0.00234422881532, max_depth=6, min_child_weight=1, n_estimators=869, reg_alpha=0.7, reg_lambda=0.79, subsample=1 
[CV] colsample_bytree=1, gamma=2.32, learning_rate=0.00234422881532, max_depth=6, min_child_weight=1, n_estimators=869, reg_alpha=0.7, reg_lambda=0.79, subsample=1 
[CV] colsample_bytree=1, gamma=2.32, learning_rate=0.00234422881532, max_depth=6, min_child_weight=1, n_estimators=869, reg_alpha=0.7, reg_lambda=0.79, subsample=1 
[CV] colsample_bytree=1, gamma=2.32, learning_rate=0.00234422881532, max_depth=6, min_child_weight=1, n_estimators=869, reg_alpha=0.7, reg_lambda=0.79, subsample=1 
[CV] colsample_bytree=1, gamma=2.32, learning_rate=0.00234422881532, max_depth=6, min_child_weight=1, n_estimator

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   55.7s


[CV]  colsample_bytree=1, gamma=1.38, learning_rate=0.00831763771103, max_depth=6, min_child_weight=1, n_estimators=499, reg_alpha=0.81, reg_lambda=0.89, subsample=1, score=0.785743, total=  20.7s
[CV] colsample_bytree=1, gamma=2.43, learning_rate=0.0030199517204, max_depth=5, min_child_weight=3, n_estimators=371, reg_alpha=0.95, reg_lambda=0.61, subsample=1 
[CV]  colsample_bytree=1, gamma=1.38, learning_rate=0.00831763771103, max_depth=6, min_child_weight=1, n_estimators=499, reg_alpha=0.81, reg_lambda=0.89, subsample=1, score=0.767773, total=  20.6s
[CV] colsample_bytree=1, gamma=2.43, learning_rate=0.0030199517204, max_depth=5, min_child_weight=3, n_estimators=371, reg_alpha=0.95, reg_lambda=0.61, subsample=1 
[CV]  colsample_bytree=1, gamma=2.43, learning_rate=0.0030199517204, max_depth=5, min_child_weight=3, n_estimators=371, reg_alpha=0.95, reg_lambda=0.61, subsample=1, score=0.793340, total=  12.7s
[CV] colsample_bytree=1, gamma=2.43, learning_rate=0.0030199517204, max_depth=5,

[CV]  colsample_bytree=1, gamma=1.27, learning_rate=0.0053703179637, max_depth=5, min_child_weight=2, n_estimators=668, reg_alpha=0.84, reg_lambda=0.65, subsample=1, score=0.806778, total=  21.8s
[CV] colsample_bytree=1, gamma=1.27, learning_rate=0.0053703179637, max_depth=5, min_child_weight=2, n_estimators=668, reg_alpha=0.84, reg_lambda=0.65, subsample=1 
[CV]  colsample_bytree=1, gamma=2.07, learning_rate=0.0052480746025, max_depth=6, min_child_weight=4, n_estimators=816, reg_alpha=0.63, reg_lambda=0.95, subsample=1, score=0.792517, total=  33.0s
[CV] colsample_bytree=1, gamma=1.36, learning_rate=0.0063095734448, max_depth=6, min_child_weight=4, n_estimators=302, reg_alpha=0.71, reg_lambda=0.75, subsample=1 
[CV]  colsample_bytree=1, gamma=2.07, learning_rate=0.0052480746025, max_depth=6, min_child_weight=4, n_estimators=816, reg_alpha=0.63, reg_lambda=0.95, subsample=1, score=0.795983, total=  33.7s
[CV] colsample_bytree=1, gamma=1.36, learning_rate=0.0063095734448, max_depth=6, m

[CV]  colsample_bytree=1, gamma=2.58, learning_rate=0.00316227766017, max_depth=5, min_child_weight=1, n_estimators=201, reg_alpha=0.96, reg_lambda=0.95, subsample=1, score=0.792321, total=   6.5s
[CV] colsample_bytree=1, gamma=2.58, learning_rate=0.00316227766017, max_depth=5, min_child_weight=1, n_estimators=201, reg_alpha=0.96, reg_lambda=0.95, subsample=1 
[CV]  colsample_bytree=1, gamma=2.58, learning_rate=0.00316227766017, max_depth=5, min_child_weight=1, n_estimators=201, reg_alpha=0.96, reg_lambda=0.95, subsample=1, score=0.772431, total=   6.4s
[CV] colsample_bytree=1, gamma=2.58, learning_rate=0.00316227766017, max_depth=5, min_child_weight=1, n_estimators=201, reg_alpha=0.96, reg_lambda=0.95, subsample=1 
[CV]  colsample_bytree=1, gamma=1.25, learning_rate=0.0032359365693, max_depth=5, min_child_weight=2, n_estimators=413, reg_alpha=0.79, reg_lambda=0.6, subsample=1, score=0.788204, total=  13.6s
[CV] colsample_bytree=1, gamma=2.7, learning_rate=0.000831763771103, max_depth=

[CV]  colsample_bytree=1, gamma=1.32, learning_rate=0.00063095734448, max_depth=5, min_child_weight=1, n_estimators=891, reg_alpha=0.88, reg_lambda=0.86, subsample=1, score=0.792374, total=  28.7s
[CV] colsample_bytree=1, gamma=2.89, learning_rate=0.00676082975392, max_depth=5, min_child_weight=2, n_estimators=605, reg_alpha=0.6, reg_lambda=0.79, subsample=1 
[CV]  colsample_bytree=1, gamma=2.89, learning_rate=0.00676082975392, max_depth=5, min_child_weight=2, n_estimators=605, reg_alpha=0.6, reg_lambda=0.79, subsample=1, score=0.808798, total=  19.7s
[CV] colsample_bytree=1, gamma=2.89, learning_rate=0.00676082975392, max_depth=5, min_child_weight=2, n_estimators=605, reg_alpha=0.6, reg_lambda=0.79, subsample=1 
[CV]  colsample_bytree=1, gamma=2.89, learning_rate=0.00676082975392, max_depth=5, min_child_weight=2, n_estimators=605, reg_alpha=0.6, reg_lambda=0.79, subsample=1, score=0.795460, total=  19.4s
[CV] colsample_bytree=1, gamma=2.89, learning_rate=0.00676082975392, max_depth=5,

[CV] colsample_bytree=1, gamma=2.86, learning_rate=0.000144543977075, max_depth=5, min_child_weight=2, n_estimators=738, reg_alpha=0.93, reg_lambda=0.77, subsample=1 
[CV]  colsample_bytree=1, gamma=2.72, learning_rate=0.00954992586021, max_depth=4, min_child_weight=1, n_estimators=748, reg_alpha=0.87, reg_lambda=0.67, subsample=1, score=0.798833, total=  19.0s
[CV] colsample_bytree=1, gamma=2.86, learning_rate=0.000144543977075, max_depth=5, min_child_weight=2, n_estimators=738, reg_alpha=0.93, reg_lambda=0.77, subsample=1 
[CV]  colsample_bytree=1, gamma=1.61, learning_rate=0.000660693448008, max_depth=4, min_child_weight=4, n_estimators=237, reg_alpha=0.66, reg_lambda=0.79, subsample=1, score=0.773283, total=   6.1s
[CV] colsample_bytree=1, gamma=2.86, learning_rate=0.000144543977075, max_depth=5, min_child_weight=2, n_estimators=738, reg_alpha=0.93, reg_lambda=0.77, subsample=1 
[CV]  colsample_bytree=1, gamma=2.72, learning_rate=0.00954992586021, max_depth=4, min_child_weight=1, n

[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  4.7min


[CV]  colsample_bytree=1, gamma=2.72, learning_rate=0.00954992586021, max_depth=4, min_child_weight=1, n_estimators=748, reg_alpha=0.87, reg_lambda=0.67, subsample=1, score=0.765121, total=  19.3s
[CV] colsample_bytree=1, gamma=3.4, learning_rate=0.000199526231497, max_depth=5, min_child_weight=1, n_estimators=591, reg_alpha=0.87, reg_lambda=0.89, subsample=1 
[CV]  colsample_bytree=1, gamma=1.61, learning_rate=0.000660693448008, max_depth=4, min_child_weight=4, n_estimators=237, reg_alpha=0.66, reg_lambda=0.79, subsample=1, score=0.758803, total=   6.1s
[CV] colsample_bytree=1, gamma=3.4, learning_rate=0.000199526231497, max_depth=5, min_child_weight=1, n_estimators=591, reg_alpha=0.87, reg_lambda=0.89, subsample=1 
[CV]  colsample_bytree=1, gamma=2.86, learning_rate=0.000144543977075, max_depth=5, min_child_weight=2, n_estimators=738, reg_alpha=0.93, reg_lambda=0.77, subsample=1, score=0.767494, total=  24.1s
[CV] colsample_bytree=1, gamma=3.4, learning_rate=0.000199526231497, max_de

[CV]  colsample_bytree=1, gamma=3.28, learning_rate=0.00354813389234, max_depth=5, min_child_weight=3, n_estimators=335, reg_alpha=0.7, reg_lambda=0.86, subsample=1, score=0.761734, total=  10.8s
[CV] colsample_bytree=1, gamma=1.4, learning_rate=0.00052480746025, max_depth=4, min_child_weight=1, n_estimators=341, reg_alpha=0.92, reg_lambda=0.75, subsample=1 
[CV]  colsample_bytree=1, gamma=1.84, learning_rate=0.00776247116629, max_depth=4, min_child_weight=4, n_estimators=561, reg_alpha=0.77, reg_lambda=0.91, subsample=1, score=0.780608, total=  14.7s
[CV] colsample_bytree=1, gamma=3.4, learning_rate=0.000912010839356, max_depth=5, min_child_weight=4, n_estimators=928, reg_alpha=0.86, reg_lambda=0.72, subsample=1 
[CV]  colsample_bytree=1, gamma=1.84, learning_rate=0.00776247116629, max_depth=4, min_child_weight=4, n_estimators=561, reg_alpha=0.77, reg_lambda=0.91, subsample=1, score=0.761890, total=  14.4s
[CV] colsample_bytree=1, gamma=3.4, learning_rate=0.000912010839356, max_depth=

[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  6.1min finished


Raw AUC score: 0.790423572532
colsample_bytree: 1
gamma: 2.8900000000000015
learning_rate: 0.0067608297539192104
max_depth: 5
min_child_weight: 2
n_estimators: 605
reg_alpha: 0.59999999999999998
reg_lambda: 0.79000000000000015
subsample: 1


In [74]:
test_probs

array([ 0.06310574,  0.20222679,  0.06957059, ...,  0.06254054,
        0.06809467,  0.13734066], dtype=float32)

In [75]:
test_indices = X_test.index.values
test_indices

array([22077, 40890, 23683, ..., 26870, 25976, 35620])

In [76]:
results = pd.DataFrame({'id':test_indices, 'y':test_probs})

In [77]:
results.head()

Unnamed: 0,id,y
0,22077,0.063106
1,40890,0.202227
2,23683,0.069571
3,35367,0.071503
4,33480,0.083151


In [78]:
results_sorted = results.sort_values(['id'])

In [79]:
results_sorted.head()

Unnamed: 0,id,y
1462,2,0.039669
14498,4,0.039507
782,5,0.039832
8504,7,0.039507
17492,10,0.046416


In [80]:
results_sorted.to_csv('test_submition2.csv', index=False)