In [28]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV

In [29]:
preprocessed_train_set = pd.read_csv('data/preprocessed_train_set.csv', sep=';')
train_target = pd.read_csv('data/train_target.csv', sep=';')

preprocessed_test_set = pd.read_csv('data/preprocessed_test_set.csv', sep=';')
test_target = pd.read_csv('data/test_target.csv', sep=';')

In [30]:
booster =  XGBClassifier(
     eta = 0.01,
    max_depth=1,
    base_score=0.5,
    subsample=0.8, 
        colsample_bytree=0.8,
        gamma=1, 
        min_child_weight= 12,
        n_estimators=319,
        n_jobs=-1,
        nthread=None,
        objective = 'binary:logistic',
        eval_metric ='auc')    

In [31]:
booster.fit(preprocessed_train_set, np.ravel(train_target))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, eta=0.01, eval_metric='auc', gamma=1,
       learning_rate=0.1, max_delta_step=0, max_depth=1,
       min_child_weight=12, missing=None, n_estimators=319, n_jobs=-1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [32]:
pred_labels = booster.predict(preprocessed_test_set)

In [33]:
roc_auc =  roc_auc_score(test_target, np.ravel(pred_labels))

In [34]:
roc_auc

0.5644878204446335

# Find Best Iteration

In [105]:
par = {  
    "eta": 0.01,
    "max_depth":1,
    "min_child_weight":12,
    "base_score":0.5,
    "n_estimators":5000,
    "n_jobs":-1,
    "subsample": 0.8,
    "colsample_bytree": 1,
    "gamma": 0,
    "objective" : 'binary:logistic',
    "eval_metric" :'auc'}

In [106]:
X_train, X_validation, y_train, y_validation =  train_test_split(
        preprocessed_train_set,
        train_target,
        test_size=.25,
        stratify=train_target
    )

In [107]:
 # Translate these sets into XGBoost-compatible dense matrices
d_train = xgb.DMatrix(X_train.values, label=y_train.values, feature_names=X_train.columns)
d_validation = xgb.DMatrix(X_validation, label=y_validation.values, feature_names=X_validation.columns)

# Parametrize a watch list to follow the training performance
watch_list = [(d_train, 'train'), (d_validation, 'validation')]



In [108]:
first_model = xgb.train(
    params=par,
    dtrain=d_train,
    num_boost_round=6000,
    evals=watch_list,
    early_stopping_rounds=100
    )


[0]	train-auc:0.559593	validation-auc:0.574088
Multiple eval metrics have been passed: 'validation-auc' will be used for early stopping.

Will train until validation-auc hasn't improved in 100 rounds.
[1]	train-auc:0.592058	validation-auc:0.562946
[2]	train-auc:0.629865	validation-auc:0.582995
[3]	train-auc:0.626887	validation-auc:0.574592
[4]	train-auc:0.626712	validation-auc:0.573994
[5]	train-auc:0.639798	validation-auc:0.576071
[6]	train-auc:0.643689	validation-auc:0.59357
[7]	train-auc:0.64766	validation-auc:0.601328
[8]	train-auc:0.645406	validation-auc:0.597677
[9]	train-auc:0.645139	validation-auc:0.597048
[10]	train-auc:0.645195	validation-auc:0.597677
[11]	train-auc:0.647449	validation-auc:0.601328
[12]	train-auc:0.655177	validation-auc:0.611289
[13]	train-auc:0.658564	validation-auc:0.608504
[14]	train-auc:0.651468	validation-auc:0.593649
[15]	train-auc:0.650692	validation-auc:0.592201
[16]	train-auc:0.651081	validation-auc:0.595285
[17]	train-auc:0.650474	validation-auc:0.5

In [104]:
# Should i use validation or train?

# Find Max Depth and Min Child Weight

In [None]:
   "eta": 0.01,
    "max_depth":3,
    "base_score":0.5,
    "n_estimators":5000,
    "n_jobs":-1,
    "subsample": 0.8,
    "colsample_bytree": 1,
    "gamma": 1,
    "objective" : 'binary:logistic',
    "eval_metric" :'auc'}

In [74]:
param_test1 = {
 'max_depth':[1,2,3,4],
 'min_child_weight':[10,12,14,16]
}

gsearch1 = GridSearchCV(
    estimator = XGBClassifier(
        learning_rate =0.01, 
        max_depth=3,
        subsample=0.8, 
        colsample_bytree=1,
        gamma=1, 
        objective= 'binary:logistic', 
        n_jobs=-1, 
        seed=2019,
        n_estimators=755, ),
    
    param_grid = param_test1, 
    scoring='roc_auc',
    n_jobs=-1,
    iid=False, 
    cv=3)

gsearch1.fit(preprocessed_train_set, train_target)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=755,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=2019, silent=True, subsample=0.8),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'max_depth': [1, 2, 3, 4], 'min_child_weight': [10, 12, 14, 16]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [75]:
print(gsearch1.best_params_)
print(gsearch1.best_score_)

{'max_depth': 1, 'min_child_weight': 12}
0.6505021292369021


# Tune Gamma

In [76]:
[i/10.0 for i in range(0,5)]

[0.0, 0.1, 0.2, 0.3, 0.4]

In [82]:
param_test2 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch2 = GridSearchCV(
    estimator = XGBClassifier(
        learning_rate =0.01, 
        max_depth=1,
        subsample=0.8, 
        colsample_bytree=1,
        gamma=1, 
        min_child_weight= 12,
        objective= 'binary:logistic', 
        n_jobs=-1, 
        seed=2019,
        n_estimators=755, ),
    
    param_grid = param_test2, 
    scoring='roc_auc',
    n_jobs=-1,
    iid=False, 
    cv=5)

gsearch2.fit(preprocessed_train_set, train_target)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.01, max_delta_step=0,
       max_depth=1, min_child_weight=12, missing=None, n_estimators=755,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=2019, silent=True, subsample=0.8),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'gamma': [0.0, 0.1, 0.2, 0.3, 0.4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [83]:
print(gsearch2.best_params_)
print(gsearch2.best_score_)

{'gamma': 0.0}
0.655203346658076


# Subsamble and  colsample_bytree

In [115]:
param_test3 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}

gsearch3 = GridSearchCV(
    estimator = XGBClassifier(
        learning_rate =0.01, 
        max_depth=1,
        subsample=0.8, 
        colsample_bytree=1,
        gamma=1, 
        min_child_weight= 12,
        objective= 'binary:logistic', 
        n_jobs=-1, 
        seed=2019,
        n_estimators=319, ),
    
    param_grid = param_test3, 
    scoring='roc_auc',
    n_jobs=-1,
    iid=False, 
    cv=5)

gsearch3.fit(preprocessed_train_set, train_target)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.01, max_delta_step=0,
       max_depth=1, min_child_weight=12, missing=None, n_estimators=319,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=2019, silent=True, subsample=0.8),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [116]:
print(gsearch3.best_params_)
print(gsearch3.best_score_)

{'subsample': 0.8, 'colsample_bytree': 0.8}
0.6596705070561094
