In [1]:
#Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional     scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4



In [2]:
train = pd.read_csv('LargeTrain.csv')
target = 'Class'

In [3]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,stratified=True,
            metrics={'mlogloss'}, early_stopping_rounds=early_stopping_rounds, callbacks=[xgb.callback.print_evaluation(show_stdv=False),                                                               xgb.callback.early_stop(3)])

        print (cvresult)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Class'],eval_metric='mlogloss')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])

    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Class'].values, dtrain_predictions))
    print ("Log Loss Score (Train): %f" % metrics.log_loss(dtrain['Class'], dtrain_predprob))


In [5]:
# first try with all random para

#Choose all predictors except target 
predictors = [x for x in train.columns if x not in target]

param_test1 = {
 'min_child_weight':[1, 3]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=50, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', num_class = 10, nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, n_jobs=1,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.99623, std: 0.00068, params: {'min_child_weight': 1},
  mean: 0.99614, std: 0.00111, params: {'min_child_weight': 3}],
 {'min_child_weight': 1},
 0.9962272969064084)

In [7]:
# choose the best n_estimators
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softprob',
 num_class = 10,
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, train, predictors)

[0]	train-mlogloss:1.83874	test-mlogloss:1.84137
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 3 rounds.
[1]	train-mlogloss:1.54737	test-mlogloss:1.55199
[2]	train-mlogloss:1.3338	test-mlogloss:1.34004
[3]	train-mlogloss:1.16554	test-mlogloss:1.17334
[4]	train-mlogloss:1.02753	test-mlogloss:1.03639
[5]	train-mlogloss:0.911387	test-mlogloss:0.921146
[6]	train-mlogloss:0.812091	test-mlogloss:0.822702
[7]	train-mlogloss:0.726076	test-mlogloss:0.73731
[8]	train-mlogloss:0.651017	test-mlogloss:0.663016
[9]	train-mlogloss:0.584863	test-mlogloss:0.597296
[10]	train-mlogloss:0.526404	test-mlogloss:0.539375
[11]	train-mlogloss:0.47434	test-mlogloss:0.48783
[12]	train-mlogloss:0.427926	test-mlogloss:0.44187
[13]	train-mlogloss:0.38654	test-mlogloss:0.40084
[14]	train-mlogloss:0.349427	test-mlogloss:0.363964
[15]	train-mlogloss:0.31608	test-mlogloss:0.330723
[16]	train-mlogloss:0.286152	test-mlogloss:0.3

In [4]:
# tune max_depth and min_child_weight use the formal result of best n_estimators=145
predictors = [x for x in train.columns if x not in target]
param_test1 = {
 'max_depth' : [3, 5, 7 , 9] ,
 'min_child_weight':[1, 3, 5]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=145, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', num_class = 10, nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, n_jobs=1,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.99724, std: 0.00065, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.99687, std: 0.00128, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.99641, std: 0.00128, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.99696, std: 0.00069, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.99641, std: 0.00106, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.99650, std: 0.00135, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.99678, std: 0.00092, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.99650, std: 0.00122, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.99614, std: 0.00138, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: 0.99669, std: 0.00102, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.99669, std: 0.00128, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.99623, std: 0.00147, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 3, 'min_child_weight': 1

In [4]:
# continue to find the optimal value
predictors = [x for x in train.columns if x not in target]
param_test1 = {
 'max_depth' : [2, 3 , 4] ,
 'min_child_weight':[1, 2, 3]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=145, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', num_class = 10, nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, n_jobs=1,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.99660, std: 0.00090, params: {'max_depth': 2, 'min_child_weight': 1},
  mean: 0.99669, std: 0.00089, params: {'max_depth': 2, 'min_child_weight': 2},
  mean: 0.99660, std: 0.00090, params: {'max_depth': 2, 'min_child_weight': 3},
  mean: 0.99724, std: 0.00065, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.99678, std: 0.00120, params: {'max_depth': 3, 'min_child_weight': 2},
  mean: 0.99687, std: 0.00128, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.99669, std: 0.00094, params: {'max_depth': 4, 'min_child_weight': 1},
  mean: 0.99678, std: 0.00127, params: {'max_depth': 4, 'min_child_weight': 2},
  mean: 0.99650, std: 0.00129, params: {'max_depth': 4, 'min_child_weight': 3}],
 {'max_depth': 3, 'min_child_weight': 1},
 0.9972395121199924)

In [4]:
# Tune gamma with the value get before
predictors = [x for x in train.columns if x not in target]
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=145, max_depth=3,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', num_class = 10, nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test3,n_jobs=1,iid=False, cv=5)
gsearch3.fit(train[predictors],train[target])
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_



([mean: 0.99724, std: 0.00065, params: {'gamma': 0.0},
  mean: 0.99696, std: 0.00062, params: {'gamma': 0.1},
  mean: 0.99669, std: 0.00089, params: {'gamma': 0.2},
  mean: 0.99678, std: 0.00092, params: {'gamma': 0.3},
  mean: 0.99660, std: 0.00080, params: {'gamma': 0.4}],
 {'gamma': 0.0},
 0.9972395121199924)

In [5]:
# re-calibrate the number of boosting rounds for the updated parameter
xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=3,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softprob',
 num_class = 10,
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb2, train, predictors)

[0]	train-mlogloss:1.85137	test-mlogloss:1.8535
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 3 rounds.
[1]	train-mlogloss:1.56466	test-mlogloss:1.56825
[2]	train-mlogloss:1.35123	test-mlogloss:1.35598
[3]	train-mlogloss:1.18356	test-mlogloss:1.18905
[4]	train-mlogloss:1.046	test-mlogloss:1.05221
[5]	train-mlogloss:0.930168	test-mlogloss:0.936988
[6]	train-mlogloss:0.831311	test-mlogloss:0.838819
[7]	train-mlogloss:0.745389	test-mlogloss:0.753296
[8]	train-mlogloss:0.670433	test-mlogloss:0.678771
[9]	train-mlogloss:0.604452	test-mlogloss:0.613264
[10]	train-mlogloss:0.545726	test-mlogloss:0.554717
[11]	train-mlogloss:0.493578	test-mlogloss:0.502866
[12]	train-mlogloss:0.446997	test-mlogloss:0.45661
[13]	train-mlogloss:0.405531	test-mlogloss:0.41545
[14]	train-mlogloss:0.368166	test-mlogloss:0.378257
[15]	train-mlogloss:0.334641	test-mlogloss:0.344909
[16]	train-mlogloss:0.304485	test-mlogloss:

In [4]:
# Tune subsample and colsample_bytree
predictors = [x for x in train.columns if x not in target]
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators= 177,max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', num_class = 10, nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test4 , n_jobs=1,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.99687, std: 0.00098, params: {'colsample_bytree': 0.6, 'subsample': 0.6},
  mean: 0.99706, std: 0.00055, params: {'colsample_bytree': 0.6, 'subsample': 0.7},
  mean: 0.99706, std: 0.00055, params: {'colsample_bytree': 0.6, 'subsample': 0.8},
  mean: 0.99724, std: 0.00058, params: {'colsample_bytree': 0.6, 'subsample': 0.9},
  mean: 0.99706, std: 0.00095, params: {'colsample_bytree': 0.7, 'subsample': 0.6},
  mean: 0.99696, std: 0.00075, params: {'colsample_bytree': 0.7, 'subsample': 0.7},
  mean: 0.99715, std: 0.00054, params: {'colsample_bytree': 0.7, 'subsample': 0.8},
  mean: 0.99696, std: 0.00075, params: {'colsample_bytree': 0.7, 'subsample': 0.9},
  mean: 0.99660, std: 0.00095, params: {'colsample_bytree': 0.8, 'subsample': 0.6},
  mean: 0.99678, std: 0.00087, params: {'colsample_bytree': 0.8, 'subsample': 0.7},
  mean: 0.99696, std: 0.00069, params: {'colsample_bytree': 0.8, 'subsample': 0.8},
  mean: 0.99696, std: 0.00047, params: {'colsample_bytree': 0.8, 'subsample'

In [4]:
# Tune subsample and colsample_bytree again
predictors = [x for x in train.columns if x not in target]
param_test5 = {
 'subsample':[i/100.0 for i in range(55,70,5)],
 'colsample_bytree':[i/100.0 for i in range(85,100,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators= 177,max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', num_class = 10, nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test5 , n_jobs=1,iid=False, cv=5)
gsearch5.fit(train[predictors],train[target])
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_



([mean: 0.99678, std: 0.00109, params: {'colsample_bytree': 0.85, 'subsample': 0.55},
  mean: 0.99650, std: 0.00119, params: {'colsample_bytree': 0.85, 'subsample': 0.6},
  mean: 0.99678, std: 0.00087, params: {'colsample_bytree': 0.85, 'subsample': 0.65},
  mean: 0.99678, std: 0.00082, params: {'colsample_bytree': 0.9, 'subsample': 0.55},
  mean: 0.99678, std: 0.00082, params: {'colsample_bytree': 0.9, 'subsample': 0.6},
  mean: 0.99687, std: 0.00084, params: {'colsample_bytree': 0.9, 'subsample': 0.65},
  mean: 0.99660, std: 0.00107, params: {'colsample_bytree': 0.95, 'subsample': 0.55},
  mean: 0.99678, std: 0.00077, params: {'colsample_bytree': 0.95, 'subsample': 0.6},
  mean: 0.99669, std: 0.00089, params: {'colsample_bytree': 0.95, 'subsample': 0.65}],
 {'colsample_bytree': 0.9, 'subsample': 0.65},
 0.9968715268394035)

In [4]:
# Tuning Regularization Parameters
predictors = [x for x in train.columns if x not in target]
param_test5 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators= 177,max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.65, colsample_bytree=0.9,
 objective= 'multi:softprob', num_class = 10, nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test5 , n_jobs=1,iid=False, cv=5)
gsearch5.fit(train[predictors],train[target])
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_



([mean: 0.99687, std: 0.00084, params: {'reg_alpha': 1e-05},
  mean: 0.99678, std: 0.00101, params: {'reg_alpha': 0.01},
  mean: 0.99669, std: 0.00102, params: {'reg_alpha': 0.1},
  mean: 0.99660, std: 0.00103, params: {'reg_alpha': 1},
  mean: 0.98859, std: 0.00158, params: {'reg_alpha': 100}],
 {'reg_alpha': 1e-05},
 0.9968715268394035)

In [6]:
# Tuning Regularization Parameters
predictors = [x for x in train.columns if x not in target]
param_test5 = {
 'reg_alpha':[0, 1e-6, 5e-6, 1e-5, 5e-5]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators= 177,max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.65, colsample_bytree=0.9,
 objective= 'multi:softprob', num_class = 10, nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test5 , n_jobs=1,iid=False, cv=5)
gsearch5.fit(train[predictors],train[target])
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_



([mean: 0.99687, std: 0.00084, params: {'reg_alpha': 0},
  mean: 0.99687, std: 0.00084, params: {'reg_alpha': 1e-06},
  mean: 0.99678, std: 0.00101, params: {'reg_alpha': 5e-06},
  mean: 0.99687, std: 0.00084, params: {'reg_alpha': 1e-05},
  mean: 0.99687, std: 0.00084, params: {'reg_alpha': 5e-05}],
 {'reg_alpha': 0},
 0.9968715268394035)

In [8]:
# See the result with new parameter
predictors = [x for x in train.columns if x not in target]
xgb3 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=3,
 min_child_weight=1,
 gamma=0,
 subsample=0.65,
 reg_alpha=0,
 colsample_bytree=0.9,
 objective= 'multi:softprob',
 num_class = 10,
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb3, train, predictors)

[0]	train-mlogloss:1.85114	test-mlogloss:1.85287
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 3 rounds.
[1]	train-mlogloss:1.56536	test-mlogloss:1.56829
[2]	train-mlogloss:1.35198	test-mlogloss:1.35577
[3]	train-mlogloss:1.18461	test-mlogloss:1.18935
[4]	train-mlogloss:1.047	test-mlogloss:1.05254
[5]	train-mlogloss:0.931335	test-mlogloss:0.937618
[6]	train-mlogloss:0.832657	test-mlogloss:0.83939
[7]	train-mlogloss:0.74702	test-mlogloss:0.754439
[8]	train-mlogloss:0.672019	test-mlogloss:0.679971
[9]	train-mlogloss:0.606046	test-mlogloss:0.614451
[10]	train-mlogloss:0.547283	test-mlogloss:0.55602
[11]	train-mlogloss:0.495066	test-mlogloss:0.504015
[12]	train-mlogloss:0.448499	test-mlogloss:0.457711
[13]	train-mlogloss:0.406859	test-mlogloss:0.416326
[14]	train-mlogloss:0.369493	test-mlogloss:0.379201
[15]	train-mlogloss:0.335813	test-mlogloss:0.34568
[16]	train-mlogloss:0.305589	test-mlogloss:0

In [9]:
#Reducing Learning Rate
xgb4 = XGBClassifier(
 learning_rate =0.01,
 n_estimators=5000,
 max_depth=3,
 min_child_weight=1,
 gamma=0,
 subsample=0.65,
 reg_alpha=0,
 colsample_bytree=0.9,
 objective= 'multi:softprob',
 num_class = 10,
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb4, train, predictors)

[0]	train-mlogloss:2.2561	test-mlogloss:2.25627
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 3 rounds.
[1]	train-mlogloss:2.21152	test-mlogloss:2.21191
[2]	train-mlogloss:2.16895	test-mlogloss:2.16951
[3]	train-mlogloss:2.12832	test-mlogloss:2.12915
[4]	train-mlogloss:2.08914	test-mlogloss:2.09015
[5]	train-mlogloss:2.0515	test-mlogloss:2.05273
[6]	train-mlogloss:2.01539	test-mlogloss:2.01684
[7]	train-mlogloss:1.98052	test-mlogloss:1.98216
[8]	train-mlogloss:1.94695	test-mlogloss:1.94869
[9]	train-mlogloss:1.91451	test-mlogloss:1.91639
[10]	train-mlogloss:1.88306	test-mlogloss:1.88508
[11]	train-mlogloss:1.85271	test-mlogloss:1.8549
[12]	train-mlogloss:1.82326	test-mlogloss:1.82562
[13]	train-mlogloss:1.79476	test-mlogloss:1.79727
[14]	train-mlogloss:1.76703	test-mlogloss:1.76966
[15]	train-mlogloss:1.74007	test-mlogloss:1.74283
[16]	train-mlogloss:1.71387	test-mlogloss:1.71674
[17]	train-ml

KeyboardInterrupt: 