In [1]:
#Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional     scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4



In [2]:
train = pd.read_csv('LargeTrain.csv')
target = 'Class'

In [6]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,stratified=True,
            metrics={'mlogloss'}, early_stopping_rounds=early_stopping_rounds, callbacks=[xgb.callback.print_evaluation(show_stdv=False),                                                               xgb.callback.early_stop(3)])

        print (cvresult)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Class'],eval_metric='mlogloss')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])

    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Class'].values, dtrain_predictions))
    print ("Log Loss Score (Train): %f" % metrics.log_loss(dtrain['Class'], dtrain_predprob))


In [5]:
# first try with all random para

#Choose all predictors except target 
predictors = [x for x in train.columns if x not in target]

param_test1 = {
 'min_child_weight':[1, 3]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=50, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', num_class = 10, nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, n_jobs=1,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.99623, std: 0.00068, params: {'min_child_weight': 1},
  mean: 0.99614, std: 0.00111, params: {'min_child_weight': 3}],
 {'min_child_weight': 1},
 0.9962272969064084)

In [7]:
# choose the best n_estimators
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softprob',
 num_class = 10,
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, train, predictors)

[0]	train-mlogloss:1.83874	test-mlogloss:1.84137
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 3 rounds.
[1]	train-mlogloss:1.54737	test-mlogloss:1.55199
[2]	train-mlogloss:1.3338	test-mlogloss:1.34004
[3]	train-mlogloss:1.16554	test-mlogloss:1.17334
[4]	train-mlogloss:1.02753	test-mlogloss:1.03639
[5]	train-mlogloss:0.911387	test-mlogloss:0.921146
[6]	train-mlogloss:0.812091	test-mlogloss:0.822702
[7]	train-mlogloss:0.726076	test-mlogloss:0.73731
[8]	train-mlogloss:0.651017	test-mlogloss:0.663016
[9]	train-mlogloss:0.584863	test-mlogloss:0.597296
[10]	train-mlogloss:0.526404	test-mlogloss:0.539375
[11]	train-mlogloss:0.47434	test-mlogloss:0.48783
[12]	train-mlogloss:0.427926	test-mlogloss:0.44187
[13]	train-mlogloss:0.38654	test-mlogloss:0.40084
[14]	train-mlogloss:0.349427	test-mlogloss:0.363964
[15]	train-mlogloss:0.31608	test-mlogloss:0.330723
[16]	train-mlogloss:0.286152	test-mlogloss:0.3

In [8]:
# tune max_depth and min_child_weight use the formal result of best n_estimators=145

param_test1 = {
 'max_depth' : [3, 5, 7 , 9] ,
 'min_child_weight':[1, 3, 5]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=145, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', num_class = 10, nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, n_jobs=1,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

KeyboardInterrupt: 