In [1]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import log_loss

In [2]:
Dtrain = xgb.DMatrix('Dataset/xgboost_train.buffer')
Dtest = xgb.DMatrix('Dataset/xgboost_test.buffer')

In [3]:
type(Dtrain)

xgboost.core.DMatrix

eta [default=0.3]
Analogous to learning rate in GBM
Makes the model more robust by shrinking the weights on each step
Typical final values to be used: 0.01-0.2
min_child_weight [default=1]
Defines the minimum sum of weights of all observations required in a child.
This is similar to min_child_leaf in GBM but not exactly. This refers to min “sum of weights” of observations while GBM has min “number of observations”.
Used to control over-fitting. Higher values prevent a model from learning relations which might be highly specific to the particular sample selected for a tree.
Too high values can lead to under-fitting hence, it should be tuned using CV.
max_depth [default=6]
The maximum depth of a tree, same as GBM.
Used to control over-fitting as higher depth will allow model to learn relations very specific to a particular sample.
Should be tuned using CV.
Typical values: 3-10
max_leaf_nodes
The maximum number of terminal nodes or leaves in a tree.
Can be defined in place of max_depth. Since binary trees are created, a depth of ‘n’ would produce a maximum of 2^n leaves.
If this is defined, GBM will ignore max_depth.
gamma [default=0]
A node is split only when the resulting split gives a positive reduction in the loss function. Gamma specifies the minimum loss reduction required to make a split.
Makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.
max_delta_step [default=0]
In maximum delta step we allow each tree’s weight estimation to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative.
Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced.
This is generally not used but you can explore further if you wish.
subsample [default=1]
Same as the subsample of GBM. Denotes the fraction of observations to be randomly samples for each tree.
Lower values make the algorithm more conservative and prevents overfitting but too small values might lead to under-fitting.
Typical values: 0.5-1
colsample_bytree [default=1]
Similar to max_features in GBM. Denotes the fraction of columns to be randomly samples for each tree.
Typical values: 0.5-1
colsample_bylevel [default=1]
Denotes the subsample ratio of columns for each split, in each level.
I don’t use this often because subsample and colsample_bytree will do the job for you. but you can explore further if you feel so.
lambda [default=1]
L2 regularization term on weights (analogous to Ridge regression)
This used to handle the regularization part of XGBoost. Though many data scientists don’t use it often, it should be explored to reduce overfitting.
alpha [default=0]
L1 regularization term on weight (analogous to Lasso regression)
Can be used in case of very high dimensionality so that the algorithm runs faster when implemented
scale_pos_weight [default=1]
A value greater than 0 should be used in case of high class imbalance as it helps in faster convergence.
 

In [4]:
evallist = [(Dtrain, 'train'), (Dtest, 'eval')]

In [5]:
import time
def xgboost_train(params):
    print("====================================")
    print("Parameters: ",params)
    print("model starts")
    start = time.time()
    model = xgb.train(params=params, 
        dtrain=Dtrain, 
        num_boost_round=20,  
        early_stopping_rounds=10, 
        evals=evallist)
    score = log_loss(Dtest.get_label(), model.predict(Dtest))
    end = time.time()
    eta = str(params['eta'])
    dp = str(params['max_depth'])
    model.save_model('Dataset/0422' + eta + '_' + dp +'.model')
    print("Time pass: ", end-start)
    
    print("Score: ", score)
#     return {'loss': score, 'status': STATUS_OK}

In [6]:
space = {'eta' : 0.3,
    'max_depth' : 6,
    'num_class' : 38,
    'eval_metric': 'mlogloss',
    'objective': 'multi:softprob'}
xgboost_train(space)

Parameters:  {'eta': 0.3, 'max_depth': 6, 'num_class': 38, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
model starts
[0]	train-mlogloss:2.02799	eval-mlogloss:2.03669
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:1.85894	eval-mlogloss:1.86902
[2]	train-mlogloss:1.76389	eval-mlogloss:1.77561
[3]	train-mlogloss:1.6963	eval-mlogloss:1.70926
[4]	train-mlogloss:1.64497	eval-mlogloss:1.65882
[5]	train-mlogloss:1.60352	eval-mlogloss:1.61853
[6]	train-mlogloss:1.56946	eval-mlogloss:1.58567
[7]	train-mlogloss:1.5413	eval-mlogloss:1.559
[8]	train-mlogloss:1.51525	eval-mlogloss:1.53385
[9]	train-mlogloss:1.49453	eval-mlogloss:1.51416
[10]	train-mlogloss:1.47418	eval-mlogloss:1.49456
[11]	train-mlogloss:1.45657	eval-mlogloss:1.47785
[12]	train-mlogloss:1.442	eval-mlogloss:1.46415
[13]	train-mlogloss:1.42863	eval-mlogloss:1.4516
[14]	train-mlogloss:1.41564	eval-mlo

In [8]:
space = {'eta' : 0.3,
    'max_depth' : 7,
    'num_class' : 38,
    'eval_metric': 'mlogloss',
    'objective': 'multi:softprob'}
xgboost_train(space)

Parameters:  {'eta': 0.3, 'max_depth': 7, 'num_class': 38, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
model starts
[0]	train-mlogloss:1.9909	eval-mlogloss:2.00193
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:1.82107	eval-mlogloss:1.83512
[2]	train-mlogloss:1.72142	eval-mlogloss:1.73838
[3]	train-mlogloss:1.65013	eval-mlogloss:1.66936
[4]	train-mlogloss:1.59602	eval-mlogloss:1.61732
[5]	train-mlogloss:1.5524	eval-mlogloss:1.57606
[6]	train-mlogloss:1.51699	eval-mlogloss:1.54216
[7]	train-mlogloss:1.48364	eval-mlogloss:1.51034
[8]	train-mlogloss:1.45694	eval-mlogloss:1.48563
[9]	train-mlogloss:1.43601	eval-mlogloss:1.46631
[10]	train-mlogloss:1.41477	eval-mlogloss:1.44643
[11]	train-mlogloss:1.39582	eval-mlogloss:1.42922
[12]	train-mlogloss:1.37895	eval-mlogloss:1.41301
[13]	train-mlogloss:1.36239	eval-mlogloss:1.39791
[14]	train-mlogloss:1.35001	eva

In [7]:
space = {'eta' : 0.3,
    'max_depth' : 8,
    'num_class' : 38,
    'eval_metric': 'mlogloss',
    'objective': 'multi:softprob'}
xgboost_train(space)

Parameters:  {'eta': 0.3, 'max_depth': 8, 'num_class': 38, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
model starts
[0]	train-mlogloss:1.95505	eval-mlogloss:1.97063
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:1.78105	eval-mlogloss:1.8019
[2]	train-mlogloss:1.67714	eval-mlogloss:1.70209
[3]	train-mlogloss:1.60215	eval-mlogloss:1.63026
[4]	train-mlogloss:1.54288	eval-mlogloss:1.57452
[5]	train-mlogloss:1.49655	eval-mlogloss:1.53072
[6]	train-mlogloss:1.45493	eval-mlogloss:1.49156
[7]	train-mlogloss:1.42201	eval-mlogloss:1.46075
[8]	train-mlogloss:1.3935	eval-mlogloss:1.43452
[9]	train-mlogloss:1.36504	eval-mlogloss:1.4083
[10]	train-mlogloss:1.34434	eval-mlogloss:1.38951
[11]	train-mlogloss:1.32446	eval-mlogloss:1.37146
[12]	train-mlogloss:1.30467	eval-mlogloss:1.35348
[13]	train-mlogloss:1.28717	eval-mlogloss:1.33806
[14]	train-mlogloss:1.27268	eval

In [9]:
space = {'eta' : 0.3,
    'max_depth' : 9,
    'num_class' : 38,
    'eval_metric': 'mlogloss',
    'objective': 'multi:softprob'}
xgboost_train(space)

Parameters:  {'eta': 0.3, 'max_depth': 9, 'num_class': 38, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
model starts
[0]	train-mlogloss:1.92237	eval-mlogloss:1.94334
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:1.74138	eval-mlogloss:1.7701
[2]	train-mlogloss:1.62938	eval-mlogloss:1.66397
[3]	train-mlogloss:1.54921	eval-mlogloss:1.58812
[4]	train-mlogloss:1.48743	eval-mlogloss:1.53065
[5]	train-mlogloss:1.44066	eval-mlogloss:1.48718
[6]	train-mlogloss:1.39603	eval-mlogloss:1.44552
[7]	train-mlogloss:1.35875	eval-mlogloss:1.41155
[8]	train-mlogloss:1.32515	eval-mlogloss:1.38011
[9]	train-mlogloss:1.29895	eval-mlogloss:1.35656
[10]	train-mlogloss:1.27403	eval-mlogloss:1.334
[11]	train-mlogloss:1.25328	eval-mlogloss:1.31568
[12]	train-mlogloss:1.23218	eval-mlogloss:1.29601
[13]	train-mlogloss:1.21361	eval-mlogloss:1.27979
[14]	train-mlogloss:1.1985	eval-

In [10]:
space = {'eta' : 0.3,
    'max_depth' : 10,
    'num_class' : 38,
    'eval_metric': 'mlogloss',
    'objective': 'multi:softprob'}
xgboost_train(space)

Parameters:  {'eta': 0.3, 'max_depth': 10, 'num_class': 38, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
model starts
[0]	train-mlogloss:1.88799	eval-mlogloss:1.91395
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:1.69975	eval-mlogloss:1.73682
[2]	train-mlogloss:1.58218	eval-mlogloss:1.6267
[3]	train-mlogloss:1.49632	eval-mlogloss:1.5459
[4]	train-mlogloss:1.42696	eval-mlogloss:1.4818
[5]	train-mlogloss:1.37335	eval-mlogloss:1.43311
[6]	train-mlogloss:1.32998	eval-mlogloss:1.39325
[7]	train-mlogloss:1.28984	eval-mlogloss:1.357
[8]	train-mlogloss:1.25597	eval-mlogloss:1.32666
[9]	train-mlogloss:1.22266	eval-mlogloss:1.29588
[10]	train-mlogloss:1.19855	eval-mlogloss:1.27411
[11]	train-mlogloss:1.17255	eval-mlogloss:1.2509
[12]	train-mlogloss:1.148	eval-mlogloss:1.22845
[13]	train-mlogloss:1.12969	eval-mlogloss:1.21239
[14]	train-mlogloss:1.11129	eval-mlo

In [13]:
space = {'eta' : 0.3,
    'max_depth' : 15,
    'num_class' : 38,
    'eval_metric': 'mlogloss',
    'objective': 'multi:softprob'}
xgboost_train(space)

Parameters:  {'eta': 0.3, 'max_depth': 15, 'num_class': 38, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
model starts
[0]	train-mlogloss:1.74298	eval-mlogloss:1.79519
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:1.5149	eval-mlogloss:1.59136
[2]	train-mlogloss:1.36707	eval-mlogloss:1.46136
[3]	train-mlogloss:1.25257	eval-mlogloss:1.35891
[4]	train-mlogloss:1.15804	eval-mlogloss:1.2741
[5]	train-mlogloss:1.08457	eval-mlogloss:1.20809
[6]	train-mlogloss:1.02097	eval-mlogloss:1.1519
[7]	train-mlogloss:0.966079	eval-mlogloss:1.10401
[8]	train-mlogloss:0.923604	eval-mlogloss:1.06688
[9]	train-mlogloss:0.882565	eval-mlogloss:1.03048
[10]	train-mlogloss:0.853663	eval-mlogloss:1.00564
[11]	train-mlogloss:0.82315	eval-mlogloss:0.97885
[12]	train-mlogloss:0.796487	eval-mlogloss:0.954928
[13]	train-mlogloss:0.769416	eval-mlogloss:0.931275
[14]	train-mlogloss:0.7

In [11]:
space = {'eta' : 0.35,
    'max_depth' : 5,
    'num_class' : 38,
    'eval_metric': 'mlogloss',
    'objective': 'multi:softprob'}
xgboost_train(space)

Parameters:  {'eta': 0.35, 'max_depth': 5, 'num_class': 38, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
model starts
[0]	train-mlogloss:1.99964	eval-mlogloss:2.00561
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:1.81927	eval-mlogloss:1.82501
[2]	train-mlogloss:1.73642	eval-mlogloss:1.74446
[3]	train-mlogloss:1.68128	eval-mlogloss:1.69073
[4]	train-mlogloss:1.63855	eval-mlogloss:1.64903
[5]	train-mlogloss:1.60672	eval-mlogloss:1.61816
[6]	train-mlogloss:1.58001	eval-mlogloss:1.59219
[7]	train-mlogloss:1.55723	eval-mlogloss:1.57036
[8]	train-mlogloss:1.53648	eval-mlogloss:1.55008
[9]	train-mlogloss:1.51957	eval-mlogloss:1.53362
[10]	train-mlogloss:1.50424	eval-mlogloss:1.51942
[11]	train-mlogloss:1.49008	eval-mlogloss:1.50582
[12]	train-mlogloss:1.47789	eval-mlogloss:1.49433
[13]	train-mlogloss:1.46776	eval-mlogloss:1.48468
[14]	train-mlogloss:1.4592	e

In [12]:
space = {'eta' : 0.35,
    'max_depth' : 6,
    'num_class' : 38,
    'eval_metric': 'mlogloss',
    'objective': 'multi:softprob'}
xgboost_train(space)

Parameters:  {'eta': 0.35, 'max_depth': 6, 'num_class': 38, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
model starts
[0]	train-mlogloss:1.95813	eval-mlogloss:1.96791
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:1.77987	eval-mlogloss:1.79207
[2]	train-mlogloss:1.69105	eval-mlogloss:1.70573
[3]	train-mlogloss:1.6323	eval-mlogloss:1.64867
[4]	train-mlogloss:1.58738	eval-mlogloss:1.60488
[5]	train-mlogloss:1.55404	eval-mlogloss:1.57311
[6]	train-mlogloss:1.52419	eval-mlogloss:1.54451
[7]	train-mlogloss:1.49873	eval-mlogloss:1.51988
[8]	train-mlogloss:1.47515	eval-mlogloss:1.4976
[9]	train-mlogloss:1.45687	eval-mlogloss:1.48018
[10]	train-mlogloss:1.4379	eval-mlogloss:1.46233
[11]	train-mlogloss:1.42318	eval-mlogloss:1.44889
[12]	train-mlogloss:1.40906	eval-mlogloss:1.43553
[13]	train-mlogloss:1.39699	eval-mlogloss:1.42464
[14]	train-mlogloss:1.38436	eva

In [None]:
space = {'eta' : 0.35,
    'max_depth' : 7,
    'num_class' : 38,
    'eval_metric': 'mlogloss',
    'objective': 'multi:softprob'}
xgboost_train(space)

Parameters:  {'eta': 0.35, 'max_depth': 7, 'num_class': 38, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
model starts
[0]	train-mlogloss:1.92031	eval-mlogloss:1.93285
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:1.73223	eval-mlogloss:1.74967
[2]	train-mlogloss:1.64219	eval-mlogloss:1.663
[3]	train-mlogloss:1.58207	eval-mlogloss:1.6053
[4]	train-mlogloss:1.53305	eval-mlogloss:1.55801
[5]	train-mlogloss:1.49652	eval-mlogloss:1.52324
[6]	train-mlogloss:1.46579	eval-mlogloss:1.49474
[7]	train-mlogloss:1.43958	eval-mlogloss:1.47009
[8]	train-mlogloss:1.41425	eval-mlogloss:1.44646
[9]	train-mlogloss:1.39484	eval-mlogloss:1.42936
[10]	train-mlogloss:1.37125	eval-mlogloss:1.4083
[11]	train-mlogloss:1.35485	eval-mlogloss:1.39367
[12]	train-mlogloss:1.33877	eval-mlogloss:1.37921
[13]	train-mlogloss:1.32244	eval-mlogloss:1.36468
[14]	train-mlogloss:1.31014	eval

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK 
# Define the hyperparameter space
space = {'eta' : 0.4,
    'max_depth' : hp.choice('max_depth', np.arange(2, 5, dtype=int)),
    'num_class' : 38,
    'eval_metric': 'mlogloss',
    'objective': 'multi:softprob'}

# Evaluate the function fmin over the hyperparameter space, and
# print the best hyperparameters.  
xgboost_best = fmin(xgboost_train, space=space, algo=tpe.suggest, max_evals = 100)
print ("==================best==================")
print ("Optimal parameters for dtrain1 are: ", xgboost_best)

Parameters:  {'eta': 0.4, 'eval_metric': 'mlogloss', 'max_depth': 3, 'num_class': 38, 'objective': 'multi:softprob'}
model starts
[0]	train-mlogloss:2.08344	eval-mlogloss:2.08256
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:1.90168	eval-mlogloss:1.90223
[2]	train-mlogloss:1.80561	eval-mlogloss:1.80644
[3]	train-mlogloss:1.75378	eval-mlogloss:1.75452
[4]	train-mlogloss:1.7176	eval-mlogloss:1.71879
[5]	train-mlogloss:1.68896	eval-mlogloss:1.69039
[6]	train-mlogloss:1.66623	eval-mlogloss:1.66761
[7]	train-mlogloss:1.64974	eval-mlogloss:1.65131
[8]	train-mlogloss:1.63508	eval-mlogloss:1.63675
[9]	train-mlogloss:1.62183	eval-mlogloss:1.62379
[10]	train-mlogloss:1.61049	eval-mlogloss:1.61237
[11]	train-mlogloss:1.6	eval-mlogloss:1.60216
[12]	train-mlogloss:1.59083	eval-mlogloss:1.59352
[13]	train-mlogloss:1.58268	eval-mlogloss:1.58559
[14]	train-mlogloss:1.57441	eval-m

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK 
# Define the hyperparameter space
space = {'eta' : hp.quniform('eta', 0.05, 0.35, 0.05),
    'max_depth' : hp.choice('max_depth', np.arange(1, 5, dtype=int)),
    'num_class' : 38,
    'eval_metric': 'mlogloss',
    'objective': 'multi:softprob'}

# Evaluate the function fmin over the hyperparameter space, and
# print the best hyperparameters.  
xgboost_best = fmin(xgboost_train, space=space, algo=tpe.suggest, max_evals = 100)
print ("==================best==================")
print ("Optimal parameters for dtrain1 are: ", xgboost_best)

Parameters:  {'eta': 0.30000000000000004, 'eval_metric': 'mlogloss', 'max_depth': 3, 'num_class': 38, 'objective': 'multi:softprob'}
model starts
[0]	train-mlogloss:2.17709	eval-mlogloss:2.17656
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:2.00928	eval-mlogloss:2.00897
[2]	train-mlogloss:1.91196	eval-mlogloss:1.9116
[3]	train-mlogloss:1.84549	eval-mlogloss:1.84531
[4]	train-mlogloss:1.79614	eval-mlogloss:1.79591
[5]	train-mlogloss:1.75742	eval-mlogloss:1.75763
[6]	train-mlogloss:1.72727	eval-mlogloss:1.7278
[7]	train-mlogloss:1.70177	eval-mlogloss:1.70223
[8]	train-mlogloss:1.68156	eval-mlogloss:1.68219
[9]	train-mlogloss:1.66366	eval-mlogloss:1.66436
[10]	train-mlogloss:1.64836	eval-mlogloss:1.64918
[11]	train-mlogloss:1.63567	eval-mlogloss:1.63682
[12]	train-mlogloss:1.6248	eval-mlogloss:1.62616
[13]	train-mlogloss:1.61378	eval-mlogloss:1.61503
[14]	train-mlogl

In [5]:
params = {'max_depth':1, 
          'eta':0.05,
          'num_class' : 38,
          'eval_metric': 'mlogloss',
          'objective': 'multi:softprob'}

In [7]:
params['max_depth']

1

In [6]:
import time
start = time.time()
params = {'max_depth':1, 
          'eta':0.05,
          'num_class' : 38,
          'eval_metric': 'mlogloss',
          'objective': 'multi:softprob'}
model = xgb.train(params=params, 
    dtrain=Dtrain, 
    num_boost_round=10,  
#     early_stopping_rounds=10, 
    evals=evallist)
score = log_loss(Dtest.get_label(), model.predict(Dtest))
end = time.time()
print(end - start)

[0]	train-mlogloss:3.39081	eval-mlogloss:3.39046
[1]	train-mlogloss:3.24364	eval-mlogloss:3.24309
[2]	train-mlogloss:3.13172	eval-mlogloss:3.13101
[3]	train-mlogloss:3.04041	eval-mlogloss:3.03966
[4]	train-mlogloss:2.9628	eval-mlogloss:2.96181
[5]	train-mlogloss:2.89512	eval-mlogloss:2.89399
[6]	train-mlogloss:2.83529	eval-mlogloss:2.8342
[7]	train-mlogloss:2.78137	eval-mlogloss:2.78018
[8]	train-mlogloss:2.73262	eval-mlogloss:2.73145
[9]	train-mlogloss:2.68788	eval-mlogloss:2.6866
1246.0861339569092


In [6]:
model.save_model('Dataset/0416.model')

In [5]:
import pandas as pd

In [6]:
test = pd.read_csv('Dataset/real_test.csv',index_col = 0)

In [10]:
bst = xgb.Booster({'eta' : 0.3,
    'max_depth' : 15,
    'num_class' : 38,
    'eval_metric': 'mlogloss',
    'objective': 'multi:softprob'})

In [28]:
model = xgb.Booster(model_file='Dataset/04220.3_10.model')

In [29]:
log_loss(Dtest.get_label(), model.predict(Dtest))

1.1356376590077499

In [30]:
real_test = xgb.DMatrix('Dataset/dtest.buffer')

In [31]:
predict_mat = model.predict(real_test)

In [32]:
predict_mat.shape

(653646, 38)

In [33]:
result = pd.DataFrame(predict_mat)
dex = test.iloc[:,0]
submurge = pd.concat([dex,result], axis = 1)
print (submurge.shape)

(653646, 39)


In [34]:
avgmurg = submurge.groupby(submurge.VisitNumber).mean().reset_index()
avgmurg.reset_index(drop = True, inplace = True)
avgmurg.columns = ['VisitNumber', 'TripType_3','TripType_4','TripType_5','TripType_6','TripType_7',\
'TripType_8','TripType_9','TripType_12','TripType_14','TripType_15','TripType_18',\
'TripType_19','TripType_20','TripType_21','TripType_22','TripType_23','TripType_24',\
'TripType_25','TripType_26','TripType_27','TripType_28','TripType_29','TripType_30',\
'TripType_31','TripType_32','TripType_33','TripType_34','TripType_35','TripType_36',\
'TripType_37','TripType_38','TripType_39','TripType_40','TripType_41','TripType_42',\
'TripType_43','TripType_44','TripType_999']
avgmurg[['VisitNumber']] = avgmurg[['VisitNumber']].astype(int)
avgmurg.fillna(1/38, inplace=True )
avgmurg.to_csv('KaggleSub_KNN.csv', index = False, header=True)

In [23]:
from sklearn.metrics import accuracy_score

In [26]:
a=[[1,2],[3,4]]

In [29]:
max_pos = np.argmax(model.predict(Dtest),axis=1)

In [30]:
max_pos.shape

(129411,)

In [33]:
accuracy_score(Dtest.get_label(),max_pos)

0.56608016320096433

In [34]:
xgb.importance(feature_names = NULL, model = model)

AttributeError: module 'xgboost' has no attribute 'importance'

In [35]:
print(model.feature_importances_)

AttributeError: 'Booster' object has no attribute 'feature_importances_'

In [37]:
features = model.get_fscore()

In [42]:
sort_features = sorted( ((features[k],k) for k in features), reverse=True)

In [43]:
sort_features

[(184, 'f6'),
 (133, 'f9'),
 (92, 'f8'),
 (90, 'f5'),
 (87, 'f4'),
 (40, 'f0'),
 (25, 'f35'),
 (21, 'f76'),
 (21, 'f74'),
 (21, 'f62'),
 (19, 'f7'),
 (19, 'f23'),
 (17, 'f29'),
 (16, 'f53'),
 (16, 'f31'),
 (16, 'f1'),
 (15, 'f72'),
 (15, 'f50'),
 (14, 'f67'),
 (14, 'f42'),
 (13, 'f64'),
 (13, 'f36'),
 (11, 'f77'),
 (11, 'f61'),
 (11, 'f55'),
 (11, 'f30'),
 (11, 'f13'),
 (10, 'f51'),
 (10, 'f43'),
 (9, 'f59'),
 (9, 'f40'),
 (8, 'f45'),
 (8, 'f44'),
 (7, 'f60'),
 (7, 'f41'),
 (7, 'f28'),
 (7, 'f22'),
 (7, 'f14'),
 (6, 'f54'),
 (6, 'f27'),
 (6, 'f26'),
 (6, 'f16'),
 (5, 'f48'),
 (5, 'f46'),
 (5, 'f39'),
 (5, 'f17'),
 (4, 'f12'),
 (3, 'f52'),
 (3, 'f34'),
 (2, 'f75'),
 (2, 'f71'),
 (2, 'f66'),
 (2, 'f38'),
 (2, 'f24'),
 (2, 'f19'),
 (2, 'f15'),
 (2, 'f11'),
 (1, 'f79'),
 (1, 'f73'),
 (1, 'f70'),
 (1, 'f68'),
 (1, 'f65'),
 (1, 'f63'),
 (1, 'f57'),
 (1, 'f56'),
 (1, 'f49'),
 (1, 'f47'),
 (1, 'f33'),
 (1, 'f32'),
 (1, 'f21'),
 (1, 'f20'),
 (1, 'f18')]

In [None]:
params = {'max_depth':2, 
          'eta':0.2,
          'num_class' : 38,
          'eval_metric': 'mlogloss',
          'objective': 'multi:softprob'}
model2 = xgb.train(params=params, 
    dtrain=Dtrain, 
    num_boost_round=500,  
    early_stopping_rounds=10, 
    evals=evallist)
score2 = log_loss(Dtest.get_label(), model2.predict(Dtest))