In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn
from sklearn import preprocessing
import time
%matplotlib inline

In [2]:
train = pd.read_csv('input/otto_train.csv')
print(train.shape)

(61878, 95)


In [3]:
def encode_features(dat):
    df = pd.DataFrame(index=dat.index.values)
    for c in dat.columns.values:
        unq = np.unique(dat[c])
        arr = np.zeros(len(df))
        for ii, u in enumerate(unq):
            flg = (dat[c] == u).values
            arr[flg] = ii
        df[c] = arr.astype(int)
    return df

In [4]:
x = encode_features(train.drop(['id', 'target'], axis=1))
y = np.array([int(v.split('_')[1])-1 for v in train.target])
print(x.shape, y.shape)

(61878, 93) (61878,)


In [5]:
num_cls = len(np.unique(y))
print(num_cls)

9


In [6]:
prm_xgb = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',
    'num_class': num_cls,
    'max_depth': 5,
    'learning_rate': 0.1,
    'colsample_bytree': 0.9,
    'subsample': 0.9,
    'eval_metric': 'mlogloss',
}
prm_lgb = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': num_cls,
    'num_leaves' : 2**5-1,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'metric': 'multi_logloss',
}
num_round = 100

In [7]:
flg_train = np.random.choice([False, True], len(y), p=[0.3, 0.7])
flg_valid = np.logical_not(flg_train)

In [8]:
dt_xgb   = xgb.DMatrix(x[flg_train], y[flg_train])
dv_xgb   = xgb.DMatrix(x[flg_valid], y[flg_valid])
dt_lgb   = lgb.Dataset(x[flg_train], y[flg_train])
dv_lgb   = lgb.Dataset(x[flg_valid], y[flg_valid], reference=dt_lgb)
dt_lgb_c = lgb.Dataset(x[flg_train], y[flg_train], free_raw_data=False)
dv_lgb_c = lgb.Dataset(x[flg_valid], y[flg_valid], free_raw_data=False,
                       reference=dt_lgb)

In [9]:
time_s = time.time()
obj_xgb = xgb.train(
    prm_xgb, dt_xgb, num_round,
    [(dt_xgb, 'train'), (dv_xgb, 'valid')])
time_t = time.time()
print(time_t - time_s)

[0]	train-mlogloss:1.9749	valid-mlogloss:1.97786
[1]	train-mlogloss:1.81513	valid-mlogloss:1.82016
[2]	train-mlogloss:1.68753	valid-mlogloss:1.69448
[3]	train-mlogloss:1.58737	valid-mlogloss:1.59634
[4]	train-mlogloss:1.49727	valid-mlogloss:1.50756
[5]	train-mlogloss:1.41833	valid-mlogloss:1.43013
[6]	train-mlogloss:1.34989	valid-mlogloss:1.36321
[7]	train-mlogloss:1.28881	valid-mlogloss:1.30316
[8]	train-mlogloss:1.23412	valid-mlogloss:1.24999
[9]	train-mlogloss:1.1893	valid-mlogloss:1.20681
[10]	train-mlogloss:1.14526	valid-mlogloss:1.1641
[11]	train-mlogloss:1.10539	valid-mlogloss:1.12545
[12]	train-mlogloss:1.06823	valid-mlogloss:1.08933
[13]	train-mlogloss:1.035	valid-mlogloss:1.05767
[14]	train-mlogloss:1.00417	valid-mlogloss:1.02793
[15]	train-mlogloss:0.975752	valid-mlogloss:1.00083
[16]	train-mlogloss:0.949905	valid-mlogloss:0.976116
[17]	train-mlogloss:0.926771	valid-mlogloss:0.954262
[18]	train-mlogloss:0.904333	valid-mlogloss:0.932782
[19]	train-mlogloss:0.884051	valid-mlog

In [10]:
time_s = time.time()
obj_lgb = lgb.train(
    prm_lgb, dt_lgb, num_boost_round=num_round,
    valid_sets=dv_lgb)
time_t = time.time()
print(time_t - time_s)
obj_lgb.save_model('output/lgb.txt')

[1]	valid_0's multi_logloss:1.96479
[2]	valid_0's multi_logloss:1.79892
[3]	valid_0's multi_logloss:1.66818
[4]	valid_0's multi_logloss:1.56021
[5]	valid_0's multi_logloss:1.47257
[6]	valid_0's multi_logloss:1.39444
[7]	valid_0's multi_logloss:1.3271
[8]	valid_0's multi_logloss:1.26567
[9]	valid_0's multi_logloss:1.2118
[10]	valid_0's multi_logloss:1.16331
[11]	valid_0's multi_logloss:1.11929
[12]	valid_0's multi_logloss:1.08051
[13]	valid_0's multi_logloss:1.04474
[14]	valid_0's multi_logloss:1.01212
[15]	valid_0's multi_logloss:0.982286
[16]	valid_0's multi_logloss:0.955192
[17]	valid_0's multi_logloss:0.930746
[18]	valid_0's multi_logloss:0.908582
[19]	valid_0's multi_logloss:0.887337
[20]	valid_0's multi_logloss:0.868239
[21]	valid_0's multi_logloss:0.849997
[22]	valid_0's multi_logloss:0.833093
[23]	valid_0's multi_logloss:0.81763
[24]	valid_0's multi_logloss:0.803032
[25]	valid_0's multi_logloss:0.789616
[26]	valid_0's multi_logloss:0.77642
[27]	valid_0's multi_logloss:0.764128
[

In [11]:
time_s = time.time()
obj_lgb = lgb.train(
    prm_lgb, dt_lgb_c, num_boost_round=num_round,
    valid_sets=dv_lgb_c,
    categorical_feature=list(range(len(x.columns.values))))
time_t = time.time()
print(time_t - time_s)
obj_lgb.save_model('output/lgb_cat.txt')

[1]	valid_0's multi_logloss:2.0095
[2]	valid_0's multi_logloss:1.87231
[3]	valid_0's multi_logloss:1.7606
[4]	valid_0's multi_logloss:1.66921
[5]	valid_0's multi_logloss:1.59493
[6]	valid_0's multi_logloss:1.52725
[7]	valid_0's multi_logloss:1.46893
[8]	valid_0's multi_logloss:1.41559
[9]	valid_0's multi_logloss:1.36839
[10]	valid_0's multi_logloss:1.32496
[11]	valid_0's multi_logloss:1.28681
[12]	valid_0's multi_logloss:1.2513
[13]	valid_0's multi_logloss:1.21869
[14]	valid_0's multi_logloss:1.18943
[15]	valid_0's multi_logloss:1.16157
[16]	valid_0's multi_logloss:1.1372
[17]	valid_0's multi_logloss:1.11444
[18]	valid_0's multi_logloss:1.09381
[19]	valid_0's multi_logloss:1.07351
[20]	valid_0's multi_logloss:1.05467
[21]	valid_0's multi_logloss:1.03711
[22]	valid_0's multi_logloss:1.0203
[23]	valid_0's multi_logloss:1.00493
[24]	valid_0's multi_logloss:0.990066
[25]	valid_0's multi_logloss:0.977071
[26]	valid_0's multi_logloss:0.964303
[27]	valid_0's multi_logloss:0.951664
[28]	valid_