In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn
from sklearn import preprocessing
import time
%matplotlib inline

In [2]:
train = pd.read_csv('input/otto_train.csv')
print(train.shape)

(61878, 95)


In [3]:
def encode_features(dat):
    df = pd.DataFrame(index=dat.index.values)
    for c in dat.columns.values:
        unq = np.unique(dat[c])
        arr = np.zeros(len(df))
        for ii, u in enumerate(unq):
            flg = (dat[c] == u).values
            arr[flg] = ii
        df[c] = arr.astype(int)
    return df

In [4]:
x = encode_features(train.drop(['id', 'target'], axis=1))
y = np.array([int(v.split('_')[1])-1 for v in train.target])
print(x.shape, y.shape)

(61878, 93) (61878,)


In [5]:
num_cls = len(np.unique(y))
print(num_cls)

9


In [6]:
prm_xgb = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',
    'num_class': num_cls,
    'max_depth': 5,
    'learning_rate': 0.1,
    'colsample_bytree': 0.9,
    'subsample': 0.9,
    'eval_metric': 'mlogloss',
}
prm_lgb = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': num_cls,
    'num_leaves' : 2**5-1,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'metric': 'multi_logloss',
}
num_round = 10

In [7]:
flg_train = np.random.choice([False, True], len(y), p=[0.3, 0.7])
flg_valid = np.logical_not(flg_train)

In [8]:
dt_xgb   = xgb.DMatrix(x[flg_train], y[flg_train])
dv_xgb   = xgb.DMatrix(x[flg_valid], y[flg_valid])
dt_lgb   = lgb.Dataset(x[flg_train], y[flg_train])
dv_lgb   = lgb.Dataset(x[flg_valid], y[flg_valid], reference=dt_lgb)
dt_lgb_c = lgb.Dataset(x[flg_train], y[flg_train], free_raw_data=False)
dv_lgb_c = lgb.Dataset(x[flg_valid], y[flg_valid], free_raw_data=False,
                       reference=dt_lgb)

In [9]:
time_s = time.time()
obj_xgb = xgb.train(
    prm_xgb, dt_xgb, num_round,
    [(dt_xgb, 'train'), (dv_xgb, 'valid')])
time_t = time.time()
print(time_t - time_s)

[0]	train-mlogloss:1.99177	valid-mlogloss:1.99372
[1]	train-mlogloss:1.82784	valid-mlogloss:1.83223
[2]	train-mlogloss:1.70671	valid-mlogloss:1.71249
[3]	train-mlogloss:1.59923	valid-mlogloss:1.6073
[4]	train-mlogloss:1.50694	valid-mlogloss:1.51626
[5]	train-mlogloss:1.4277	valid-mlogloss:1.43853
[6]	train-mlogloss:1.36171	valid-mlogloss:1.37404
[7]	train-mlogloss:1.29922	valid-mlogloss:1.31287
[8]	train-mlogloss:1.2441	valid-mlogloss:1.25884
[9]	train-mlogloss:1.19562	valid-mlogloss:1.21179
687.2176711559296


In [10]:
time_s = time.time()
obj_lgb = lgb.train(
    prm_lgb, dt_lgb, num_boost_round=num_round,
    valid_sets=dv_lgb)
time_t = time.time()
print(time_t - time_s)
obj_lgb.save_model('output/lgb.txt')

[1]	valid_0's multi_logloss:1.96383
[2]	valid_0's multi_logloss:1.79886
[3]	valid_0's multi_logloss:1.66845
[4]	valid_0's multi_logloss:1.56013
[5]	valid_0's multi_logloss:1.47166
[6]	valid_0's multi_logloss:1.39313
[7]	valid_0's multi_logloss:1.32587
[8]	valid_0's multi_logloss:1.26509
[9]	valid_0's multi_logloss:1.211
[10]	valid_0's multi_logloss:1.16295
1.8976049423217773


In [11]:
time_s = time.time()
obj_lgb = lgb.train(
    prm_lgb, dt_lgb_c, num_boost_round=num_round,
    valid_sets=dv_lgb_c,
    categorical_feature=list(range(len(x.columns.values))))
time_t = time.time()
print(time_t - time_s)
obj_lgb.save_model('output/lgb_cat.txt')

[1]	valid_0's multi_logloss:2.00997
[2]	valid_0's multi_logloss:1.8735
[3]	valid_0's multi_logloss:1.76282
[4]	valid_0's multi_logloss:1.67073
[5]	valid_0's multi_logloss:1.59668
[6]	valid_0's multi_logloss:1.52861
[7]	valid_0's multi_logloss:1.47019
[8]	valid_0's multi_logloss:1.4177
[9]	valid_0's multi_logloss:1.37029
[10]	valid_0's multi_logloss:1.32816
2.2183749675750732
