In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import time
from tqdm import tqdm

from scipy import stats
from sklearn.model_selection import KFold

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn import linear_model


np.random.seed(42)

In [2]:
version='v2'

# Data Load

In [3]:
train = pd.read_excel("../data/train_clst.xlsx")
train = train.drop(['Unnamed: 0'], axis=1)
train

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,clst
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0,0,3
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0,0,3
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0,0,3
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0,0,3
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1,1,2
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1,1,2
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1,1,2
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1,1,2


In [4]:
test = pd.read_excel("../data/test_clst.xlsx")
test = test.drop(['Unnamed: 0'], axis=1)
test

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유,clst
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,,,3
1,1,2020-08-25 01,,,,,,,,3
2,1,2020-08-25 02,,,,,,,,3
3,1,2020-08-25 03,27.3,1.1,78.0,,0.0,,,3
4,1,2020-08-25 04,,,,,,,,3
...,...,...,...,...,...,...,...,...,...,...
10075,60,2020-08-31 19,,,,,,,,2
10076,60,2020-08-31 20,,,,,,,,2
10077,60,2020-08-31 21,27.9,4.1,68.0,,0.0,1.0,1.0,2
10078,60,2020-08-31 22,,,,,,,,2


In [5]:
#train = pd.read_csv("../data/train.csv", encoding='cp949')

In [6]:
#train.describe()

In [7]:
#train.info()

In [8]:
#test = pd.read_csv("./data/test.csv", encoding='cp949')

In [9]:
#test.describe()

In [10]:
#test.info()

In [11]:
submission = pd.read_csv('../submission/sample_submission.csv', encoding='cp949')

In [12]:
submission_cat = submission.copy()
submission_lgbm = submission.copy()
submission_xgb = submission.copy()

# Feature Engineering

In [13]:
#시간 변수와 요일 변수를 추가해봅니다.
def time(x):
    return int(x[-2:])
train['hour'] = train['date_time'].apply(lambda x: time(x))
test['hour'] = test['date_time'].apply(lambda x: time(x))

def weekday(x):
    return pd.to_datetime(x[:10]).weekday()
train['weekday'] = train['date_time'].apply(lambda x :weekday(x))
test['weekday'] = test['date_time'].apply(lambda x :weekday(x))

train['weekend'] = train['weekday'].apply(lambda x : 1 if (x==5 or x==6) else 0)
test['weekend'] = test['weekday'].apply(lambda x : 1 if (x==5 or x==6) else 0)

def month(x):
    return int(x[5:7])
train['month'] = train['date_time'].apply(lambda x :month(x))
test['month'] = test['date_time'].apply(lambda x :month(x))

In [14]:
#건물별로 '비전기냉방설비운영'과 '태양광보유'를 판단해 test set의 결측치를 보간해줍니다
train[['num', '비전기냉방설비운영','태양광보유']]
ice = {}
hot = {}
count = 0
for i in tqdm(range(0, len(train), len(train)//60)):
    count += 1
    ice[count] = train.loc[i,'비전기냉방설비운영']
    hot[count] = train.loc[i,'태양광보유']

100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 7496.30it/s]


In [15]:
for i in tqdm(range(len(test))):
    test.loc[i, '비전기냉방설비운영'] = ice[test['num'][i]]
    test.loc[i, '태양광보유'] = hot[test['num'][i]]

100%|██████████████████████████████████████████████████████████████████████████| 10080/10080 [00:09<00:00, 1051.55it/s]


In [16]:
test = test.interpolate(method='values')

In [17]:
test

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유,clst,hour,weekday,weekend,month
0,1,2020-08-25 00,27.800000,1.500000,74.000000,0.0,0.000000,0.0,0.0,3,0,1,0,8
1,1,2020-08-25 01,27.633333,1.366667,75.333333,0.0,0.000000,0.0,0.0,3,1,1,0,8
2,1,2020-08-25 02,27.466667,1.233333,76.666667,0.0,0.000000,0.0,0.0,3,2,1,0,8
3,1,2020-08-25 03,27.300000,1.100000,78.000000,0.0,0.000000,0.0,0.0,3,3,1,0,8
4,1,2020-08-25 04,26.900000,1.166667,79.666667,0.0,0.000000,0.0,0.0,3,4,1,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10075,60,2020-08-31 19,28.633333,3.566667,66.000000,0.0,0.533333,1.0,1.0,2,19,0,0,8
10076,60,2020-08-31 20,28.266667,3.833333,67.000000,0.0,0.266667,1.0,1.0,2,20,0,0,8
10077,60,2020-08-31 21,27.900000,4.100000,68.000000,0.0,0.000000,1.0,1.0,2,21,0,0,8
10078,60,2020-08-31 22,27.900000,4.100000,68.000000,0.0,0.000000,1.0,1.0,2,22,0,0,8


In [18]:
def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)

train_cdhs = np.array([])
for num in tqdm(range(1, 61)):
    temp = train[train['num'] == num]
    cdh = CDH(temp['기온(°C)'].values)
    train_cdhs = np.concatenate([train_cdhs, cdh])
train['CDH'] = train_cdhs

test_cdhs = np.array([])
for num in tqdm(range(1, 61)):
    temp = test[test['num'] == num]
    cdh = CDH(temp['기온(°C)'].values)
    test_cdhs = np.concatenate([test_cdhs, cdh])
test['CDH'] = test_cdhs

100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 65.68it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 516.94it/s]


In [19]:
train['THI'] = 9/5*train['기온(°C)'] - 0.55*(1-train['습도(%)']/100)*(9/5*train['기온(°C)']-26)+32
train['THI_cat'] = pd.cut(train['THI'], bins = [0, 68, 75, 80, 200], labels = [1,2,3,4])

test['THI'] = 9/5*test['기온(°C)'] - 0.55*(1-test['습도(%)']/100)*(9/5*test['기온(°C)']-26)+32
test['THI_cat'] = pd.cut(test['THI'], bins = [0, 68, 75, 80, 200], labels = [1,2,3,4])

In [20]:
train['태양광'] =  train['일조(hr)'] * train['태양광보유'] * train['강수량(mm)'].map(lambda x : np.exp(-x)) * train['풍속(m/s)'] / train['습도(%)'] * 100
test['태양광'] =  test['일조(hr, 3시간)'] * test['태양광보유'] * test['강수량(mm, 6시간)'].map(lambda x : np.exp(-x)) * test['풍속(m/s)'] / test['습도(%)'] * 100

In [21]:
train

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,clst,hour,weekday,weekend,month,CDH,THI,THI_cat,태양광
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0,0,3,0,0,0,6,-8.4,63.43008,1,0.000000
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0,0,3,1,0,0,6,-16.7,63.56993,1,0.000000
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0,0,3,2,0,0,6,-25.2,63.22775,1,0.000000
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0,0,3,3,0,0,6,-34.1,62.54339,1,0.000000
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0,0,3,4,0,0,6,-43.1,62.39760,1,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1,1,2,19,0,0,8,30.8,77.80896,3,2.367647
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1,1,2,20,0,0,8,32.3,77.44917,3,0.000000
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1,1,2,21,0,0,8,32.5,77.44917,3,0.000000
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1,1,2,22,0,0,8,31.3,77.52246,3,0.000000


In [22]:
train.describe()

Unnamed: 0,num,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,clst,hour,weekday,weekend,month,CDH,THI,태양광
count,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0
mean,30.5,2324.830866,24.251713,2.151641,80.169848,0.514989,0.213533,0.683333,0.483333,1.95,11.5,2.964706,0.282353,6.929412,-21.071605,73.461061,0.418172
std,17.318173,2058.999326,3.407902,1.514475,15.525862,2.624505,0.370517,0.465178,0.499724,0.82513,6.922215,2.014351,0.450146,0.793925,34.111471,4.77833,1.377256
min,1.0,0.0,11.1,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,-151.3,52.34421,0.0
25%,15.75,1055.268,21.8,1.1,70.0,0.0,0.0,0.0,0.0,1.0,5.75,1.0,0.0,6.0,-45.3,70.03596,0.0
50%,30.5,1700.352,24.2,1.9,84.0,0.0,0.0,1.0,0.0,2.0,11.5,3.0,0.0,7.0,-20.8,73.61585,0.0
75%,45.25,2780.487,26.5,2.9,93.0,0.0,0.3,1.0,1.0,3.0,17.25,5.0,1.0,8.0,2.8,77.07451,0.0
max,60.0,17739.225,36.3,20.1,100.0,81.5,1.0,1.0,1.0,3.0,23.0,6.0,1.0,8.0,99.2,86.66179,20.0


In [23]:
test

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유,clst,hour,weekday,weekend,month,CDH,THI,THI_cat,태양광
0,1,2020-08-25 00,27.800000,1.500000,74.000000,0.0,0.000000,0.0,0.0,3,0,1,0,8,1.800000,78.602280,3,0.000000
1,1,2020-08-25 01,27.633333,1.366667,75.333333,0.0,0.000000,0.0,0.0,3,1,1,0,8,3.433333,78.519273,3,0.000000
2,1,2020-08-25 02,27.466667,1.233333,76.666667,0.0,0.000000,0.0,0.0,3,2,1,0,8,4.900000,78.431867,3,0.000000
3,1,2020-08-25 03,27.300000,1.100000,78.000000,0.0,0.000000,0.0,0.0,3,3,1,0,8,6.200000,78.340060,3,0.000000
4,1,2020-08-25 04,26.900000,1.166667,79.666667,0.0,0.000000,0.0,0.0,3,4,1,0,8,7.100000,77.912697,3,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10075,60,2020-08-31 19,28.633333,3.566667,66.000000,0.0,0.533333,1.0,1.0,2,19,0,0,8,22.400000,78.764020,3,2.882155
10076,60,2020-08-31 20,28.266667,3.833333,67.000000,0.0,0.266667,1.0,1.0,2,20,0,0,8,25.400000,78.364280,3,1.525705
10077,60,2020-08-31 21,27.900000,4.100000,68.000000,0.0,0.000000,1.0,1.0,2,21,0,0,8,27.700000,77.957280,3,0.000000
10078,60,2020-08-31 22,27.900000,4.100000,68.000000,0.0,0.000000,1.0,1.0,2,22,0,0,8,29.300000,77.957280,3,0.000000


In [24]:
test.describe()

Unnamed: 0,num,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유,clst,hour,weekday,weekend,month,CDH,THI,태양광
count,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0
mean,30.5,27.805813,2.440585,81.9625,2.191845,0.612917,0.683333,0.483333,1.95,11.5,3.0,0.285714,8.0,21.134927,79.447959,1.049931
std,17.318961,2.275416,1.785098,11.467641,5.704649,0.867991,0.465199,0.499747,0.825167,6.92253,2.000099,0.451776,0.0,21.159186,2.411122,3.267854
min,1.0,22.1,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,-28.6,71.78,0.0
25%,15.75,26.133333,1.2,74.666667,0.0,0.0,0.0,0.0,1.0,5.75,1.0,0.0,8.0,5.3,77.881755,0.0
50%,30.5,27.466667,2.1,83.0,0.0,0.1,1.0,0.0,2.0,11.5,3.0,0.0,8.0,17.516667,79.21422,0.0
75%,45.25,29.1,3.175,91.0,1.5,1.033333,1.0,1.0,3.0,17.25,5.0,1.0,8.0,33.45,81.01962,0.00023
max,60.0,35.4,22.5,100.0,83.5,3.0,1.0,1.0,3.0,23.0,6.0,1.0,8.0,88.6,85.5519,30.285714


# Feature Selection

In [25]:
train_x = train.drop(['date_time'], axis=1)

In [26]:
train_x = train_x.rename({'전력사용량(kWh)':'feature_1', '기온(°C)': 'feature_2', '풍속(m/s)':'feature_3', '습도(%)':'feature_4', '강수량(mm)':'feature_5', '일조(hr)':'feature_6', '비전기냉방설비운영':'feature_7', '태양광보유':'feature_8', 'hour':'feature_9', 'weekday':'feature_10', 'weekend':'feature_11', 'month':'feature_12', 'CDH':'feature_13', 'THI':'feature_14', 'THI_cat':'feature_15', '태양광':'feature_16', 'clst':'feature_17', 'num':'feature_18'}, axis='columns')

In [27]:
train_x

Unnamed: 0,feature_18,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_17,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16
0,1,8179.056,17.6,2.5,92.0,0.8,0.0,0,0,3,0,0,0,6,-8.4,63.43008,1,0.000000
1,1,8135.640,17.7,2.9,91.0,0.3,0.0,0,0,3,1,0,0,6,-16.7,63.56993,1,0.000000
2,1,8107.128,17.5,3.2,91.0,0.0,0.0,0,0,3,2,0,0,6,-25.2,63.22775,1,0.000000
3,1,8048.808,17.1,3.2,91.0,0.0,0.0,0,0,3,3,0,0,6,-34.1,62.54339,1,0.000000
4,1,8043.624,17.0,3.3,92.0,0.0,0.0,0,0,3,4,0,0,6,-43.1,62.39760,1,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122395,60,4114.368,27.8,2.3,68.0,0.0,0.7,1,1,2,19,0,0,8,30.8,77.80896,3,2.367647
122396,60,3975.696,27.3,1.2,71.0,0.0,0.0,1,1,2,20,0,0,8,32.3,77.44917,3,0.000000
122397,60,3572.208,27.3,1.8,71.0,0.0,0.0,1,1,2,21,0,0,8,32.5,77.44917,3,0.000000
122398,60,3299.184,27.1,1.8,74.0,0.0,0.0,1,1,2,22,0,0,8,31.3,77.52246,3,0.000000


In [28]:
train_x['feature_15'] = train_x['feature_15'].astype(int)

In [29]:
test_x = test.drop(['date_time'], axis=1)

In [30]:
test_x = test_x.rename({'기온(°C)': 'feature_2', '풍속(m/s)':'feature_3', '습도(%)':'feature_4', '강수량(mm, 6시간)':'feature_5', '일조(hr, 3시간)':'feature_6', '비전기냉방설비운영':'feature_7', '태양광보유':'feature_8', 'hour':'feature_9', 'weekday':'feature_10', 'weekend':'feature_11', 'month':'feature_12', 'CDH':'feature_13', 'THI':'feature_14', 'THI_cat':'feature_15', '태양광':'feature_16', 'clst':'feature_17', 'num':'feature_18'}, axis='columns')

In [31]:
test_x

Unnamed: 0,feature_18,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_17,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16
0,1,27.800000,1.500000,74.000000,0.0,0.000000,0.0,0.0,3,0,1,0,8,1.800000,78.602280,3,0.000000
1,1,27.633333,1.366667,75.333333,0.0,0.000000,0.0,0.0,3,1,1,0,8,3.433333,78.519273,3,0.000000
2,1,27.466667,1.233333,76.666667,0.0,0.000000,0.0,0.0,3,2,1,0,8,4.900000,78.431867,3,0.000000
3,1,27.300000,1.100000,78.000000,0.0,0.000000,0.0,0.0,3,3,1,0,8,6.200000,78.340060,3,0.000000
4,1,26.900000,1.166667,79.666667,0.0,0.000000,0.0,0.0,3,4,1,0,8,7.100000,77.912697,3,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10075,60,28.633333,3.566667,66.000000,0.0,0.533333,1.0,1.0,2,19,0,0,8,22.400000,78.764020,3,2.882155
10076,60,28.266667,3.833333,67.000000,0.0,0.266667,1.0,1.0,2,20,0,0,8,25.400000,78.364280,3,1.525705
10077,60,27.900000,4.100000,68.000000,0.0,0.000000,1.0,1.0,2,21,0,0,8,27.700000,77.957280,3,0.000000
10078,60,27.900000,4.100000,68.000000,0.0,0.000000,1.0,1.0,2,22,0,0,8,29.300000,77.957280,3,0.000000


In [32]:
test_x['feature_15'] = test_x['feature_15'].astype(int)

# Dataset

In [33]:
x_train = train_x.drop(['feature_1'], axis = 1)
y_train = pd.DataFrame(train_x['feature_1'])

print(x_train.shape)
print(y_train.shape)

(122400, 17)
(122400, 1)


In [34]:
test_x['index'] = np.arange(len(test_x))

print(test_x.shape)

(10080, 18)


In [35]:
x_test = test_x.drop(['index'], axis = 1)

print(x_test.shape)

(10080, 17)


# Model

## XGBoost

In [36]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 42)
oof_preds_xgb = np.zeros(train_x.shape[0])

In [37]:
print("xgboost")

sub_preds_xgb = np.zeros(test_x.shape[0])

for train_idx, valid_idx in tqdm(folds.split(x_train, y_train)):
    trn_x = x_train.iloc[train_idx, :]
    trn_y = y_train.iloc[train_idx, :]
    val_x = x_train.iloc[valid_idx, :]
    val_y = y_train.iloc[valid_idx, :]
    
    params = {'objective': 'reg:linear', 'eval_metric': 'rmse', 'eta': 0.01, 'max_depth': 10,
              'subsample': 0.6, 'colsample_bytree': 0.6, 'alpha':0.001, 'random_state': 42, 'silent': True}
    
    tr_data = xgb.DMatrix(trn_x, trn_y)
    va_data = xgb.DMatrix(val_x, val_y)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model_xgb = xgb.train(params, tr_data, 5000, watchlist, maximize=False, early_stopping_rounds = 30, verbose_eval=100)
    
    test_data = xgb.DMatrix(x_test)
    
    oof_preds_xgb[valid_idx] = model_xgb.predict(va_data)
    sub_preds_xgb += model_xgb.predict(test_data) / folds.n_splits
    

submission_xgb['answer'] = sub_preds_xgb
submission_xgb.to_csv('../submission/xgb_{}.csv'.format(version), index = False)

0it [00:00, ?it/s]

xgboost
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:3087.02466	valid-rmse:3084.08960
[100]	train-rmse:1398.12903	valid-rmse:1409.56641
[200]	train-rmse:742.85358	valid-rmse:766.05042
[300]	train-rmse:465.77506	valid-rmse:496.52387
[400]	train-rmse:337.47287	valid-rmse:373.13550
[500]	train-rmse:275.82675	valid-rmse:315.66043
[600]	train-rmse:239.39679	valid-rmse:283.59607
[700]	train-rmse:217.04985	valid-rmse:265.05039
[800]	train-rmse:199.85179	valid-rmse:251.69089
[900]	train-rmse:187.19598	valid-rmse:242.63916
[1000]	train-rmse:177.13902	valid-rmse:236.34459
[1100]	train-rmse:168.58247	valid-rmse:231.40202
[1200]	train-rmse:161.57620	valid-rmse:227.56120
[1300]	train-rmse:154.79654	valid-rmse:223.84541
[1400]	train-rmse:149.67546	va

1it [04:09, 249.05s/it]

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:3086.34155	valid-rmse:3086.75439
[100]	train-rmse:1400.98791	valid-rmse:1415.72815
[200]	train-rmse:744.66540	valid-rmse:770.68158
[300]	train-rmse:465.95898	valid-rmse:501.18173
[400]	train-rmse:339.02600	valid-rmse:381.99200
[500]	train-rmse:276.16058	valid-rmse:325.58826
[600]	train-rmse:240.04349	valid-rmse:294.64941
[700]	train-rmse:217.68242	valid-rmse:277.15390
[800]	train-rmse:200.70659	valid-rmse:264.80054
[900]	train-rmse:187.59163	valid-rmse:256.21060
[1000]	train-rmse:177.48093	valid-rmse:250.07884
[1100]	train-rmse:169.19685	valid-rmse:245.26209
[1200]	train-rmse:162.54492	valid-rmse:241.62874
[1300]	train-rmse:156.55316	valid-rmse:238.56476
[1400]	train-rmse:151.80089	valid-rmse

2it [08:16, 248.56s/it]

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:3094.93066	valid-rmse:3051.97998
[100]	train-rmse:1408.46167	valid-rmse:1394.13000
[200]	train-rmse:750.96332	valid-rmse:756.33173
[300]	train-rmse:471.69217	valid-rmse:490.73120
[400]	train-rmse:342.36212	valid-rmse:369.39444
[500]	train-rmse:279.18448	valid-rmse:311.79010
[600]	train-rmse:243.29248	valid-rmse:280.45532
[700]	train-rmse:220.34314	valid-rmse:261.79709
[800]	train-rmse:202.81070	valid-rmse:248.66025
[900]	train-rmse:189.31808	valid-rmse:239.73416
[1000]	train-rmse:178.92219	valid-rmse:233.35172
[1100]	train-rmse:170.32817	valid-rmse:228.34731
[1200]	train-rmse:163.17438	valid-rmse:224.33322
[1300]	train-rmse:156.55839	valid-rmse:220.85088
[1400]	train-rmse:151.57294	valid-rmse

3it [12:13, 245.24s/it]

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:3084.34619	valid-rmse:3094.82788
[100]	train-rmse:1401.36706	valid-rmse:1423.55920
[200]	train-rmse:747.19812	valid-rmse:780.17883
[300]	train-rmse:468.49033	valid-rmse:506.21494
[400]	train-rmse:341.28986	valid-rmse:380.00299
[500]	train-rmse:279.08728	valid-rmse:319.15476
[600]	train-rmse:242.95281	valid-rmse:284.72937
[700]	train-rmse:220.89696	valid-rmse:265.67227
[800]	train-rmse:204.33491	valid-rmse:252.19684
[900]	train-rmse:191.45946	valid-rmse:242.64726
[1000]	train-rmse:181.03140	valid-rmse:235.40469
[1100]	train-rmse:172.78848	valid-rmse:230.33438
[1200]	train-rmse:165.49742	valid-rmse:225.76930
[1300]	train-rmse:158.83676	valid-rmse:221.92722
[1400]	train-rmse:153.03734	valid-rmse

4it [16:16, 244.42s/it]

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:3079.12329	valid-rmse:3115.06226
[100]	train-rmse:1396.71777	valid-rmse:1422.45850
[200]	train-rmse:743.05756	valid-rmse:771.23016
[300]	train-rmse:466.68543	valid-rmse:501.46115
[400]	train-rmse:338.78091	valid-rmse:381.09042
[500]	train-rmse:277.22784	valid-rmse:326.13828
[600]	train-rmse:240.80243	valid-rmse:295.59738
[700]	train-rmse:217.94026	valid-rmse:278.05316
[800]	train-rmse:200.86760	valid-rmse:265.87042
[900]	train-rmse:187.62617	valid-rmse:257.37170
[1000]	train-rmse:177.02718	valid-rmse:251.07436
[1100]	train-rmse:168.93530	valid-rmse:246.80009
[1200]	train-rmse:161.84215	valid-rmse:243.21863
[1300]	train-rmse:155.72095	valid-rmse:240.08084
[1400]	train-rmse:150.52196	valid-rmse

5it [21:15, 255.09s/it]


## Catboost

In [38]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 42)
oof_preds_catboost = np.zeros(train_x.shape[0])

In [39]:
print("using catboost")

sub_preds_catboost = np.zeros(test_x.shape[0])

for train_idx, valid_idx in tqdm(folds.split(x_train, y_train)):
    trn_x = x_train.iloc[train_idx, :]
    trn_y = y_train.iloc[train_idx, :]
    val_x = x_train.iloc[valid_idx, :]
    val_y = y_train.iloc[valid_idx, :]
    
    cb_model = CatBoostRegressor(iterations=5000, learning_rate=0.05, depth=11, l2_leaf_reg=20,
                                 bootstrap_type='Bernoulli', subsample=0.6, eval_metric='RMSE',
                                 metric_period=50, od_type='Iter', od_wait=20, random_seed=42,
                                 allow_writing_files=False)
    
    cb_model.fit(trn_x, trn_y, eval_set=(val_x, val_y), cat_features=[], use_best_model=True, verbose=True)
    
    oof_preds_catboost[valid_idx] = cb_model.predict(val_x)
    sub_preds_catboost += cb_model.predict(x_test) / folds.n_splits
    

submission_cat['answer'] = sub_preds_catboost
submission_cat.to_csv('../submission/catboost_{}.csv'.format(version), index = False)

0it [00:00, ?it/s]

using catboost




0:	learn: 1991.8882557	test: 1999.1310713	best: 1999.1310713 (0)	total: 216ms	remaining: 18m 1s
50:	learn: 786.5660202	test: 780.4456691	best: 780.4456691 (50)	total: 5.08s	remaining: 8m 12s
100:	learn: 464.9470902	test: 459.6201188	best: 459.6201188 (100)	total: 11.8s	remaining: 9m 33s
150:	learn: 367.2025676	test: 363.3838771	best: 363.3838771 (150)	total: 18.8s	remaining: 10m 2s
200:	learn: 323.0813022	test: 319.9343488	best: 319.9343488 (200)	total: 25.8s	remaining: 10m 15s
250:	learn: 293.9530227	test: 291.8468672	best: 291.8468672 (250)	total: 32.7s	remaining: 10m 19s
300:	learn: 273.5993690	test: 272.9019346	best: 272.9019346 (300)	total: 40s	remaining: 10m 24s
350:	learn: 259.3861676	test: 260.3097272	best: 260.3097272 (350)	total: 47.1s	remaining: 10m 23s
400:	learn: 248.0536983	test: 250.4791009	best: 250.4791009 (400)	total: 54s	remaining: 10m 19s
450:	learn: 239.1391356	test: 243.9664572	best: 243.9664572 (450)	total: 1m 1s	remaining: 10m 17s
500:	learn: 231.4171764	test: 2

4150:	learn: 115.8688664	test: 179.4404473	best: 179.4404473 (4150)	total: 9m 51s	remaining: 2m
4200:	learn: 115.2219247	test: 179.1942269	best: 179.1942269 (4200)	total: 9m 58s	remaining: 1m 53s
4250:	learn: 114.6566052	test: 179.0522926	best: 179.0522926 (4250)	total: 10m 5s	remaining: 1m 46s
4300:	learn: 114.0572784	test: 178.8742179	best: 178.8742179 (4300)	total: 10m 13s	remaining: 1m 39s
4350:	learn: 113.4877962	test: 178.6937603	best: 178.6936291 (4349)	total: 10m 20s	remaining: 1m 32s
4400:	learn: 112.9038526	test: 178.5008846	best: 178.5001210 (4398)	total: 10m 27s	remaining: 1m 25s
4450:	learn: 112.2883880	test: 178.2678383	best: 178.2678383 (4450)	total: 10m 34s	remaining: 1m 18s
4500:	learn: 111.7279388	test: 178.0772136	best: 178.0772136 (4500)	total: 10m 42s	remaining: 1m 11s
4550:	learn: 111.2138729	test: 177.9223066	best: 177.9223066 (4550)	total: 10m 49s	remaining: 1m 4s
4600:	learn: 110.6452001	test: 177.7427368	best: 177.7427368 (4600)	total: 10m 56s	remaining: 56.9s



0:	learn: 1991.9988707	test: 1992.1013288	best: 1992.1013288 (0)	total: 30.3ms	remaining: 2m 31s
50:	learn: 815.7599316	test: 824.5396214	best: 824.5396214 (50)	total: 4.81s	remaining: 7m 47s
100:	learn: 463.0937382	test: 475.9553169	best: 475.9553169 (100)	total: 11.4s	remaining: 9m 13s
150:	learn: 360.6799718	test: 376.5914982	best: 376.5914982 (150)	total: 18.4s	remaining: 9m 50s
200:	learn: 313.9607152	test: 333.3152108	best: 333.3152108 (200)	total: 25.4s	remaining: 10m 5s
250:	learn: 286.9853147	test: 308.2980693	best: 308.2980693 (250)	total: 32.3s	remaining: 10m 11s
300:	learn: 269.1704519	test: 292.3725140	best: 292.3725140 (300)	total: 39.2s	remaining: 10m 12s
350:	learn: 257.8416824	test: 282.0821243	best: 282.0821243 (350)	total: 46.1s	remaining: 10m 10s
400:	learn: 247.6165208	test: 272.8379791	best: 272.8379791 (400)	total: 53.3s	remaining: 10m 10s
450:	learn: 239.1161232	test: 265.3852234	best: 265.3852234 (450)	total: 1m	remaining: 10m 9s
500:	learn: 232.3098975	test: 2

4150:	learn: 115.9570288	test: 188.2294073	best: 188.2270579 (4148)	total: 9m 48s	remaining: 2m
4200:	learn: 115.2842699	test: 187.9464541	best: 187.9464541 (4200)	total: 9m 55s	remaining: 1m 53s
4250:	learn: 114.6567365	test: 187.7321106	best: 187.7321106 (4250)	total: 10m 2s	remaining: 1m 46s
4300:	learn: 114.1091714	test: 187.5361378	best: 187.5361017 (4299)	total: 10m 9s	remaining: 1m 39s
4350:	learn: 113.4942226	test: 187.3235546	best: 187.3235546 (4350)	total: 10m 17s	remaining: 1m 32s
4400:	learn: 112.8890162	test: 187.1218620	best: 187.1218620 (4400)	total: 10m 24s	remaining: 1m 24s
4450:	learn: 112.2607810	test: 186.9569288	best: 186.9569288 (4450)	total: 10m 31s	remaining: 1m 17s
4500:	learn: 111.6745531	test: 186.7789380	best: 186.7789380 (4500)	total: 10m 38s	remaining: 1m 10s
4550:	learn: 111.0682710	test: 186.5596582	best: 186.5596582 (4550)	total: 10m 45s	remaining: 1m 3s
4600:	learn: 110.3798668	test: 186.3402613	best: 186.3402613 (4600)	total: 10m 52s	remaining: 56.6s




0:	learn: 2008.4967485	test: 1966.6033618	best: 1966.6033618 (0)	total: 154ms	remaining: 12m 50s
50:	learn: 818.2485637	test: 811.2776013	best: 811.2776013 (50)	total: 4.85s	remaining: 7m 50s
100:	learn: 466.7453968	test: 459.8286643	best: 459.8286643 (100)	total: 11.8s	remaining: 9m 31s
150:	learn: 362.8667418	test: 357.1524573	best: 357.1524573 (150)	total: 18.5s	remaining: 9m 55s
200:	learn: 317.0886837	test: 312.8970125	best: 312.8970125 (200)	total: 25.4s	remaining: 10m 6s
250:	learn: 287.7079231	test: 285.6766537	best: 285.6766537 (250)	total: 32.3s	remaining: 10m 11s
300:	learn: 269.3115894	test: 270.0379213	best: 270.0379213 (300)	total: 39.1s	remaining: 10m 10s
350:	learn: 257.0020871	test: 260.1915397	best: 260.1915397 (350)	total: 46.1s	remaining: 10m 10s
400:	learn: 246.5270422	test: 251.1996136	best: 251.1996136 (400)	total: 53.1s	remaining: 10m 9s
450:	learn: 237.8073519	test: 244.8658539	best: 244.8658539 (450)	total: 60s	remaining: 10m 4s
500:	learn: 231.4058975	test: 2

4150:	learn: 113.2576350	test: 179.5146736	best: 179.5145218 (4149)	total: 9m 50s	remaining: 2m
4200:	learn: 112.5975275	test: 179.2774954	best: 179.2755223 (4198)	total: 9m 57s	remaining: 1m 53s
4250:	learn: 111.9764859	test: 179.1157416	best: 179.1142489 (4248)	total: 10m 5s	remaining: 1m 46s
4300:	learn: 111.3000677	test: 178.9024353	best: 178.9024353 (4300)	total: 10m 12s	remaining: 1m 39s
4350:	learn: 110.6285149	test: 178.6924617	best: 178.6924617 (4350)	total: 10m 19s	remaining: 1m 32s
4400:	learn: 110.0266894	test: 178.4801994	best: 178.4801994 (4400)	total: 10m 26s	remaining: 1m 25s
4450:	learn: 109.4319512	test: 178.2738039	best: 178.2738039 (4450)	total: 10m 34s	remaining: 1m 18s
4500:	learn: 108.8933374	test: 178.1225389	best: 178.1225389 (4500)	total: 10m 41s	remaining: 1m 11s
4550:	learn: 108.2968318	test: 177.9299185	best: 177.9265908 (4549)	total: 10m 48s	remaining: 1m 3s
4600:	learn: 107.7495856	test: 177.7498368	best: 177.7453170 (4599)	total: 10m 55s	remaining: 56.8s



0:	learn: 1990.7830203	test: 2003.0931880	best: 2003.0931880 (0)	total: 146ms	remaining: 12m 8s
50:	learn: 813.0487083	test: 825.2621291	best: 825.2621291 (50)	total: 4.87s	remaining: 7m 52s
100:	learn: 472.6104316	test: 477.3121454	best: 477.3121454 (100)	total: 11.4s	remaining: 9m 12s
150:	learn: 369.0865588	test: 371.4445146	best: 371.4445146 (150)	total: 18.4s	remaining: 9m 49s
200:	learn: 320.7982940	test: 323.2975099	best: 323.2975099 (200)	total: 25.3s	remaining: 10m 5s
250:	learn: 292.6552527	test: 295.5553290	best: 295.5553290 (250)	total: 32.5s	remaining: 10m 15s
300:	learn: 271.8102702	test: 274.9626760	best: 274.9626760 (300)	total: 39.6s	remaining: 10m 18s
350:	learn: 257.4063546	test: 261.3069789	best: 261.3069789 (350)	total: 46.7s	remaining: 10m 18s
400:	learn: 246.2830896	test: 251.4488248	best: 251.4488248 (400)	total: 53.9s	remaining: 10m 18s
450:	learn: 237.2139566	test: 243.4924207	best: 243.4924207 (450)	total: 1m 1s	remaining: 10m 16s
500:	learn: 229.3398148	test

4150:	learn: 114.8002594	test: 171.7889142	best: 171.7889142 (4150)	total: 9m 53s	remaining: 2m 1s
4200:	learn: 114.2268426	test: 171.6146704	best: 171.6146704 (4200)	total: 10m	remaining: 1m 54s
4250:	learn: 113.6369401	test: 171.4331950	best: 171.4331950 (4250)	total: 10m 8s	remaining: 1m 47s
4300:	learn: 113.0578098	test: 171.2812392	best: 171.2812392 (4300)	total: 10m 15s	remaining: 1m 39s
4350:	learn: 112.4458428	test: 171.1016489	best: 171.1016489 (4350)	total: 10m 22s	remaining: 1m 32s
4400:	learn: 111.7573773	test: 170.8568056	best: 170.8568056 (4400)	total: 10m 29s	remaining: 1m 25s
4450:	learn: 111.0972232	test: 170.6358324	best: 170.6358324 (4450)	total: 10m 37s	remaining: 1m 18s
4500:	learn: 110.4924898	test: 170.4586054	best: 170.4586054 (4500)	total: 10m 44s	remaining: 1m 11s
4550:	learn: 109.9261554	test: 170.3142026	best: 170.3142026 (4550)	total: 10m 51s	remaining: 1m 4s
4600:	learn: 109.2495247	test: 170.0738291	best: 170.0738291 (4600)	total: 10m 58s	remaining: 57.1s



0:	learn: 1988.3399329	test: 2012.2179081	best: 2012.2179081 (0)	total: 145ms	remaining: 12m 4s
50:	learn: 794.0009399	test: 794.9028952	best: 794.9028952 (50)	total: 4.74s	remaining: 7m 40s
100:	learn: 464.5971884	test: 470.0854584	best: 470.0854584 (100)	total: 11.4s	remaining: 9m 12s
150:	learn: 359.2810714	test: 369.4895419	best: 369.4895419 (150)	total: 18.4s	remaining: 9m 49s
200:	learn: 316.8440620	test: 330.6369771	best: 330.6369771 (200)	total: 25.2s	remaining: 10m 2s
250:	learn: 288.5750404	test: 305.5242224	best: 305.5242224 (250)	total: 32.4s	remaining: 10m 13s
300:	learn: 269.6650441	test: 288.7862388	best: 288.7862388 (300)	total: 39.4s	remaining: 10m 14s
350:	learn: 257.6163536	test: 278.5892474	best: 278.5892474 (350)	total: 46.3s	remaining: 10m 13s
400:	learn: 246.5672812	test: 269.6288008	best: 269.6288008 (400)	total: 53.5s	remaining: 10m 14s
450:	learn: 238.5374039	test: 262.6868909	best: 262.6868909 (450)	total: 1m	remaining: 10m 12s
500:	learn: 231.8229464	test: 2

4150:	learn: 115.3089065	test: 193.2557071	best: 193.2557071 (4150)	total: 9m 50s	remaining: 2m
4200:	learn: 114.7300669	test: 193.0708077	best: 193.0707313 (4199)	total: 9m 57s	remaining: 1m 53s
4250:	learn: 114.0186851	test: 192.8727510	best: 192.8727510 (4250)	total: 10m 4s	remaining: 1m 46s
4300:	learn: 113.4631995	test: 192.7056206	best: 192.7056206 (4300)	total: 10m 12s	remaining: 1m 39s
4350:	learn: 112.9074867	test: 192.4914728	best: 192.4914728 (4350)	total: 10m 19s	remaining: 1m 32s
4400:	learn: 112.3714951	test: 192.2899571	best: 192.2899571 (4400)	total: 10m 26s	remaining: 1m 25s
4450:	learn: 111.8426441	test: 192.1280105	best: 192.1280105 (4450)	total: 10m 33s	remaining: 1m 18s
4500:	learn: 111.1767224	test: 191.8849793	best: 191.8849793 (4500)	total: 10m 40s	remaining: 1m 11s
4550:	learn: 110.5897117	test: 191.6811948	best: 191.6811948 (4550)	total: 10m 48s	remaining: 1m 3s
4600:	learn: 109.9935523	test: 191.5161745	best: 191.5161745 (4600)	total: 10m 55s	remaining: 56.8s

5it [59:48, 717.72s/it]


## LGBM

In [40]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 42)
oof_preds_lgm = np.zeros(train_x.shape[0])

In [41]:
print("lightgbm")

sub_preds_lgm = np.zeros(test_x.shape[0])

for train_idx, valid_idx in tqdm(folds.split(x_train, y_train)):
    trn_x = x_train.iloc[train_idx, :]
    trn_y = y_train.iloc[train_idx, :]
    val_x = x_train.iloc[valid_idx, :]
    val_y = y_train.iloc[valid_idx, :]
    
    params = {'learning_rate': 0.005, 'max_depth': 16, 'boosting': 'dart', 'objective': 'regression',
              'metric': 'rmse', 'is_training_metric': True, 'num_leaves': 144, 'feature_fraction': 0.9,
              'bagging_fraction': 0.7, 'bagging_freq': 6, 'seed':42}
    
    train_T = lgb.Dataset(trn_x, label=trn_y)
    val_T = lgb.Dataset(val_x, label=val_y)
    
    model1 = lgb.train(params, train_T, 5000, val_T, verbose_eval=100, early_stopping_rounds=30)
    
    oof_preds_lgm[valid_idx] = model1.predict(val_x)
    sub_preds_lgm += model1.predict(x_test) / folds.n_splits
    

submission_lgbm['answer'] = sub_preds_lgm
submission_lgbm.to_csv('../submission/lgbm_{}.csv'.format(version), index = False)

0it [00:00, ?it/s]

lightgbm
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1553
[LightGBM] [Info] Number of data points in the train set: 97920, number of used features: 17
[LightGBM] [Info] Start training from score 2327.154754




[100]	valid_0's rmse: 2122.71
[200]	valid_0's rmse: 2171.96
[300]	valid_0's rmse: 1966.69
[400]	valid_0's rmse: 1866.9
[500]	valid_0's rmse: 1720.83
[600]	valid_0's rmse: 1620.83
[700]	valid_0's rmse: 1415.09
[800]	valid_0's rmse: 1239.16
[900]	valid_0's rmse: 1094.77
[1000]	valid_0's rmse: 1039.72
[1100]	valid_0's rmse: 914.78
[1200]	valid_0's rmse: 902.126
[1300]	valid_0's rmse: 863.107
[1400]	valid_0's rmse: 792.114
[1500]	valid_0's rmse: 770.399
[1600]	valid_0's rmse: 670.548
[1700]	valid_0's rmse: 638.855
[1800]	valid_0's rmse: 638.666
[1900]	valid_0's rmse: 605.353
[2000]	valid_0's rmse: 561.828
[2100]	valid_0's rmse: 555.896
[2200]	valid_0's rmse: 550.916
[2300]	valid_0's rmse: 515.479
[2400]	valid_0's rmse: 511.393
[2500]	valid_0's rmse: 500.674
[2600]	valid_0's rmse: 481.796
[2700]	valid_0's rmse: 457.883
[2800]	valid_0's rmse: 463.827
[2900]	valid_0's rmse: 438.389
[3000]	valid_0's rmse: 414.933
[3100]	valid_0's rmse: 406.068
[3200]	valid_0's rmse: 394.084
[3300]	valid_0's rm

1it [03:36, 216.29s/it]

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 97920, number of used features: 17
[LightGBM] [Info] Start training from score 2324.635440
[100]	valid_0's rmse: 2121.07
[200]	valid_0's rmse: 2170.67
[300]	valid_0's rmse: 1964.6
[400]	valid_0's rmse: 1863.93
[500]	valid_0's rmse: 1717.34
[600]	valid_0's rmse: 1617.58
[700]	valid_0's rmse: 1411.61
[800]	valid_0's rmse: 1235.6
[900]	valid_0's rmse: 1090.81
[1000]	valid_0's rmse: 1036.26
[1100]	valid_0's rmse: 911.565
[1200]	valid_0's rmse: 899.461
[1300]	valid_0's rmse: 860.231
[1400]	valid_0's rmse: 789.517
[1500]	valid_0's rmse: 767.722
[1600]	valid_0's rmse: 668.544
[1700]	valid_0's rmse: 637.053
[1800]	valid_0's rmse: 636.532
[1900]	valid_0's rmse: 603.179
[2000]	valid_0's rmse: 559.994
[2100]	valid_0's rmse: 554.008
[2200]	valid_0's rmse: 548.935
[2300]	valid_0's rmse: 5

2it [07:16, 217.50s/it]

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 97920, number of used features: 17
[LightGBM] [Info] Start training from score 2328.294847
[100]	valid_0's rmse: 2089.72
[200]	valid_0's rmse: 2142.68
[300]	valid_0's rmse: 1939.6
[400]	valid_0's rmse: 1841.56
[500]	valid_0's rmse: 1697.04
[600]	valid_0's rmse: 1599.05
[700]	valid_0's rmse: 1396.66
[800]	valid_0's rmse: 1223.45
[900]	valid_0's rmse: 1080.7
[1000]	valid_0's rmse: 1026.72
[1100]	valid_0's rmse: 903.385
[1200]	valid_0's rmse: 891.235
[1300]	valid_0's rmse: 852.719
[1400]	valid_0's rmse: 782.283
[1500]	valid_0's rmse: 760.928
[1600]	valid_0's rmse: 661.907
[1700]	valid_0's rmse: 630.645
[1800]	valid_0's rmse: 630.412
[1900]	valid_0's rmse: 597.32
[2000]	valid_0's rmse: 554.067
[2100]	valid_0's rmse: 548.083
[2200]	valid_0's rmse: 543.031
[2300]	valid_0's rmse: 50

3it [10:40, 213.32s/it]

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1551
[LightGBM] [Info] Number of data points in the train set: 97920, number of used features: 17
[LightGBM] [Info] Start training from score 2324.180464
[100]	valid_0's rmse: 2130.94
[200]	valid_0's rmse: 2180.02
[300]	valid_0's rmse: 1973.82
[400]	valid_0's rmse: 1873.06
[500]	valid_0's rmse: 1726.31
[600]	valid_0's rmse: 1626.35
[700]	valid_0's rmse: 1420.26
[800]	valid_0's rmse: 1244
[900]	valid_0's rmse: 1099.78
[1000]	valid_0's rmse: 1044.07
[1100]	valid_0's rmse: 919.073
[1200]	valid_0's rmse: 906.528
[1300]	valid_0's rmse: 866.714
[1400]	valid_0's rmse: 795.047
[1500]	valid_0's rmse: 772.655
[1600]	valid_0's rmse: 672.103
[1700]	valid_0's rmse: 640.461
[1800]	valid_0's rmse: 640.281
[1900]	valid_0's rmse: 606.764
[2000]	valid_0's rmse: 562.886
[2100]	valid_0's rmse: 556.751
[2200]	valid_0's rmse: 551.644
[2300]	valid_0's rmse: 51

4it [14:09, 212.02s/it]

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1552
[LightGBM] [Info] Number of data points in the train set: 97920, number of used features: 17
[LightGBM] [Info] Start training from score 2319.888826
[100]	valid_0's rmse: 2145.86
[200]	valid_0's rmse: 2193.78
[300]	valid_0's rmse: 1984.86
[400]	valid_0's rmse: 1882.64
[500]	valid_0's rmse: 1733.89
[600]	valid_0's rmse: 1632.95
[700]	valid_0's rmse: 1424.84
[800]	valid_0's rmse: 1247.28
[900]	valid_0's rmse: 1100.65
[1000]	valid_0's rmse: 1044.32
[1100]	valid_0's rmse: 918.418
[1200]	valid_0's rmse: 905.687
[1300]	valid_0's rmse: 866.379
[1400]	valid_0's rmse: 794.619
[1500]	valid_0's rmse: 772.489
[1600]	valid_0's rmse: 671.863
[1700]	valid_0's rmse: 640.451
[1800]	valid_0's rmse: 640.132
[1900]	valid_0's rmse: 606.641
[2000]	valid_0's rmse: 562.664
[2100]	valid_0's rmse: 556.676
[2200]	valid_0's rmse: 551.901
[2300]	valid_0's rmse:

5it [17:52, 214.41s/it]


## Stacking

### validation prediction

In [42]:
new_data_v = np.array([oof_preds_catboost, oof_preds_lgm, oof_preds_xgb])
new_data_v = np.transpose(new_data_v)
new_data_v.shape

(122400, 3)

In [43]:
regr = linear_model.LinearRegression(fit_intercept = False)
stackedmodel = regr.fit(new_data_v, y_train)

In [44]:
# validation prediction
stackedmodel_pred_v = stackedmodel.predict(new_data_v)
stackedmodel_pred_v.shape

(122400, 1)

In [45]:
def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f))*100)

In [46]:
smape(y_train, stackedmodel_pred_v)

feature_1    5.696281
dtype: float64

### test prediction

In [47]:
new_data_t = np.array([sub_preds_catboost, sub_preds_lgm, sub_preds_xgb])
new_data_t = np.transpose(new_data_t)
new_data_t.shape

(10080, 3)

In [48]:
# test prediction
stackedmodel_pred_t = stackedmodel.predict(new_data_t)
stackedmodel_pred_t.shape

(10080, 1)

In [49]:
submission_stack = submission.copy()

In [50]:
submission_stack['answer'] = stackedmodel_pred_t

In [51]:
submission_stack.to_csv('../submission/cat_lgbm_xgb_stack_{}.csv'.format(version), index=False)

In [52]:
submission_stack.describe()

Unnamed: 0,answer
count,10080.0
mean,2555.028797
std,2203.750303
min,237.730665
25%,1218.586374
50%,1898.767462
75%,3035.646768
max,17304.385637
