In [46]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

In [47]:
columns = ['date_time', 'hour_assignment', 'perceived_temperature', 'discomfort_index']

In [48]:
def data_load(columns, train, test):
    train=pd.read_csv('edit_train.csv')
    test=pd.read_csv('edit_test.csv')
    submission=pd.read_csv('energy/sample_submission.csv', encoding='cp949')
    
    train = train.drop(columns, axis=1)
    test = test.drop(columns, axis=1)

    features = train.drop('전력사용량(kWh)', axis=1)
    labels = train[['전력사용량(kWh)']]
    
    return train, test, features, labels

In [49]:
def Kfold(train, N=None):
    kfold = KFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in kfold.split(train):
        print('\n ================== Fold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]

        model = LGBMRegressor(n_estimators=10000, random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], early_stopping_rounds=30, verbose=100)
        models[fold] = model

        fold += 1
    
    return models

In [50]:
train, test, features, labels = data_load(columns, train, test)
models = Kfold(train, 5)


Training until validation scores don't improve for 30 rounds
[100]	training's l2: 88063.2	valid_1's l2: 96292.9
[200]	training's l2: 64388.6	valid_1's l2: 75092.9
[300]	training's l2: 52802.5	valid_1's l2: 65223.9
[400]	training's l2: 46369.3	valid_1's l2: 60144
[500]	training's l2: 42075.9	valid_1's l2: 56647.8
[600]	training's l2: 38535.9	valid_1's l2: 54372.6
[700]	training's l2: 35708.1	valid_1's l2: 52649.5
[800]	training's l2: 33618.3	valid_1's l2: 51377.7
[900]	training's l2: 31993.4	valid_1's l2: 50560.5
[1000]	training's l2: 30488.3	valid_1's l2: 49868.7
[1100]	training's l2: 29025	valid_1's l2: 48976.7
[1200]	training's l2: 27757.1	valid_1's l2: 48406.9
[1300]	training's l2: 26665.2	valid_1's l2: 47911.9
[1400]	training's l2: 25423.8	valid_1's l2: 47282.2
[1500]	training's l2: 24520.2	valid_1's l2: 46901.7
[1600]	training's l2: 23691.2	valid_1's l2: 46617.2
[1700]	training's l2: 22728.7	valid_1's l2: 46163.2
[1800]	training's l2: 21965.1	valid_1's l2: 45832.6
[1900]	training

In [18]:
submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')

for i in range(1, 6):
    submission['answer'] += models[i].predict(test)/5

submission.to_csv('lgbm_day2_1.csv', index=False)

In [51]:
columns = ['date_time', 'perceived_temperature', 'discomfort_index']

In [52]:
train, test, features, labels = data_load(columns, train, test)
models = Kfold(train, 5)


Training until validation scores don't improve for 30 rounds
[100]	training's l2: 88822.1	valid_1's l2: 97096.3
[200]	training's l2: 64636.4	valid_1's l2: 75471.5
[300]	training's l2: 52643.6	valid_1's l2: 65707.8
[400]	training's l2: 45842.4	valid_1's l2: 60218.7
[500]	training's l2: 41486.4	valid_1's l2: 56735.5
[600]	training's l2: 38349.4	valid_1's l2: 54538.8
[700]	training's l2: 35775.4	valid_1's l2: 53040.7
[800]	training's l2: 33490.6	valid_1's l2: 51714
[900]	training's l2: 31833.9	valid_1's l2: 50726.7
[1000]	training's l2: 30167	valid_1's l2: 49968.3
[1100]	training's l2: 28867.7	valid_1's l2: 49499.4
[1200]	training's l2: 27645.8	valid_1's l2: 48904.8
[1300]	training's l2: 26484.2	valid_1's l2: 48415.2
Early stopping, best iteration is:
[1363]	training's l2: 25831.1	valid_1's l2: 48157.9

Training until validation scores don't improve for 30 rounds
[100]	training's l2: 86616.1	valid_1's l2: 85906.2
[200]	training's l2: 62731.6	valid_1's l2: 65319.6
[300]	training's l2: 523

In [53]:
submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')

for i in range(1, 6):
    submission['answer'] += models[i].predict(test)/5

submission.to_csv('lgbm_day2_2.csv', index=False)

In [55]:
columns = ['date_time']

In [56]:
train, test, features, labels = data_load(columns, train, test)
models = Kfold(train, 5)


Training until validation scores don't improve for 30 rounds
[100]	training's l2: 84066.2	valid_1's l2: 92596.6
[200]	training's l2: 60504.3	valid_1's l2: 71512.6
[300]	training's l2: 50717	valid_1's l2: 63494.5
[400]	training's l2: 44724	valid_1's l2: 58769.2
[500]	training's l2: 40340	valid_1's l2: 55600
[600]	training's l2: 37203.9	valid_1's l2: 53443.7
[700]	training's l2: 34808.7	valid_1's l2: 51818.3
[800]	training's l2: 32557.8	valid_1's l2: 50506.6
[900]	training's l2: 30992.7	valid_1's l2: 49691
[1000]	training's l2: 29349.4	valid_1's l2: 48995.8
[1100]	training's l2: 27619.3	valid_1's l2: 48219.6
[1200]	training's l2: 26357.2	valid_1's l2: 47713.3
[1300]	training's l2: 25150.2	valid_1's l2: 47080.5
[1400]	training's l2: 24216.4	valid_1's l2: 46702.9
[1500]	training's l2: 23241.5	valid_1's l2: 46236.1
Early stopping, best iteration is:
[1536]	training's l2: 22963.1	valid_1's l2: 46109.6

Training until validation scores don't improve for 30 rounds
[100]	training's l2: 84763.9

In [57]:
submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')

for i in range(1, 6):
    submission['answer'] += models[i].predict(test)/5

submission.to_csv('lgbm_day2_3.csv', index=False)