In [22]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor


from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [23]:
# train = pd.read_csv('edit_train.csv')
# test = pd.read_csv('edit_test.csv')
# submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')

In [24]:
def data_load(columns):
    train = pd.read_csv('edit_train.csv')
    test = pd.read_csv('edit_test.csv')
    submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')
    
    train = train.drop(columns, axis=1)
    test = test.drop(columns, axis=1)

    features = train.drop('전력사용량(kWh)', axis=1)
    labels = train[['전력사용량(kWh)']]
    
    return train, test, submission, features, labels

## KFold

In [25]:
def Kfold(train, N=None):
    kfold = KFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in kfold.split(train):
        print('\n ================== Fold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]

        model = LGBMRegressor(n_estimators=10000, random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'rmse', early_stopping_rounds=100, verbose=500)
        models[fold] = model
        
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('='*30)
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        print('='*30)
        
        fold += 1
    
    return models

In [26]:
train.columns

Index(['전력사용량(kWh)', '강수량(mm)', '일조(hr)', '비전기냉방설비운영', '태양광보유', 'hour',
       'weekday', 'holiday', 'perceived_temperature', 'discomfort_index',
       'ref_day', 'hour_assignment', 'gmm_num_assignment'],
      dtype='object')

In [28]:
columns = ['date_time', 'hour_assignment', 'gmm_num_assignment']

train, test, submission, features, labels = data_load(columns)
models = Kfold(train, 5)


Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 179.77	training's l2: 32317.2	valid_1's rmse: 230.582	valid_1's l2: 53168
[1000]	training's rmse: 147.807	training's l2: 21847	valid_1's rmse: 214.227	valid_1's l2: 45893.4
[1500]	training's rmse: 129.112	training's l2: 16670	valid_1's rmse: 207.368	valid_1's l2: 43001.5
[2000]	training's rmse: 116.128	training's l2: 13485.8	valid_1's rmse: 203.269	valid_1's l2: 41318.3
[2500]	training's rmse: 106.013	training's l2: 11238.7	valid_1's rmse: 200.64	valid_1's l2: 40256.3
[3000]	training's rmse: 97.5821	training's l2: 9522.26	valid_1's rmse: 198.939	valid_1's l2: 39576.8
[3500]	training's rmse: 91.109	training's l2: 8300.85	valid_1's rmse: 197.746	valid_1's l2: 39103.6
[4000]	training's rmse: 84.859	training's l2: 7201.06	valid_1's rmse: 196.557	valid_1's l2: 38634.6
[4500]	training's rmse: 79.3607	training's l2: 6298.12	valid_1's rmse: 195.864	valid_1's l2: 38362.8
[5000]	training's rmse: 74.7118	trainin

## StratifiedKFold

In [13]:
def sKfold(train, N=None):
    skfold = StratifiedKFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in skfold.split(features, features['num']):
        print('\n ================== sKFold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]

        model = LGBMRegressor(n_estimators=10000, random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'rmse', early_stopping_rounds=30, verbose=500)
        models[fold] = model
        
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('='*30)
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        print('='*30)
        
        fold += 1
    
    return models

In [14]:
train, test, submission, features, labels = data_load(columns)
models = sKfold(train, 5)


Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 173.69	training's l2: 30168.3	valid_1's rmse: 210.902	valid_1's l2: 44479.5
[1000]	training's rmse: 143.653	training's l2: 20636.1	valid_1's rmse: 197.576	valid_1's l2: 39036.4
[1500]	training's rmse: 125.376	training's l2: 15719.1	valid_1's rmse: 191.488	valid_1's l2: 36667.5
[2000]	training's rmse: 112.224	training's l2: 12594.3	valid_1's rmse: 187.661	valid_1's l2: 35216.8
[2500]	training's rmse: 102.496	training's l2: 10505.5	valid_1's rmse: 185.366	valid_1's l2: 34360.6
Early stopping, best iteration is:
[2829]	training's rmse: 97.1903	training's l2: 9445.96	valid_1's rmse: 184.55	valid_1's l2: 34058.7
MAE: 99.9892 
MSE: 34058.7068 
RMSE: 184.5500

Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 171.586	training's l2: 29441.9	valid_1's rmse: 219.79	valid_1's l2: 48307.6
[1000]	training's rmse: 141.764	training's l2: 20096.9	valid_1's rmse: 206.108	valid_1's l2: 

## RandomForest

In [37]:
def KfoldForest(train, N=None):
    kfold = KFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in kfold.split(train):
        print('\n ================== Fold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]
        
        
        model = RandomForestRegressor(random_state=0)
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        
        models[fold] = model

        fold += 1
    
    return models

In [38]:
train, test, submission, features, labels = data_load(columns)
models = KfoldForest(train, 5)


MAE: 121.1460 
MSE: 70082.4526 
RMSE: 264.7309

MAE: 118.6476 
MSE: 63287.0983 
RMSE: 251.5693

MAE: 119.4404 
MSE: 54014.8691 
RMSE: 232.4110

MAE: 117.9968 
MSE: 56927.5535 
RMSE: 238.5950

MAE: 118.4614 
MSE: 57056.5039 
RMSE: 238.8650


In [53]:
submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')

for i in range(1, 6):
    submission['answer'] += models[i].predict(test)/5

submission.to_csv('lgbm_day3_3.csv', index=False)