In [8]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor


from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [9]:
# train = pd.read_csv('edit_train.csv')
# test = pd.read_csv('edit_test.csv')
# submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')

In [10]:
def data_load(columns):
    train = pd.read_csv('edit_train.csv')
    test = pd.read_csv('edit_test.csv')
    submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')
    
    train = train.drop(columns, axis=1)
    test = test.drop(columns, axis=1)

    features = train.drop('전력사용량(kWh)', axis=1)
    labels = train[['전력사용량(kWh)']]
    
    return train, test, submission, features, labels

## KFold

In [19]:
def Kfold(train, N=None):
    kfold = KFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in kfold.split(train):
        print('\n ================== Fold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]

        model = LGBMRegressor(n_estimators=10000, random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'rmse', early_stopping_rounds=30, verbose=500)
        models[fold] = model
        
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('='*30)
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        print('='*30)
        
        fold += 1
    
    return models

In [20]:
columns = ['date_time']

train, test, submission, features, labels = data_load(columns)
models = Kfold(train, 5)


Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 161.135	training's l2: 25964.6	valid_1's rmse: 197.555	valid_1's l2: 39027.9
[1000]	training's rmse: 133.006	training's l2: 17690.5	valid_1's rmse: 184.088	valid_1's l2: 33888.5
[1500]	training's rmse: 115.561	training's l2: 13354.3	valid_1's rmse: 178.047	valid_1's l2: 31700.7
[2000]	training's rmse: 103.169	training's l2: 10643.9	valid_1's rmse: 175.133	valid_1's l2: 30671.7
[2500]	training's rmse: 93.7954	training's l2: 8797.59	valid_1's rmse: 172.959	valid_1's l2: 29914.9
[3000]	training's rmse: 86.0275	training's l2: 7400.73	valid_1's rmse: 171.583	valid_1's l2: 29440.8
[3500]	training's rmse: 79.4252	training's l2: 6308.37	valid_1's rmse: 170.572	valid_1's l2: 29094.8
[4000]	training's rmse: 73.6084	training's l2: 5418.19	valid_1's rmse: 169.623	valid_1's l2: 28772.1
Early stopping, best iteration is:
[4212]	training's rmse: 71.5247	training's l2: 5115.78	valid_1's rmse: 169.445	valid_1's l2: 287

## StratifiedKFold

In [21]:
def sKfold(train, N=None):
    skfold = StratifiedKFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in skfold.split(features, features['num']):
        print('\n ================== sKFold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]

        model = LGBMRegressor(n_estimators=10000, random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'rmse', early_stopping_rounds=30, verbose=500)
        models[fold] = model
        
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('='*30)
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        print('='*30)
        
        fold += 1
    
    return models

In [22]:
train, test, submission, features, labels = data_load(columns)
models = sKfold(train, 5)


Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 160.541	training's l2: 25773.5	valid_1's rmse: 207.524	valid_1's l2: 43066.2
[1000]	training's rmse: 132.107	training's l2: 17452.2	valid_1's rmse: 194.592	valid_1's l2: 37866.2
[1500]	training's rmse: 115.599	training's l2: 13363.1	valid_1's rmse: 188.509	valid_1's l2: 35535.5
[2000]	training's rmse: 103.215	training's l2: 10653.3	valid_1's rmse: 185.234	valid_1's l2: 34311.5
[2500]	training's rmse: 93.5116	training's l2: 8744.41	valid_1's rmse: 183.14	valid_1's l2: 33540.1
[3000]	training's rmse: 85.9424	training's l2: 7386.1	valid_1's rmse: 181.831	valid_1's l2: 33062.3
Early stopping, best iteration is:
[2999]	training's rmse: 85.9553	training's l2: 7388.31	valid_1's rmse: 181.823	valid_1's l2: 33059.7
MAE: 96.2148 
MSE: 33059.6609 
RMSE: 181.8232

Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 160.554	training's l2: 25777.5	valid_1's rmse: 207.763	valid_1's l2:

## RandomForest

In [15]:
def KfoldForest(train, N=None):
    kfold = KFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in kfold.split(train):
        print('\n ================== Fold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]
        
        
        model = RandomForestRegressor(random_state=0)
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        
        models[fold] = model

        fold += 1
    
    return models

In [16]:
train, test, submission, features, labels = data_load(columns)
models = KfoldForest(train, 5)


MAE: 112.4019 
MSE: 49774.5470 
RMSE: 223.1021

MAE: 116.6566 
MSE: 68836.8379 
RMSE: 262.3678

MAE: 116.0191 
MSE: 55604.7868 
RMSE: 235.8067

MAE: 114.9101 
MSE: 57782.0016 
RMSE: 240.3789

MAE: 114.5944 
MSE: 53244.0506 
RMSE: 230.7467


In [32]:
submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')

for i in range(1, 6):
    submission['answer'] += models[i].predict(test)/5

submission.to_csv('lgbm_day4_3.csv', index=False)