In [73]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor


from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [74]:
# train = pd.read_csv('edit_train.csv')
# test = pd.read_csv('edit_test.csv')
# submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')

In [75]:
def data_load(columns):
    train = pd.read_csv('edit_train.csv')
    test = pd.read_csv('edit_test.csv')
    submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')
    
    train = train.drop(columns, axis=1)
    test = test.drop(columns, axis=1)

    features = train.drop('전력사용량(kWh)', axis=1)
    labels = train[['전력사용량(kWh)']]
    
    return train, test, submission, features, labels

## KFold

In [71]:
def Kfold(train, N=None):
    kfold = KFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in kfold.split(train):
        print('\n ================== Fold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]

        model = LGBMRegressor(n_estimators=10000, random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'rmse', early_stopping_rounds=100, verbose=500)
        models[fold] = model
        
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('='*30)
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        print('='*30)
        
        fold += 1
    
    return models

In [72]:
columns = ['date_time']

train, test, submission, features, labels = data_load(columns)
models = Kfold(train, 5)


Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 172.825	training's l2: 29868.4	valid_1's rmse: 218.461	valid_1's l2: 47725.3
[1000]	training's rmse: 141.719	training's l2: 20084.3	valid_1's rmse: 203.324	valid_1's l2: 41340.6
[1500]	training's rmse: 124.142	training's l2: 15411.1	valid_1's rmse: 197.813	valid_1's l2: 39130
[2000]	training's rmse: 111.882	training's l2: 12517.7	valid_1's rmse: 194.25	valid_1's l2: 37733.2
[2500]	training's rmse: 102.173	training's l2: 10439.2	valid_1's rmse: 192.228	valid_1's l2: 36951.7
[3000]	training's rmse: 94.2455	training's l2: 8882.22	valid_1's rmse: 190.697	valid_1's l2: 36365.3
[3500]	training's rmse: 87.3962	training's l2: 7638.1	valid_1's rmse: 189.433	valid_1's l2: 35884.7
[4000]	training's rmse: 81.6608	training's l2: 6668.49	valid_1's rmse: 188.591	valid_1's l2: 35566.6
[4500]	training's rmse: 76.7445	training's l2: 5889.72	valid_1's rmse: 187.865	valid_1's l2: 35293.1
[5000]	training's rmse: 72.335	tr

## StratifiedKFold

In [65]:
def sKfold(train, N=None):
    skfold = StratifiedKFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in skfold.split(features, features['num']):
        print('\n ================== sKFold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]

        model = LGBMRegressor(n_estimators=10000, random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'rmse', early_stopping_rounds=30, verbose=500)
        models[fold] = model
        
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('='*30)
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        print('='*30)
        
        fold += 1
    
    return models

In [66]:
train, test, submission, features, labels = data_load(columns)
models = sKfold(train, 5)


Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 173.69	training's l2: 30168.3	valid_1's rmse: 210.902	valid_1's l2: 44479.5
[1000]	training's rmse: 143.653	training's l2: 20636.1	valid_1's rmse: 197.576	valid_1's l2: 39036.4
[1500]	training's rmse: 125.376	training's l2: 15719.1	valid_1's rmse: 191.488	valid_1's l2: 36667.5
[2000]	training's rmse: 112.224	training's l2: 12594.3	valid_1's rmse: 187.661	valid_1's l2: 35216.8
[2500]	training's rmse: 102.496	training's l2: 10505.5	valid_1's rmse: 185.366	valid_1's l2: 34360.6
Early stopping, best iteration is:
[2829]	training's rmse: 97.1903	training's l2: 9445.96	valid_1's rmse: 184.55	valid_1's l2: 34058.7
MAE: 99.9892 
MSE: 34058.7068 
RMSE: 184.5500

Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 171.586	training's l2: 29441.9	valid_1's rmse: 219.79	valid_1's l2: 48307.6
[1000]	training's rmse: 141.764	training's l2: 20096.9	valid_1's rmse: 206.108	valid_1's l2: 

## RandomForest

In [37]:
def KfoldForest(train, N=None):
    kfold = KFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in kfold.split(train):
        print('\n ================== Fold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]
        
        
        model = RandomForestRegressor(random_state=0)
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        
        models[fold] = model

        fold += 1
    
    return models

In [38]:
train, test, submission, features, labels = data_load(columns)
models = KfoldForest(train, 5)


MAE: 121.1460 
MSE: 70082.4526 
RMSE: 264.7309

MAE: 118.6476 
MSE: 63287.0983 
RMSE: 251.5693

MAE: 119.4404 
MSE: 54014.8691 
RMSE: 232.4110

MAE: 117.9968 
MSE: 56927.5535 
RMSE: 238.5950

MAE: 118.4614 
MSE: 57056.5039 
RMSE: 238.8650


In [59]:
submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')

for i in range(1, 6):
    submission['answer'] += models[i].predict(test)/5

submission.to_csv('lgbm_day3_1.csv', index=False)

In [67]:
submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')

for i in range(1, 6):
    submission['answer'] += models[i].predict(test)/5

submission.to_csv('lgbm_day3_2.csv', index=False)