In [66]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [54]:
columns = ['date_time']

In [55]:
def data_load(columns):
    train = pd.read_csv('edit_train.csv')
    test = pd.read_csv('edit_test.csv')
    submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')
    
    train = train.drop(columns, axis=1)
    test = test.drop(columns, axis=1)

    features = train.drop('전력사용량(kWh)', axis=1)
    labels = train[['전력사용량(kWh)']]
    
    return train, test, submission, features, labels

In [56]:
def lgbm_sKfold(train, N=None):
    skfold = StratifiedKFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in skfold.split(features, features['num']):
        print('\n ================== sKFold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]

        model = LGBMRegressor(n_estimators=10000, random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'rmse', early_stopping_rounds=50, verbose=500)
        models[fold] = model
        
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('='*30)
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        print('='*30)
        
        fold += 1
    
    return models

In [60]:
def xgb_sKfold(train, N=None):
    skfold = StratifiedKFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in skfold.split(features, features['num']):
        print('\n ================== sKFold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]

        model = xgb.XGBRegressor(n_estimators=10000, random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'rmse', early_stopping_rounds=50, verbose=500)
        models[fold] = model
        
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('='*30)
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        print('='*30)
        
        fold += 1
    
    return models

In [67]:
def rf_sKfold(train, N=None):
    skfold = StratifiedKFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in skfold.split(features, features['num']):
        print('\n ================== sKFold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]

        model = RandomForestRegressor(n_estimators=500, random_state=0)
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('='*30)
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        print('='*30)
        
        fold += 1
    
    return models

In [59]:
train, test, submission, features, labels = data_load(columns)
lgbm_models = sKfold(train, 5)


Training until validation scores don't improve for 50 rounds
[500]	training's rmse: 168.879	training's l2: 28520	valid_1's rmse: 205.207	valid_1's l2: 42109.9
[1000]	training's rmse: 137.899	training's l2: 19016.2	valid_1's rmse: 189.35	valid_1's l2: 35853.3
[1500]	training's rmse: 119.709	training's l2: 14330.2	valid_1's rmse: 182.658	valid_1's l2: 33363.9
[2000]	training's rmse: 107.247	training's l2: 11501.9	valid_1's rmse: 179.062	valid_1's l2: 32063.3
[2500]	training's rmse: 98.2856	training's l2: 9660.05	valid_1's rmse: 176.774	valid_1's l2: 31248.9
[3000]	training's rmse: 90.4979	training's l2: 8189.87	valid_1's rmse: 174.848	valid_1's l2: 30571.9
[3500]	training's rmse: 83.698	training's l2: 7005.35	valid_1's rmse: 173.561	valid_1's l2: 30123.3
[4000]	training's rmse: 77.9714	training's l2: 6079.54	valid_1's rmse: 172.541	valid_1's l2: 29770.5
[4500]	training's rmse: 72.965	training's l2: 5323.89	valid_1's rmse: 171.711	valid_1's l2: 29484.7
[5000]	training's rmse: 68.46	train

In [62]:
train, test, submission, features, labels = data_load(columns)
xgb_models = xgb_sKfold(train, 5)


[0]	validation_0-rmse:2212.52759	validation_1-rmse:2201.59717
[500]	validation_0-rmse:113.15136	validation_1-rmse:198.55585
[1000]	validation_0-rmse:82.70358	validation_1-rmse:194.00102
[1437]	validation_0-rmse:66.79118	validation_1-rmse:192.85071
MAE: 106.3645 
MSE: 37173.9907 
RMSE: 192.8056

[0]	validation_0-rmse:2208.79175	validation_1-rmse:2210.80420
[500]	validation_0-rmse:114.07649	validation_1-rmse:210.87251
[1000]	validation_0-rmse:82.37148	validation_1-rmse:205.96202
[1382]	validation_0-rmse:67.79047	validation_1-rmse:205.13373
MAE: 108.2642 
MSE: 42057.2240 
RMSE: 205.0786

[0]	validation_0-rmse:2208.20459	validation_1-rmse:2212.38550
[500]	validation_0-rmse:113.83881	validation_1-rmse:222.46339
[1000]	validation_0-rmse:82.66311	validation_1-rmse:216.94667
[1500]	validation_0-rmse:63.81551	validation_1-rmse:215.14682
[1592]	validation_0-rmse:61.61703	validation_1-rmse:215.08632
MAE: 110.9533 
MSE: 46235.2086 
RMSE: 215.0237

[0]	validation_0-rmse:2207.89233	validation_1-rms

In [68]:
train, test, submission, features, labels = data_load(columns)
rf_models = rf_sKfold(train, 5)


MAE: 115.3020 
MSE: 58696.9433 
RMSE: 242.2745



KeyboardInterrupt: 

In [74]:
submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')

for i in range(1, 6):
#     submission['answer'] += lgbm_models[i].predict(test)/5
    submission['answer'] += xgb_models[i].predict(test)/5
#     submission['answer'] += rf_models[i].predict(test)/15
    
submission.to_csv('lgbm_day6_3.csv', index=False)