In [4]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor


from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [5]:
# train = pd.read_csv('edit_train.csv')
# test = pd.read_csv('edit_test.csv')
# submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')

In [6]:
def data_load(columns):
    train = pd.read_csv('edit_train.csv')
    test = pd.read_csv('edit_test.csv')
    submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')
    
    train = train.drop(columns, axis=1)
    test = test.drop(columns, axis=1)

    features = train.drop('전력사용량(kWh)', axis=1)
    labels = train[['전력사용량(kWh)']]
    
    return train, test, submission, features, labels

## KFold

In [27]:
def Kfold(train, N=None):
    kfold = KFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in kfold.split(train):
        print('\n ================== Fold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]

        model = LGBMRegressor(n_estimators=10000, random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'rmse', early_stopping_rounds=100, verbose=500)
        models[fold] = model
        
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('='*30)
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        print('='*30)
        
        fold += 1
    
    return models

In [28]:
train.columns

Index(['num', '전력사용량(kWh)', '기온(°C)', '풍속(m/s)', '습도(%)', '강수량(mm)', '일조(hr)',
       '비전기냉방설비운영', '태양광보유', 'hour', 'weekday', 'holiday',
       'perceived_temperature', 'discomfort_index', 'ref_day',
       'hour_assignment', 'gmm_num_assignment'],
      dtype='object')

In [29]:
columns = ['date_time']

train, test, submission, features, labels = data_load(columns)
models = Kfold(train, 5)


Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 181.166	training's l2: 32821.3	valid_1's rmse: 205.574	valid_1's l2: 42260.5
[1000]	training's rmse: 146.075	training's l2: 21337.8	valid_1's rmse: 186.351	valid_1's l2: 34726.6
[1500]	training's rmse: 125.686	training's l2: 15797	valid_1's rmse: 178.099	valid_1's l2: 31719.3
[2000]	training's rmse: 112.176	training's l2: 12583.4	valid_1's rmse: 173.976	valid_1's l2: 30267.7
[2500]	training's rmse: 101.763	training's l2: 10355.8	valid_1's rmse: 171.439	valid_1's l2: 29391.2
[3000]	training's rmse: 93.2564	training's l2: 8696.76	valid_1's rmse: 169.576	valid_1's l2: 28756
[3500]	training's rmse: 86.0226	training's l2: 7399.89	valid_1's rmse: 168.24	valid_1's l2: 28304.5
[4000]	training's rmse: 80.1308	training's l2: 6420.95	valid_1's rmse: 167.354	valid_1's l2: 28007.3
[4500]	training's rmse: 74.5037	training's l2: 5550.8	valid_1's rmse: 166.63	valid_1's l2: 27765.6
[5000]	training's rmse: 69.6434	trai

## StratifiedKFold

In [21]:
def sKfold(train, N=None):
    skfold = StratifiedKFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in skfold.split(features, features['num']):
        print('\n ================== sKFold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]

        model = LGBMRegressor(n_estimators=10000, random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'rmse', early_stopping_rounds=30, verbose=500)
        models[fold] = model
        
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('='*30)
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        print('='*30)
        
        fold += 1
    
    return models

In [22]:
train, test, submission, features, labels = data_load(columns)
models = sKfold(train, 5)


Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 179.894	training's l2: 32362	valid_1's rmse: 208.694	valid_1's l2: 43553.4
[1000]	training's rmse: 145.059	training's l2: 21042.2	valid_1's rmse: 189.995	valid_1's l2: 36098.2
[1500]	training's rmse: 125.867	training's l2: 15842.6	valid_1's rmse: 182.954	valid_1's l2: 33472
[2000]	training's rmse: 112.119	training's l2: 12570.7	valid_1's rmse: 179.164	valid_1's l2: 32099.8
[2500]	training's rmse: 101.415	training's l2: 10285.1	valid_1's rmse: 176.552	valid_1's l2: 31170.8
[3000]	training's rmse: 92.4616	training's l2: 8549.15	valid_1's rmse: 174.333	valid_1's l2: 30391.8
Early stopping, best iteration is:
[3335]	training's rmse: 87.8491	training's l2: 7717.47	valid_1's rmse: 173.683	valid_1's l2: 30165.7
MAE: 101.9673 
MSE: 30165.6612 
RMSE: 173.6826

Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 177.604	training's l2: 31543	valid_1's rmse: 202.433	valid_1's l2: 40

## RandomForest

In [13]:
def KfoldForest(train, N=None):
    kfold = KFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in kfold.split(train):
        print('\n ================== Fold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]
        
        
        model = RandomForestRegressor(random_state=0)
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        
        models[fold] = model

        fold += 1
    
    return models

In [31]:
train, test, submission, features, labels = data_load(columns)
models = KfoldForest(train, 5)


MAE: 88.9583 
MSE: 32198.0299 
RMSE: 179.4381

MAE: 94.4796 
MSE: 48452.1476 
RMSE: 220.1185

MAE: 91.2638 
MSE: 31570.0613 
RMSE: 177.6797

MAE: 89.8084 
MSE: 35248.1979 
RMSE: 187.7450

MAE: 89.8402 
MSE: 30111.9460 
RMSE: 173.5279


In [7]:
def xgb_Kfold(train, N=None):
    kfold = KFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in kfold.split(train):
        print('\n ================== Fold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]

        model = XGBRegressor(n_estimators=10000, random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'rmse', early_stopping_rounds=100, verbose=500)
        models[fold] = model
        
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('='*30)
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        print('='*30)
        
        fold += 1
    
    return models

In [9]:
columns = ['date_time']

train, test, submission, features, labels = data_load(columns)
models = xgb_Kfold(train, 5)


[0]	validation_0-rmse:2247.24927	validation_1-rmse:2256.78711
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[500]	validation_0-rmse:130.78369	validation_1-rmse:206.19284
[1000]	validation_0-rmse:93.39452	validation_1-rmse:200.37775
[1500]	validation_0-rmse:72.16432	validation_1-rmse:198.74883
[2000]	validation_0-rmse:56.46673	validation_1-rmse:197.77647
[2500]	validation_0-rmse:45.91515	validation_1-rmse:197.36523
Stopping. Best iteration:
[2584]	validation_0-rmse:44.43705	validation_1-rmse:197.33546

MAE: 121.0998 
MSE: 38941.2813 
RMSE: 197.3355

[0]	validation_0-rmse:2252.35083	validation_1-rmse:2230.90405
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[500]	validation_0-rmse:128.20734	validation_1-rmse:238.43959
[1000]	validation_0-rmse:92.28738	validation_1

In [10]:
def xgb_sKfold(train, N=None):
    skfold = StratifiedKFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in skfold.split(features, features['num']):
        print('\n ================== sKFold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]

        model = XGBRegressor(n_estimators=10000, random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'rmse', early_stopping_rounds=30, verbose=500)
        models[fold] = model
        
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('='*30)
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        print('='*30)
        
        fold += 1
    
    return models

In [None]:
train, test, submission, features, labels = data_load(columns)
models = xgb_sKfold(train, 5)

In [32]:
submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')

for i in range(1, 6):
    submission['answer'] += models[i].predict(test)/5

submission.to_csv('lgbm_day4_3.csv', index=False)