In [15]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor


from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [16]:
# def smape(true, pred):
#     return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred)))

In [17]:
def data_load(columns):
    train = pd.read_csv('edit_train.csv')
    test = pd.read_csv('edit_test.csv')
    submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')
    
    train = train.drop(columns, axis=1)
    test = test.drop(columns, axis=1)

    features = train.drop('전력사용량(kWh)', axis=1)
    labels = train[['전력사용량(kWh)']]
    
    return train, test, submission, features, labels

In [12]:
def Kfold(train, N=None):
    kfold = KFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in kfold.split(train):
        print('\n ================== Fold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]

        model = LGBMRegressor(n_estimators=10000, random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'rmse', early_stopping_rounds=100, verbose=500)
        models[fold] = model
        
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('='*30)
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        print('='*30)
        
        fold += 1
    
    return models

In [13]:
columns = ['date_time', 'perceived_temperature', 'discomfort_index', 'gmm_hour_assignment', 'gmm_num_assignment']

train, test, submission, features, labels = data_load(columns)
models = Kfold(train, 5)


Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 203.682	training's l2: 41486.4	valid_1's rmse: 238.192	valid_1's l2: 56735.5
[1000]	training's rmse: 173.687	training's l2: 30167	valid_1's rmse: 223.536	valid_1's l2: 49968.3
[1500]	training's rmse: 156.303	training's l2: 24430.7	valid_1's rmse: 218.656	valid_1's l2: 47810.4
[2000]	training's rmse: 142.683	training's l2: 20358.3	valid_1's rmse: 214.577	valid_1's l2: 46043.4
[2500]	training's rmse: 132.577	training's l2: 17576.7	valid_1's rmse: 212.467	valid_1's l2: 45142.4
[3000]	training's rmse: 124.533	training's l2: 15508.4	valid_1's rmse: 211.257	valid_1's l2: 44629.6
Early stopping, best iteration is:
[3076]	training's rmse: 123.464	training's l2: 15243.5	valid_1's rmse: 211.082	valid_1's l2: 44555.6
MAE: 111.1085 
MSE: 44555.6451 
RMSE: 211.0821

Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 206.154	training's l2: 42499.5	valid_1's rmse: 224.753	valid_1's 

In [14]:
columns = ['date_time', 'perceived_temperature', 'discomfort_index', 'hour_assignment', 'num_assignment']

train, test, submission, features, labels = data_load(columns)
models = Kfold(train, 5)


Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 201.456	training's l2: 40584.6	valid_1's rmse: 235.506	valid_1's l2: 55463
[1000]	training's rmse: 171.211	training's l2: 29313.1	valid_1's rmse: 221.35	valid_1's l2: 48995.8
[1500]	training's rmse: 155.213	training's l2: 24091.2	valid_1's rmse: 216.876	valid_1's l2: 47035.2
[2000]	training's rmse: 142.159	training's l2: 20209.1	valid_1's rmse: 213.766	valid_1's l2: 45695.9
[2500]	training's rmse: 132.931	training's l2: 17670.7	valid_1's rmse: 211.941	valid_1's l2: 44918.9
[3000]	training's rmse: 124.764	training's l2: 15565.9	valid_1's rmse: 210.57	valid_1's l2: 44339.8
[3500]	training's rmse: 117.915	training's l2: 13904	valid_1's rmse: 209.889	valid_1's l2: 44053.6
Early stopping, best iteration is:
[3623]	training's rmse: 116.486	training's l2: 13568.9	valid_1's rmse: 209.777	valid_1's l2: 44006.3
MAE: 109.5050 
MSE: 44006.2554 
RMSE: 209.7767

Training until validation scores don't improve for 10

In [7]:
def s_data_load(columns):
    train = pd.read_csv('edit_train.csv')
    test = pd.read_csv('edit_test.csv')
    submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')
    
    train = train.drop(columns, axis=1)
    test = test.drop(columns, axis=1)

    features = train.drop('전력사용량(kWh)', axis=1)
    labels = train['전력사용량(kWh)']
    
    return train, test, submission, features, labels

In [8]:
def sKfold(train, N=None):
    skfold = StratifiedKFold(n_splits=N, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in skfold.split(features, features['num']):
        print('\n ================== sKFold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels[train_idx], labels[valid_idx]

        model = LGBMRegressor(n_estimators=10000, random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'rmse', early_stopping_rounds=30, verbose=500)
        models[fold] = model
        
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('='*30)
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        print('='*30)
        
        fold += 1
    
    return models

In [9]:
columns = ['date_time', 'perceived_temperature', 'discomfort_index', 'gmm_hour_assignment', 'gmm_num_assignment']

train, test, submission, features, labels = s_data_load(columns)
models = sKfold(train, 5)


Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 197.629	training's l2: 39057.3	valid_1's rmse: 301.987	valid_1's l2: 91195.9
Early stopping, best iteration is:
[800]	training's rmse: 176.135	training's l2: 31023.5	valid_1's rmse: 297.27	valid_1's l2: 88369.4
MAE: 163.4508 
MSE: 88369.3634 
RMSE: 297.2698

Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 210.058	training's l2: 44124.4	valid_1's rmse: 222.281	valid_1's l2: 49409
Early stopping, best iteration is:
[675]	training's rmse: 196.044	training's l2: 38433.1	valid_1's rmse: 219.125	valid_1's l2: 48015.7
MAE: 132.3548 
MSE: 48015.6788 
RMSE: 219.1248

Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 206.667	training's l2: 42711.2	valid_1's rmse: 244.002	valid_1's l2: 59537
Early stopping, best iteration is:
[628]	training's rmse: 194.824	training's l2: 37956.6	valid_1's rmse: 241.254	valid_1's l2: 58203.4
MAE: 136.2680 
MSE: 

In [11]:
columns = ['date_time', 'perceived_temperature', 'discomfort_index', 'hour_assignment', 'num_assignment']

train, test, submission, features, labels = s_data_load(columns)
models = sKfold(train, 5)


Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 196.898	training's l2: 38769	valid_1's rmse: 303.109	valid_1's l2: 91875.2
Early stopping, best iteration is:
[755]	training's rmse: 176.359	training's l2: 31102.7	valid_1's rmse: 298.24	valid_1's l2: 88947.3
MAE: 163.7473 
MSE: 88947.3227 
RMSE: 298.2404

Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 211.217	training's l2: 44612.4	valid_1's rmse: 222.491	valid_1's l2: 49502.3
Early stopping, best iteration is:
[819]	training's rmse: 185.975	training's l2: 34586.8	valid_1's rmse: 216.548	valid_1's l2: 46892.9
MAE: 130.8452 
MSE: 46892.8919 
RMSE: 216.5477

Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 205.283	training's l2: 42141.3	valid_1's rmse: 247.501	valid_1's l2: 61256.5
Early stopping, best iteration is:
[785]	training's rmse: 184.123	training's l2: 33901.3	valid_1's rmse: 243.404	valid_1's l2: 59245.6
MAE: 135.6657 
MSE

In [None]:
def KfoldForest(train, N=None):
    kfold = KFold(n_splits=N, shuffle=True, random_state=0)
    fold = 1
    models = {}

    for train_idx, valid_idx in kfold.split(train):
        print('\n ================== Fold {} =================='.format(fold))

        X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
        y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]
        
        
        model = RandomForestRegressor(random_state=0)
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        
        mae = mean_absolute_error(y_valid, pred)
        mse = mean_squared_error(y_valid, pred)
        rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
        print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        
        models[fold] = model

        fold += 1
    
    return models

In [None]:
columns = ['date_time', 'perceived_temperature', 'discomfort_index', 'gmm_hour_assignment', 'gmm_num_assignment']

train, test, submission, features, labels = data_load(columns)
models = KfoldForest(train, 5)

In [None]:
columns = ['date_time', 'perceived_temperature', 'discomfort_index', 'hour_assignment', 'num_assignment']

train, test, submission, features, labels = data_load(columns)
models = KfoldForest(train, 5)

In [None]:
# def KfoldModel(train, N=None, model=None):
#     kfold = KFold(n_splits=N, shuffle=True, random_state=0)
#     fold = 1
#     models = {}

#     for train_idx, valid_idx in kfold.split(train):
#         print('\n ================== Fold {} =================='.format(fold))

#         X_train, X_valid = features.iloc[train_idx, :], features.iloc[valid_idx, :]
#         y_train, y_valid = labels.iloc[train_idx, :], labels.iloc[valid_idx, :]
        
#         model.fit(X_train, y_train)
#         pred = model.predict(X_valid)
        
#         mae = mean_absolute_error(y_valid, pred)
#         mse = mean_squared_error(y_valid, pred)
#         rmse = np.sqrt(mean_squared_error(y_valid, pred))
        
#         print('MAE: {0:.4f} \nMSE: {1:.4f} \nRMSE: {2:.4f}'.format(mae, mse, rmse))
        
#         models[fold] = model

#         fold += 1
    
#     return models

In [None]:
# columns = ['date_time', 'perceived_temperature', 'discomfort_index']
# train, test, submission, features, labels = data_load(columns)

# model = [SVR(), KNeighborsRegressor()]
# dic = {}

# for m in model:
#     print('\nModel {}   '.format(str(m)[:-2]))
#     models = KfoldModel(train, 5, m)
#     dic[str(m)[:-2]] = models