# 변수변환이 된 데이터를 사용하며, 0이 아닌 데이터를 기준으로 cross validation을 해주어 최적의 모델을 적용한다. 또한 AutoCorrelation을 제거하기 위해 Shuffle해준다.


In [10]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go

from datetime import timedelta

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score

## R의 influence함수를 기준으로 influence 데이터들을 제거하였으며 해당 데이터를 train데이터로 불러온다

In [11]:
train_us=pd.read_csv("us_noinf.csv")
train_dj_or=pd.read_csv("dj_or_noinf.csv")
train_dj_wh=pd.read_csv("dj_wh_noinf.csv")
train_dj_ft=pd.read_csv("dj_ft_noinf.csv")


In [12]:
train_us

Unnamed: 0.1,Unnamed: 0,X,Humidity,WindSpeed,Temperature,Temp2,Temp3,Cloud_1.0,Cloud_2.0,Cloud_3.0,...,month_5,month_6,month_7,month_8,month_9,month_10,month_11,day_hour_int,hour,energy
0,1,8,43,1.7,-1,1,-1,1,0,0,...,0,0,0,0,0,0,0,2018030208,8,18
1,2,32,63,1.2,5,25,125,0,1,0,...,0,0,0,0,0,0,0,2018030308,8,14
2,3,56,84,1.5,9,81,729,0,0,1,...,0,0,0,0,0,0,0,2018030408,8,20
3,4,80,84,7.9,6,36,216,0,0,0,...,0,0,0,0,0,0,0,2018030508,8,0
4,5,104,61,3.3,5,25,125,0,0,1,...,0,0,0,0,0,0,0,2018030608,8,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12124,106311,25507,53,3.1,6,36,216,0,1,0,...,0,0,0,0,0,0,0,2021012719,19,0
12125,106411,25531,36,8.6,1,1,1,0,0,1,...,0,0,0,0,0,0,0,2021012819,19,0
12126,10659,25555,27,3.5,-1,1,-1,1,0,0,...,0,0,0,0,0,0,0,2021012919,19,0
12127,106611,25579,64,2.0,5,25,125,1,0,0,...,0,0,0,0,0,0,0,2021013019,19,0


#### 데이터를 확인해보니 앞서 데이터를 생성할 떄 index를 제거하지 않았음을 알 수 있다. -> 이를 제거해준다.
#### 또한 hour과 day_hour_int는 데이터 구분을 위해 아직 제거하지 않는다

In [13]:
# 앞이 두 열을 제거(인덱스 제거)
train_us=train_us.iloc[:,2:]
train_dj_or=train_dj_or.iloc[:,2:]
train_dj_wh=train_dj_wh.iloc[:,2:]
train_dj_ft=train_dj_ft.iloc[:,2:]

train_us

Unnamed: 0,Humidity,WindSpeed,Temperature,Temp2,Temp3,Cloud_1.0,Cloud_2.0,Cloud_3.0,month_1,month_2,...,month_5,month_6,month_7,month_8,month_9,month_10,month_11,day_hour_int,hour,energy
0,43,1.7,-1,1,-1,1,0,0,0,0,...,0,0,0,0,0,0,0,2018030208,8,18
1,63,1.2,5,25,125,0,1,0,0,0,...,0,0,0,0,0,0,0,2018030308,8,14
2,84,1.5,9,81,729,0,0,1,0,0,...,0,0,0,0,0,0,0,2018030408,8,20
3,84,7.9,6,36,216,0,0,0,0,0,...,0,0,0,0,0,0,0,2018030508,8,0
4,61,3.3,5,25,125,0,0,1,0,0,...,0,0,0,0,0,0,0,2018030608,8,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12124,53,3.1,6,36,216,0,1,0,1,0,...,0,0,0,0,0,0,0,2021012719,19,0
12125,36,8.6,1,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,2021012819,19,0
12126,27,3.5,-1,1,-1,1,0,0,1,0,...,0,0,0,0,0,0,0,2021012919,19,0
12127,64,2.0,5,25,125,1,0,0,1,0,...,0,0,0,0,0,0,0,2021013019,19,0


# Cross Validation

## 선형회귀를 바탕으로 데이터를 구성했지만, 그럼에도 몇몇 시간대는 회귀 가정을 따르지 않는다. 
## 그러한 데이터는 어떤 모델이 가장 fitting이 잘되는지 살펴보고자 cross validation을 실시한다.
### 또한, 에너지가 생성되는 상황에서 에너지 생산량을 예측하는 모델이기에 energy=9인 case를 제외한다.

In [15]:
from sklearn.model_selection import cross_val_score

from sklearn.neighbors import KNeighborsRegressor # k-최근접 이웃 회귀 #분류대신 값을 예측
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor



#개별 모델 성능 (MAE) 확인

knn_model = KNeighborsRegressor()
svr_model = SVR()
lr_model = LinearRegression()
rf_model = RandomForestRegressor()
lgb_model = LGBMRegressor()

## ulsan-crossvalidation

In [62]:
data=train_us[train_us['energy']>0]
data=data.sample(frac=1).reset_index(drop=True)#자기 상관성 제거를 위한 shuffle
#########################################################3###############################
for h in range(8,20):
    y=data[data['hour']==h].energy
    X_train_m=data[data['hour']==h].drop(['energy','day_hour_int','hour'],axis=1)

    cross_score_knn = cross_val_score(knn_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_lr = cross_val_score(lr_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_svr = cross_val_score(svr_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_rf = cross_val_score(rf_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_lgb = cross_val_score(lgb_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)


    r_score_knn = -cross_score_knn/X_train_m.shape[0]*5
    r_score_lr = -cross_score_lr/X_train_m.shape[0]*5
    r_score_svr = -cross_score_svr/X_train_m.shape[0]*5
    r_score_rf = -cross_score_rf/X_train_m.shape[0]*5
    r_score_lgb = -cross_score_lgb/X_train_m.shape[0]*5

    print(" ")
    print("Cross Validation of h=",h,"size=",X_train_m.shape[0])
### 각 validaion의 점수를 보기 힘들어 평균과 표준편차를 추출한다.    
    #print("KNN score :",r_score_knn )
    #print("LR score :",r_score_lr) 
    #print("SVR score :",r_score_svr )
    #print("RF score :",r_score_rf)
    #print("LGB score :",r_score_lgb)
    print("  KNN mean ", round(r_score_knn.mean(),4),"  LR mean ", round(r_score_lr.mean(),4), 
          "  SVR mean ", round(r_score_svr.mean(),4),"  RF mean ", round(r_score_rf.mean(),4)
         , " LGB mean ", round(r_score_lgb.mean(),4))
    print("  KNN std ", round(r_score_knn.std(),4),"   LR std ", round(r_score_lr.std(),4), 
          "   SVR std ", round(r_score_svr.std(),4),"   RF std ", round(r_score_rf.std(),4)
         , "  LGB std ", round(r_score_lgb.std(),4))    

 
Cross Validation of h= 8 size= 828
  KNN mean  2.4036   LR mean  1.1156   SVR mean  3.779   RF mean  1.1906  LGB mean  1.147
  KNN std  0.2603    LR std  0.0797    SVR std  0.3125    RF std  0.1592   LGB std  0.107
 
Cross Validation of h= 9 size= 978
  KNN mean  6.2728   LR mean  3.6496   SVR mean  12.2306   RF mean  3.8814  LGB mean  4.0645
  KNN std  0.7963    LR std  0.354    SVR std  0.6503    RF std  0.4433   LGB std  0.5574
 
Cross Validation of h= 10 size= 999
  KNN mean  14.1888   LR mean  8.3879   SVR mean  28.1941   RF mean  9.2951  LGB mean  9.6099
  KNN std  1.1419    LR std  0.6889    SVR std  2.7838    RF std  1.7109   LGB std  0.8035
 
Cross Validation of h= 11 size= 994
  KNN mean  20.8307   LR mean  10.9633   SVR mean  43.5139   RF mean  11.8909  LGB mean  12.6032
  KNN std  2.4166    LR std  1.205    SVR std  3.3411    RF std  0.7221   LGB std  0.8874
 
Cross Validation of h= 12 size= 1000
  KNN mean  26.4335   LR mean  14.7531   SVR mean  56.6319   RF mean  16.723

## dj_or Cross Validation

In [63]:
data=train_dj_or[train_dj_or['energy']>0]
data=data.sample(frac=1).reset_index(drop=True)#자기 상관성 제거를 위한 shuffle
#########################################################3###############################
for h in range(8,20):
    y=data[data['hour']==h].energy
    X_train_m=data[data['hour']==h].drop(['energy','day_hour_int','hour'],axis=1)

    cross_score_knn = cross_val_score(knn_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_lr = cross_val_score(lr_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_svr = cross_val_score(svr_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_rf = cross_val_score(rf_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_lgb = cross_val_score(lgb_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)


    r_score_knn = -cross_score_knn/X_train_m.shape[0]*5   # validation의 데이터 개수 만큼 나누어주어 NMAE를 만들어준다
    r_score_lr = -cross_score_lr/X_train_m.shape[0]*5
    r_score_svr = -cross_score_svr/X_train_m.shape[0]*5
    r_score_rf = -cross_score_rf/X_train_m.shape[0]*5
    r_score_lgb = -cross_score_lgb/X_train_m.shape[0]*5

    print(" ")
    print("Cross Validation of h=",h,"size=",X_train_m.shape[0])
### 각 validaion의 점수를 보기 힘들어 평균과 표준편차를 추출한다.    
    #print("KNN score :",r_score_knn )
    #print("LR score :",r_score_lr) 
    #print("SVR score :",r_score_svr )
    #print("RF score :",r_score_rf)
    #print("LGB score :",r_score_lgb)
    print("  KNN mean ", round(r_score_knn.mean(),4),"  LR mean ", round(r_score_lr.mean(),4), 
          "  SVR mean ", round(r_score_svr.mean(),4),"  RF mean ", round(r_score_rf.mean(),4)
         , " LGB mean ", round(r_score_lgb.mean(),4))
    print("  KNN std ", round(r_score_knn.std(),4),"   LR std ", round(r_score_lr.std(),4), 
          "   SVR std ", round(r_score_svr.std(),4),"   RF std ", round(r_score_rf.std(),4)
         , "  LGB std ", round(r_score_lgb.std(),4))     

 
Cross Validation of h= 8 size= 674
  KNN mean  4.9942   LR mean  2.9169   SVR mean  5.2438   RF mean  3.005  LGB mean  3.0033
  KNN std  0.2333    LR std  0.2997    SVR std  0.5909    RF std  0.2503   LGB std  0.1799
 
Cross Validation of h= 9 size= 942
  KNN mean  21.2817   LR mean  11.3848   SVR mean  29.1102   RF mean  12.7404  LGB mean  12.6921
  KNN std  1.3316    LR std  0.5177    SVR std  2.9825    RF std  0.7623   LGB std  1.0137
 
Cross Validation of h= 10 size= 1001
  KNN mean  57.3689   LR mean  35.8393   SVR mean  85.0523   RF mean  41.0747  LGB mean  41.7114
  KNN std  6.2216    LR std  3.1818    SVR std  4.827    RF std  3.4325   LGB std  3.4316
 
Cross Validation of h= 11 size= 1010
  KNN mean  103.4709   LR mean  56.8592   SVR mean  164.4558   RF mean  64.3462  LGB mean  63.9472
  KNN std  7.4238    LR std  4.814    SVR std  12.2138    RF std  5.9494   LGB std  5.9427
 
Cross Validation of h= 12 size= 1009
  KNN mean  124.0491   LR mean  75.0178   SVR mean  230.3125  

## dj_wh CrossValidation


In [65]:
data=train_dj_wh[train_dj_wh['energy']>0]
data=data.sample(frac=1).reset_index(drop=True)#자기 상관성 제거를 위한 shuffle
#########################################################3###############################
for h in range(8,20):
    y=data[data['hour']==h].energy
    X_train_m=data[data['hour']==h].drop(['energy','day_hour_int','hour'],axis=1)

    cross_score_knn = cross_val_score(knn_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_lr = cross_val_score(lr_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_svr = cross_val_score(svr_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_rf = cross_val_score(rf_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_lgb = cross_val_score(lgb_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)


    r_score_knn = -cross_score_knn/X_train_m.shape[0]*5   # validation의 데이터 개수 만큼 나누어주어 NMAE를 만들어준다
    r_score_lr = -cross_score_lr/X_train_m.shape[0]*5
    r_score_svr = -cross_score_svr/X_train_m.shape[0]*5
    r_score_rf = -cross_score_rf/X_train_m.shape[0]*5
    r_score_lgb = -cross_score_lgb/X_train_m.shape[0]*5

    print(" ")
    print("Cross Validation of h=",h," size=",X_train_m.shape[0])
### 각 validaion의 점수를 보기 힘들어 평균과 표준편차를 추출한다.    
    #print("KNN score :",r_score_knn )
    #print("LR score :",r_score_lr) 
    #print("SVR score :",r_score_svr )
    #print("RF score :",r_score_rf)
    #print("LGB score :",r_score_lgb)
    print("  KNN mean ", round(r_score_knn.mean(),4),"  LR mean ", round(r_score_lr.mean(),4), 
          "  SVR mean ", round(r_score_svr.mean(),4),"  RF mean ", round(r_score_rf.mean(),4)
         , " LGB mean ", round(r_score_lgb.mean(),4))
    print("  KNN std ", round(r_score_knn.std(),4),"   LR std ", round(r_score_lr.std(),4), 
          "   SVR std ", round(r_score_svr.std(),4),"   RF std ", round(r_score_rf.std(),4)
         , "  LGB std ", round(r_score_lgb.std(),4))     

 
Cross Validation of h= 8  size= 734
  KNN mean  3.0138   LR mean  1.0674   SVR mean  3.7549   RF mean  1.2082  LGB mean  1.111
  KNN std  0.0728    LR std  0.1257    SVR std  0.3595    RF std  0.2414   LGB std  0.1103
 
Cross Validation of h= 9  size= 982
  KNN mean  9.0937   LR mean  4.9085   SVR mean  13.7465   RF mean  5.4377  LGB mean  5.1866
  KNN std  0.5592    LR std  0.318    SVR std  1.4484    RF std  0.6214   LGB std  0.6758
 
Cross Validation of h= 10  size= 1007
  KNN mean  24.1595   LR mean  14.9722   SVR mean  38.8077   RF mean  16.2473  LGB mean  16.5693
  KNN std  1.3527    LR std  0.665    SVR std  2.431    RF std  1.4093   LGB std  1.4407
 
Cross Validation of h= 11  size= 1014
  KNN mean  45.6866   LR mean  26.6135   SVR mean  74.424   RF mean  27.6216  LGB mean  28.0974
  KNN std  5.3801    LR std  2.3265    SVR std  3.9176    RF std  3.1918   LGB std  2.4854
 
Cross Validation of h= 12  size= 1007
  KNN mean  61.3223   LR mean  36.9963   SVR mean  101.8242   RF m

## dj_ft CrossValidation

In [67]:
data=train_dj_ft[train_dj_ft['energy']>0]
data=data.sample(frac=1).reset_index(drop=True)#자기 상관성 제거를 위한 shuffle
#########################################################3###############################
for h in range(8,20):
    y=data[data['hour']==h].energy
    X_train_m=data[data['hour']==h].drop(['energy','day_hour_int','hour'],axis=1)

    cross_score_knn = cross_val_score(knn_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_lr = cross_val_score(lr_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_svr = cross_val_score(svr_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_rf = cross_val_score(rf_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)
    cross_score_lgb = cross_val_score(lgb_model, X_train_m, y, scoring='neg_mean_squared_error', cv=5)


    r_score_knn = -cross_score_knn/X_train_m.shape[0]*5   # validation의 데이터 개수 만큼 나누어주어 NMAE를 만들어준다
    r_score_lr = -cross_score_lr/X_train_m.shape[0]*5
    r_score_svr = -cross_score_svr/X_train_m.shape[0]*5
    r_score_rf = -cross_score_rf/X_train_m.shape[0]*5
    r_score_lgb = -cross_score_lgb/X_train_m.shape[0]*5

    print(" ")
    print("Cross Validation of h=",h,"size=",X_train_m.shape[0])
### 각 validaion의 점수를 보기 힘들어 평균과 표준편차를 추출한다.    
    #print("KNN score :",r_score_knn )
    #print("LR score :",r_score_lr) 
    #print("SVR score :",r_score_svr )
    #print("RF score :",r_score_rf)
    #print("LGB score :",r_score_lgb)
    print("  KNN mean ", round(r_score_knn.mean(),4),"  LR mean ", round(r_score_lr.mean(),4), 
          "  SVR mean ", round(r_score_svr.mean(),4),"  RF mean ", round(r_score_rf.mean(),4)
         , " LGB mean ", round(r_score_lgb.mean(),4))
    print("  KNN std ", round(r_score_knn.std(),4),"   LR std ", round(r_score_lr.std(),4), 
          "   SVR std ", round(r_score_svr.std(),4),"   RF std ", round(r_score_rf.std(),4)
         , "  LGB std ", round(r_score_lgb.std(),4))     

 
Cross Validation of h= 8 size= 788
  KNN mean  3.2272   LR mean  1.5731   SVR mean  4.2855   RF mean  1.6875  LGB mean  1.6513
  KNN std  0.2708    LR std  0.1158    SVR std  0.4047    RF std  0.1754   LGB std  0.1434
 
Cross Validation of h= 9 size= 1004
  KNN mean  13.5744   LR mean  7.5671   SVR mean  20.8549   RF mean  8.198  LGB mean  8.1204
  KNN std  1.634    LR std  0.8252    SVR std  0.8361    RF std  1.1238   LGB std  0.9832
 
Cross Validation of h= 10 size= 1011
  KNN mean  39.1022   LR mean  24.6288   SVR mean  63.5437   RF mean  26.5801  LGB mean  26.9183
  KNN std  3.8359    LR std  1.66    SVR std  4.304    RF std  1.5583   LGB std  1.7704
 
Cross Validation of h= 11 size= 1013
  KNN mean  74.1267   LR mean  44.8753   SVR mean  123.868   RF mean  47.9263  LGB mean  50.4697
  KNN std  7.1906    LR std  3.3182    SVR std  6.5072    RF std  4.6381   LGB std  4.8724
 
Cross Validation of h= 12 size= 1017
  KNN mean  100.963   LR mean  62.919   SVR mean  172.4061   RF mean 

## 다양하게 고려해봐야 하나 우선 평균을 기준으로 best model을 선정
### + RF 와 LGB는 성능이 계속 바뀜  흠.. 큰차이가 나지 않는다면 데이터를 정리해서 LR을 쓰고 싶음
- us :   18->RF     /나머지 ->LR
- dj_or: 17,18->RF     /나머지 ->LR
- dj_wh: 17->RF, 18->LGB     /나머지 ->LR
- dj_ft: 18->RF     /나머지 ->LR

# test 생성

In [69]:
test_us=pd.read_csv("test_us_with_energy0.csv")
test_dj=pd.read_csv("test_dj_with_energy0.csv")


In [70]:
test_us

Unnamed: 0,Humidity,WindSpeed,Temperature,Temp2,Temp3,Cloud_1.0,Cloud_2.0,Cloud_3.0,month_1,month_2,...,month_5,month_6,month_7,month_8,month_9,month_10,month_11,day_hour_int,hour,energy
0,65.0,4.7,11.0,121.0,1331.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021020100,0,0
1,75.0,6.2,11.0,121.0,1331.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021020101,1,0
2,75.0,6.2,11.0,121.0,1331.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021020102,2,0
3,75.0,6.2,11.0,121.0,1331.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021020103,3,0
4,79.0,6.7,12.0,144.0,1728.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021020104,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,72.0,3.5,14.0,196.0,2744.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021022819,19,0
668,74.0,3.9,14.0,196.0,2744.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021022820,20,0
669,75.0,4.3,14.0,196.0,2744.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021022821,21,0
670,77.0,4.0,14.0,196.0,2744.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021022822,22,0


In [71]:
test_dj

Unnamed: 0,Humidity,WindSpeed,Temperature,Temp2,Temp3,Cloud_1.0,Cloud_2.0,Cloud_3.0,month_1,month_2,...,month_5,month_6,month_7,month_8,month_9,month_10,month_11,day_hour_int,hour,energy
0,80.0,6.0,7.0,49.0,343.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021020100,0,0
1,90.0,3.4,7.0,49.0,343.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021020101,1,0
2,90.0,3.4,7.0,49.0,343.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021020102,2,0
3,90.0,3.4,7.0,49.0,343.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021020103,3,0
4,89.0,3.1,7.0,49.0,343.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021020104,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,65.0,1.0,8.0,64.0,512.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021022819,19,0
668,67.0,1.1,7.0,49.0,343.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021022820,20,0
669,70.0,1.3,7.0,49.0,343.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021022821,21,0
670,73.0,1.3,7.0,49.0,343.0,0,0,0,0,1,...,0,0,0,0,0,0,0,2021022822,22,0


# 예측
### 위의 cross validation의 결과로 찾은 최적 model을 적용하여 예측

## ulsan
- 18 -> RF
- 나머지-> LR

In [87]:
PRED_us=pd.DataFrame(np.zeros((28,24))) # 예측 값을 담아줄 행렬을 생성: 28 x 24 (28일 x 24시간)

data=train_us[train_us['energy']>0]
data=data.sample(frac=1).reset_index(drop=True)#자기 상관성 제거를 위한 shuffle

for h in range(24):
########################  평가기준 아닌 시간은 0########################3    
    if h<=7:
        PRED_us.iloc[:,h]=0
    elif h>=20:
        PRED_us.iloc[:,h]=0

#######################  18 ->  RF ############################        

    elif h==18:
        train=data[data['hour']==h]
        
        train_X=train.drop(['energy','day_hour_int','hour'],axis=1) # data 분류를 위해 남겼던 hour,day_hour_int & target:energy 제거
        train_y=train['energy'] # target만 추출

        # test-시간 맞춰 추출 & 설명변수만 남김
        test=test_us[test_us['hour']==h].reset_index(drop=True).drop(['energy','day_hour_int','hour'],axis=1) 

        #모델 fitting
        rf_model = RandomForestRegressor()
        rf_model.fit(train_X,train_y)

        #예측값
        energy=rf_model.predict(test)
        PRED_us.iloc[:,h]=energy
        
        
####################### 나머지-> LR ###############################
    else:
        train=data[data['hour']==h]
        
        train_X=train.drop(['energy','day_hour_int','hour'],axis=1) # data 분류를 위해 남겼던 hour,day_hour_int & target:energy 제거
        train_y=train['energy'] # target만 추출
        
        # test-시간 맞춰 추출 & 설명변수만 남김
        test=test_us[test_us['hour']==h].reset_index(drop=True).drop(['energy','day_hour_int','hour'],axis=1)

        #모델 fitting
        lr_model = LinearRegression()
        lr_model.fit(train_X,train_y)

        #예측값
        energy=lr_model.predict(test)
        PRED_us.iloc[:,h]=energy

## dj_or
- 17,18 -> RF
- 나머지 -> LR

In [88]:
PRED_dj_or=pd.DataFrame(np.zeros((28,24))) # 예측 값을 담아줄 행렬을 생성: 28 x 24 (28일 x 24시간)

data=train_dj_or[train_dj_or['energy']>0]
data=data.sample(frac=1).reset_index(drop=True)#자기 상관성 제거를 위한 shuffle

for h in range(24):
########################  평가기준 아닌 시간은 0########################3    
    if h<=7:
        PRED_dj_or.iloc[:,h]=0
    elif h>=20:
        PRED_dj_or.iloc[:,h]=0

#######################  17,18 ->  RF ############################        

    elif h==17:
        train=data[data['hour']==h]
        
        train_X=train.drop(['energy','day_hour_int','hour'],axis=1) # data 분류를 위해 남겼던 hour,day_hour_int & target:energy 제거
        train_y=train['energy'] # target만 추출

        # test-시간 맞춰 추출 & 설명변수만 남김
        test=test_dj[test_dj['hour']==h].reset_index(drop=True).drop(['energy','day_hour_int','hour'],axis=1) 

        #모델 fitting
        rf_model = RandomForestRegressor()
        rf_model.fit(train_X,train_y)

        #예측값
        energy=rf_model.predict(test)
        PRED_dj_or.iloc[:,h]=energy

    elif h==18:
        train=data[data['hour']==h]
        
        train_X=train.drop(['energy','day_hour_int','hour'],axis=1) # data 분류를 위해 남겼던 hour,day_hour_int & target:energy 제거
        train_y=train['energy'] # target만 추출

        # test-시간 맞춰 추출 & 설명변수만 남김
        test=test_dj[test_dj['hour']==h].reset_index(drop=True).drop(['energy','day_hour_int','hour'],axis=1) 

        #모델 fitting
        rf_model = RandomForestRegressor()
        rf_model.fit(train_X,train_y)

        #예측값
        energy=rf_model.predict(test)
        PRED_dj_or.iloc[:,h]=energy
        
        
####################### 나머지-> LR ###############################
    else:
        train=data[data['hour']==h]
        
        train_X=train.drop(['energy','day_hour_int','hour'],axis=1) # data 분류를 위해 남겼던 hour,day_hour_int & target:energy 제거
        train_y=train['energy'] # target만 추출
        
        # test-시간 맞춰 추출 & 설명변수만 남김
        test=test_dj[test_dj['hour']==h].reset_index(drop=True).drop(['energy','day_hour_int','hour'],axis=1)

        #모델 fitting
        lr_model = LinearRegression()
        lr_model.fit(train_X,train_y)

        #예측값
        energy=lr_model.predict(test)
        PRED_dj_or.iloc[:,h]=energy

## dj_wh
- 17 -> RF
- 18 -> LGB
- 나머지 -> LR

In [89]:
PRED_dj_wh=pd.DataFrame(np.zeros((28,24))) # 예측 값을 담아줄 행렬을 생성: 28 x 24 (28일 x 24시간)

data=train_dj_wh[train_dj_wh['energy']>0]
data=data.sample(frac=1).reset_index(drop=True)#자기 상관성 제거를 위한 shuffle

for h in range(24):
########################  평가기준 아닌 시간은 0########################3    
    if h<=7:
        PRED_dj_wh.iloc[:,h]=0
    elif h>=20:
        PRED_dj_wh.iloc[:,h]=0

#######################  17 ->  RF ############################        

    elif h==17:
        train=data[data['hour']==h]
        
        train_X=train.drop(['energy','day_hour_int','hour'],axis=1) # data 분류를 위해 남겼던 hour,day_hour_int & target:energy 제거
        train_y=train['energy'] # target만 추출

        # test-시간 맞춰 추출 & 설명변수만 남김
        test=test_dj[test_dj['hour']==h].reset_index(drop=True).drop(['energy','day_hour_int','hour'],axis=1) 

        #모델 fitting
        rf_model = RandomForestRegressor()
        rf_model.fit(train_X,train_y)

        #예측값
        energy=rf_model.predict(test)
        PRED_dj_wh.iloc[:,h]=energy

#####################  18->LGB ###############################
    elif h==18:
        train=data[data['hour']==h]
        
        train_X=train.drop(['energy','day_hour_int','hour'],axis=1) # data 분류를 위해 남겼던 hour,day_hour_int & target:energy 제거
        train_y=train['energy'] # target만 추출

        # test-시간 맞춰 추출 & 설명변수만 남김
        test=test_dj[test_dj['hour']==h].reset_index(drop=True).drop(['energy','day_hour_int','hour'],axis=1) 

        #모델 fitting
        lr_model = LinearRegression()
        lr_model.fit(train_X,train_y)

        #예측값
        energy=lr_model.predict(test)
        PRED_dj_wh.iloc[:,h]=energy
        
        
####################### 나머지-> LR ###############################
    else:
        train=data[data['hour']==h]
        
        train_X=train.drop(['energy','day_hour_int','hour'],axis=1) # data 분류를 위해 남겼던 hour,day_hour_int & target:energy 제거
        train_y=train['energy'] # target만 추출
        
        # test-시간 맞춰 추출 & 설명변수만 남김
        test=test_dj[test_dj['hour']==h].reset_index(drop=True).drop(['energy','day_hour_int','hour'],axis=1)

        #모델 fitting
        lr_model = LinearRegression()
        lr_model.fit(train_X,train_y)

        #예측값
        energy=lr_model.predict(test)
        PRED_dj_wh.iloc[:,h]=energy

## dj_ft
- 18 -> LGB
- 나머지 -> LR

In [90]:
PRED_dj_ft=pd.DataFrame(np.zeros((28,24))) # 예측 값을 담아줄 행렬을 생성: 28 x 24 (28일 x 24시간)

data=train_dj_ft[train_dj_ft['energy']>0]
data=data.sample(frac=1).reset_index(drop=True)#자기 상관성 제거를 위한 shuffle

for h in range(24):
########################  평가기준 아닌 시간은 0########################3    
    if h<=7:
        PRED_dj_ft.iloc[:,h]=0
    elif h>=20:
        PRED_dj_ft.iloc[:,h]=0


#####################  18->LGB ###############################
    elif h==18:
        train=data[data['hour']==h]
        
        train_X=train.drop(['energy','day_hour_int','hour'],axis=1) # data 분류를 위해 남겼던 hour,day_hour_int & target:energy 제거
        train_y=train['energy'] # target만 추출

        # test-시간 맞춰 추출 & 설명변수만 남김
        test=test_dj[test_dj['hour']==h].reset_index(drop=True).drop(['energy','day_hour_int','hour'],axis=1) 

        #모델 fitting
        lr_model = LinearRegression()
        lr_model.fit(train_X,train_y)

        #예측값
        energy=lr_model.predict(test)
        PRED_dj_ft.iloc[:,h]=energy
        
        
####################### 나머지-> LR ###############################
    else:
        train=data[data['hour']==h]
        
        train_X=train.drop(['energy','day_hour_int','hour'],axis=1) # data 분류를 위해 남겼던 hour,day_hour_int & target:energy 제거
        train_y=train['energy'] # target만 추출
        
        # test-시간 맞춰 추출 & 설명변수만 남김
        test=test_dj[test_dj['hour']==h].reset_index(drop=True).drop(['energy','day_hour_int','hour'],axis=1)

        #모델 fitting
        lr_model = LinearRegression()
        lr_model.fit(train_X,train_y)

        #예측값
        energy=lr_model.predict(test)
        PRED_dj_ft.iloc[:,h]=energy

In [91]:
PRED_np_us=PRED_us.to_numpy()
PRED_np_dj_or=PRED_dj_or.to_numpy()
PRED_np_dj_ft=PRED_dj_ft.to_numpy()
PRED_np_dj_wh=PRED_dj_wh.to_numpy()


In [92]:
test_dj['dangjin']=PRED_np_dj_or.reshape(-1,1)
test_dj['dangjin_warehouse']=PRED_np_dj_wh.reshape(-1,1)
test_dj['dangjin_floating']=PRED_np_dj_ft.reshape(-1,1)
test_us['ulsan']=PRED_np_us.reshape(-1,1)

In [93]:
test_us

Unnamed: 0,Humidity,WindSpeed,Temperature,Temp2,Temp3,Cloud_1.0,Cloud_2.0,Cloud_3.0,month_1,month_2,...,month_6,month_7,month_8,month_9,month_10,month_11,day_hour_int,hour,energy,ulsan
0,65.0,4.7,11.0,121.0,1331.0,0,0,0,0,1,...,0,0,0,0,0,0,2021020100,0,0,0.000000
1,75.0,6.2,11.0,121.0,1331.0,0,0,0,0,1,...,0,0,0,0,0,0,2021020101,1,0,0.000000
2,75.0,6.2,11.0,121.0,1331.0,0,0,0,0,1,...,0,0,0,0,0,0,2021020102,2,0,0.000000
3,75.0,6.2,11.0,121.0,1331.0,0,0,0,0,1,...,0,0,0,0,0,0,2021020103,3,0,0.000000
4,79.0,6.7,12.0,144.0,1728.0,0,0,0,0,1,...,0,0,0,0,0,0,2021020104,4,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,72.0,3.5,14.0,196.0,2744.0,0,0,0,0,1,...,0,0,0,0,0,0,2021022819,19,0,3.194249
668,74.0,3.9,14.0,196.0,2744.0,0,0,0,0,1,...,0,0,0,0,0,0,2021022820,20,0,0.000000
669,75.0,4.3,14.0,196.0,2744.0,0,0,0,0,1,...,0,0,0,0,0,0,2021022821,21,0,0.000000
670,77.0,4.0,14.0,196.0,2744.0,0,0,0,0,1,...,0,0,0,0,0,0,2021022822,22,0,0.000000


In [95]:
sub=pd.read_csv('sample_submission.csv')
sub_2=sub.iloc[:671,:]
sub_2
sub_67=sub.iloc[671:,:]
sub.head(10)

Unnamed: 0,time,dangjin_floating,dangjin_warehouse,dangjin,ulsan
0,2021-02-01 01:00:00,0,0,0,0
1,2021-02-01 02:00:00,0,0,0,0
2,2021-02-01 03:00:00,0,0,0,0
3,2021-02-01 04:00:00,0,0,0,0
4,2021-02-01 05:00:00,0,0,0,0
5,2021-02-01 06:00:00,0,0,0,0
6,2021-02-01 07:00:00,0,0,0,0
7,2021-02-01 08:00:00,0,0,0,0
8,2021-02-01 09:00:00,0,0,0,0
9,2021-02-01 10:00:00,0,0,0,0


In [97]:
sub_2['dangjin']=list(test_dj.iloc[1:,:].dangjin)
sub_2['dangjin_warehouse']=list(test_dj.iloc[1:,:].dangjin_warehouse)
sub_2['dangjin_floating']=list(test_dj.iloc[1:,:].dangjin_floating)
sub_2['ulsan']=list(test_us.iloc[1:,:].ulsan)



sub_final=pd.concat([sub_2,sub_67],axis=0)
sub_final.to_csv('sub_noinf_shuffle.csv',index=False)