## 모듈 임포트

In [None]:
import pandas as pd
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

## 데이터 로드

In [None]:
!wget 'https://bit.ly/3dD5MU9'

import zipfile
with zipfile.ZipFile('3dD5MU9', 'r') as existing_zip:
    existing_zip.extractall('data')

--2021-06-16 09:19:54--  https://bit.ly/3dD5MU9
Resolving bit.ly (bit.ly)... 67.199.248.10, 67.199.248.11
Connecting to bit.ly (bit.ly)|67.199.248.10|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://drive.google.com/uc?export=download&id=1kkF00wW8v0npJ8S2nA7--eMTH3gOL03z [following]
--2021-06-16 09:19:54--  https://drive.google.com/uc?export=download&id=1kkF00wW8v0npJ8S2nA7--eMTH3gOL03z
Resolving drive.google.com (drive.google.com)... 108.177.126.101, 108.177.126.100, 108.177.126.113, ...
Connecting to drive.google.com (drive.google.com)|108.177.126.101|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-14-94-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/qfmb2abe79fo41o7r2io3s7sins7k5pl/1623835200000/00192245294648390361/*/1kkF00wW8v0npJ8S2nA7--eMTH3gOL03z?e=download [following]
--2021-06-16 09:20:01--  https://doc-14-94-docs.googleusercontent.com/docs/secur

In [None]:
energy = pd.read_csv('data/energy.csv')
dangjin_fcst = pd.read_csv('./data/dangjin_fcst_data.csv')
ulsan_fcst = pd.read_csv('./data/ulsan_fcst_data.csv')

## 데이터 병합 
- 가장 나중의 예측 데이터가 가장 좋다고 가정.

In [None]:
def to_date(x):
    return timedelta(hours=x)

In [None]:
def generate_df(df_):
    df = df_.copy()

    df['Forecast_time'] = pd.to_datetime(df['Forecast time'])

    # 20 시
    a = df[df["forecast"] == 4.0]
    a = a[a["Forecast time"].apply(lambda x: "20:00:00" in x)]
    a.loc[:, 'Forecast_time'] = a.loc[:, 'Forecast_time'] + a.loc[:, 'forecast'].map(to_date)

    # 23 시
    b = df[df["forecast"] <= 22]
    b = b[b["Forecast time"].apply(lambda x: "23:00:00" in x)]
    b.loc[:, 'Forecast_time'] = b.loc[:, 'Forecast_time'] + b.loc[:, 'forecast'].map(to_date)

    # 병합
    c = pd.concat([a, b])
    print(f"20시 사용 데이터 길이 : {len(a)}")
    print(f"23시 사용 데이터 길이 : {len(b)}")
    print(f"합친 데이터 길이 : {len(c)}")
    print()

    # 정렬
    c.sort_values(by=['Forecast_time'], inplace=True)
    c = c[['Forecast_time', 'Temperature', 'Humidity', 'WindSpeed', 'WindDirection', 'Cloud']]

    return c

In [None]:
dangjin_filled = generate_df(dangjin_fcst)
ulsan_filled = generate_df(ulsan_fcst)

20시 사용 데이터 길이 : 1096
23시 사용 데이터 길이 : 7672
합친 데이터 길이 : 8768

20시 사용 데이터 길이 : 1096
23시 사용 데이터 길이 : 7672
합친 데이터 길이 : 8768



## 보간

In [None]:
def interpolate_df(df, method='linear'):
    new_df = pd.DataFrame()
    new_df['Forecast_time'] = pd.date_range(start=df['Forecast_time'].iloc[0], end=df['Forecast_time'].iloc[-1], freq='H')
    new_df = pd.merge(new_df, df, on='Forecast_time', how='outer')
    return new_df.interpolate(method=method)

In [None]:
dangjin_interpolated = interpolate_df(dangjin_filled, method='linear')
ulsan_interpolated = interpolate_df(ulsan_filled, method='linear')

## 학습 데이터 전처리

In [None]:
def train_datast(energy_df, fcst_df, target):
    # 일기 예보 있는 날짜만 선택
    energy = energy_df.loc[24:]
    energy.index = range(energy.shape[0])
    
    # 발전량 데이터가 있는 날짜만 선택
    fcst = fcst_df.loc[:25608-1]
    fcst.index = range(fcst.shape[0])
    
    # 발전량과 일기예보 연결
    concat_df = pd.concat([energy, fcst], axis=1)
    
    # 예보 시간 및 날짜 정보 feature로 추가
    concat_df['date'] = concat_df['Forecast_time'].str.split(' ').str[0]
    concat_df['hour'] = concat_df['Forecast_time'].str.split(' ').str[1].str.split(':').str[0].astype(int)
    
    concat_df['year'] = concat_df['date'].str.split('-').str[0].astype(int)
    concat_df['month'] = concat_df['date'].str.split('-').str[1].astype(int)
    concat_df['day'] = concat_df['date'].str.split('-').str[2].astype(int)
    
    # 예보 시간, 날짜, 기상 예보 및 발전량 선택
    feature_df = concat_df[['year', 'month', 'day', 'hour', 'Temperature', 'Humidity', 'WindSpeed', 'WindDirection', 'Cloud', target]]
    
    # 마지막 30일을 검증데이터셋으로 나머지를 학습 데이터셋으로 선택
    train_df = feature_df.iloc[:-24*90]
    val_df = feature_df.iloc[-24*90:]
    
    # 발전량이 0인 데이터를 제외
    #train_df = train_df[train_df[target]!=0]
    
    train_x = train_df.loc[:, 'year':'Cloud'].to_numpy()
    train_y = train_df[target].to_numpy()
    val_x = val_df.loc[:, 'year':'Cloud'].to_numpy()
    val_y = val_df[target].to_numpy()
    
    return train_x, train_y, val_x, val_y

## bayesian optimization 
---
- bayesian optimization 를 이용하여 하이퍼 파라미터 튜닝
- lgbm만 적용.

In [None]:
# bayesian optimization 패키지 설치
!pip install bayesian-optimization
from bayes_opt import BayesianOptimization

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/bb/7a/fd8059a3881d3ab37ac8f72f56b73937a14e8bb14a9733e68cc8b17dbe3c/bayesian-optimization-1.2.0.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-cp37-none-any.whl size=11686 sha256=13a90544c739290a7e66731b876402ba580036175f0326b3cd917bcb864bb332
  Stored in directory: /root/.cache/pip/wheels/5a/56/ae/e0e3c1fc1954dc3ec712e2df547235ed072b448094d8f94aec
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [None]:
!pip install Catboost
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

Collecting Catboost
[?25l  Downloading https://files.pythonhosted.org/packages/5a/41/24e14322b9986cf72a8763e0a0a69cc256cf963cf9502c8f0044a62c1ae8/catboost-0.26-cp37-none-manylinux1_x86_64.whl (69.2MB)
[K     |████████████████████████████████| 69.2MB 62kB/s 
Installing collected packages: Catboost
Successfully installed Catboost-0.26


In [None]:
dangjin_interpolated['Forecast_time'] = dangjin_interpolated['Forecast_time'].astype('str')
ulsan_interpolated['Forecast_time'] = ulsan_interpolated['Forecast_time'].astype('str')

In [None]:
energy = energy.fillna(0)

## 당진 수상태양광 예측 모델 학습

In [None]:
bayesian_params = {
    'max_depth': (5, 20), 
    'num_leaves': (24, 64), 
    'min_child_samples': (10, 200), 
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha': (0.01, 50) 
}

def lgb_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree,max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":1000, "learning_rate":0.01,
        'max_depth': int(round(max_depth)), #  호출 시 실수형 값이 들어오므로 정수형 하이퍼 파라미터는 정수형으로 변경 
        'num_leaves': int(round(num_leaves)), 
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0), 
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),
        'reg_lambda': max(reg_lambda,0),
        'reg_alpha': max(reg_alpha, 0),
        "metric" : 'rmse'
    }
    cv_result = lgb.cv(params, train_data, nfold=5, seed=0, verbose_eval =200,stratified=False)
    return (-1.0 * np.array(cv_result['rmse-mean'])).max()

train_x, train_y, val_x, val_y = train_datast(energy, dangjin_interpolated, target='dangjin_floating')
train_data = lgb.Dataset(train_x, train_y)
lgbBO = BayesianOptimization(lgb_eval, bayesian_params, random_state=42)


lgbBO.maximize(init_points=5, n_iter=25)
####################################################
# dictionary에 있는 target값을 모두 추출
target_list = []
for result in lgbBO.res:
    target = result['target']
    target_list.append(target)
print(target_list)
# 가장 큰 target 값을 가지는 순번(index)를 추출
print('maximum target index:', np.argmax(np.array(target_list)))

#################################################################
# 가장 큰 target값을 가지는 index값을 기준으로 res에서 해당 parameter 추출. 
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
print(max_dict)

## K-fold dataset

In [None]:
from sklearn.model_selection import StratifiedKFold
import random

In [None]:
def kfold_datast(energy_df, fcst_df, target):
    # 일기 예보 있는 날짜만 선택
    energy = energy_df.loc[24:]
    energy.index = range(energy.shape[0])
    
    # 발전량 데이터가 있는 날짜만 선택
    fcst = fcst_df.loc[:25608-1]
    fcst.index = range(fcst.shape[0])
    
    # 발전량과 일기예보 연결
    concat_df = pd.concat([energy, fcst], axis=1)
    
    # 예보 시간 및 날짜 정보 feature로 추가
    concat_df['date'] = concat_df['Forecast_time'].str.split(' ').str[0]
    concat_df['hour'] = concat_df['Forecast_time'].str.split(' ').str[1].str.split(':').str[0].astype(int)
    
    concat_df['year'] = concat_df['date'].str.split('-').str[0].astype(int)
    concat_df['month'] = concat_df['date'].str.split('-').str[1].astype(int)
    concat_df['day'] = concat_df['date'].str.split('-').str[2].astype(int)
    
    # 예보 시간, 날짜, 기상 예보 및 발전량 선택
    feature_df = concat_df[['year', 'month', 'day', 'hour', 'Temperature', 'Humidity', 'WindSpeed', 'WindDirection', 'Cloud', target]]
    return feature_df

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
random.seed(42)
lgb1_models={}
target = 'dangjin_floating'

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds=[]
train = kfold_datast (energy, dangjin_interpolated, target=target)


for train_idx, valid_idx in skf.split(train, train['dangjin_floating']):
    folds.append((train_idx, valid_idx))

for fold in range(10):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    train_x, val_x, train_y, val_y = train.drop([target],axis=1).iloc[train_idx].values, train.drop([target],axis=1).iloc[valid_idx].values,\
                                         train[target][train_idx].values, train[target][valid_idx].values 
    lgb = LGBMRegressor(n_estimators=5000 ,random_seed=42, learning_rate=0.005,depth = 10,loss_function='MultiRMSE',
                                        colsample_bytree =  1,max_depth = 17, min_child_samples = 17, min_child_weight = 14, num_leaves = 63, reg_alpha=49.75855787791481, reg_lambda= 0.026935271560602856, subsample= 1.0)
    lgb.fit(train_x, train_y,
                    eval_set=[(train_x, train_y), (val_x, val_y)],
                    early_stopping_rounds=50,verbose=100)
    lgb1_models[fold]=lgb
    print(f'================================================================================\n\n')
    #2364



Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 16952.6	valid_1's l2: 17107.2
[200]	training's l2: 9245.21	valid_1's l2: 9551.91
[300]	training's l2: 6013.8	valid_1's l2: 6437.25
[400]	training's l2: 4562.8	valid_1's l2: 5054.99
[500]	training's l2: 3811.96	valid_1's l2: 4389.5
[600]	training's l2: 3332.44	valid_1's l2: 3977.49
[700]	training's l2: 3025.76	valid_1's l2: 3742.32
[800]	training's l2: 2785.18	valid_1's l2: 3537.93
[900]	training's l2: 2604.97	valid_1's l2: 3418.26
[1000]	training's l2: 2448.42	valid_1's l2: 3317.05
[1100]	training's l2: 2341.12	valid_1's l2: 3233.69
[1200]	training's l2: 2245.14	valid_1's l2: 3166.08
[1300]	training's l2: 2152.17	valid_1's l2: 3112.7
[1400]	training's l2: 2068.38	valid_1's l2: 3048.41
[1500]	training's l2: 2003.07	valid_1's l2: 2993.17
[1600]	training's l2: 1945.74	valid_1's l2: 2950.43
[1700]	training's l2: 1897.28	valid_1's l2: 2917.6
[1800]	training's l2: 1846.5	valid_1's l2: 2892.55
[1900]	training's

### 모델저장

In [None]:
from sklearn.externals import joblib
for i in range(10):
    joblib.dump(lgb1_models[i],'./drive/MyDrive/model/dangjin_floating_{0}.pkl'.format(i+1))

### cat boost 사용 x 

In [None]:
'''train_x, train_y, val_x, val_y = train_datast(energy, dangjin_interpolated, target='dangjin_floating')
cat1 = CatBoostRegressor(n_estimators=5000 ,random_seed=42, learning_rate=0.005,depth = 10,loss_function='MultiRMSE')
cat1.fit(train_x, train_y,
                  eval_set=[(train_x, train_y), (val_x, val_y)],
                  early_stopping_rounds=100,verbose=100)'''

### lgbm 사용

In [None]:
train_x, train_y, val_x, val_y = train_datast(energy, dangjin_interpolated, target='dangjin_floating')
dangjin_floating_model = LGBMRegressor(n_estimators=5000 ,random_seed=42, learning_rate=0.005,depth = 10,loss_function='MultiRMSE',
                                       colsample_bytree =  1,max_depth = 17, min_child_samples = 17, min_child_weight = 14, num_leaves = 63, reg_alpha=49.75855787791481, reg_lambda= 0.026935271560602856, subsample= 1.0)
dangjin_floating_model.fit(train_x, train_y,
                  eval_set=[(train_x, train_y), (val_x, val_y)],
                  early_stopping_rounds=100,verbose=100)

Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 17468.2	valid_1's l2: 12314.7
[200]	training's l2: 9505.46	valid_1's l2: 7518.53
[300]	training's l2: 6160.28	valid_1's l2: 5552.15
[400]	training's l2: 4621.98	valid_1's l2: 4734.6
[500]	training's l2: 3850.62	valid_1's l2: 4397.85
[600]	training's l2: 3360.87	valid_1's l2: 4265.15
[700]	training's l2: 3025.62	valid_1's l2: 4199.41
[800]	training's l2: 2769.75	valid_1's l2: 4179.53
[900]	training's l2: 2594.36	valid_1's l2: 4162.22
[1000]	training's l2: 2454.82	valid_1's l2: 4151.01
[1100]	training's l2: 2326.93	valid_1's l2: 4141.93
[1200]	training's l2: 2230.02	valid_1's l2: 4123.69
[1300]	training's l2: 2152.63	valid_1's l2: 4103.76
[1400]	training's l2: 2065.5	valid_1's l2: 4105.27
Early stopping, best iteration is:
[1309]	training's l2: 2144.3	valid_1's l2: 4102.56


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1,
              depth=10, importance_type='split', learning_rate=0.005,
              loss_function='MultiRMSE', max_depth=17, min_child_samples=17,
              min_child_weight=14, min_split_gain=0.0, n_estimators=5000,
              n_jobs=-1, num_leaves=63, objective=None, random_seed=42,
              random_state=None, reg_alpha=49.75855787791481,
              reg_lambda=0.026935271560602856, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

## 당진 자재 창고 태양광 예측 모델 학습

In [None]:
bayesian_params = {
    'max_depth': (5, 20), 
    'num_leaves': (24, 64), 
    'min_child_samples': (10, 200), 
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha': (0.01, 50) 
}

def lgb_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree,max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":1000, "learning_rate":0.01,
        'max_depth': int(round(max_depth)), #  호출 시 실수형 값이 들어오므로 정수형 하이퍼 파라미터는 정수형으로 변경 
        'num_leaves': int(round(num_leaves)), 
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0), 
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),
        'reg_lambda': max(reg_lambda,0),
        'reg_alpha': max(reg_alpha, 0),
        "metric" : 'rmse'
    }
    cv_result = lgb.cv(params, train_data, nfold=5, seed=0, verbose_eval =200,stratified=False)
    return (-1.0 * np.array(cv_result['rmse-mean'])).max()

train_x, train_y, val_x, val_y = train_datast(energy, dangjin_interpolated, target='dangjin_warehouse')
train_data = lgb.Dataset(train_x, train_y)
lgbBO = BayesianOptimization(lgb_eval, bayesian_params, random_state=42)




lgbBO.maximize(init_points=5, n_iter=25)
####################################################
# dictionary에 있는 target값을 모두 추출
target_list = []
for result in lgbBO.res:
    target = result['target']
    target_list.append(target)
print(target_list)
# 가장 큰 target 값을 가지는 순번(index)를 추출
print('maximum target index:', np.argmax(np.array(target_list)))

#################################################################
# 가장 큰 target값을 가지는 index값을 기준으로 res에서 해당 parameter 추출. 
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
print(max_dict)

In [None]:
random.seed(42)
cat2_models={}
target = 'dangjin_warehouse'

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds=[]
train = kfold_datast (energy, dangjin_interpolated, target=target)


for train_idx, valid_idx in skf.split(train, train['dangjin_warehouse']):
    folds.append((train_idx, valid_idx))

for fold in range(10):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    train_x, val_x, train_y, val_y = train.drop([target],axis=1).iloc[train_idx].values, train.drop([target],axis=1).iloc[valid_idx].values,\
                                         train[target][train_idx].values, train[target][valid_idx].values 
    cat = CatBoostRegressor(n_estimators=5000 ,random_seed=42, learning_rate=0.005,depth = 10,loss_function='MultiRMSE')
    cat.fit(train_x, train_y,
                    eval_set=[(train_x, train_y), (val_x, val_y)],
                    early_stopping_rounds=100,verbose=100)
    cat2_models[fold]=cat
    print(f'================================================================================\n\n')



0:	learn: 145.7419910	test: 145.7419910	test1: 145.6543400	best: 145.6543400 (0)	total: 80.5ms	remaining: 6m 42s
100:	learn: 102.5902427	test: 102.5902427	test1: 103.1662505	best: 103.1662505 (100)	total: 2.35s	remaining: 1m 54s
200:	learn: 78.5393241	test: 78.5393241	test1: 79.7530000	best: 79.7530000 (200)	total: 4.6s	remaining: 1m 49s
300:	learn: 65.5506712	test: 65.5506712	test1: 67.3306372	best: 67.3306372 (300)	total: 6.88s	remaining: 1m 47s
400:	learn: 58.5652532	test: 58.5652532	test1: 60.8296179	best: 60.8296179 (400)	total: 9.13s	remaining: 1m 44s
500:	learn: 54.6415664	test: 54.6415664	test1: 57.2948176	best: 57.2948176 (500)	total: 11.4s	remaining: 1m 42s
600:	learn: 52.1372775	test: 52.1372775	test1: 55.1789102	best: 55.1789102 (600)	total: 13.7s	remaining: 1m 40s
700:	learn: 50.3467585	test: 50.3467585	test1: 53.7532371	best: 53.7532371 (700)	total: 15.9s	remaining: 1m 37s
800:	learn: 49.0042941	test: 49.0042941	test1: 52.7443760	best: 52.7443760 (800)	total: 18.2s	remain

### 모델 저장

In [None]:
from sklearn.externals import joblib
for i in range(10):
    joblib.dump(cat2_models[i],'./drive/MyDrive/model/dangjin_warehouse{0}.pkl'.format(i+1))

### cat 사용

In [None]:
train_x, train_y, val_x, val_y = train_datast(energy, dangjin_interpolated, target='dangjin_warehouse')
cat2 = CatBoostRegressor(n_estimators=5000 ,random_seed=42, learning_rate=0.005,depth = 10,loss_function='MultiRMSE')
cat2.fit(train_x, train_y,
                  eval_set=[(train_x, train_y), (val_x, val_y)],
                  early_stopping_rounds=100,verbose=100)

0:	learn: 147.2777493	test: 147.2777493	test1: 127.8906322	best: 127.8906322 (0)	total: 75.1ms	remaining: 6m 15s
100:	learn: 103.6944464	test: 103.6944464	test1: 91.7248245	best: 91.7248245 (100)	total: 2.45s	remaining: 1m 59s
200:	learn: 79.2017736	test: 79.2017736	test1: 72.3140566	best: 72.3140566 (200)	total: 4.76s	remaining: 1m 53s
300:	learn: 65.9537681	test: 65.9537681	test1: 62.8602773	best: 62.8602773 (300)	total: 7.06s	remaining: 1m 50s
400:	learn: 58.8730529	test: 58.8730529	test1: 58.3327560	best: 58.3327560 (400)	total: 9.37s	remaining: 1m 47s
500:	learn: 54.7668253	test: 54.7668253	test1: 56.4786361	best: 56.4786361 (500)	total: 11.7s	remaining: 1m 44s
600:	learn: 52.1889608	test: 52.1889608	test1: 55.4925947	best: 55.4925947 (600)	total: 14s	remaining: 1m 42s
700:	learn: 50.3383994	test: 50.3383994	test1: 54.9446819	best: 54.9446819 (700)	total: 16.3s	remaining: 1m 39s
800:	learn: 48.9941662	test: 48.9941662	test1: 54.7032289	best: 54.6956555 (792)	total: 18.6s	remaining

<catboost.core.CatBoostRegressor at 0x7f2909420650>


### lgbm 사용 X basian optimizer 했지만 성능 하락

In [None]:
{'colsample_bytree': 0.7840043724584478, 'max_bin': 109.42082906265013, 'max_depth': 11.939485103471496, 'min_child_samples': 47.27436173622021, 'min_child_weight': 44.12615815161272, 'num_leaves': 61.168181706089655, 'reg_alpha': 7.967786769707356, 'reg_lambda': 1.4818125424587414, 'subsample': 0.835744966051806}}

In [None]:
train_x, train_y, val_x, val_y = train_datast(energy, dangjin_interpolated, target='dangjin_warehouse')
dangjin_warehouse_model = LGBMRegressor(n_estimators=5000 ,random_seed=42, learning_rate=0.005,depth = 8,loss_function='MultiRMSE',
                                       colsample_bytree =  1,max_depth = 17, min_child_samples = 17, min_child_weight = 14, num_leaves = 63, reg_alpha=49.75855787791481, reg_lambda= 0.026935271560602856, subsample= 1.0)
dangjin_warehouse_model.fit(train_x, train_y,
                  eval_set=[(train_x, train_y), (val_x, val_y)],
                  early_stopping_rounds=100,verbose=100)

Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 10057.3	valid_1's l2: 8089.53
[200]	training's l2: 5471.98	valid_1's l2: 4970.63
[300]	training's l2: 3539.38	valid_1's l2: 3813.67
[400]	training's l2: 2674.14	valid_1's l2: 3368.68
[500]	training's l2: 2229.72	valid_1's l2: 3195.32
[600]	training's l2: 1963	valid_1's l2: 3129.7
[700]	training's l2: 1785.8	valid_1's l2: 3105.45
[800]	training's l2: 1651.59	valid_1's l2: 3057.7
[900]	training's l2: 1547.15	valid_1's l2: 3032.77
[1000]	training's l2: 1460.65	valid_1's l2: 3021.56
[1100]	training's l2: 1380.61	valid_1's l2: 3024.25
Early stopping, best iteration is:
[1051]	training's l2: 1417.22	valid_1's l2: 3020.03


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1,
              depth=8, importance_type='split', learning_rate=0.005,
              loss_function='MultiRMSE', max_depth=17, min_child_samples=17,
              min_child_weight=14, min_split_gain=0.0, n_estimators=5000,
              n_jobs=-1, num_leaves=63, objective=None, random_seed=42,
              random_state=None, reg_alpha=49.75855787791481,
              reg_lambda=0.026935271560602856, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

## 당진 태양광 예측 모델 학습

In [None]:
bayesian_params = {
    'max_depth': (5, 20), 
    'num_leaves': (24, 64), 
    'min_child_samples': (10, 200), 
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha': (0.01, 50) 
}

def lgb_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree,max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":1000, "learning_rate":0.01,
        'max_depth': int(round(max_depth)), #  호출 시 실수형 값이 들어오므로 정수형 하이퍼 파라미터는 정수형으로 변경 
        'num_leaves': int(round(num_leaves)), 
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0), 
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),
        'reg_lambda': max(reg_lambda,0),
        'reg_alpha': max(reg_alpha, 0),
        "metric" : 'rmse'
    }
    cv_result = lgb.cv(params, train_data, nfold=5, seed=0, verbose_eval =200,stratified=False)
    return (-1.0 * np.array(cv_result['rmse-mean'])).max()

train_x, train_y, val_x, val_y = train_datast(energy, dangjin_interpolated, target='dangjin')
train_data = lgb.Dataset(train_x, train_y)
lgbBO = BayesianOptimization(lgb_eval, bayesian_params, random_state=42)



lgbBO.maximize(init_points=5, n_iter=25)
####################################################
# dictionary에 있는 target값을 모두 추출
target_list = []
for result in lgbBO.res:
    target = result['target']
    target_list.append(target)
print(target_list)
# 가장 큰 target 값을 가지는 순번(index)를 추출
print('maximum target index:', np.argmax(np.array(target_list)))

#################################################################
# 가장 큰 target값을 가지는 index값을 기준으로 res에서 해당 parameter 추출. 
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
print(max_dict)

In [None]:
random.seed(42)
cat3_models={}
target = 'dangjin'

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds=[]
train = kfold_datast (energy, dangjin_interpolated, target=target)


for train_idx, valid_idx in skf.split(train, train['dangjin']):
    folds.append((train_idx, valid_idx))

for fold in range(10):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    train_x, val_x, train_y, val_y = train.drop([target],axis=1).iloc[train_idx].values, train.drop([target],axis=1).iloc[valid_idx].values,\
                                         train[target][train_idx].values, train[target][valid_idx].values 
    cat = CatBoostRegressor(n_estimators=5000 ,random_seed=42, learning_rate=0.005,depth = 10,loss_function='MultiRMSE')
    cat.fit(train_x, train_y,
                    eval_set=[(train_x, train_y), (val_x, val_y)],
                    early_stopping_rounds=50,verbose=100)
    cat3_models[fold]=cat
    print(f'================================================================================\n\n')



0:	learn: 219.5145900	test: 219.5145900	test1: 220.0727538	best: 220.0727538 (0)	total: 24.1ms	remaining: 2m
100:	learn: 153.1183740	test: 153.1183740	test1: 153.9422478	best: 153.9422478 (100)	total: 2.48s	remaining: 2m
200:	learn: 115.7346251	test: 115.7346251	test1: 117.0363390	best: 117.0363390 (200)	total: 5.44s	remaining: 2m 9s
300:	learn: 95.5252689	test: 95.5252689	test1: 97.4526104	best: 97.4526104 (300)	total: 9.82s	remaining: 2m 33s
400:	learn: 84.8733626	test: 84.8733626	test1: 87.2673274	best: 87.2673274 (400)	total: 12.7s	remaining: 2m 26s
500:	learn: 79.1199229	test: 79.1199229	test1: 81.9631548	best: 81.9631548 (500)	total: 15.1s	remaining: 2m 15s
600:	learn: 75.7200842	test: 75.7200842	test1: 79.0088483	best: 79.0088483 (600)	total: 17.5s	remaining: 2m 7s
700:	learn: 73.4349653	test: 73.4349653	test1: 77.1386720	best: 77.1386720 (700)	total: 19.8s	remaining: 2m 1s
800:	learn: 71.6572808	test: 71.6572808	test1: 75.8225932	best: 75.8225932 (800)	total: 22.2s	remaining: 1

### 모델 저장

In [None]:
from sklearn.externals import joblib
for i in range(10):
    joblib.dump(cat3_models[i],'./drive/MyDrive/model/dangjin{0}.pkl'.format(i+1))

### cat 사용

In [None]:
train_x, train_y, val_x, val_y = train_datast(energy, dangjin_interpolated, target='dangjin')
cat3 = CatBoostRegressor(n_estimators=5000 ,random_seed=42, learning_rate=0.005,depth = 10,loss_function='MultiRMSE')
cat3.fit(train_x, train_y,
                  eval_set=[(train_x, train_y), (val_x, val_y)],
                  early_stopping_rounds=100,verbose=100)

0:	learn: 221.6627612	test: 221.6627612	test1: 195.8197076	best: 195.8197076 (0)	total: 24.2ms	remaining: 2m 1s
100:	learn: 154.3536660	test: 154.3536660	test1: 140.5623408	best: 140.5623408 (100)	total: 2.38s	remaining: 1m 55s
200:	learn: 116.2758635	test: 116.2758635	test1: 111.7606646	best: 111.7606646 (200)	total: 4.71s	remaining: 1m 52s
300:	learn: 95.6359842	test: 95.6359842	test1: 97.2736954	best: 97.2736954 (300)	total: 7s	remaining: 1m 49s
400:	learn: 84.8683633	test: 84.8683633	test1: 90.2021170	best: 90.2021170 (400)	total: 9.29s	remaining: 1m 46s
500:	learn: 78.9467163	test: 78.9467163	test1: 86.7140935	best: 86.7140935 (500)	total: 11.6s	remaining: 1m 44s
600:	learn: 75.4799602	test: 75.4799602	test1: 85.0015122	best: 85.0015122 (600)	total: 13.9s	remaining: 1m 41s
700:	learn: 73.1411108	test: 73.1411108	test1: 84.1336129	best: 84.1215786 (699)	total: 16.2s	remaining: 1m 39s
800:	learn: 71.4765393	test: 71.4765393	test1: 83.5197950	best: 83.5197950 (800)	total: 18.5s	remai

<catboost.core.CatBoostRegressor at 0x7f290a0d7210>


### lgbm 사용 X basian optimizer 했지만 성능 하락

In [None]:
{'target': -67.37835160276464, 'params': {'colsample_bytree': 0.8803371255183132, 'max_bin': 366.3312849753787, 'max_depth': 14.426679148899519, 'min_child_samples': 24.334988996735756, 'min_child_weight': 14.85309783894488, 'num_leaves': 41.115245176939176, 'reg_alpha': 1.1811080746620453, 'reg_lambda': 9.672295351116277, 'subsample': 0.6051769440265253}}

In [None]:
train_x, train_y, val_x, val_y = train_datast(energy, dangjin_interpolated, target='dangjin')
dangjin_model = LGBMRegressor(n_estimators=5000 ,random_seed=42, learning_rate=0.005,depth = 8,loss_function='MultiRMSE',
                                       colsample_bytree =  0.7840043724584478,max_bin= 109,num_leaves = 61,
                                        max_depth = 12, min_child_samples = 47, min_child_weight = 44, reg_alpha=7.96, reg_lambda= 1.48, subsample= 0.835744966051806)
dangjin_model.fit(train_x, train_y,
                  eval_set=[(train_x, train_y), (val_x, val_y)],
                  early_stopping_rounds=100,verbose=100)

Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 23972.9	valid_1's l2: 20038.7
[200]	training's l2: 13505.7	valid_1's l2: 12722.1
[300]	training's l2: 8680.92	valid_1's l2: 9453.09
[400]	training's l2: 6494.53	valid_1's l2: 8076.26
[500]	training's l2: 5396.67	valid_1's l2: 7532.7
[600]	training's l2: 4788.04	valid_1's l2: 7312.69
[700]	training's l2: 4394.52	valid_1's l2: 7184.58
[800]	training's l2: 4115.53	valid_1's l2: 7123.25
[900]	training's l2: 3903.11	valid_1's l2: 7085.49
[1000]	training's l2: 3718.13	valid_1's l2: 7095.42
Early stopping, best iteration is:
[912]	training's l2: 3878.86	valid_1's l2: 7080.32


LGBMRegressor(boosting_type='gbdt', class_weight=None,
              colsample_bytree=0.7840043724584478, depth=8,
              importance_type='split', learning_rate=0.005,
              loss_function='MultiRMSE', max_bin=109, max_depth=12,
              min_child_samples=47, min_child_weight=44, min_split_gain=0.0,
              n_estimators=5000, n_jobs=-1, num_leaves=61, objective=None,
              random_seed=42, random_state=None, reg_alpha=7.96,
              reg_lambda=1.48, silent=True, subsample=0.835744966051806,
              subsample_for_bin=200000, subsample_freq=0)

## 울산 태양광 예측 모델 학습

In [None]:
bayesian_params = {
    'max_depth': (5, 20), 
    'num_leaves': (24, 64), 
    'min_child_samples': (10, 200), 
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha': (0.01, 50) 
}

def lgb_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree,max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":1000, "learning_rate":0.01,
        'max_depth': int(round(max_depth)), #  호출 시 실수형 값이 들어오므로 정수형 하이퍼 파라미터는 정수형으로 변경 
        'num_leaves': int(round(num_leaves)), 
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0), 
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),
        'reg_lambda': max(reg_lambda,0),
        'reg_alpha': max(reg_alpha, 0),
        "metric" : 'rmse'
    }
    cv_result = lgb.cv(params, train_data, nfold=5, seed=0, verbose_eval =200,stratified=False)
    return (-1.0 * np.array(cv_result['rmse-mean'])).max()

train_x, train_y, val_x, val_y = train_datast(energy, ulsan_interpolated, target='ulsan')
train_data = lgb.Dataset(train_x, train_y)

lgbB1 = BayesianOptimization(lgb_eval, bayesian_params, random_state=42)


lgbB1.maximize(init_points=5, n_iter=25)
####################################################
# dictionary에 있는 target값을 모두 추출
target_list = []
for result in lgbB1.res:
    target = result['target']
    target_list.append(target)
print(target_list)
# 가장 큰 target 값을 가지는 순번(index)를 추출
print('maximum target index:', np.argmax(np.array(target_list)))

#################################################################
# 가장 큰 target값을 가지는 index값을 기준으로 res에서 해당 parameter 추출. 
max_dict = lgbB1.res[np.argmax(np.array(target_list))]
print(max_dict)

In [None]:
random.seed(42)
cat4_models={}
target = 'ulsan'

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds=[]
train = kfold_datast (energy, dangjin_interpolated, target=target)


for train_idx, valid_idx in skf.split(train, train['ulsan']):
    folds.append((train_idx, valid_idx))

for fold in range(10):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    train_x, val_x, train_y, val_y = train.drop([target],axis=1).iloc[train_idx].values, train.drop([target],axis=1).iloc[valid_idx].values,\
                                         train[target][train_idx].values, train[target][valid_idx].values 
    cat = CatBoostRegressor(n_estimators=5000 ,random_seed=42, learning_rate=0.005,depth = 10,loss_function='MultiRMSE')
    cat.fit(train_x, train_y,
                    eval_set=[(train_x, train_y), (val_x, val_y)],
                    early_stopping_rounds=50,verbose=100)
    cat4_models[fold]=cat
    print(f'================================================================================\n\n')



0:	learn: 100.5878403	test: 100.5878403	test1: 100.8131765	best: 100.8131765 (0)	total: 23.8ms	remaining: 1m 59s
100:	learn: 72.0170838	test: 72.0170838	test1: 72.6069122	best: 72.6069122 (100)	total: 2.38s	remaining: 1m 55s
200:	learn: 56.4072071	test: 56.4072071	test1: 57.5670471	best: 57.5670471 (200)	total: 6.46s	remaining: 2m 34s
300:	learn: 48.2702535	test: 48.2702535	test1: 49.9878460	best: 49.9878460 (300)	total: 10.2s	remaining: 2m 39s
400:	learn: 43.9877746	test: 43.9877746	test1: 46.1504309	best: 46.1504309 (400)	total: 12.6s	remaining: 2m 24s
500:	learn: 41.5582378	test: 41.5582378	test1: 44.0562000	best: 44.0562000 (500)	total: 14.9s	remaining: 2m 13s
600:	learn: 40.0135269	test: 40.0135269	test1: 42.7995297	best: 42.7995297 (600)	total: 17.3s	remaining: 2m 6s
700:	learn: 38.8846599	test: 38.8846599	test1: 41.8638786	best: 41.8638786 (700)	total: 19.6s	remaining: 2m
800:	learn: 38.0255983	test: 38.0255983	test1: 41.2198795	best: 41.2198795 (800)	total: 22s	remaining: 1m 55

### 모델 저장


In [None]:
from sklearn.externals import joblib
for i in range(10):
    joblib.dump(cat4_models[i],'./drive/MyDrive/model/ulsan_{0}.pkl'.format(i+1))

## cat사용

In [None]:
train_x, train_y, val_x, val_y = train_datast(energy, ulsan_interpolated, target='ulsan')
cat4 = CatBoostRegressor(n_estimators=5000 ,random_seed=42, learning_rate=0.005,depth = 10,loss_function='MultiRMSE',)
cat4.fit(train_x, train_y,
                  eval_set=[(train_x, train_y), (val_x, val_y)],
                  early_stopping_rounds=100,verbose=100)

0:	learn: 100.9963701	test: 100.9963701	test1: 96.1193850	best: 96.1193850 (0)	total: 24.2ms	remaining: 2m 1s
100:	learn: 70.2557123	test: 70.2557123	test1: 65.9418300	best: 65.9418300 (100)	total: 2.3s	remaining: 1m 51s
200:	learn: 52.9166766	test: 52.9166766	test1: 48.9462749	best: 48.9462749 (200)	total: 4.6s	remaining: 1m 49s
300:	learn: 43.6255030	test: 43.6255030	test1: 39.6344758	best: 39.6344758 (300)	total: 6.9s	remaining: 1m 47s
400:	learn: 38.5923310	test: 38.5923310	test1: 34.3495232	best: 34.3495232 (400)	total: 9.2s	remaining: 1m 45s
500:	learn: 35.7032955	test: 35.7032955	test1: 31.5021386	best: 31.5021386 (500)	total: 11.5s	remaining: 1m 43s
600:	learn: 33.9247382	test: 33.9247382	test1: 29.8900932	best: 29.8900932 (600)	total: 13.8s	remaining: 1m 40s
700:	learn: 32.6665984	test: 32.6665984	test1: 28.8877333	best: 28.8877333 (700)	total: 16.1s	remaining: 1m 38s
800:	learn: 31.7384537	test: 31.7384537	test1: 28.2436411	best: 28.2436411 (800)	total: 18.3s	remaining: 1m 36

<catboost.core.CatBoostRegressor at 0x7f290a60a890>


### lgbm 사용 X basian optimizer 했지만 성능 하락

In [None]:
train_x, train_y, val_x, val_y = train_datast(energy, ulsan_interpolated, target='ulsan')
ulsan_model = LGBMRegressor(n_estimators=5000 ,random_seed=42, learning_rate=0.005,depth = 8,loss_function='MultiRMSE',
                                       colsample_bytree =  1,max_depth = 17, min_child_samples = 17, min_child_weight = 14, num_leaves = 63, reg_alpha=49.75855787791481, reg_lambda= 0.026935271560602856, subsample= 1.0)
ulsan_model.fit(train_x, train_y,
                  eval_set=[(train_x, train_y), (val_x, val_y)],
                  early_stopping_rounds=100,verbose=100)

## privat 제출 

In [None]:
import pandas as pd
import urllib
import urllib.request
import json

In [None]:
def private(nx,ny,fcst_day,date):
    url = 'http://apis.data.go.kr/1360000/VilageFcstInfoService/getVilageFcst'


    queryParams = '?' + urllib.parse.urlencode(
        {
            urllib.parse.quote_plus('ServiceKey') : 'k4ddN+RdAKoBExDTI0PS+2M3QtLVZZkW7Bd8in1h4j5sgfrd2cyVRmYieYHcJNTTnPd+b7X8epYmZS5Ngj18Ww==', # key를 바로 입력해도 됩니다.
            urllib.parse.quote_plus('numOfRows') : '113', # 총 14개의 항목을 3시간 단위로 순차적으로 불러옵니다. 다음날 24시간예보에 필요한 만큼만 가져왔습니다.
            urllib.parse.quote_plus('dataType') : 'JSON', # JSON, XML 두가지 포멧을 제공합니다.
            urllib.parse.quote_plus('base_date') : fcst_day, # 예보 받을 날짜를 입력합니다. 최근 1일간의 자료만 제공합니다.
            urllib.parse.quote_plus('base_time') : '1700', # 예보 시간을 입력합니다. 2시부터 시작하여 3시간 단위로 입력 가능합니다.
            urllib.parse.quote_plus('nx') : nx, # 울산 태양광 발전소 x 좌표입니다. '기상청18_동네예보 조회서비스_오픈API활용가이드.zip'에 포함 된 excel파일을 통해 확인 가능합니다.
            urllib.parse.quote_plus('ny') : ny # 울산 태양광 발전소 y 좌표입니다. '기상청18_동네예보 조회서비스_오픈API활용가이드.zip'에 포함 된 excel파일을 통해 확인 가능합니다.
        }
    )

    response = urllib.request.urlopen(url + queryParams).read()
    response = json.loads(response)
    fcst_df = pd.DataFrame()

    fcst_df['Forecast_time'] = [f'{date} {hour}:00' for hour in range(24)]
    row_idx = 0

    for i, data in enumerate(response['response']['body']['items']['item']):
        if i > 19:
            if data['category']=='REH':
                fcst_df.loc[row_idx, 'Humidity'] = float(data['fcstValue'])
                print('category:Humidity,',data['category'], 'baseTime:',data['baseTime'], ', fcstTime:', data['fcstTime'], ', fcstValue:', data['fcstValue'])
            elif data['category']=='T3H':
                fcst_df.loc[row_idx, 'Temperature'] = float(data['fcstValue'])
                print('category:Temperature,',data['category'], 'baseTime:',data['baseTime'], ', fcstTime:', data['fcstTime'], ', fcstValue:', data['fcstValue'])
            elif data['category']=='SKY':
                fcst_df.loc[row_idx, 'Cloud'] = float(data['fcstValue'])
                print('category:Cloud,',data['category'], 'baseTime:',data['baseTime'], ', fcstTime:', data['fcstTime'], ', fcstValue:', data['fcstValue'])
            elif data['category']=='VEC':
                fcst_df.loc[row_idx, 'WindDirection'] = float(data['fcstValue'])
                print('category:WindDirection,',data['category'], 'baseTime:',data['baseTime'], ', fcstTime:', data['fcstTime'], ', fcstValue:', data['fcstValue'])
            elif data['category']=='WSD':
                fcst_df.loc[row_idx, 'WindSpeed'] = float(data['fcstValue'])
                print('category:WindSpeed,',data['category'], 'baseTime:',data['baseTime'], ', fcstTime:', data['fcstTime'], ', fcstValue:', data['fcstValue'], '\n')
                row_idx+=3

    fcst_df = fcst_df.interpolate()
    fcst_df = fcst_df.iloc[:24]
    return fcst_df

In [None]:
#dangjin = 53 , 44
#ulsan = 102, 83
#nx, ny = '53','44'

fcst_day='20210616'# 오늘 날짜
date = '2021-06-17'  #내일 날짜
day = 8 #6월 8일  = 1
ulsan = private('102' ,'83' ,fcst_day,date)
dangjin = private('53','44' ,fcst_day,date)


category:Humidity, REH baseTime: 1700 , fcstTime: 0300 , fcstValue: 90
category:Cloud, SKY baseTime: 1700 , fcstTime: 0300 , fcstValue: 4
category:Temperature, T3H baseTime: 1700 , fcstTime: 0300 , fcstValue: 19
category:WindDirection, VEC baseTime: 1700 , fcstTime: 0300 , fcstValue: 33
category:WindSpeed, WSD baseTime: 1700 , fcstTime: 0300 , fcstValue: 3.7 

category:Humidity, REH baseTime: 1700 , fcstTime: 0600 , fcstValue: 90
category:Cloud, SKY baseTime: 1700 , fcstTime: 0600 , fcstValue: 4
category:Temperature, T3H baseTime: 1700 , fcstTime: 0600 , fcstValue: 19
category:WindDirection, VEC baseTime: 1700 , fcstTime: 0600 , fcstValue: 27
category:WindSpeed, WSD baseTime: 1700 , fcstTime: 0600 , fcstValue: 3.3 

category:Humidity, REH baseTime: 1700 , fcstTime: 0900 , fcstValue: 85
category:Cloud, SKY baseTime: 1700 , fcstTime: 0900 , fcstValue: 4
category:Temperature, T3H baseTime: 1700 , fcstTime: 0900 , fcstValue: 20
category:WindDirection, VEC baseTime: 1700 , fcstTime: 0900 , 

In [None]:
def test_datast(fcst_df):
    start = '2021-06-17 0:00'
    end = '2021-06-17 23:00'
    
    start_idx = fcst_df[fcst_df['Forecast_time']==start].index[0]
    end_idx = fcst_df[fcst_df['Forecast_time']==end].index[0]
    
    test_df = fcst_df.loc[start_idx:end_idx, :].copy()
    
    test_df['date'] = test_df['Forecast_time'].str.split(' ').str[0]
    test_df['hour'] = test_df['Forecast_time'].str.split(' ').str[1].str.split(':').str[0].astype(int)
    
    test_df['year'] = test_df['date'].str.split('-').str[0].astype(int)
    test_df['month'] = test_df['date'].str.split('-').str[1].astype(int)
    test_df['day'] = test_df['date'].str.split('-').str[2].astype(int)
    
    test_df = test_df[['year', 'month', 'day', 'hour', 'Temperature', 'Humidity', 'WindSpeed', 'WindDirection', 'Cloud']]
    
    test_x = test_df.to_numpy()
    
    return test_x

In [None]:
dangjin_test = test_datast(dangjin)
ulsan_test = test_datast(ulsan)

## 각 발전소 발전량 추론

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')

In [None]:
from sklearn.externals import joblib
lgb1_models = {}
for i in range(10):
    model =  joblib.load('./drive/MyDrive/model/dangjin_floating_{0}.pkl'.format(i+1))
    lgb1_models[i] = model



In [None]:
dangjin_floating_pred = submission.iloc[:24, 1]
for day in range(1,30):
    dangjin_floating_pred = submission.iloc[:24, 1].copy()
    for fold in range(10):
        dangjin_floating_pred += lgb1_models[fold].predict(dangjin_test)/10
    #dangjin_floating_pred += cat1_models[fold].predict(dangjin_test)/20


#dangjin_floating_pred += dangjin_floating_model.predict(dangjin_test) 
    submission.iloc[24*28+24*(day-1):24*28+24*day, 1] = dangjin_floating_pred.to_list()

In [None]:
dangjin_floating_pred

0     38.635726
1     38.640497
2     38.640497
3      4.167042
4      3.796771
5      1.433166
6      2.166614
7     15.232325
8     47.207103
9     47.062288
10    47.378860
11    57.515075
12    66.634690
13    59.146267
14    27.181423
15    25.792464
16    24.048130
17    23.837229
18     5.538723
19     1.620031
20    -0.033543
21    -4.788303
22    -6.194315
23    -1.809115
Name: dangjin_floating, dtype: float64

In [None]:
cat2_models = {}
for i in range(10):
    model =  joblib.load('./drive/MyDrive/model/dangjin_warehouse{0}.pkl'.format(i+1))
    cat2_models[i] = model
cat2_models

{0: <catboost.core.CatBoostRegressor at 0x7f9a8353cf10>,
 1: <catboost.core.CatBoostRegressor at 0x7f9a83537d90>,
 2: <catboost.core.CatBoostRegressor at 0x7f9a8353ce90>,
 3: <catboost.core.CatBoostRegressor at 0x7f9a824e4790>,
 4: <catboost.core.CatBoostRegressor at 0x7f9a824e4c50>,
 5: <catboost.core.CatBoostRegressor at 0x7f9a824e4750>,
 6: <catboost.core.CatBoostRegressor at 0x7f9a824e4a50>,
 7: <catboost.core.CatBoostRegressor at 0x7f9a83534dd0>,
 8: <catboost.core.CatBoostRegressor at 0x7f9a824e4b10>,
 9: <catboost.core.CatBoostRegressor at 0x7f9a83545650>}

In [None]:
dangjin_warehouse_pred = submission.iloc[:24, 2]
for day in range(1,30):
    dangjin_warehouse_pred = submission.iloc[:24, 2].copy()
    for fold in range(10):
        
        dangjin_warehouse_pred += cat2_models[fold].predict(dangjin_test)/10
        
    #dangjin_warehouse_pred += cat2.predict(dangjin_test)
    submission.iloc[24*28+24*(day-1):24*28+24*day, 2] = dangjin_warehouse_pred.to_list()

In [None]:
cat3_models = {}
for i in range(10):
    model =  joblib.load('./drive/MyDrive/model/dangjin{0}.pkl'.format(i+1))
    cat3_models[i] = model
cat3_models

{0: <catboost.core.CatBoostRegressor at 0x7f9a8253dc50>,
 1: <catboost.core.CatBoostRegressor at 0x7f9a8253dbd0>,
 2: <catboost.core.CatBoostRegressor at 0x7f9a83544c10>,
 3: <catboost.core.CatBoostRegressor at 0x7f9a83534450>,
 4: <catboost.core.CatBoostRegressor at 0x7f9a82628850>,
 5: <catboost.core.CatBoostRegressor at 0x7f9a82628610>,
 6: <catboost.core.CatBoostRegressor at 0x7f9a82628450>,
 7: <catboost.core.CatBoostRegressor at 0x7f9a824e48d0>,
 8: <catboost.core.CatBoostRegressor at 0x7f9a826286d0>,
 9: <catboost.core.CatBoostRegressor at 0x7f9a8253dd10>}

In [None]:
dangjin_pred = submission.iloc[:24, 3]
for day in range(1,30):
    dangjin_pred = submission.iloc[:24, 3].copy()
    for fold in range(10):

        dangjin_pred += cat3_models[fold].predict(dangjin_test)/10

    #dangjin_pred += cat3.predict(dangjin_test)
    submission.iloc[24*28+24*(day-1):24*28+24*day, 3] = dangjin_pred.to_list()

In [None]:
cat4_models = {}
for i in range(10):
    model =  joblib.load('./drive/MyDrive/model/ulsan_{0}.pkl'.format(i+1))
    cat4_models[i] = model
cat4_models

{0: <catboost.core.CatBoostRegressor at 0x7f9a8253d550>,
 1: <catboost.core.CatBoostRegressor at 0x7f9a8253db10>,
 2: <catboost.core.CatBoostRegressor at 0x7f9a8355b810>,
 3: <catboost.core.CatBoostRegressor at 0x7f9a83534050>,
 4: <catboost.core.CatBoostRegressor at 0x7f9a8355ba50>,
 5: <catboost.core.CatBoostRegressor at 0x7f9a83534650>,
 6: <catboost.core.CatBoostRegressor at 0x7f9a83534310>,
 7: <catboost.core.CatBoostRegressor at 0x7f9a825b6190>,
 8: <catboost.core.CatBoostRegressor at 0x7f9a825b6c50>,
 9: <catboost.core.CatBoostRegressor at 0x7f9a825b6210>}

In [None]:
ulsan_pred = submission.iloc[:24, 4].copy()

for day in range(1,30):
    ulsan_pred = submission.iloc[:24, 4].copy()
    
    for fold in range(10):
        #ulsan_pred += lgb4_models[fold].predict(ulsan_test)/20
        ulsan_pred += cat4_models[fold].predict(ulsan_test)/10
        

    #ulsan_pred += cat4.predict(ulsan_test)    
    submission.iloc[24*28+24*(day-1):24*28+24*day, 4] = ulsan_pred.to_list()

In [None]:
submission.iloc[-48:, 4]

1344     1.549593
1345     2.270297
1346     3.013953
1347     3.838883
1348     4.025745
1349     4.993543
1350     9.093555
1351    25.403615
1352    52.501692
1353    69.201106
1354    80.464603
1355    86.298162
1356    88.824341
1357    86.458819
1358    73.122279
1359    58.969188
1360    35.573585
1361    13.291083
1362     2.452554
1363     1.083316
1364     0.510781
1365    -0.584521
1366    -2.534282
1367    -2.665737
1368     0.000000
1369     0.000000
1370     0.000000
1371     0.000000
1372     0.000000
1373     0.000000
1374     0.000000
1375     0.000000
1376     0.000000
1377     0.000000
1378     0.000000
1379     0.000000
1380     0.000000
1381     0.000000
1382     0.000000
1383     0.000000
1384     0.000000
1385     0.000000
1386     0.000000
1387     0.000000
1388     0.000000
1389     0.000000
1390     0.000000
1391     0.000000
Name: ulsan, dtype: float64

In [None]:
submission.iloc[24*28+24*(day-1):24*28+24*day,:]

Unnamed: 0,time,dangjin_floating,dangjin_warehouse,dangjin,ulsan
1344,2021-07-07 01:00:00,36.77108,18.735408,21.104692,9.792052
1345,2021-07-07 02:00:00,36.77108,18.739366,21.114555,9.727772
1346,2021-07-07 03:00:00,36.751588,18.801735,21.257653,9.653187
1347,2021-07-07 04:00:00,1.426765,0.653759,4.638194,2.282487
1348,2021-07-07 05:00:00,2.001451,1.724496,5.53551,2.898698
1349,2021-07-07 06:00:00,2.058491,2.593909,6.463861,4.319726
1350,2021-07-07 07:00:00,10.436691,5.791048,8.492165,8.203019
1351,2021-07-07 08:00:00,34.393841,17.653872,21.71182,24.878586
1352,2021-07-07 09:00:00,76.819238,51.890587,71.064772,52.821271
1353,2021-07-07 10:00:00,117.865206,84.465371,118.497016,71.074429


In [None]:
day= 9
submission.iloc[24*28+24*(day-1):24*28+24*day,:]

Unnamed: 0,time,dangjin_floating,dangjin_warehouse,dangjin,ulsan
864,2021-06-17 01:00:00,38.635726,18.01751,20.105727,1.549593
865,2021-06-17 02:00:00,38.640497,17.964849,19.963319,2.270297
866,2021-06-17 03:00:00,38.640497,18.043539,20.006648,3.013953
867,2021-06-17 04:00:00,4.167042,3.475805,7.842921,3.838883
868,2021-06-17 05:00:00,3.796771,3.321273,7.420531,4.025745
869,2021-06-17 06:00:00,1.433166,2.533738,6.074404,4.993543
870,2021-06-17 07:00:00,2.166614,1.229911,2.355794,9.093555
871,2021-06-17 08:00:00,15.232325,5.531541,5.124246,25.403615
872,2021-06-17 09:00:00,47.207103,29.024194,35.668597,52.501692
873,2021-06-17 10:00:00,47.062288,47.483223,61.877877,69.201106


In [None]:
submission.to_csv(fcst_day+'.csv',index=False)

## 테스트 데이터 전처리

## 제출

In [None]:
def test_datast(fcst_df):
    start = '2021-02-01 00:00:00'
    end = '2021-02-28 23:00:00'
    
    start_idx = fcst_df[fcst_df['Forecast_time']==start].index[0]
    end_idx = fcst_df[fcst_df['Forecast_time']==end].index[0]
    
    test_df = fcst_df.loc[start_idx:end_idx, :].copy()
    
    test_df['date'] = test_df['Forecast_time'].str.split(' ').str[0]
    test_df['hour'] = test_df['Forecast_time'].str.split(' ').str[1].str.split(':').str[0].astype(int)
    
    test_df['year'] = test_df['date'].str.split('-').str[0].astype(int)
    test_df['month'] = test_df['date'].str.split('-').str[1].astype(int)
    test_df['day'] = test_df['date'].str.split('-').str[2].astype(int)
    
    test_df = test_df[['year', 'month', 'day', 'hour', 'Temperature', 'Humidity', 'WindSpeed', 'WindDirection', 'Cloud']]
    
    test_x = test_df.to_numpy()
    
    return test_x

In [None]:
dangjin_test = test_datast(dangjin_interpolated)
ulsan_test = test_datast(ulsan_interpolated)

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')

In [None]:
dangjin_floating_pred = submission.iloc[:24*28, 1]

for fold in range(10):
    dangjin_floating_pred += lgb1_models[fold].predict(dangjin_test)/10
    

#dangjin_floating_pred = dangjin_floating_model.predict(dangjin_test)
submission.iloc[:24*28, 1] = dangjin_floating_pred

In [None]:
dangjin_warehouse_pred = submission.iloc[:24*28, 2]

for fold in range(10):
    
    dangjin_warehouse_pred += cat2_models[fold].predict(dangjin_test)/10
#dangjin_warehouse_pred = cat2.predict(dangjin_test)
submission.iloc[:24*28, 2] = dangjin_warehouse_pred

In [None]:
dangjin_pred = submission.iloc[:24*28, 2]

for fold in range(5):
    #dangjin_pred += lgb3_models[fold].predict(dangjin_test)/20
    dangjin_pred += cat3_models[fold].predict(dangjin_test)/5
#dangjin_pred = cat3.predict(dangjin_test)
submission.iloc[:24*28, 3] = dangjin_pred

In [None]:
ulsan_pred = submission.iloc[:24*28, 4]

'''for fold in range(10):
    #ulsan_pred += lgb4_models[fold].predict(ulsan_test)/20
    ulsan_pred += cat4_models[fold].predict(ulsan_test)/10'''
ulsan_pred = cat4.predict(ulsan_test)
ulsan_pred = submission.iloc[:24*28, 4] = ulsan_pred

In [None]:
submission.to_csv('final2222.csv', index=False)

In [None]:
submission[:20]

Unnamed: 0,time,dangjin_floating,dangjin_warehouse,dangjin,ulsan
0,2021-02-01 01:00:00,-14.030844,-3.92923,-3.92923,-3.411006
1,2021-02-01 02:00:00,-10.785423,-0.544544,-0.544544,-1.206727
2,2021-02-01 03:00:00,-13.93963,-1.851262,-1.851262,0.020413
3,2021-02-01 04:00:00,-16.172595,-7.913679,-7.913679,0.057985
4,2021-02-01 05:00:00,-13.610172,-7.420329,-7.420329,0.052356
5,2021-02-01 06:00:00,-14.192924,-10.949913,-10.949913,0.50838
6,2021-02-01 07:00:00,-17.096378,-15.462109,-15.462109,2.761307
7,2021-02-01 08:00:00,-6.982919,-9.54443,-9.54443,6.430773
8,2021-02-01 09:00:00,17.643115,49.958906,49.958906,18.850496
9,2021-02-01 10:00:00,21.13281,77.03343,77.03343,35.471467


In [None]:
submission.iloc[:24*28, 1] = dangjin_floating_pred
submission.iloc[:24*28, 2] = dangjin_warehouse_pred
submission.iloc[:24*28, 3] = dangjin_pred
submission.iloc[:24*28, 4] = ulsan_pred

In [None]:
submission2 = pd.read_csv('./data/sample_submission.csv')
submission2.iloc[:24*28, 1] = dangjin_floating_pred
submission2.iloc[:24*28, 2] = dangjin_warehouse_pred
submission2.iloc[:24*28, 3] = dangjin_pred
submission2.iloc[:24*28, 4] = ulsan_pred

In [None]:
submission[['dangjin_floating','dangjin_warehouse','dangjin','ulsan']] = (submission[['dangjin_floating','dangjin_warehouse','dangjin','ulsan']] + submission2[['dangjin_floating','dangjin_warehouse','dangjin','ulsan']]) / 2

In [None]:
submission

Unnamed: 0,time,dangjin_floating,dangjin_warehouse,dangjin,ulsan
0,2021-02-01 01:00:00,-6.904098,-0.891064,0.659742,-1.984275
1,2021-02-01 02:00:00,-6.691012,-0.693226,0.816881,-1.073283
2,2021-02-01 03:00:00,-6.429722,-1.361885,0.809705,-0.190118
3,2021-02-01 04:00:00,-5.708050,-1.485486,0.548285,-0.191640
4,2021-02-01 05:00:00,-3.146568,-1.651132,0.530432,-0.169720
...,...,...,...,...,...
1387,2021-07-08 20:00:00,0.000000,0.000000,0.000000,0.000000
1388,2021-07-08 21:00:00,0.000000,0.000000,0.000000,0.000000
1389,2021-07-08 22:00:00,0.000000,0.000000,0.000000,0.000000
1390,2021-07-08 23:00:00,0.000000,0.000000,0.000000,0.000000


In [None]:
submission.to_csv('lgb+cat.csv', index=False)