In [1]:
%load_ext autoreload
%autoreload 2

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime
import math
pd.options.mode.chained_assignment = None

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

from preprocess.functions.date_inspector import load_files
from functions.feature_engineering import wind_cos_sin, new_wind_speed_direction, make_moving_average_df, load_power_ma_forecast
from functions.feature_engineering import load_power_ma_forecast_mean, fe_add_timestep, fe_add_previous_n_hours_mean_kpx, add_time_feature

In [3]:
df_final_power = pd.read_pickle(os.path.abspath(os.path.join(os.getcwd(),'..','data','data_to_final','df_kpx_wind_final.pkl')))
df_final_forecast_kpx = pd.read_pickle(os.path.abspath(os.path.join(os.getcwd(),'..','data','data_to_final','final_kpx_forecast.pkl')))

In [4]:
df_power = df_final_power.drop(['풍속(m/s)','풍향(16방위)'],axis=1).copy()

In [5]:
def preprocess_forecast(df_forecast) :
    df_forecast['6시간강수량'] = pd.to_numeric(df_forecast['6시간강수량'])

    df_forecast['6시간강수량'] /= 2
    df_forecast['6시간적설'] /= 2
    df_forecast['6시간강수량'] = df_forecast['6시간강수량'].fillna(method='ffill')
    df_forecast['6시간적설'] = df_forecast['6시간적설'].fillna(method='ffill')

    df_forecast['일최고기온'] = df_forecast['일최고기온'].fillna(method='ffill')
    df_forecast['일최저기온'] = df_forecast['일최저기온'].fillna(method='ffill')

    return df_forecast

In [6]:
def merge_forecast_power_hotfix(df_forecast_original, df_power_original, power) :
    df_forecast = df_forecast_original.copy()
    df_power = df_power_original.copy()
    if power == 'solar' :
        df_power = df_power.drop(['date', 'location', '일사량(MJ/m^2)'], axis=1)
    if power == 'wind' :
#         df_power = df_power.drop(['date', 'location', '풍향(m/s)', '풍속(16방위)'], axis=1)
        df_power = df_power.drop(['date', 'location'], axis=1)

    # 해당 시간 합치기
    df_power_0 = df_power.copy()
    df_power_0 = df_power_0.rename(columns={'datetime':'datetime(forecast)', '발전량(kW)':'발전량(kW)+0'})
    df_merged = pd.merge(df_forecast, df_power_0, how='inner', on=['datetime(forecast)'])
    # +1시간 합치기
    df_power_1 = df_power.copy()
    df_power_1 = df_power_1.rename(columns={'datetime':'datetime(forecast)', '발전량(kW)':'발전량(kW)+1'})
    df_power_1['datetime(forecast)'] -= datetime.timedelta(hours=1)
    df_merged = pd.merge(df_merged, df_power_1, how='inner', on='datetime(forecast)')

    # +2시간 합치기
    df_power_2 = df_power.copy()
    df_power_2 = df_power_2.rename(columns={'datetime':'datetime(forecast)', '발전량(kW)':'발전량(kW)+2'})
    df_power_2['datetime(forecast)'] -= datetime.timedelta(hours=2)
    df_merged = pd.merge(df_merged, df_power_2, how='inner', on='datetime(forecast)')

    # datetime obejct로
    df_merged['date'] = pd.to_datetime(df_merged['date'])
    df_merged['date(forecast)'] = pd.to_datetime(df_merged['date(forecast)'])
    #display(df_merged)

    # 발전량 nan 제거
    df_merged = df_merged[df_merged['발전량(kW)+0'].notnull()]
    # outer join하면서 생긴 nan 제거
    df_merged = df_merged[df_merged['location'].notnull()]
    #df_merged = df_merged[df_merged[‘location’]!=‘NaN’]
    #df_merged = df_merged.dropna(thresh=1)


    # rename
    df_merged = df_merged.rename(columns={'하늘상태':'Weather Condition',
                                        '풍속':'Wind Speed(m/s)',
                                        '습도':'Humidity(%)',
                                        '3시간기온':'Celsius(°C)',
                                        '풍향':'Wind Direction(16)',
                                        '강수형태':'Rain Condition',
                                        '강수확률':'Rainfall Probability',
                                        '6시간강수량':'Rainfall',
                                        '6시간적설':'Snowfall',
                                        '일최고기온':'Celsius(Highest)',
                                        '일최저기온':'Celsius(Lowest)',
                                        '발전량(kW)+0':'Power Generation(kW)+0',
                                        '발전량(kW)+1':'Power Generation(kW)+1',
                                        '발전량(kW)+2':'Power Generation(kW)+2'})



    #display(df_merged)
    return df_merged

## data 합치기

In [23]:
df_final_forecast = preprocess_forecast(df_final_forecast_kpx)
df = merge_forecast_power_hotfix(df_final_forecast, df_power,'wind')

In [8]:
df_2 = df.rename(columns={'하늘상태':'Weather Condition',
                                        '풍속':'Wind Speed(m/s)',
                                        '습도':'Humidity(%)',
                                        '3시간기온':'Celsius(°C)',
                                        '풍향':'Wind Direction(16)',
                                        '강수형태':'Rain Condition',
                                        '강수확률':'Rainfall Probability',
                                        '6시간강수량':'Rainfall',
                                        '6시간적설':'Snowfall',
                                        '일최고기온':'Celsius(Highest)',
                                        '일최저기온':'Celsius(Lowest)',})

## 풍향/풍속 인코딩

In [9]:
df = wind_cos_sin(df_2)
df = new_wind_speed_direction(df,340)

## kpx 발전량 Moving Average

각 시간에 전년도 발전량의 moving average , 모든 년도 발전량의 moving average의 평균 변수 추가

In [10]:
hours = [12,24,48]
df_new_power_2017 = make_moving_average_df(df_power,hours,0)
df_new_power_2018 = make_moving_average_df(df_power,hours,0)

2017 7/1~2018 6/30  
1. 1년치 데이터에 대해 각각 시간에 2017, 2018년각각 발전량의 moving average 
2. 2017,2018년의 평균 moving average 
3. 년도는 다르고 월,일,시간이 같은 데이터를 찾아서 넣으려 함

#### 발전량 데이터 만들기

In [11]:
df_new_power_2017 = add_time_feature(df_new_power_2017)
df_new_power_2018 = add_time_feature(df_new_power_2018)

In [12]:
df_new_power_kpx_ma_mean = df_new_power_2017.copy()
new_df_kpx_ma = pd.merge(df_new_power_kpx_ma_mean, df_new_power_2018, how='inner',on=['month','day','hour'])

In [13]:
new_df_kpx_ma = new_df_kpx_ma.rename(columns={'ma12_x':'ma12_2017','ma24_x':'ma24_2017','ma48_x':'ma48_2017',
                                             'ma12_y':'ma12_2018','ma24_y':'ma24_2018','ma48_y':'ma48_2018',
                                             'datetime_x':'datetime'})
new_df_kpx_ma = new_df_kpx_ma.drop(['datetime_y'],axis=1)

In [14]:
new_df_kpx_ma['ma_12_mean'] = (new_df_kpx_ma['ma12_2017']+new_df_kpx_ma['ma12_2018'])/2
new_df_kpx_ma['ma_24_mean'] = (new_df_kpx_ma['ma24_2017']+new_df_kpx_ma['ma24_2018'])/2
new_df_kpx_ma['ma_48_mean'] = (new_df_kpx_ma['ma48_2017']+new_df_kpx_ma['ma48_2018'])/2

In [15]:
df_ma_new = new_df_kpx_ma.copy()

#### 발전량 feature 추가

In [16]:
import datetime
df_ma = df_ma_new

In [92]:
hours = [12,24,48]

for hour in hours:
    name = 'ma'+str(hour)+'_pre_year'
    df[name]=0
    for i in range(len(df)):
        df[name][i] = load_power_ma_forecast(df['datetime(forecast)'][i], df_ma, hour)

In [None]:
hours = [12,24,48]

for hour in hours:
    name = 'ma'+str(hour)+'_mean'
    df[name]=0
    for i in range(len(df)):
        df[name][i] = load_power_ma_forecast_mean(df['datetime(forecast)'][i], df_ma, hour)

## feature windowing

In [17]:
hour = [12,24]
df_2 = fe_add_timestep(df,hour[0])
df_2 = fe_add_timestep(df_2,hour[1])

In [32]:
df_2.tail()            

Unnamed: 0,date,date(forecast),datetime,datetime(forecast),location,Weather Condition,Wind Speed(m/s),Humidity(%),Celsius(°C),Wind Direction(16),...,Wind Direction(16) (previous 24),Rain Condition (previous 24),Rainfall Probability (previous 24),Rainfall (previous 24),Snowfall (previous 24),Celsius(Highest) (previous 24),Celsius(Lowest) (previous 24),wind_dir_cos (previous 24),wind_dir_sin (previous 24),new_wind_speed (previous 24)
112827,2019-07-31,2019-08-03,2019-07-31 20:00:00,2019-08-03 12:00:00,"Korea, Jeju-do, Seogwipo-si, Pyoseon-myeon",4.0,3.4,90.0,27.0,82.0,...,72.0,0.0,20.0,0.0,0.0,31.0,24.0,0.309017,0.951057,-0.122148
112828,2019-07-31,2019-08-03,2019-07-31 23:00:00,2019-08-03 12:00:00,"Korea, Jeju-do, Seogwipo-si, Pyoseon-myeon",4.0,3.4,90.0,27.0,82.0,...,72.0,0.0,20.0,0.0,0.0,31.0,24.0,0.309017,0.951057,-0.122148
112829,2019-07-31,2019-08-03,2019-07-31 20:00:00,2019-08-03 15:00:00,"Korea, Jeju-do, Seogwipo-si, Pyoseon-myeon",4.0,3.6,90.0,26.0,82.0,...,90.0,0.0,20.0,0.0,0.0,31.0,24.0,6.123234000000001e-17,1.0,-1.231273
112830,2019-07-31,2019-08-03,2019-07-31 23:00:00,2019-08-03 15:00:00,"Korea, Jeju-do, Seogwipo-si, Pyoseon-myeon",4.0,3.6,90.0,26.0,82.0,...,90.0,0.0,20.0,0.0,0.0,31.0,24.0,6.123234000000001e-17,1.0,-1.231273
112831,2019-08-11,2019-08-11,2019-08-11 20:00:00,2019-08-11 00:00:00,"Korea, Jeju-do, Seogwipo-si, Pyoseon-myeon",3.0,6.0,75.0,28.0,135.0,...,90.0,0.0,20.0,0.0,0.0,31.0,24.0,6.123234000000001e-17,1.0,-1.197071


In [None]:
df['year'] = df['datetime(forecast)'].dt.year
df['month'] = df['datetime(forecast)'].dt.month
df['day'] = df['datetime(forecast)'].dt.day
df['hour'] = df['datetime(forecast)'].dt.hour
df['dayofyear'] = df['datetime(forecast)'].dt.dayofyear

In [21]:
df_2.to_pickle(os.path.abspath(os.path.join(os.getcwd(), '..', 'data','df_forecast_kpx_fe.pkl')))

# make test data

In [25]:
df_final_forecast_kpx = pd.read_pickle(os.path.abspath(os.path.join(os.getcwd(),'..','data','data_to_final','final_kpx_forecast.pkl')))

In [27]:
df_test = df_final_forecast_kpx.copy()

In [28]:
df_test_x = df_test.sort_values(by=['datetime'])
df_test_x_2 = df_test_x[df_test_x['datetime'] == datetime.datetime(2019,8,12,14,0)].reset_index(drop=True)
df = df_test_x_2.sort_values(by=['datetime(forecast)']).copy().reset_index(drop=True)

In [29]:
df['datetime(forecast)'].loc[0] = datetime.datetime(2019,8,13,0,0)
df['datetime(forecast)'].loc[4] = datetime.datetime(2019,8,14,0,0)

In [30]:
df = df.sort_values(by=['datetime(forecast)']).reset_index(drop=True)

In [31]:
df = wind_cos_sin(df_2)
df = new_wind_speed_direction(df,340)