# IT3030 Deep Learning

## set working path

In [88]:
import sys
sys.path.insert(0, '/home/jupyter/DeepLearning/Time Series prediction')

## import libraries

In [89]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# import utility functions
from utilities import *

## read data

In [191]:
train = pd.read_csv(sys.path[0] + '/data/no1_train.csv')
validation = pd.read_csv(sys.path[0] + '/data/no1_validation.csv')

## pre-processing

In [194]:
def preprocess(df, df_val):
    # convert start_time to datetime
    df['start_time'] = pd.to_datetime(df['start_time'])
    df_val['start_time'] = pd.to_datetime(df_val['start_time'])

    # clamp the target variable y
    lower, upper = df['y'].quantile(0.005),  df['y'].quantile(0.995)
    df['y'].clip(lower, upper, inplace=True)
    df_val['y'].clip(lower, upper, inplace=True)

    
    # Normalize with a min max scaler for the planned power production
    min_max_var = ['hydro', 'micro', 'thermal', 'wind', 'river', 'total']
    min_max_scaler = MinMaxScaler(feature_range=(0, 1))
    df[min_max_var] = min_max_scaler.fit_transform(df[min_max_var])
    df_val[min_max_var] = min_max_scaler.transform(df_val[min_max_var])
    
    
    # Normalize with a standard scaler for the system regulation, planned flow and imbalance predictions
    standard_var = ['sys_reg', 'flow', 'y']
    standard_scaler = StandardScaler()
    df[standard_var] = standard_scaler.fit_transform(df[standard_var])
    df_val[standard_var] = standard_scaler.transform(df_val[standard_var])
    
    
    # Add time features
    df['time_of_day']  = df.start_time.dt.hour
    df['time_of_week'] = df.start_time.dt.day_name()
    df['time_of_year'] = df.start_time.dt.month_name()
    df_val['time_of_day']  = df_val.start_time.dt.hour
    df_val['time_of_week'] = df_val.start_time.dt.day_name()
    df_val['time_of_year'] = df_val.start_time.dt.month_name()
    
    ## Add one-hot encoding for season and time of week
    
    #df['is_summer'] = df['time_of_year'].isin(['June', 'July', 'August'])
    df['is_fall'] = df['time_of_year'].isin(['September', 'October', 'November'])
    df['is_winter'] = df['time_of_year'].isin(['December', 'January', 'February'])
    df['is_spring'] = df['time_of_year'].isin(['March', 'April', 'May'])
    #df_val['is_summer'] = df_val['time_of_year'].isin(['June', 'July', 'August'])
    df_val['is_fall'] = df_val['time_of_year'].isin(['September', 'October', 'November'])
    df_val['is_winter'] = df_val['time_of_year'].isin(['December', 'January', 'February'])
    df_val['is_spring'] = df_val['time_of_year'].isin(['March', 'April', 'May'])
    
    #df['is_weekday'] = df['time_of_week'].isin(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'])
    df['is_weekend'] = df['time_of_week'].isin(['Saturday', 'Sunday'])
    #df_val['is_weekday'] = df_val['time_of_week'].isin(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'])
    df_val['is_weekend'] = df_val['time_of_week'].isin(['Saturday', 'Sunday'])
    
    #df['is_night'] = np.logical_and(0 <= df['time_of_day'], df['time_of_day'] < 6)
    df['is_morning'] = np.logical_and(6 <= df['time_of_day'], df['time_of_day'] < 12)
    df['is_midday'] = np.logical_and(12 <= df['time_of_day'], df['time_of_day'] < 18)
    df['is_evening'] = np.logical_and(18 <= df['time_of_day'], df['time_of_day'] <= 23)
    #df_val['is_night'] = np.logical_and(0 <= df_val['time_of_day'], df['time_of_day'] < 6)
    df_val['is_morning'] = np.logical_and(6 <= df_val['time_of_day'], df['time_of_day'] < 12)
    df_val['is_midday'] = np.logical_and(12 <= df_val['time_of_day'], df['time_of_day'] < 18)
    df_val['is_evening'] = np.logical_and(18 <= df_val['time_of_day'], df['time_of_day'] <= 23)
    

    ## Add previous y
    df['previous_y'] = df['y'].shift(1)
    df.loc[0,'previous_y'] = df.loc[1,'previous_y']
    df_val['previous_y'] = df_val['y'].shift(1)
    df_val.loc[0,'previous_y'] = df_val.loc[1,'previous_y']
    
    # Add lag features
    
    # 24 hour lag imbalance (= 288 periods of 5 min)
    df['lag_24_hours_y'] = df['y'].diff(periods=288)
    df.loc[0:287,'lag_24_hours_y'] = 0
    df_val['lag_24_hours_y'] = df_val['y'].diff(periods=288)
    df_val.loc[0:287,'lag_24_hours_y'] = 0
    
    

In [192]:
preprocess(train, validation)
train.dtypes

start_time        datetime64[ns]
hydro                    float64
micro                    float64
thermal                  float64
wind                     float64
river                    float64
total                    float64
y                        float64
sys_reg                  float64
flow                     float64
time_of_day                int64
time_of_week              object
time_of_year              object
is_fall                     bool
is_winter                   bool
is_spring                   bool
is_weekend                  bool
is_morning                  bool
is_midday                   bool
is_evening                  bool
previous_y               float64
lag_24_hours_y           float64
dtype: object

In [164]:
train.previous_y

0              NaN
1         1.079343
2         1.043034
3         1.022842
4         0.934014
            ...   
225083    0.430890
225084    0.432809
225085    0.392099
225086    0.281588
225087    0.306456
Name: previous_y, Length: 225088, dtype: float64

In [None]:
def create_n1_seq(df, n_seq, inputs, outputs):
    '''
    INPUTS:
    df (pandas dataframe): contains the inputs and outputs for each time step
    
    OUTPUTS:
    x (numpy ndarray): (n_seq)
    '''
    
    

In [119]:
train = pd.read_csv(sys.path[0] + '/data/no1_train.csv')
prep_dtypes(train)
clamp_data(train, 'y', 0.005, 0.995)
normalize_data(train, variable_list = ['hydro', 'micro', 'thermal', 'wind', 'river', 'total'])

add_time_features(train)
add_lag_features(train)


In [137]:
x = np.arange(10).reshape((-1,1))
scaler = MinMaxScaler(feature_range=(0,1))
x = scaler.fit_transform(x)
print(x)
y = np.arange(12).reshape((-1,1))-1
y = scaler.transform(y)
print(y)

[[0.        ]
 [0.11111111]
 [0.22222222]
 [0.33333333]
 [0.44444444]
 [0.55555556]
 [0.66666667]
 [0.77777778]
 [0.88888889]
 [1.        ]]
MinMaxScaler()
[[-0.11111111]
 [ 0.        ]
 [ 0.11111111]
 [ 0.22222222]
 [ 0.33333333]
 [ 0.44444444]
 [ 0.55555556]
 [ 0.66666667]
 [ 0.77777778]
 [ 0.88888889]
 [ 1.        ]
 [ 1.11111111]]


In [85]:
## Preprocessing

# start_time : The timestamp of each datum
# hydro      : The planned reservoir hydropower production at the time step.
# micro      : The planned small-scale production at the time step.
# river      : The planned run-of-river hydropower production at the time step.
# thermal    : The planned thermal power plant production at the time step.
# wind       : The planned wind power plant production at the time step.
# total      : The total planned production at the time step.
# sys_reg    : The planned system regulation at the time step.
# flow       : The planned total power flow in or out of the current area.

# y          : The estimated open loop power grid imbalance at the time step.


## TO DO
# 1. Define previous_y, the estimated power grid imbalance at the previous step. (DONE)
# 2. Clamb the values of the target series "y" to exclude noisy spikes in magnitude. (DONE)
# 3. Write code for normalizing / standardizing the input data. ()
# 4. Implement date time features. -> time_of_day, time_of_week, time_of_year (DONE)
# 5. Implement at least two lag features of power imbalance. -> 24 hour lag? 1 week lag? 48 hours lag? (DONE)