# Regression LSTM with best parameters
    find the best prediction window to apply w/ lr = 0.0001

# Import libraries

In [21]:
!pip install pandas
!pip install sklearn
!pip install matplotlib

import pandas as pd;
import numpy as np;
import sklearn
import matplotlib.pyplot as plt
import time

date =  [2018010000, 
         2018030000, 
         2018050000,
         2018070000, 
         2018090000, 
         2018110000]

# parameters
steps = 96
n_hidden = 2
units = 180
batch_size = 336
epochs = 180
features_num = 14

# lists to append results
mae_gen = []
mae_nor = []
mae_spi = []
rmse_gen = []
rmse_nor = []
rmse_spi = []
y_pred_list = []
time_count = []



# Import keras libraries, packages and data:

In [22]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import LeakyReLU
from keras import initializers
from keras import optimizers

# import data
data_full = pd.read_csv('Data_set_1_smaller_(1).csv', index_col = 0)

# Create loop for different dates:

In [26]:
# function to split data into correct shape for RNN
def split_data(X, y, steps):
    X_, y_ = list(), list()
    for i in range(steps, len(y)):
        X_.append(X[i - steps : i, :])
        y_.append(y[i]) 
    return np.array(X_), np.array(y_)

# function to cut data set so it can be divisible by the batch_size
def cut_data(data, batch_size):
     # see if it is divisivel
    condition = data.shape[0] % batch_size
    if condition == 0:
        return data
    else:
        return data[: -condition]

# design the LSTM
def regressor_tunning(kernel_initializer = 'he_uniform',
                      bias_initializer = initializers.Ones()):
    model = Sequential()
    if n_hidden == 1:
        model.add(LSTM(units = units,                    
                       batch_input_shape = (batch_size, steps, features_num), 
                       stateful = True,
                       kernel_initializer = kernel_initializer,
                       bias_initializer = bias_initializer))
        model.add(LeakyReLU(alpha = 0.2))
        model.add(Dropout(0.2))
    else:
        model.add(LSTM(units = units,                    
                       batch_input_shape = (batch_size, steps, features_num), 
                       stateful = True,
                       return_sequences = True,
                       kernel_initializer = kernel_initializer,
                       bias_initializer = bias_initializer))
        model.add(LeakyReLU(alpha = 0.2))
        model.add(Dropout(0.2))
        model.add(LSTM(units = units, 
                       batch_input_shape = (batch_size, steps, features_num), 
                       stateful = True,
                       kernel_initializer = kernel_initializer,
                       bias_initializer = bias_initializer))
        model.add(LeakyReLU(alpha = 0.2))
        model.add(Dropout(0.2))
    model.add(Dense(1, activation='linear'))
    optimizer = optimizers.RMSprop()
    model.compile(loss = 'mse', metrics = ['mse', 'mae'], optimizer = optimizer)
    return model

# for final one
date = [2018110000]
  
# LOOP STARTS
for i in date:
    start_time = time.time()
    # data
    data = data_full.loc[data_full.index > i, :]

    # reset index
    data.reset_index(inplace = True)
    data.drop('index', axis = 1, inplace = True)

    # fill nan values in the whole data set
    data.fillna(data.mean(), inplace = True)

    from sklearn.model_selection import train_test_split

    # divide data into train and test 
    data_train, data_test = train_test_split(
             data, test_size = 0.15, shuffle=False)  
    
    from sklearn.preprocessing import MinMaxScaler

    # data scaling  (including offer (y))
    sc_X = MinMaxScaler()
    data_train = sc_X.fit_transform(data_train)
    data_test = sc_X.transform(data_test)
    
    # divide features and labels
    X_train = data_train[:, 0:14] 
    y_train = data_train[:, -1]
    X_test = data_test[:, 0:14] 
    y_test = data_test[:, -1] 

    # divide data into train and test 
    X_train, X_val, y_train, y_val = train_test_split(
             X_train, y_train, test_size = 0.10, shuffle=False)

    # put data into correct shape
    X_train, y_train = split_data(X_train, y_train, steps)
    X_test, y_test = split_data(X_test, y_test, steps)
    X_val, y_val = split_data(X_val, y_val, steps)

    X_train = cut_data(X_train, batch_size)
    y_train = cut_data(y_train, batch_size)
    X_test = cut_data(X_test, batch_size)
    y_test = cut_data(y_test, batch_size)
    X_val = cut_data(X_val, batch_size)
    y_val = cut_data(y_val, batch_size)

    model = regressor_tunning()
    
    # fitting the LSTM to the training set
    history = model.fit(X_train,
                        y_train, 
                        batch_size = batch_size, 
                        epochs = epochs,
                        shuffle = False)
                        #validation_data = (X_val, y_val))
    
    model.reset_states()
    
    # make new predicitons with test set
    y_pred = model.predict(X_test, batch_size = batch_size)
    
    # prices col = 15 (inverso should not be used as scalling was made with the whole data set)
    y_pred = (y_pred * sc_X.data_range_[14]) + (sc_X.data_min_[14])
    y_test = (y_test * sc_X.data_range_[14]) + (sc_X.data_min_[14])

    # smal adjustment
    y_test = pd.Series(y_test)
    y_test.replace(0, 0.0001,inplace = True)
    
    y_pred_list.append(y_pred)
    
    from sklearn.metrics import mean_squared_error as mse
    from sklearn.metrics import mean_absolute_error as mae

    rmse_error = mse(y_test, y_pred, squared = False)
    mae_error = mae(y_test, y_pred)
    
    rmse_gen.append(rmse_error)
    mae_gen.append(mae_error)
    
    # =============================================================================
    # Metrics evaluation on spike regions
    # =============================================================================
    
    # Need to process data with spike occurences the same way as features
    data = pd.read_csv('Spike_binary_1std.csv', index_col = 0)

    # set predictive window according with tuning best results
    data = data.loc[data.index > i, :]

    # make sure shaded area will correspond to values outputed by LSTM
    data.reset_index(drop = True, inplace = True)

    # fill_nan is already made - so lets split data into test and train
    from sklearn.model_selection import train_test_split

    # divide data into train and test 
    shade_train, shade_test = train_test_split(
             data, test_size = 0.15, shuffle = False)

    # reset index of testing data
    shade_test.reset_index(drop = True, inplace = True)

    # function to split data into correct shape for RNN
    def split_data_shade(shade_test, steps):
        y_spike_occ = list()
        upper_lim = list()
        lower_lim = list()
        for i in range(steps, len(shade_test.index)):
            y_spike_occ.append(shade_test['spike_occurance'][i])
            upper_lim.append(shade_test['spike_upperlim'][i])
            lower_lim.append(shade_test['spike_lowerlim'][i])
        return np.array(y_spike_occ), np.array(upper_lim), np.array(lower_lim)

    # function to cut data set so it can be divisible by the batch_size
    def cut_data_shade(data, batch_size):
         # see if it is divisivel
        condition = data.shape[0] % batch_size
        if condition == 0:
            return data
        else:
            return data[: -condition]
    
    # shape y_spike_occ for the right size to compare results in normal and spike regions
    y_spike_occ, spike_upperlim, spike_lowerlim = split_data_shade(shade_test, steps)
    y_spike_occ = cut_data_shade(y_spike_occ, batch_size)

    # continue
    
    # select y_pred and y_test only for regions with spikes
    y_test_spike = (y_test.T * y_spike_occ).T
    y_pred_spike = (y_pred.T * y_spike_occ).T
    y_test_spike = y_test_spike[y_test_spike != 0]
    y_pred_spike = y_pred_spike[y_pred_spike != 0]
    
    # calculate metric
    rmse_spike = mse(y_test_spike, y_pred_spike, squared = False)
    mae_spike = mae(y_test_spike, y_pred_spike)
    
    rmse_spi.append(rmse_spike)
    mae_spi.append(mae_spike)
    
    # =============================================================================
    # Metric evaluation on normal regions
    # =============================================================================
    
    # inverse y_spike_occ so the only normal occurences are chosen
    y_normal_occ = (y_spike_occ - 1) * (-1)
    
    # sanity check
    y_normal_occ.sum() + y_spike_occ.sum() # gives the correct total 
    
    # select y_pred and y_test only for normal regions
    y_test_normal = (y_test.T * y_normal_occ).T
    y_pred_normal = (y_pred.T * y_normal_occ).T
    y_test_normal = y_test_normal[y_test_normal != 0.00]
    y_pred_normal = y_pred_normal[y_pred_normal != 0.00]
    
    # calculate metric
    rmse_normal = mse(y_test_normal, y_pred_normal, squared = False)
    mae_normal = mae(y_test_normal, y_pred_normal)
    
    rmse_nor.append(rmse_normal)
    mae_nor.append(mae_normal)
    
    elapsed_time = time.time() - start_time

    time_count.append(elapsed_time)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


Epoch 1/180
Epoch 2/180
Epoch 3/180
Epoch 4/180
Epoch 5/180
Epoch 6/180
Epoch 7/180
Epoch 8/180
Epoch 9/180
Epoch 10/180
Epoch 11/180
Epoch 12/180
Epoch 13/180
Epoch 14/180
Epoch 15/180
Epoch 16/180
Epoch 17/180
Epoch 18/180
Epoch 19/180
Epoch 20/180
Epoch 21/180
Epoch 22/180
Epoch 23/180
Epoch 24/180
Epoch 25/180
Epoch 26/180
Epoch 27/180
Epoch 28/180
Epoch 29/180
Epoch 30/180
Epoch 31/180
Epoch 32/180
Epoch 33/180
Epoch 34/180
Epoch 35/180
Epoch 36/180
Epoch 37/180
Epoch 38/180
Epoch 39/180
Epoch 40/180
Epoch 41/180
Epoch 42/180
Epoch 43/180
Epoch 44/180
Epoch 45/180
Epoch 46/180
Epoch 47/180
Epoch 48/180
Epoch 49/180
Epoch 50/180
Epoch 51/180
Epoch 52/180
Epoch 53/180
Epoch 54/180
Epoch 55/180
Epoch 56/180
Epoch 57/180
Epoch 58/180
Epoch 59/180
Epoch 60/180
Epoch 61/180
Epoch 62/180
Epoch 63/180
Epoch 64/180
Epoch 65/180
Epoch 66/180
Epoch 67/180
Epoch 68/180
Epoch 69/180
Epoch 70/180
Epoch 71/180
Epoch 72/180
Epoch 73/180
Epoch 74/180
Epoch 75/180
Epoch 76/180
Epoch 77/180


Epoch 78/180
Epoch 79/180
Epoch 80/180
Epoch 81/180
Epoch 82/180
Epoch 83/180
Epoch 84/180
Epoch 85/180
Epoch 86/180
Epoch 87/180
Epoch 88/180
Epoch 89/180
Epoch 90/180
Epoch 91/180
Epoch 92/180
Epoch 93/180
Epoch 94/180
Epoch 95/180
Epoch 96/180
Epoch 97/180
Epoch 98/180
Epoch 99/180
Epoch 100/180
Epoch 101/180
Epoch 102/180
Epoch 103/180
Epoch 104/180
Epoch 105/180
Epoch 106/180
Epoch 107/180
Epoch 108/180
Epoch 109/180
Epoch 110/180
Epoch 111/180
Epoch 112/180
Epoch 113/180
Epoch 114/180
Epoch 115/180
Epoch 116/180
Epoch 117/180
Epoch 118/180
Epoch 119/180
Epoch 120/180
Epoch 121/180
Epoch 122/180
Epoch 123/180
Epoch 124/180
Epoch 125/180
Epoch 126/180
Epoch 127/180
Epoch 128/180
Epoch 129/180
Epoch 130/180
Epoch 131/180
Epoch 132/180
Epoch 133/180
Epoch 134/180
Epoch 135/180
Epoch 136/180
Epoch 137/180
Epoch 138/180
Epoch 139/180
Epoch 140/180
Epoch 141/180
Epoch 142/180
Epoch 143/180
Epoch 144/180
Epoch 145/180
Epoch 146/180
Epoch 147/180
Epoch 148/180
Epoch 149/180
Epoch 150/180


Epoch 154/180
Epoch 155/180
Epoch 156/180
Epoch 157/180
Epoch 158/180
Epoch 159/180
Epoch 160/180
Epoch 161/180
Epoch 162/180
Epoch 163/180
Epoch 164/180
Epoch 165/180
Epoch 166/180
Epoch 167/180
Epoch 168/180
Epoch 169/180
Epoch 170/180
Epoch 171/180
Epoch 172/180
Epoch 173/180
Epoch 174/180
Epoch 175/180
Epoch 176/180
Epoch 177/180
Epoch 178/180
Epoch 179/180
Epoch 180/180


In [31]:
results = pd.DataFrame({                        
                        'rmse_general': rmse_gen, 
                 
                        'mae_general': mae_gen,
                        
                        'rmse_spike': rmse_spi,
                 
                        'mae_spike': mae_spi,
                        
                        'rmse_normal': rmse_nor,
                    
                        'mae_normal': mae_nor,
    
                        'time': time_count})

#y_pred = pd.DataFrame({'dates': date,
#                       'Predicitons': y_pred_list})

#y_pred.to_csv('Pedictions_LSTM_5_prediction_window.csv')
results.to_csv('Results_LSTM_Prediction_window.csv')
results

Unnamed: 0,rmse_general,mae_general,rmse_spike,mae_spike,rmse_normal,mae_normal,time
0,59.206006,32.266572,75.095039,40.107748,55.613483,30.75085,11222.361281
1,36.35653,19.49179,56.834863,28.62913,30.859672,17.722749,8934.704042
2,27.214883,17.25915,30.628762,21.652058,26.625771,16.553554,8004.332963
3,28.155884,16.697367,32.37713,21.613011,27.451361,15.94371,9490.615029
4,29.814576,18.606296,40.733153,24.234102,27.74515,17.736017,6434.632526
5,29.35544,22.055076,41.345949,31.517682,26.099331,20.039576,2193.240561


In [28]:
def highlight_min(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_max]

results.style.apply(highlight_min)

Unnamed: 0,rmse_general,mae_general,rmse_spike,mae_spike,rmse_normal,mae_normal,time
0,59.206006,32.266572,75.095039,40.107748,55.613483,30.75085,11222.361281
1,36.35653,19.49179,56.834863,28.62913,30.859672,17.722749,8934.704042
2,27.214883,17.25915,30.628762,21.652058,26.625771,16.553554,8004.332963
3,28.155884,16.697367,32.37713,21.613011,27.451361,15.94371,9490.615029
4,29.814576,18.606296,40.733153,24.234102,27.74515,17.736017,6434.632526
5,29.35544,22.055076,41.345949,31.517682,26.099331,20.039576,2193.240561


In [24]:
results

Unnamed: 0,rmse_general,mae_general,rmse_spike,mae_spike,rmse_normal,mae_normal,time
0,36.753872,24.360023,49.547439,31.386293,33.725673,23.001824,4877.699733
1,38.962476,19.308989,61.775859,30.429582,32.758714,17.15598,3691.841471
2,27.244339,17.124567,33.328991,22.994592,26.135302,16.181714,3212.422024
3,28.740984,18.11934,32.508364,22.292849,28.118785,17.479466,2486.948337
4,34.610033,30.461997,40.24653,35.016924,33.654237,29.757627,2265.764607


In [29]:
!pip install matplotlib

import matplotlib.pyplot as plt



In [30]:
%matplotlib notebook

dates_labels = ['12 ',
                '10 ',
                '8 ',
                '6 ',
                '4 ',
                '2 ']

plt.figure(figsize=(10,4))
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.title('LSTM: Averaged RMSE for different\n predictive windows')
plt.plot(rmse_gen, label = 'Overall error')
plt.plot(rmse_spi, label = 'Spike regions')
plt.plot(rmse_nor, label = 'Normal regions')
plt.legend()
plt.ylabel('RMSE (£/MWh)')
plt.xlabel('Predictive window (in months)')
plt.xticks([0,1,2,3,4,5], dates_labels)
plt.tight_layout()
plt.savefig('RMSE_predictive_window.png')

plt.figure(figsize=(10,4))
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.title('LSTM: Averaged MAE for different\n predictive windows')
plt.plot(mae_gen, label = 'Overall error')
plt.plot(mae_spi, label = 'Spike regions')
plt.plot(mae_nor, label = 'Normal regions')
plt.legend()
plt.ylabel('MAE (£/MWh)')
plt.xlabel('Predictive window (in months)')
plt.xticks([0,1,2,3,4,5], dates_labels)
plt.tight_layout()
plt.savefig('MAE_predictive_window.png')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>