# Regression LSTM with best parameters
    find the best prediction window to apply with a rolling Nested CV technique

# Import libraries

In [1]:
!pip install pandas
!pip install sklearn
!pip install matplotlib

import matplotlib.pyplot as plt
import pandas as pd;
import numpy as np;
import sklearn
import time

dates = [2018010000, 
         2018030000, 
         2018050000,
         2018070000, 
         2018090000, 
         2018110000]

Collecting pandas
  Downloading pandas-1.1.0-cp37-cp37m-manylinux1_x86_64.whl (10.5 MB)
[K     |████████████████████████████████| 10.5 MB 10.8 MB/s eta 0:00:01
[?25hCollecting pytz>=2017.2
  Downloading pytz-2020.1-py2.py3-none-any.whl (510 kB)
[K     |████████████████████████████████| 510 kB 67.5 MB/s eta 0:00:01
Installing collected packages: pytz, pandas
Successfully installed pandas-1.1.0 pytz-2020.1
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-0.23.1-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 16.0 MB/s eta 0:00:01
Collecting joblib>=0.11
  Downloading joblib-0.16.0-py3-none-any.whl (300 kB)
[K     |████████████████████████████████| 300 kB 62.1 MB/s eta 0:00:01
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h

# Import keras libraries, packages and data:

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import LeakyReLU
from keras import initializers
from keras import optimizers
from keras.callbacks import EarlyStopping

mae_gen = []
rmse_gen = []
mae_nor = []
mae_spi = []
rmse_nor = []
rmse_spi = []
hist_list = []
y_pred_list = []
prediction_list = []
time_count = []

# import data
data_full = pd.read_csv('Data_set_1_smaller.csv', index_col = 0)

# parameters
features_num = 15
steps = 96
n_hidden = 1
units = 150
batch_size = 96
epochs = 180

Using TensorFlow backend.


# Create loop for different dates:

In [None]:
# function to split data into correct shape for RNN
def split_data(X, y, steps):
    X_, y_ = list(), list()
    for i in range(steps, len(y)):
        X_.append(X[i - steps : i, :])
        y_.append(y[i]) 
    return np.array(X_), np.array(y_)

# function to cut data set so it can be divisible by the batch_size
def cut_data(data, batch_size):
     # see if it is divisivel
    condition = data.shape[0] % batch_size
    if condition == 0:
        return data
    else:
        return data[: -condition]

# design the LSTM
def regressor_tunning(bias_initializer = initializers.Ones() , kernel_initializer = 'he_normal'):
    model = Sequential()
    if n_hidden == 0:
        model.add(LSTM(units = units,                    
                       batch_input_shape = (batch_size, steps, features_num), 
                       stateful = True,
                       kernel_initializer = kernel_initializer,
                       bias_initializer = bias_initializer))
        model.add(LeakyReLU(alpha = 0.2))
        model.add(Dropout(0.2))
    else:
        model.add(LSTM(units = units,                    
                       batch_input_shape = (batch_size, steps, features_num), 
                       stateful = True,
                       return_sequences = True,
                       kernel_initializer = kernel_initializer,
                       bias_initializer = bias_initializer))
        model.add(LeakyReLU(alpha = 0.2))
        model.add(Dropout(0.2))
        model.add(LSTM(units = units, 
                       batch_input_shape = (batch_size, steps, features_num), 
                       stateful = True,
                       kernel_initializer = kernel_initializer,
                       bias_initializer = bias_initializer))
        model.add(LeakyReLU(alpha = 0.2))
        model.add(Dropout(0.2))
    model.add(Dense(1, activation='linear'))
    optimizer = optimizers.RMSprop(lr = i)
    model.compile(loss = 'mse', metrics = ['mse', 'mae'], optimizer = optimizer)
    return model

  
# LOOP STARTS
for i in dates:
    start_time = time.time()
    # data
    data = data_full.loc[data_full.index > 2018070000, :]

    # reset index
    data.reset_index(inplace = True)
    data.drop('index', axis = 1, inplace = True)

    # fill nan values
    data.fillna(method = 'ffill', inplace = True)

    from sklearn.model_selection import train_test_split

    # divide data into train and test 
    data_train, data_test = train_test_split(
             data, test_size = 0.15, shuffle=False)    
    
    from sklearn.preprocessing import MinMaxScaler

    # data scaling  (including offer (y))
    sc_X = MinMaxScaler()
    data_train = sc_X.fit_transform(data_train)
    data_test = sc_X.transform(data_test)
    
    # divide features and labels
    X_train = data_train[:, 0:15] 
    y_train = data_train[:, -1]
    X_test = data_test[:, 0:15] 
    y_test = data_test[:, -1] 

    # divide data into train and test 
    X_train, X_val, y_train, y_val = train_test_split(
             X_train, y_train, test_size = 0.15, shuffle=False)

    # put data into correct shape
    X_train, y_train = split_data(X_train, y_train, steps)
    X_test, y_test = split_data(X_test, y_test, steps)
    X_val, y_val = split_data(X_val, y_val, steps)

    X_train = cut_data(X_train, batch_size)
    y_train = cut_data(y_train, batch_size)
    X_test = cut_data(X_test, batch_size)
    y_test = cut_data(y_test, batch_size)
    X_val = cut_data(X_val, batch_size)
    y_val = cut_data(y_val, batch_size)

    model = regressor_tunning()
    
    # fitting the LSTM to the training set
    history = model.fit(X_train,
                        y_train, 
                        batch_size = batch_size, 
                        epochs = epochs,
                        shuffle = False, 
                        validation_data = (X_val, y_val))
    
    model.reset_states()
    
    # make new predicitons with test set
    y_pred = model.predict(X_test, batch_size = batch_size)
    
    # prices col = 15 (inverso should not be used as scalling was made with the whole data set)
    y_pred = (y_pred * sc_X.data_range_[15]) + (sc_X.data_min_[15])
    y_test = (y_test * sc_X.data_range_[15]) + (sc_X.data_min_[15])

    # smal adjustment
    y_test = pd.Series(y_test)
    y_test.replace(0, 0.0001,inplace = True)
    
    y_pred_list.append(y_pred)
    
    from sklearn.metrics import mean_squared_error as mse
    from sklearn.metrics import mean_absolute_error as mae

    rmse_error = mse(y_test, y_pred, squared = False)
    mae_error = mae(y_test, y_pred)
    
    rmse_gen.append(rmse_error)
    mae_gen.append(mae_error)
    
    # =============================================================================
    # Metrics evaluation on spike regions
    # =============================================================================
    
    y_spike_occ = pd.read_csv('Spike_binary_1std.csv', usecols = [6])
    
    # create array same size as y_test
    y_spike_occ = y_spike_occ.iloc[- len(y_test):]
    y_spike_occ = pd.Series(y_spike_occ.iloc[:,0]).values

    # select y_pred and y_test only for regions with spikes
    y_test_spike = (y_test.T * y_spike_occ).T
    y_pred_spike = (y_pred.T * y_spike_occ).T
    y_test_spike = y_test_spike[y_test_spike != 0]
    y_pred_spike = y_pred_spike[y_pred_spike != 0]
    
    # calculate metric
    rmse_spike = mse(y_test_spike, y_pred_spike, squared = False)
    mae_spike = mae(y_test_spike, y_pred_spike)
    
    rmse_spi.append(rmse_spike)
    mae_spi.append(mae_spike)
    
    # =============================================================================
    # Metric evaluation on normal regions
    # =============================================================================
    
    # inverse y_spike_occ so the only normal occurences are chosen
    y_normal_occ = (y_spike_occ - 1) * (-1)
    
    # sanity check
    y_normal_occ.sum() + y_spike_occ.sum() # gives the correct total 
    
    # select y_pred and y_test only for normal regions
    y_test_normal = (y_test.T * y_normal_occ).T
    y_pred_normal = (y_pred.T * y_normal_occ).T
    y_test_normal = y_test_normal[y_test_normal != 0.00]
    y_pred_normal = y_pred_normal[y_pred_normal != 0.00]
    
    # calculate metric
    rmse_normal = mse(y_test_normal, y_pred_normal, squared = False)
    mae_normal = mae(y_test_normal, y_pred_normal)
    
    rmse_nor.append(rmse_normal)
    mae_nor.append(mae_normal)
    
    elapsed_time = time.time() - start_time

    time_count.append(elapsed_time)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


Train on 6240 samples, validate on 960 samples
Epoch 1/180

In [None]:
results = pd.DataFrame({                        
                        'rmse_general': rmse_gen, 
                 
                        'mae_general': mae_gen,
                        
                        'rmse_spike': rmse_spi,
                 
                        'mae_spike': mae_spi,
                        
                        'rmse_normal': rmse_nor,
                    
                        'mae_normal': mae_nor,
    
                        'time': time_count}, index = dates_labels)

In [None]:
def highlight_min(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_max]

results.style.apply(highlight_min)

In [None]:
results.min()

In [None]:
!pip install matplotlib

%matplotlib notebook

import matplotlib.pyplot as plt

In [None]:
dates_labels = ['12 ',
                '10 ',
                '8 ',
                '6 ',
                '4 ',
                '2 ']

plt.figure(figsize=(10,4))
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.title('LSTM: Averaged RMSE for different\n predictive windows')
plt.plot(rmse_gen, label = 'Overall error')
plt.plot(rmse_spi, label = 'Spike regions')
plt.plot(rmse_nor, label = 'Normal regions')
plt.legend()
plt.ylabel('RMSE (£/MWh)')
plt.xlabel('Predictive window (in months)')
plt.xticks([0,1,2,3,4,5], dates_labels)
plt.tight_layout()
plt.savefig('RMSE_predictive_window.png')

plt.figure(figsize=(10,4))
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.title('LSTM: Averaged MAE for different\n predictive windows')
plt.plot(mae_gen, label = 'Overall error')
plt.plot(mae_spi, label = 'Spike regions')
plt.plot(mae_nor, label = 'Normal regions')
plt.legend()
plt.ylabel('MAE (£/MWh)')
plt.xlabel('Predictive window (in months)')
plt.xticks([0,1,2,3,4,5], dates_labels)
plt.tight_layout()
plt.savefig('MAE_predictive_window.png')