In [1]:
%matplotlib inline

import os
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import signal
from scipy import stats

import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error
from statsmodels.tools.eval_measures import rmse
from sklearn.preprocessing import MinMaxScaler

plt.style.use('ggplot')

import warnings
warnings.filterwarnings('ignore')

  _py_version.major, _py_version.minor, _py_version.micro,


Ingest Data File and Make Necessary Adjustments

In [2]:
data = pd.read_csv('cap3_data.csv')

In [3]:
data = data.drop(['Unnamed: 0'], axis=1)
data['UTC Time at End of Hour'] = pd.to_datetime(data['UTC Time at End of Hour'], errors='raise')
data = data.set_index('UTC Time at End of Hour')

Calculate Demand Mean and Fill NaNs in Column with Mean/Zeros

In [6]:
og_mean = data['Demand (MW)'].mean()
og_mean

343.39225844920753

In [29]:
zero_mean = data['Demand (MW)'].mean()
zero_mean

85.02272118128093

In [4]:
data['Demand (MW)'] = data['Demand (MW)'].fillna(0)
#data['Demand (MW)'] = data['Demand (MW)'].fillna(og_mean)

In [12]:
data.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3156438 entries, 2015-07-01 06:00:00 to 2021-01-01 05:00:00
Data columns (total 13 columns):
 #   Column                                                  Non-Null Count    Dtype  
---  ------                                                  --------------    -----  
 0   Balancing Authority                                     3156438 non-null  object 
 1   Demand Forecast (MW)                                    745868 non-null   float64
 2   Demand (MW)                                             3156438 non-null  float64
 3   Net Generation (MW)                                     1278726 non-null  float64
 4   Total Interchange (MW)                                  2342764 non-null  float64
 5   Net Generation (MW) from Coal                           2198539 non-null  float64
 6   Net Generation (MW) from Natural Gas                    2382768 non-null  float64
 7   Net Generation (MW) from Nuclear                        1808818 no

In [9]:
data.head()

Unnamed: 0_level_0,Balancing Authority,Demand Forecast (MW),Demand (MW),Net Generation (MW),Total Interchange (MW),Net Generation (MW) from Coal,Net Generation (MW) from Natural Gas,Net Generation (MW) from Nuclear,Net Generation (MW) from All Petroleum Products,Net Generation (MW) from Hydropower and Pumped Storage,Net Generation (MW) from Solar,Net Generation (MW) from Wind,Net Generation (MW) from Other Fuel Sources
UTC Time at End of Hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2015-07-01 06:00:00,AEC,882.0,422.0,670.0,248.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-07-01 07:00:00,AEC,819.0,395.0,620.0,225.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-07-01 08:00:00,AEC,782.0,382.0,637.0,255.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-07-01 09:00:00,AEC,763.0,370.0,619.0,249.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-07-01 10:00:00,AEC,774.0,383.0,633.0,250.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
demand = data['Demand (MW)']

In [40]:
demand.head()

UTC Time at End of Hour
2015-07-01 06:00:00    422.0
2015-07-01 07:00:00    395.0
2015-07-01 08:00:00    382.0
2015-07-01 09:00:00    370.0
2015-07-01 10:00:00    383.0
Name: Demand (MW), dtype: float64

In [None]:
#auto_arima(data['Demand (MW)'], 
#           seasonal=True, 
#           m=12, 
#           max_p=7, 
#           max_d=5, 
#           max_q=7, 
#           max_P=4, 
#           max_D=4, 
#           max_Q=4).summary()

Defining the Size of Training and Testing Data

In [10]:
size = int(len(demand) * (2/3))
train_data = demand[:size]
test_data = demand[size:]

In [None]:
# walk-forward validation
for t in range(len(test)):
	model = ARIMA(history, order=(5,1,0))
	model_fit = model.fit()
	output = model_fit.forecast()
	yhat = output[0]
	predictions.append(yhat)
	obs = test[t]
	history.append(obs)
	print('predicted=%f, expected=%f' % (yhat, obs))

Creating the ARIMA Model

In [32]:
arima_model = SARIMAX(train_data)#, order = (2,1,1), seasonal_order = (4,0,3,12))
arima_result = arima_model.fit()
arima_result.summary()

0,1,2,3
Dep. Variable:,Demand (MW),No. Observations:,2104292.0
Model:,"SARIMAX(1, 0, 0)",Log Likelihood,-11764008.929
Date:,"Thu, 14 Jan 2021",AIC,23528021.858
Time:,09:10:18,BIC,23528046.977
Sample:,0,HQIC,23528028.571
,- 2104292,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.9548,8.94e-05,1.07e+04,0.000,0.955,0.955
sigma2,4201.0372,0.541,7758.545,0.000,4199.976,4202.099

0,1,2,3
Ljung-Box (L1) (Q):,11995.57,Jarque-Bera (JB):,3482848817.97
Prob(Q):,0.0,Prob(JB):,0.0
Heteroskedasticity (H):,1.05,Skew:,0.89
Prob(H) (two-sided):,0.0,Kurtosis:,202.3


Predictions with Zeroes for NaNs

In [33]:
arima_zero_pred = arima_result.predict(
    start = len(train_data), 
    end = (len(train_data) + len(test_data)-1), typ="levels").rename("ARIMA Zero Predictions")

In [34]:
arima_zero_pred

2104292   -5.434722e-323
2104293   -5.434722e-323
2104294   -5.434722e-323
2104295   -5.434722e-323
2104296   -5.434722e-323
               ...      
3156433   -5.434722e-323
3156434   -5.434722e-323
3156435   -5.434722e-323
3156436   -5.434722e-323
3156437   -5.434722e-323
Name: ARIMA Zero Predictions, Length: 1052146, dtype: float64

Predictions with Demand Mean

In [20]:
arima_mean_pred = arima_result.predict(
    start = len(train_data), 
    end = (len(train_data) + len(test_data)-1), typ="levels").rename("ARIMA Mean Predictions")

In [21]:
arima_mean_pred

2104292     3.409387e+02
2104293     3.385028e+02
2104294     3.360842e+02
2104295     3.336829e+02
2104296     3.312988e+02
               ...      
3156433    3.409053e-322
3156434    3.409053e-322
3156435    3.409053e-322
3156436    3.409053e-322
3156437    3.409053e-322
Name: ARIMA Predictions, Length: 1052146, dtype: float64

Measuring Performance of the ARIMA Model

Performance when the NaNs are Filled with Zeroes

In [48]:
arima_rmse_error = rmse(test_data, arima_pred)
arima_mse_error = arima_rmse_error**2
mean_value = data['Demand (MW)'].mean()

print(f'MSE Error: {arima_mse_error}\nRMSE Error: {arima_rmse_error}\nMean: {mean_value}')

MSE Error: 48550.670196955056
RMSE Error: 220.34216618013687
Mean: 85.02272118128093


Performance when NaNs are Filled with the Mean Demand

In [22]:
arima_rmse_error = rmse(test_data, arima_mean_pred)
arima_mse_error = arima_rmse_error**2
mean_value = data['Demand (MW)'].mean()

print(f'MSE Error: {arima_mse_error}\nRMSE Error: {arima_rmse_error}\nMean: {mean_value}')

MSE Error: 137802.64675154808
RMSE Error: 371.2177888403896
Mean: 343.3922584492074


LSTM Model

In [5]:
import tensorflow as tf

In [6]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

Instantiating the Scaler and Scaling the Data

In [7]:
scaler = MinMaxScaler()

In [13]:
train_lstm = train_data.values.reshape(-1, 1)
test_lstm = test_data.values.reshape(-1, 1)

In [None]:
x_train, x_test, y_train, y_test

In [14]:
scaler.fit(train_lstm)
scaled_train = scaler.transform(train_lstm)
scaled_test = scaler.transform(test_lstm)

Instantiating the LSTM Model

In [17]:
lstm_model = Sequential()
lstm_model.add(LSTM(200, activation='relu', input_shape=(n_input, n_features)))
lstm_model.add(Dense(num_pred, activation='linear'))
lstm_model.compile(optimizer='adam', loss='mse')

lstm_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
unified_lstm (UnifiedLSTM)   (None, 200)               161600    
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 161,801
Trainable params: 161,801
Non-trainable params: 0
_________________________________________________________________


In [20]:
n_input = 12
n_features = 1
num_pred = 1
generator = TimeseriesGenerator(scaled_train, 
                                train_lstm, 
                                length=n_input, batch_size=1)

Verifying Scaled/Train Data

In [23]:
batch_0 = generator[0]
x, y = batch_0
x

array([[[0.70990447],
        [0.69632981],
        [0.68979387],
        [0.68376068],
        [0.69029663],
        [0.69733534],
        [0.70889894],
        [0.72649573],
        [0.7456008 ],
        [0.77626948],
        [0.80442433],
        [0.83257919]]])

In [24]:
y

array([[715.]])

In [None]:
lstm_model.fit_generator(generator, epochs=2, verbose=1)

Epoch 1/2
 216364/2104280 [==>...........................] - ETA: 22:30:23 - loss: 14412.6503

In [None]:
lstm_predictions_scaled = list()

batch = scaled_train_data[-n_input:]
current_batch = batch.reshape((1, n_input, n_features))

In [None]:
for i in range(len(test_data)):
    lstm_pred = lstm_model.predict(current_batch)[0]
    lstm_predictions_scaled.append(lstm_pred)
    current_batch = np.append(current_batch[:,1:,:],[[lstm_pred]], axis=1)

In [None]:
lstm_predictions_scaled

In [None]:
lstm_predictions = scaler.inverse_transform(lstm_predictions_scaled)
lstm_predictions

In [None]:
test_data['LSTM_Predictions'] = lstm_predictions
test_data

In [None]:
test_data.plot(figsize=(16,5), legend=True)
test_data['LSTM_Predictions'].plot(legend=True)

In [None]:
lstm_rmse_error = rmse(test_data['Demand (MW)'],
                      test_data['LSTM_Predictions'])
lstm_mst_error = lstm_rmse_error**2
mean_value = data['Demand (MW)'].mean()

print(f'MSE Error: {lstm_mse_error}\n
RMSE Error:{lstm_rmse_error}\
nMean: {mean_value})

In [8]:
def windowize_data(data, n_prev):
    n_predictions = len(data) - n_prev
    y = data[n_prev:]
    # this might be too clever
    indices = np.arange(n_prev) + np.arange(n_predictions)[:, None]
    x = data[indices, None]
    return x, y

In [9]:
def split_and_windowize(data, n_prev, fraction_test=0.3):
    n_predictions = len(data) - 2*n_prev
    
    n_test  = int(fraction_test * n_predictions)
    n_train = n_predictions - n_test   
    
    x_train, y_train = windowize_data(data[:n_train], n_prev)
    x_test, y_test = windowize_data(data[n_train:], n_prev)
    return x_train, x_test, y_train, y_test

In [10]:
n_prev = 50

In [12]:
x_train, x_test, y_train, y_test = split_and_windowize(demand, n_prev=50)

In [17]:
#x_train, x_test, y_train, y_test = split_and_windowize(sin_t_noisy, n_prev)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((2209387, 50, 1), (946951, 50, 1), (2209387,), (946951,))

In [None]:
model = keras.Sequential()
model.add(keras.layers.SimpleRNN(32, input_shape=(n_prev, 1), return_sequences=True))
model.add(keras.layers.SimpleRNN(32, return_sequences=False))
model.add(keras.layers.Dense(1, activation='linear'))
model.compile(optimizer='rmsprop',
              loss='mse')

In [None]:
model.summary()

In [None]:
model.fit(x_train, y_train, batch_size=32, epochs=2)

In [None]:
y_pred = model.predict(x_test)