#### Roughly following guide in:
###### https://pangkh98.medium.com/multi-step-multivariate-time-series-forecasting-using-lstm-92c6d22cd9c2

In [64]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

features = ["Springfield Plateau Aquifer Depth to Water Level (ft)", "Fire 120 Hour Rainfall Aggregate", "SWTP Total Influent Flow"]
dataset = pd.read_csv("Imputed Data.csv", usecols=features)
dataset = dataset[["Fire 120 Hour Rainfall Aggregate", "Springfield Plateau Aquifer Depth to Water Level (ft)",
                    "SWTP Total Influent Flow"]]        # reordering to match, y (variable to predict) MUST BE LAST FEATURE
values = dataset.values

# linear transformation of each feature from [min, max] to [0, 1]
scaler = MinMaxScaler()
scaled = scaler.fit_transform(values)

In [65]:
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps_in, n_steps_out):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out-1
        # check if we are beyond the dataset
        if out_end_ix > len(sequences):
            break
    # gather input and output parts of the pattern
    seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1:out_end_ix, -1]
    X.append(seq_x)
    y.append(seq_y)
    return np.array(X), np.array(y)


# choose a number of time steps 
n_steps_in = 60
n_steps_out = 24

# covert into input/output
X, y = split_sequences(scaled, n_steps_in, n_steps_out)
print ("X.shape" , X.shape)                             # [rows, time lags backward, features]
print ("y.shape" , y.shape)                             # [rows, future time values]

# splitting into training and testing
split_point = 4 * 365 * 24                              # first four years
train_X , train_y = X[:split_point, :] , y[:split_point, :]
test_X , test_y = X[split_point:, :] , y[split_point:, :]
print("\ntrain_X.shape", train_X.shape)
print("train_y.shape", train_y.shape)
print("test_X.shape", test_X.shape)
print("test_y.shape", test_y.shape)

X.shape (44173, 60, 2)
y.shape (44173, 30)

train_X.shape (35040, 60, 2)
train_y.shape (35040, 30)
test_X.shape (9133, 60, 2)
test_y.shape (9133, 30)


In [78]:
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation

# building model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(n_steps_in, len(features)-1)))    # the -1 is to remove target variable
model.add(LSTM(50))
model.add(Dense(n_steps_out))
model.add(Activation('linear'))
model.compile(loss ='mae', optimizer ='adam', metrics = 'mse')

In [79]:
# Fit network
# will take a while
history = model.fit(train_X, train_y, epochs=20, steps_per_epoch=25, 
    verbose=1, validation_data=(test_X, test_y), shuffle=False)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [99]:
def invNormalize(arr, minimum, maximum):
    return (maximum - minimum) * arr + minimum

def predict(model, test_X):
    # to be able to inverse scale predictions
    df = pd.read_csv("Imputed Data.csv", usecols=["SWTP Total Influent Flow"])
    arr = np.array(df["SWTP Total Influent Flow"])
    maximum = np.max(arr)
    minimum = np.min(arr)

    #predictions and rescaling to [min, max]
    y_pred = model.predict(test_X)
    y_pred_inv = np.array([invNormalize(x, minimum, maximum) for x in y_pred])
    test_y_inv = np.array([invNormalize(x, minimum, maximum) for x in test_y])
    print("y_pred_inv:",y_pred_inv.shape)
    print("test_y_inv:",y_pred_inv.shape)
    
    return y_pred_inv, test_y_inv

def mseForecast(y, y_pred):
    # change so is only for a range, i.e. first 72 days of test set
    # mse is gonna scale really badly the further out it goes
    # msut fix! cannot calculate mse from a full year, must use a rolling window
    # that forecasts 3 days in advance, has new next hour put into it, then has next 3 day forecast one hour after
    totalMSE = 0
    for i in range(y.shape[0]):
        mse = mean_squared_error(y[i], y_pred[i])
        totalMSE += mean_squared_error(y[i], y_pred[i])
    print("MSE:", totalMSE / y.shape[0])

y_pred_inv, test_y_inv = predict(model, test_X)
# mseForecast(test_y_inv, y_pred_inv)

print("MSE", mean_squared_error(test_y_inv[0], y_pred_inv[0]))
for y, yhat in zip(test_y_inv[0], y_pred_inv[0]):
    print(y, yhat)

y_pred_inv: (9133, 30)
test_y_inv: (9133, 30)
MSE 10.630680189464575
36.83265021999999 44.198612
36.24014786 43.74248
37.88015705 44.65155
40.94266323 44.92485
42.39016823 44.684105
46.00768234 44.468773
46.44018364 44.274197
46.81518364 44.08175
46.787688059999994 44.70101
46.58518219 44.191216
46.497684019999994 43.930122
43.96267318999999 44.04759
46.520184099999994 44.307735
44.46767715 43.84969
44.33017448 43.823193
44.350175629999995 43.80226
41.4501667 43.98481
40.33016109 42.824524
40.33016109 43.732445
40.33016109 43.379227
40.33016109 43.89429
40.33016109 43.25763
40.33016109 42.5062
40.33016109 42.63833
40.33016109 42.4592
40.33016109 42.634827
40.33016109 42.687115
40.33016109 43.375675
40.33016109 42.72474
40.33016109 43.34558
