#### Roughly following the guide in:
###### https://pangkh98.medium.com/multi-step-multivariate-time-series-forecasting-using-lstm-92c6d22cd9c2

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

features = ['SWTP Total Influent Flow', 'SWTP Plant 1 Influent Flow', 'SWTP Plant 2 Influent Flow',
            'Wilsons Gauge Height (ft)', 'James Gauge Height (ft)', 
            'Fire 120 Hour Rainfall Aggregate', 'Bingham 120 Hour Rainfall Aggregate', 'Field 120 Hour Rainfall Aggregate', 
            'Springfield Plateau Aquifer Depth to Water Level (ft)', 'Ozark Aquifer Depth to Water Level (ft)']

# features = ["Springfield Plateau Aquifer Depth to Water Level (ft)", "Fire 120 Hour Rainfall Aggregate", "SWTP Total Influent Flow"]
dataset = pd.read_csv("Train and Test Data.csv", usecols=features)
# dataset = pd.read_csv("Imputed Data.csv", usecols=features)
arr = np.array(dataset["SWTP Total Influent Flow"])
dataset["Target"] = arr         # adding another influent flow feature so that past values can be used to predict future values
values = dataset.values

# linear transformation of each feature from [min, max] to [0, 1]
scaler = MinMaxScaler()
scaled = scaler.fit_transform(values)

In [None]:
from tensorflow import keras
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation
from keras_tuner.tuners import BayesianOptimization
import os

# path = "C:\\Users\\natha\\Desktop\\Undergrad\\Spring2022\\MTH 596 PIC Math\\Project - Group 2\\Project\\Forecasting\\keras_tuner_attempt4\\model"

# split a multivariate sequence into samples
def split_sequences(sequences, n_steps_in, n_steps_out):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out-1
        # check if we are beyond the dataset
        if out_end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1:out_end_ix, -1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

# for testing on different parts of the total year testing set
# use slide = 15 for 8 different windows will cover 24 days of 72 hour forecasts
def sliding_window(X, y, n_test, slide):
    split_point = X.shape[0] - n_test + slide
    train_X , train_y = X[:split_point, :] , y[:split_point, :]
    test_X , test_y = X[split_point:, :] , y[split_point:, :]
    return train_X, train_y, test_X, test_y

def build_model(hp):
    model = Sequential()
    model.add(LSTM(units=hp.Int('units', min_value = 40, max_value = 200, step = 5), 
               activation = 'tanh', return_sequences = True, input_shape = (n_steps_in, len(features))))
    model.add(LSTM(units = hp.Int('units', min_value = 40, max_value = 200, step = 5)))
    model.add(Dense(24))   # for predicting 24 hours -- if desire more, change
    model.add(Activation('linear'))
    model.compile(loss = 'mse', metrics = 'mse', optimizer = keras.optimizers.Adam(
        hp.Choice('learning_rate', values = [1e-2, 1e-3, 1e-4, 1e-5])))
    return model

def invNormalize(arr, minimum, maximum):
    return (maximum - minimum) * arr + minimum

def predict(model, test_X, test_y, fullData = False):
    # to be able to inverse scale predictions
    df = pd.read_csv("Train and Test Data.csv", usecols=["SWTP Total Influent Flow"])
    if fullData:
        df = pd.read_csv("Imputed Data.csv", usecols=["SWTP Total Influent Flow"])
    arr = np.array(df["SWTP Total Influent Flow"])
    maximum = np.max(arr)
    minimum = np.min(arr)

    #predictions and rescaling to [min, max]
    y_pred = model.predict(test_X)
    y_pred_inv = np.array([invNormalize(x, minimum, maximum) for x in y_pred])
    test_y_inv = np.array([invNormalize(x, minimum, maximum) for x in test_y])
    print("y_pred_inv:",y_pred_inv.shape)
    print("test_y_inv:",y_pred_inv.shape)
    
    return y_pred_inv, test_y_inv

def mseForecast(y, y_pred):
    # change so is only for a range, i.e. first 72 days of test set
    # mse is gonna scale really badly the further out it goes
    # msut fix! cannot calculate mse from a full year, must use a rolling window
    # that forecasts 3 days in advance, has new next hour put into it, then has next 3 day forecast one hour after
    totalMSE = 0
    for i in range(y.shape[0]):
        totalMSE += mean_squared_error(y[i], y_pred[i])
    avgMSE = totalMSE / y.shape[0]
    print("Total Avg MSE:", avgMSE)
    return avgMSE

def saveResults(path, firstMSE, avgMSE, n_epochs, hours=36):
    txt = "n_steps_in = " + str(hours)
    txt += "\nepochs = " + str(n_epochs)
    txt += "\nFirst 10 Avg MSE: " + str(round(firstMSE, 4))
    txt += "\nTotal Avg MSE: " + str(round(avgMSE, 4))
    txt += "\n\nForm:\nLSTM\nLSTM\nDense(24)\nActivation('linear')"
    with open(path + "\\results.txt", 'w') as f:
        f.write(txt)

def getValidationData(features, n_steps_in):
    # getting data
    dataset = pd.read_csv("Imputed Data.csv", usecols=features)
    arr = np.array(dataset["SWTP Total Influent Flow"])
    dataset["Target"] = arr         # adding another influent flow feature so that past values can be used to predict future values
    values = dataset.values
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(values)

    n_steps_out = 24
    X, y = split_sequences(scaled, n_steps_in, n_steps_out)
    train_X, train_y, validate_X, validate_y = sliding_window(X, y, 8760, 0)    # 8760 means last year of data, for validation
    return validate_X, validate_y

def saveValidationResults(path, testMSE, validationMSE, n_epochs, hours=36):
    txt = "n_steps_in = " + str(hours)
    txt += "\nepochs = " + str(n_epochs)
    txt += "\nAvg Test MSE: " + str(round(testMSE, 4))
    txt += "\nAvg Validation MSE: " + str(round(validationMSE, 4))
    txt += "\n\nForm:\nLSTM\nLSTM\nDense(24)\nActivation('linear')"
    with open(path + "\\validation results.txt", 'w') as f:
        f.write(txt)


In [None]:
# choose a number of time steps 
# n_steps_in = 36
# n_steps_in = 48
n_steps_out = 24

# splitting into training and testing
n_tests = [8760, 6552, 4344, 2280]                      # for different validation windows throughout the year
# epochList = [3, 5, 8, 10, 12, 15, 20, 25, 30]
# epochList = [4, 6, 7, 9, 11, 13, 14]
# epochList = [10, 5, 15, 30, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 20, 25]   # for 36 hour inupt
epochList = [3, 5, 8, 10, 12, 15, 20, 25, 30, 4, 6, 7, 9, 11, 13, 14]   # for 48 hour input
hourList = [48, 72, 36, 60, 42, 54, 24, 12, 18, 30, 45]
indicies = [2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13]
# epochList = [20, 25]

# for i in n_tests:               # number of testing points per year
#     for s in range(7):          # number of sliding windows per testing point
lst = []
# for i in range(len(epochList)):
for n_steps_in, index in zip(hourList, indicies):
    # covert into input/output
    X, y = split_sequences(scaled, n_steps_in, n_steps_out)
    print ("X.shape" , X.shape)                             # [rows, time lags backward, features]
    print ("y.shape" , y.shape)                             # [rows, future time values]

    path = "C:\\Users\\natha\\Desktop\\Undergrad\\Spring2022\\MTH 596 PIC Math\\Project - Group 2\\Project\\Forecasting\\hour tuning\\keras_tuner_attempt"
    path += str(index)
    modelPath = path + "\\model"
    project_title = "keras_tuner_attempt" + str(index)
    n_epochs = 20
    print("Number epochs:", n_epochs)
    for n in n_tests[:1]:               
        for s in range(1):
            train_X, train_y, test_X, test_y = sliding_window(X, y, n, 15 * s)
            # print("\ntrain_X.shape", train_X.shape)
            # print("train_y.shape", train_y.shape)
            # print("test_X.shape", test_X.shape)
            # print("test_y.shape", test_y.shape)

            # tuning model with keras tuner
            # bayesian_opt_tuner = BayesianOptimization(
            #     build_model,
            #     objective='mse',
            #     max_trials=3,
            #     executions_per_trial=1,
            #     directory=os.path.normpath('C:/Users/natha/Desktop/Undergrad/Spring2022/MTH 596 PIC Math/Project - Group 2/Project/Forecasting'),
            #     project_name=project_title,
            #     overwrite=True)
            # bayesian_opt_tuner.search(train_X, train_y, epochs=n_epochs,
            #     validation_data=(test_X, test_y),
            #     validation_split=0.2, verbose=1)
            # bayes_opt_model_best_model = bayesian_opt_tuner.get_best_models(num_models=1)
            # model = bayes_opt_model_best_model[0]
            # model.save(modelPath)

            # string = "Number epochs = " + str(n_epochs)
            # with open(path + "\\note.txt", 'w') as f:
            #     f.write(string)

            model = keras.models.load_model(modelPath)

            # fitting model and predicting
            pred_y_inv, test_y_inv = predict(model, test_X, test_y)
            totalAvgMSE = mseForecast(test_y_inv, pred_y_inv)

            validate_X, validate_y = getValidationData(features, n_steps_in)
            y_pred_inv, y_validate_inv = predict(model, validate_X, validate_y, True)
            validationAvgMSE = mseForecast(y_validate_inv, y_pred_inv)

            saveValidationResults(path, totalAvgMSE, validationAvgMSE, n_epochs, n_steps_in)

            lst.append((n_epochs, round(totalAvgMSE, 4), round(validationAvgMSE, 4)))
print(lst)
    

In [7]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib qt

# This cell displays results of models created using above

bestHours = [(48, 73.4145, 68.161), (72, 74.0058, 64.4983), (36, 85.1316, 66.0576), (60, 82.8813, 69.0653), 
             (42, 110.3751, 80.0528), (54, 121.7316, 81.2496), (24, 99.3262, 72.6822), (12, 81.604, 68.1418), 
             (18, 94.3174, 72.4802), (30, 92.6211, 76.5908), (45, 87.9457, 77.9525)]    # hour, testMSE, validateMSE
bestHours.sort(key = lambda x:x[0])

bestEpochs36 = [(10, 67.2062, 62.3418), (5, 74.1717, 66.4878), (15, 77.8016, 64.5557), (30, 148.8478, 99.5409), 
                (3, 108.7818, 99.7956), (4, 67.6975, 61.946), (6, 66.4358, 61.5656), (7, 70.8634, 59.2995), (8, 71.261, 64.6968), 
                (9, 73.3233, 63.315), (11, 77.3612, 67.0248), (12, 80.6395, 78.4578), (13, 74.4024, 70.7845), 
                (14, 85.5936, 71.134), (20, 87.708, 72.1959), (25, 86.8419, 82.2874)]   # epoch, testMSE, validateMSE
bestEpochs36.sort(key = lambda x:x[0])

bestEpochs48 = [(3, 66.25, 62.099), (5, 70.0581, 59.1067), (8, 68.3578, 59.3903), (10, 74.559, 69.4547), 
                (12, 79.9994, 69.8428), (15, 86.8468, 68.2357), (20, 106.8292, 71.1984), (25, 93.9291, 79.7969), 
                (30, 94.9882, 82.1036), (4, 69.0089, 61.9953), (6, 73.2355, 63.6687), (7, 69.012, 61.0523), (9, 79.0518, 65.1099), 
                (11, 75.885, 65.9066), (13, 73.6373, 63.8709), (14, 102.5217, 84.5292)]   # epoch, testMSE, validateMSE
bestEpochs48.sort(key = lambda x:x[0])

# graphing hours
fig1, ax1 = plt.subplots()
x = [tup[0] for tup in bestHours]
y = [tup[2] for tup in bestHours]
ax1.plot(x, y)
ax1.scatter(x, y)
ax1.grid()
ax1.set_title("Past Hours of Data and the MSE of an LSTM model with 20 Epochs", fontsize=20)
ax1.set_xlabel("Past Hours of Data", fontsize=16)
ax1.set_ylabel("Validation MSE", fontsize=16)

# graphing epochs
fig2, ax2 = plt.subplots()
for results, label, color in zip([bestEpochs36, bestEpochs48], ["36 Hour", "48 Hour"], ["red", "blue"]):
    x = [tup[0] for tup in results]     # number of epochs
    y = [tup[2] for tup in results]     # validation mse
    ax2.plot(x, y, label=label)
    ax2.scatter(x, y)
ax2.legend(loc="upper right")
ax2.grid()
ax2.set_title("Epoch tuning and MSE on Validation Set", fontsize=20)
ax2.set_xlabel("Epochs", fontsize=16)
ax2.set_ylabel("Validation MSE", fontsize=16)

plt.show()

In [None]:
import json

# choose a number of time steps 
n_steps_in = 36
# n_steps_in = 48
n_steps_out = 24

# covert into input/output
X, y = split_sequences(scaled, n_steps_in, n_steps_out)
print ("X.shape" , X.shape)                             # [rows, time lags backward, features]
print ("y.shape" , y.shape)                             # [rows, future time values]


n_tests = [8760, 6552, 4344, 2280]                      # for different validation windows throughout the year
epochList36 = [10, 5, 15, 30, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 20, 25]
epochList48 = [3, 5, 8, 10, 12, 15, 20, 25, 30, 4, 6, 7, 9, 11, 13, 14]
epoch48ValidationsIndicies = [13, 4]
epoch36ValidationsIndicies = [6, 0]
# 48 - 11, 12 epochs best
# 36 - 6, 10 epochs best

epochsToUse = [epochList36[i] for i in epoch36ValidationsIndicies]
for n_epochs, folderIndex in zip(epochsToUse, epoch36ValidationsIndicies):
    path = "C:\\Users\\natha\\Desktop\\Undergrad\\Spring2022\\MTH 596 PIC Math\\Project - Group 2\\Project\\Forecasting\\36 hour epoch tuning\\keras_tuner_attempt"
    path += str(folderIndex+1)
    modelPath = path + "\\model"
    print(path)
    print("Number epochs:", n_epochs)

    testMSE = [[0, 0, 0], [1, 0, 0], [2, 0, 0], [3, 0, 0]]      # n_test period, first 96 hour prediction mse, shifted 480 hours forward 96 hour pred
    for n in range(len(n_tests)):               
        for s in range(2):
            # getting data for validation test
            train_X, train_y, test_X, test_y = sliding_window(X, y, n_tests[n], 480 * s)

            # fitting model and predicting
            model = keras.models.load_model(modelPath)
            model.fit(train_X, train_y, epochs=n_epochs, verbose=1, validation_data=(test_X, test_y), validation_split=0.2)
            pred_y_inv, test_y_inv = predict(model, test_X, test_y)
            totalMSE = 0
            numHoursForward = 96
            for k in range(numHoursForward):
                totalMSE += mean_squared_error(pred_y_inv[k], test_y_inv[k])
            # print("First {n} Avg MSE: {mse}".format(n=numHoursForward, mse=totalMSE/numHoursForward))

            testMSE[n][s+1] = totalMSE/numHoursForward

    # saving validation results
    with open(path + "\\validation results.json", "w") as f:
        json.dump(testMSE, f)


In [None]:
def combineValidationResults(results):
    sum = 0
    for l in results:
        for i in range(1, 3):
            sum += l[i]
    print("MSE = ", sum/8)

epoch12Hour48Results = [[0, 13.265085957764384, 5.664549392861499], [1, 322.14254964814444, 89.00012063018733], 
                        [2, 30.94971330178527, 15.442300796366476], [3, 23.369238580467254, 21.40317229597237]]
epoch11Hour48Results = [[0, 18.972753015342423, 5.010667427249979], [1, 275.55779577332555, 133.88350945889738], 
                        [2, 33.828710449148744, 12.727969694411149], [3, 15.03309853350242, 24.127845879349817]]
epoch10Hour36Results = [[0, 15.57648872532529, 7.567939042658903], [1, 292.8620620735798, 49.715468543092896], 
                        [2, 37.71236929791596, 13.61932757675911], [3, 36.1392324544318, 35.98589133229877]]
epoch6Hour36Results = [[0, 13.016783481550883, 6.828164035876674], [1, 314.32799463859845, 23.358751010248636], 
                        [2, 31.608841918622772, 12.704988196358912], [3, 17.563685872196917, 21.91560674564487]]
combineValidationResults(epoch11Hour48Results)
combineValidationResults(epoch12Hour48Results)
combineValidationResults(epoch6Hour36Results)
combineValidationResults(epoch10Hour36Results)