In [None]:
from tensorflow import keras
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation
from keras_tuner.tuners import BayesianOptimization
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from copy import deepcopy
import json

n_steps_in = 36
n_steps_out = 24
features = ['SWTP Total Influent Flow', 'Fire Rainfall (in)', 'Bingham Rainfall (in)', 
            "Ozark Aquifer Depth to Water Level (ft)", "James Gauge Height (ft)", 'HourlyStationPressure']

In [None]:
def getShiftedRainfallData(t):
    df = pd.read_csv("Imputed Data.csv")
    features = list(df.columns)
    newFeatures = []
    array2D = []
    for feat in features:
        arr = np.array(df[feat])
        if feat.find("Rain") > 1:
            # remove last t hours
            arr = arr[:-1*t]
            newFeatures.append(feat + " (t-" + str(t) + ")")
        else:
            # remove first t hours
            arr = arr[t:]
            newFeatures.append(feat)
        array2D.append(arr)
    newDf = pd.DataFrame(np.array(array2D).T, columns=newFeatures)        
    return newDf

def split_sequences(sequences, n_steps_in, n_steps_out):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out-1
        # check if we are beyond the dataset
        if out_end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1:out_end_ix, -1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

def sliding_window(X, y, n_test, slide):
    split_point = X.shape[0] - n_test + slide
    train_X , train_y = X[:split_point, :] , y[:split_point, :]
    test_X , test_y = X[split_point:, :] , y[split_point:, :]
    return train_X, train_y, test_X, test_y

def build_model(hp):
    model = Sequential()
    model.add(LSTM(units=hp.Int('units', min_value = 20, max_value = 180, step = 5), 
               activation = 'tanh', return_sequences = True, input_shape = (n_steps_in, len(features))))
    model.add(LSTM(units = hp.Int('units', min_value = 20, max_value = 180, step = 5)))
    model.add(Dense(24))   # for predicting 24 hours -- if desire more, change
    model.add(Activation('linear'))
    model.compile(loss = 'mse', metrics = 'mse', optimizer = keras.optimizers.Adam(
        hp.Choice('learning_rate', values = [1e-2, 1e-3, 1e-4, 1e-5])))
    return model

def invNormalize(arr, minimum, maximum):
    return (maximum - minimum) * arr + minimum

def predict(model, test_X, test_y, fullData = False):
    # to be able to inverse scale predictions
    df = pd.read_csv("Train and Test Data.csv", usecols=["SWTP Total Influent Flow"])
    if fullData:
        df = pd.read_csv("Imputed Data.csv", usecols=["SWTP Total Influent Flow"])
    arr = np.array(df["SWTP Total Influent Flow"])
    maximum = np.max(arr)
    minimum = np.min(arr)

    #predictions and rescaling to [min, max]
    y_pred = model.predict(test_X)
    y_pred_inv = np.array([invNormalize(x, minimum, maximum) for x in y_pred])
    test_y_inv = np.array([invNormalize(x, minimum, maximum) for x in test_y])
    print("y_pred_inv:",y_pred_inv.shape)
    print("test_y_inv:",y_pred_inv.shape)
    
    return y_pred_inv, test_y_inv

def mseForecast(y, y_pred):
    # change so is only for a range, i.e. first 72 days of test set
    # mse is gonna scale really badly the further out it goes
    # msut fix! cannot calculate mse from a full year, must use a rolling window
    # that forecasts 3 days in advance, has new next hour put into it, then has next 3 day forecast one hour after
    totalMSE = 0
    for i in range(y.shape[0]):
        totalMSE += mean_squared_error(y[i], y_pred[i])
    avgMSE = totalMSE / y.shape[0]
    print("Total Avg MSE:", avgMSE)
    return avgMSE

def getValidationData(filename, features, n_steps_in):
    # getting data
    dataset = pd.read_csv(filename, usecols=features)
    arr = np.array(dataset["SWTP Total Influent Flow"])
    dataset["Target"] = arr         # adding another influent flow feature so that past values can be used to predict future values
    values = dataset.values
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(values)

    n_steps_out = 24
    X, y = split_sequences(scaled, n_steps_in, n_steps_out)
    train_X, train_y, validate_X, validate_y = sliding_window(X, y, 8760, 0)    # 8760 means last year of data, for validation
    return validate_X, validate_y

def saveValidationResults(path, testMSE, validationMSE, n_epochs, hours=36):
    txt = "n_steps_in = " + str(hours)
    txt += "\nepochs = " + str(n_epochs)
    txt += "\nAvg Test MSE: " + str(round(testMSE, 4))
    txt += "\nAvg Validation MSE: " + str(round(validationMSE, 4))
    txt += "\n\nForm:\nLSTM\nLSTM\nDense(24)\nActivation('linear')"
    with open(path + "\\validation results.txt", 'w') as f:
        f.write(txt)


In [None]:
shiftList = [36, 42]
attemptList = [5, 6]
n_epochs = 5
n_steps_in = 36
results = []

for shift, attempt in zip(shiftList, attemptList):
    # getting shifted rainfall data
    dataset = getShiftedRainfallData(shift)
    filename = "Shifted Rainfall\\Imputed Data Shifted " + str(shift) + " Hours.csv"
    dataset.to_csv(filename, index=False)
    newFeatures = [feat + " (t-" + str(shift) + ")" if feat.find("Rain") > 1 else feat for feat in features]
    dropFeatures = [feat for feat in dataset.columns if feat not in newFeatures]
    dataset = dataset.drop(columns=dropFeatures)
    arr = np.array(dataset["SWTP Total Influent Flow"])
    dataset["Target"] = arr
    dataset = dataset.drop(dataset.tail(364*25).index)

    # making train and test datasets
    values = dataset.values
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(values)
    X, y = split_sequences(scaled, n_steps_in, n_steps_out)
    train_X, train_y, test_X, test_y = sliding_window(X, y, 365*24, 0)
    path = "C:\\Users\\natha\\Desktop\\Undergrad\\Spring2022\\MTH 596 PIC Math\\Project - Group 2\\Project\\Forecasting\\Shifted Rainfall\\keras_tuner_attempt"
    path += str(attempt)
    modelPath = path + "\\model"
    project_title = "keras_tuner_attempt" + str(attempt)

    # tuning model with keras tuner
    bayesian_opt_tuner = BayesianOptimization(
        build_model,
        objective='mse',
        max_trials=3,
        executions_per_trial=1,
        directory=os.path.normpath('C:/Users/natha/Desktop/Undergrad/Spring2022/MTH 596 PIC Math/Project - Group 2/Project/Forecasting/Shifted Rainfall'),
        project_name=project_title,
        overwrite=True)
    bayesian_opt_tuner.search(train_X, train_y, epochs=n_epochs,
        validation_data=(test_X, test_y),
        validation_split=0.2, verbose=1)
    bayes_opt_model_best_model = bayesian_opt_tuner.get_best_models(num_models=1)
    model = bayes_opt_model_best_model[0]
    model.save(modelPath)
    # model = keras.models.load_model(modelPath)

    # getting predictions on test and validation set from created model
    pred_y_inv, test_y_inv = predict(model, test_X, test_y)
    totalAvgMSE = mseForecast(test_y_inv, pred_y_inv)

    validate_X, validate_y = getValidationData(filename, newFeatures, n_steps_in)
    y_pred_inv, y_validate_inv = predict(model, validate_X, validate_y, True)
    validationAvgMSE = mseForecast(y_validate_inv, y_pred_inv)

    saveValidationResults(path, totalAvgMSE, validationAvgMSE, n_epochs, n_steps_in)

    results.append((shift, n_epochs, round(totalAvgMSE, 4), round(validationAvgMSE, 4)))


print(results)
with open("Shifted Rainfall\\results list.json", 'w') as f:
    json.dumps(results)

In [None]:
shiftResults = [(20, 5, 91.8418, 82.3813), (24, 5, 73.2098, 67.1881), (28, 5, 85.0717, 80.1233), (32, 5, 72.9596, 63.533), 
                (36, 5, 71.0614, 63.7691), (42, 5, 75.4345, 67.1984)]

In [None]:
# getting data with shifted rainfall
hourList = [72]
epochList = [5, 6, 7, 8]
attemptList = list(range(2, len(epochList)+1))
results = []

for n_steps_in in hourList:
    for attempt, n_epochs in zip(attemptList, epochList):

        dataset = pd.read_csv("Train and Test Data.csv", usecols=features)
        arr = np.array(dataset["SWTP Total Influent Flow"])


        dataset["Target"] = arr         # adding another influent flow feature so that past values can be used to predict future values
        values = dataset.values
        scaler = MinMaxScaler()
        scaled = scaler.fit_transform(values)
        X, y = split_sequences(scaled, n_steps_in, n_steps_out)
        train_X, train_y, test_X, test_y = sliding_window(X, y, 365*24, 0)


        path = "C:\\Users\\natha\\Desktop\\Undergrad\\Spring2022\\MTH 596 PIC Math\\Project - Group 2\\Project\\Forecasting\\Shifted Rainfall\\keras_tuner_attempt"
        path += str(attempt) + " " + str(n_steps_in) + " hours"
        modelPath = path + "\\model"
        project_title = "keras_tuner_attempt" + str(attempt) + " " + str(n_steps_in) + " hours"


        # tuning model with keras tuner
        bayesian_opt_tuner = BayesianOptimization(
            build_model,
            objective='mse',
            max_trials=3,
            executions_per_trial=1,
            directory=os.path.normpath('C:/Users/natha/Desktop/Undergrad/Spring2022/MTH 596 PIC Math/Project - Group 2/Project/Forecasting/Shifted Rainfall'),
            project_name=project_title,
            overwrite=True)
        bayesian_opt_tuner.search(train_X, train_y, epochs=n_epochs,
            validation_data=(test_X, test_y),
            validation_split=0.2, verbose=1)
        bayes_opt_model_best_model = bayesian_opt_tuner.get_best_models(num_models=1)
        model = bayes_opt_model_best_model[0]
        model.save(modelPath)
        # model = keras.models.load_model(modelPath)


        pred_y_inv, test_y_inv = predict(model, test_X, test_y)
        totalAvgMSE = mseForecast(test_y_inv, pred_y_inv)

        validate_X, validate_y = getValidationData("Imputed Data.csv", features, n_steps_in)
        y_pred_inv, y_validate_inv = predict(model, validate_X, validate_y, True)
        validationAvgMSE = mseForecast(y_validate_inv, y_pred_inv)

        saveValidationResults(path, totalAvgMSE, validationAvgMSE, n_epochs, n_steps_in)

        results.append((n_steps_in, n_epochs, round(totalAvgMSE, 4), round(validationAvgMSE, 4)))
print(results)
with open("Shifted Rainfall\\results list.json", 'w') as f:
    json.dumps(results)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib qt


newFeaturesResults36 = [(36, 4, 117.4312, 122.5767), (36, 5, 56.0574, 52.9312), (36, 6, 58.1944, 56.2634), (36, 7, 58.5114, 55.2089), 
                        (36, 8, 55.7228, 55.9546)]
newFeaturesResults48 = [(48, 4, 63.5259, 60.7421), (48, 5, 57.8375, 56.3667), (48, 6, 66.1357, 66.4636), (48, 7, 56.5258, 56.9665), 
                        (48, 8, 80.874, 82.2395), (48, 9, 59.1129, 62.2371), (48, 10, 59.7826, 56.803), 
                        (48, 12, 61.6223, 64.5385), (48, 15, 62.6599, 64.2141), (48, 18, 89.9026, 92.5337)]
newFeaturesResults72 = [(72, 4, 61.5909, 56.5136), (72, 5, 55.7488, 55.6569), (72, 6, 59.8857, 56.9455), (72, 7, 63.0854, 62.7871)]

# graphing epochs
fig, ax = plt.subplots()
for newFeaturesResults, label in zip([newFeaturesResults36, newFeaturesResults48[:len(newFeaturesResults36)], 
    newFeaturesResults72], ["36 Hours", "48 Hours", "72 Hours"]):
    x = [tup[1] for tup in newFeaturesResults]     # number of epochs
    y = [tup[3] for tup in newFeaturesResults]     # validation mse
    ax.plot(x, y, label=label)
    ax.scatter(x, y)
ax.grid()
ax.legend(loc="upper right")
ax.set_title("LSTM Average MSE on Validation Set", fontsize=20)
ax.set_xlabel("Epochs", fontsize=16)
ax.set_ylabel("Average Validation MSE", fontsize=16)

plt.show()

In [None]:
path = "C:\\Users\\natha\\Desktop\\Undergrad\\Spring2022\\MTH 596 PIC Math\\Project - Group 2\\Project\\Forecasting"
path += "\\feature tuning\\subset 5\\keras_tuner_attempt1\\model"
model = keras.models.load_model(path)
model.summary()