In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
buildings_list = ['Energy_Innovation_Center_Data.csv']

in_path = './clean_data/'

for building in buildings_list:
    df = pd.read_csv(in_path + building)

    # Convert the data into a Pandas dataframe
    df["ts"] = pd.to_datetime(df["ts"])
    df = df.drop_duplicates(subset=["bldgname", "ts"])
    df = df.sort_values(["bldgname", "ts"])

    # Group the dataframe by building name and timestamp
    groups = df.groupby("bldgname")
    df = df.set_index("ts")

    orig_cols = df.columns
    y_columns = ["present_elec_kwh", "present_htwt_mmbtu", "present_wtr_usgal", "present_chll_tonhr", "present_co2_tons"]
    header = ["ts"] + y_columns

    # Train the LSTM models on the data
    models = {}

    print(building)
    epochs = [5, 10, 15, 50, 100]

    for name, group in groups:
        bldgname = name

        group = group.drop_duplicates(subset=["ts"])
        model_data = group[header]

        for y in y_columns:
            if model_data[y].count() >= 365*24 and y != 'present_co2_tons':
                for epoch in epochs:
                    model_data = model_data.rename(columns={ "ts": "ds", y: "y" })
                    model_data = model_data.sort_values(["ds"])
                    # model_data = model_data.dropna(subset="y")

                    # Interpolate missing values in 'y' column of model_data using linear method and fill values in both directions.
                    model_data['y'] = model_data['y'].interpolate(method='linear', limit_direction='both')

                    # normalize the data
                    scaler = MinMaxScaler(feature_range=(0, 1))
                    data_scaled = scaler.fit_transform(model_data["y"].values.reshape(-1, 1))

                    # split the data into training and testing sets
                    train_size = int(len(data_scaled) * 0.8)
                    test_size = len(data_scaled) - train_size
                    train_data = data_scaled[0:train_size,:]
                    test_data = data_scaled[train_size:len(data_scaled),:]

                    # define the window size
                    window_size = 7

                    # create the training data set
                    def create_dataset(dataset, window_size):
                        X, y = [], []
                        for i in range(window_size, len(dataset)):
                            X.append(dataset[i-window_size:i, 0])
                            y.append(dataset[i, 0])
                        X, y = np.array(X), np.array(y)
                        return X, y

                    X_train, y_train = create_dataset(train_data, window_size)

                    # create the testing data set
                    X_test, y_test = create_dataset(test_data, window_size)

                    # reshape the input data to be 3-dimensional for LSTM model
                    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
                    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1]))

                    # Create the LSTM model
                    model = Sequential()
                    model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
                    model.add(Dropout(0.2))
                    model.add(LSTM(units=50, return_sequences=True))
                    model.add(Dropout(0.2))
                    model.add(LSTM(units=50))
                    model.add(Dropout(0.2))
                    model.add(Dense(units=1))
                    
                    # Compile the model (run_eagerly=False for prod)
                    model.compile(optimizer='adam', loss='mean_squared_error', run_eagerly=True)

                    # Create the EarlyStopping callback
                    early_stop = EarlyStopping(monitor='loss', patience=10, verbose=1, mode='min')

                    # Train the model
                    model.fit(X_train, y_train, epochs=epoch, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stop])

                    # Save the model
                    # model.save(par_folder + '/' + bldgname + '_' + y + '_LSTM_model.h5')

                    # Predict on the test set
                    predictions = model.predict(X_test)

                    # Inverse transform the predictions and actual values
                    predictions = scaler.inverse_transform(predictions)
                    y_test = scaler.inverse_transform(y_test.reshape(-1, 1))
                    y_train = scaler.inverse_transform(y_train.reshape(-1, 1))

                    # Slice the predicted values array to the desired prediction length
                    prediction_length_hours = len(y_test) # set the prediction length to 24 hours
                    predictions = predictions[-prediction_length_hours:]

                    # Calculate the RMSE
                    rmse = np.sqrt(mean_squared_error(y_test[-prediction_length_hours:], predictions))
                    # print('Root Mean Squared Error:', rmse)

                    models[(bldgname, y, rmse)] = (y_test, y_train, predictions, rmse, epoch)

In [None]:
# Plot the actual values and predictions
for name, (y_test, y_train, predictions, rmse, epoch) in models.items():
    bldgname, y, rmse = name
    print(f'Root Mean Squared Error: {rmse}, Epoch: {epoch}')
    fig, ax = plt.subplots()
    
    # Get the length of the training data to correctly index the predictions
    train_len = len(y_train)
    
    # Plot the actual values
    # ax.plot(np.concatenate([y_train, y_test]), label='Actual Values')
    ax.plot(y_test, label='Actual Values')
    
    # Plot the predictions at the correct indices
    # ax.plot(range(train_len, train_len + len(y_test)), predictions, label='Predicted Values')
    ax.plot(predictions, label='Predicted Values')

    ax.set_title(bldgname + '_' + y + ' Consumption')
    ax.legend()

    plt.show()
    plt.close()