# LSTM Forecast Test

This notebook generates LSTM models and applies them to forecast future events.

## Initialize Environment

In [None]:
import os
import sys
PADOGRID_WORKSPACE=os.environ["PADOGRID_WORKSPACE"];
sys.path.append(PADOGRID_WORKSPACE+"/apps/ml_lstm/src/main/python")

from padogrid.bundle.hazelcast.dna.hazelcast_lstm_dna import HazelcastLstmDna
from padogrid.bundle.hazelcast.data.PortableFactoryImpl import PortableFactoryImpl
import hazelcast
from matplotlib import pyplot
import pandas as pd
from sklearn.metrics import r2_score

def plot_forecasts(train_time_list, train_data_list, time_list, test_data_list, forecasts, time_type="date", time_delta="1 day"):
    '''
    Plots the specified train and test datasets in addition to the specified forecasts.
    '''

    # Label and plot
    fig, ax = pyplot.subplots(1, figsize=(14, 8))
    fig.suptitle(feature + " LSTM Test Data Validation")
    ax.set(xlabel='Time', ylabel='Value')

    # Plot the entire train data in blue
    ax.plot(train_time_list, train_data_list, color='blue', label="Train")

    # Plot the entire test data in black. We need to add the last value
    # in the train data list as the first value in the test data list
    # so that we can plot the test data starting from that value.
    test_time_list = pd.to_datetime(train_time_list[-1:]).append(time_list)
    test_data_list2 = train_data_list[-1:] + test_data_list
    ax.plot(test_time_list, test_data_list2, color='black', label="Test")

    # Get the last values in the train list. It is the staring point for plotting
    # the forecasted values.
    xvalue = train_time_list[-1]
    yvalue = train_data_list[-1]
    for i in range(len(forecasts)):
        # The last time value is the starting point of this iteration of forecasted values
        xaxis = [xvalue]
        # Build array with future dates. Increment the time by month.

        forecast = forecasts[i]
        time_value = xvalue
        for j in range(len(forecast)):
            if time_type == "date":
                time_value += pd.Timedelta(time_delta)
            else:
                year = time_value.year
                month = time_value.month
                if month % 12 == 0:
                    year += 1
                    month = 1
                else:
                    month += 1
                time_value = pd.Timestamp(year, month, 1)
            xaxis += [time_value]
        # insert the last value in the yaxis list. The last value is the starting point
        # for this iteration of forecasted values
        yaxis = [yvalue] + forecasts[i]
        #Plot forecast data in red
        ax.plot(xaxis, yaxis, color='red')
        xvalue = time_list[i]
        yvalue = test_data_list[i]

    ax.legend(["Train", "Test", "Forecast"], loc = "upper left")
    fig.canvas.draw()
    fig.canvas.flush_events()

## User Inputs

Place your inputs here.

In [None]:
# grid_path: Name of the Hazelcast map that contains the data.
grid_path = "stocks"
# feature: stock1-jitter, stock1-no-jitter, stock1-jitter-large, stock2-jitter, stock2-no-jitter
feature = "stock1-jitter"
# is_generate: False to use the existing model, True to generate a new model
is_generate = False
# test_data_percentage: Percentage of the dataset to be used as test data
test_data_percentage = 0.2

# Try comparing jitter vs no-jitter. With no-jitter, the simulator generates
# unaltered equation values which should result in high accuracy of predicted
# values.
# ------
# stock1
# ------
#feature="stock1-jitter"
# R^2=0.92
#feature="stock1-jitter-large"
#feature="stock1-no-jitter"

# ------
# stock2
# ------
#feature="stock2-jitter"
#feature="stock2-no-jitter"

## Forecast

In [None]:
if test_data_percentage <= 0 or test_data_percentage >= 1:
    print(f'ERROR: Invalid test_data_percentage {test_data_percentage}', file=sys.stderr)
    exit(1)

use_saved_model = is_generate != True

# LSTM network parameters
epochs = 100
neurons = 1
batch_size = 1

is_verbose = False

print()
print("---------------------------------------")
print("                 map: " + str(grid_path))
print("             feature: " + str(feature))
print("            generate: " + str(is_generate))
print("              epochs: " + str(epochs))
print("             neurons: " + str(neurons))
print("          batch_size: " + str(batch_size))
print("test_data_percentage: " + str(test_data_percentage))
print("---------------------------------------")
print()

# Working directory where the model is saved. By default, it assumes you are running
# this program in PadoGrid.
workspace_dir = os.environ['PADOGRID_WORKSPACE']
if workspace_dir == "":
    raise Exception("ERROR: PADOGRID_WORKSPACE environment var is not set. Aborted.")
working_dir = workspace_dir + "/apps/ml_lstm"
if os.path.isdir(working_dir) == False:
    raise Exception("ERROR: working_dir (" + working_dir + ") does not exist. Aborted.")

# -------------------------------
# End of User Inputs
# -------------------------------

# Connect to Hazelcast
cluster_name="ml_jet"
client = hazelcast.HazelcastClient(cluster_name=cluster_name,
                                    cluster_members=[
                                            "localhost:5701",
                                            "localhost:5702"
                                        ],
                                    lifecycle_listeners=[
                                        lambda state: print("Hazelcast Lifecycle: ", state),
                                    ],
                                    portable_factories=PortableFactoryImpl.factories())

# HazelcastLstmDna expects the Hazelcast map to contain JSON objects with the specified
# numerical feature (attributes).
dna = HazelcastLstmDna(feature, client, working_dir=working_dir, verbose=is_verbose)

# --------------------------------------------------------------------------
# Execute locally
# --------------------------------------------------------------------------
model_name="model_" + feature

where_clause = None
jresult = dna.run_lstm_local(grid_path, where_clause, time_attribute="time",
                            use_saved_model=use_saved_model, model_name=model_name,
                            return_train_data=True,
                            time_type='date',
                            value_key=feature,
                            epochs=epochs,
                            neurons=neurons,
                            batch_size=batch_size,
                            test_data_percentage=test_data_percentage)
# print(jresult)
# --------------------------------------------------------------------------

if jresult != None:
    expected_list = jresult['Expected']
    predicted_list = jresult['Predicted']
    time_list = pd.to_datetime(jresult['Time'])

    train_data_list = jresult['TrainData']
    train_time_list = pd.to_datetime(jresult['TrainTime'])
    test_data_list = jresult['TestData']

    plot_forecasts(train_time_list, train_data_list, time_list, test_data_list, predicted_list, time_delta="1 day")

    if is_verbose:
        print()
        print("-------------")
        print("test_data_list")
        print("-------------")
        print(test_data_list)
        print()
        print("--------------")
        print("predicted_list")
        print("--------------")
        print(predicted_list)

    #test_data = expected_list[1:2]

    if 'TrainRmse' in jresult:
        train_rmse = jresult['TrainRmse']
        print('RMSE(train)=%f' % train_rmse)
    rmse = jresult['Rmse']
    normalized_rmse = jresult['NormalizedRmse']
    print()
    print(f'RMSE: {rmse}, Normalized RMSE: {normalized_rmse}')
    # Coefficient of determination or variance score: 1 is perfect prediction
    r2 = r2_score(expected_list, predicted_list)
    print('Coefficient of Determination - R^2 score: %.2f' % r2)
    print()

    ## Display and block
    pyplot.gcf().canvas.manager.set_window_title("PadoGrid LSTM Model Validation")
    pyplot.show()

#client.shutdown()
