<a href="https://colab.research.google.com/github/oimartin/SP_500_index_RNN/blob/main/sp500_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ingest

## Load Libraries

In [1]:
!pip install yfinance
!pip install -U kaleido

Collecting yfinance
  Downloading yfinance-0.1.70-py2.py3-none-any.whl (26 kB)
Collecting lxml>=4.5.1
  Downloading lxml-4.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 6.8 MB/s 
[?25hCollecting requests>=2.26
  Downloading requests-2.27.1-py2.py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.5 MB/s 
Installing collected packages: requests, lxml, yfinance
  Attempting uninstall: requests
    Found existing installation: requests 2.23.0
    Uninstalling requests-2.23.0:
      Successfully uninstalled requests-2.23.0
  Attempting uninstall: lxml
    Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires req

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.io as pio
import kaleido
import os

if not os.path.exists("images"):
  os.mkdir('images') # save plotly images

import matplotlib.pyplot as plt
import yfinance as yf
import datetime as dt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, max_error, mean_squared_log_error

from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

from time import time
from keras.models import Sequential
from keras import layers
from keras.losses import BinaryCrossentropy, Poisson, SparseCategoricalCrossentropy, mean_squared_error
from keras.metrics import BinaryAccuracy
from keras.callbacks import EarlyStopping

# fix random seed for reproducibility
np.random.seed(7)

## Load Data

In [4]:
data = yf.Ticker('^GSPC').history(start=dt.datetime(2015,1,1),
                           end=dt.datetime(2020,1,1)).reset_index()
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2015-01-02,2058.899902,2072.360107,2046.040039,2058.199951,2708700000,0,0
1,2015-01-05,2054.439941,2054.439941,2017.339966,2020.579956,3799120000,0,0
2,2015-01-06,2022.150024,2030.25,1992.439941,2002.609985,4460110000,0,0
3,2015-01-07,2005.550049,2029.609985,2005.550049,2025.900024,3805480000,0,0
4,2015-01-08,2030.609985,2064.080078,2030.609985,2062.139893,3934010000,0,0


# EDA

## First View

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          1258 non-null   datetime64[ns]
 1   Open          1258 non-null   float64       
 2   High          1258 non-null   float64       
 3   Low           1258 non-null   float64       
 4   Close         1258 non-null   float64       
 5   Volume        1258 non-null   int64         
 6   Dividends     1258 non-null   int64         
 7   Stock Splits  1258 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(3)
memory usage: 78.8 KB


In [6]:
data.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
count,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0
mean,2452.403505,2462.925358,2440.702211,2452.643027,3625464000.0,0.0,0.0
std,357.405072,357.889358,357.025302,357.451845,671811400.0,0.0,0.0
min,1833.400024,1847.0,1810.099976,1829.079956,1296540000.0,0.0,0.0
25%,2101.687439,2108.959961,2092.134888,2102.08252,3232422000.0,0.0,0.0
50%,2434.209961,2441.555054,2427.97998,2434.14502,3520885000.0,0.0,0.0
75%,2773.082458,2783.702515,2758.289978,2771.179993,3900050000.0,0.0,0.0
max,3247.22998,3247.929932,3234.370117,3240.02002,7609010000.0,0.0,0.0


In [7]:
display(data['Dividends'].value_counts(),
        data['Stock Splits'].value_counts())

0    1258
Name: Dividends, dtype: int64

0    1258
Name: Stock Splits, dtype: int64

In [8]:
data = data.drop(columns=['Dividends','Stock Splits'])
data.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')

In [9]:
data['Year'] = data['Date'].dt.strftime('%Y')
data['Month'] = data['Date'].dt.strftime('%m')
data['Day'] = data['Date'].dt.strftime('%d')
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Year,Month,Day
0,2015-01-02,2058.899902,2072.360107,2046.040039,2058.199951,2708700000,2015,1,2
1,2015-01-05,2054.439941,2054.439941,2017.339966,2020.579956,3799120000,2015,1,5
2,2015-01-06,2022.150024,2030.25,1992.439941,2002.609985,4460110000,2015,1,6
3,2015-01-07,2005.550049,2029.609985,2005.550049,2025.900024,3805480000,2015,1,7
4,2015-01-08,2030.609985,2064.080078,2030.609985,2062.139893,3934010000,2015,1,8


## Plots

In [10]:
fig = go.Figure()
fig.add_trace(go.Candlestick(x=data['Date'], close=data['Close'], open=data['Open'], 
                             low=data['Low'], high=data['High']))
fig.update_layout(title_text="2015-2019: Overview of S&P 500",
                  yaxis_title='Price', xaxis_title='Year')
fig.show()
fig.write_image('images/canldestick_allyears.png')

In [11]:
fig = px.box(data, y=["Open", 'High', 'Low', 'Close'], color='Year', points="all")
fig.update_layout(height=600, width=1000, yaxis_title='Price', xaxis_title='',
                  title_text='2015-2019: S&P 500 Open, High, Low, Close Prices')
fig.show()
fig.write_image('images/boxplot_years_openHighlowClose.png')

In [12]:
fig = px.box(data, y=["Open", 'High', 'Low', 'Close'], points="all")
fig.update_layout(height=600, width=1000, yaxis_title='Price', xaxis_title='')
fig.show()
fig.write_image('images/boxplot_openHighlowClose.png')

In [13]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=data['Volume']))

# Overlay both histograms
fig.update_layout(bargap=0.1, title_text="2015-2019: S&P 500 Volume",
                  yaxis_title='Count', xaxis_title='Volume')

# Reduce opacity to see both histograms
fig.show()
fig.write_image('images/combined_volume.png')

In [14]:
fig = px.scatter(data, x='Date', y='Volume', color='Year', trendline='lowess',
                 trendline_color_override='black')
fig.update_layout(title_text='2015-2019: S&P 500 Volume')
fig.show()
fig.write_image('images/scatter_year_volume_all.png')


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



In [15]:
fig = px.scatter(data, x='Date', y='Close', color='Year', trendline='lowess',
                 trendline_color_override='black')
fig.update_layout(title_text='2015-2019: S&P 500 Volume')
fig.show()
fig.write_image('images/scatter_year_close_all.png')

In [16]:
fig = px.scatter(data.iloc[:1006, :], x='Date', y='Close', color='Year', trendline='lowess',
                 trendline_color_override='black')
fig.update_layout(title_text='2015-2019: S&P 500 Volume')
fig.show()
fig.write_image('images/scatter_year_close_80.png')

In [17]:
fig = px.line(data.iloc[:1006, :], x='Date', y='Close', color='Year')
fig.update_layout(title_text='2015-2019: S&P 500 Volume')
fig.show()
fig.write_image('images/line_year_close_80.png')

In [18]:
fig = px.scatter(data.iloc[1006:, :], x='Date', y='Close', color='Year', trendline='lowess',
                 trendline_color_override='black', color_discrete_sequence=['#ff7f0e'])
fig.update_layout(title_text='2015-2019: S&P 500 Volume')
fig.show()
fig.write_image('images/scatter_year_close_20.png')

In [19]:
fig = px.line(data.iloc[1006:, :], x='Date', y='Close', color='Year', color_discrete_sequence=['#ff7f0e'])
fig.update_layout(title_text='2015-2019: S&P 500 Volume')
fig.show()
fig.write_image('images/line_year_close_20.png')

In [20]:
min = data['Open'].min()
max = data['Open'].max()

def hist_year(df, col, year):
  fig = go.Histogram(x=df[df['Year']== year][col],
                      xbins=dict(
                      start=min,
                      end= max,
                      size=50),
                      autobinx=False,
                     name=year)
  return fig

In [21]:
fig = make_subplots(rows=5, cols=1)
trace0 = hist_year(data, 'Open', '2015')
trace1 = hist_year(data, 'Open', '2016')
trace2 = hist_year(data, 'Open', '2017')
trace3 = hist_year(data, 'Open', '2018')
trace4 = hist_year(data, 'Open', '2019')

# adjust plots
fig.update_layout(bargap=0.1)
fig.update_xaxes(range=[min,max])
fig.update_layout(title_text="2015-2019: S&P 500 Open Price Comparison",
                  height=700,width= 900)

# add trace to subplot
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 2, 1)
fig.append_trace(trace2, 3, 1)
fig.append_trace(trace3, 4, 1)
fig.append_trace(trace4, 5, 1)
fig.show()
fig.write_image('images/compare_open_all.png')

In [22]:
fig = px.scatter_3d(data, x='Open', y='Close', z='High',
                    color='Year')
camera = dict(
    up=dict(x=1, y=0, z=1),
    center=dict(x=-1, y=0, z=0),
    eye=dict(x=2.15, y=0.1, z=0.1)
)
fig.update_layout(title_text='2015-2019: 3D View of S&P 500\nby Open, High, and Close Prices',
                  margin=dict(l=0, r=0, b=0, t=0), scene_camera=camera,
                  height=800, width=900)
fig.show()
fig.write_image('images/3d_view_open_high_close_all.png')

In [23]:
corr = data.iloc[:, 1:6].corr()
corr_no_vol = data.iloc[:, 1:5].corr()
corr

Unnamed: 0,Open,High,Low,Close,Volume
Open,1.0,0.999541,0.99922,0.998774,-0.213469
High,0.999541,1.0,0.999025,0.999316,-0.206946
Low,0.99922,0.999025,1.0,0.999468,-0.228355
Close,0.998774,0.999316,0.999468,1.0,-0.219588
Volume,-0.213469,-0.206946,-0.228355,-0.219588,1.0


In [24]:
fig = go.Figure()
fig.add_trace(go.Heatmap(
    z=corr,
    x=corr.columns,
    y=corr.columns,
    colorscale=px.colors.diverging.RdBu
))
fig.update_layout(height=400, width=400)
fig.show()
fig.write_image('images/heatmap_corr.png')

In [25]:
fig = go.Figure()
fig.add_trace(go.Heatmap(
    z=corr_no_vol,
    x=corr_no_vol.columns,
    y=corr_no_vol.columns,
    colorscale=px.colors.diverging.RdBu
))
fig.update_layout(height=400, width=400)
fig.show()
fig.write_image('images/heatmap_corr_no_vol.png')

# Preprocessing Data

In [26]:
minmax = MinMaxScaler(feature_range=(0,1))

def select_transform(df, features, y_output):
  split_80 = int((data.shape[0]*0.8))
  print(f"Split at {split_80}")

  x_train = df.loc[:split_80-1, features].copy()
  y_train = df.loc[:split_80-1, [y_output]].copy()

  x_test = df.loc[split_80:, features].copy()
  y_test = df.loc[split_80:, [y_output]].copy()

  x_train_sc = minmax.fit_transform(x_train)
  y_train_sc = minmax.fit_transform(y_train)
  x_test_sc = minmax.fit_transform(x_test)
  y_test_sc = minmax.fit_transform(y_test)

  print(f'''X_train shape {x_train_sc.shape}
y_train shape {len(y_train_sc)}
X_test shape {x_test_sc.shape}
y_test {len(y_test_sc)}
''')

  return x_train_sc, y_train_sc, x_test_sc, y_test_sc

In [27]:
X_train, y_train, X_test, y_test = select_transform(data, ['Open', 'High','Low'], 'Close')

Split at 1006
X_train shape (1006, 3)
y_train shape 1006
X_test shape (252, 3)
y_test 252



In [28]:
# Inspired by https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/
# and https://towardsdatascience.com/how-to-reshape-data-and-do-regression-for-time-series-using-lstm-133dad96cd00

def lstm_data_transform(x_data, y_data, num_steps=5):
    """ Changes data to the format for LSTM training 
for sliding window approach """
    # Prepare the list for the transformed data
    X, y = list(), list()
    # Loop of the entire data set
    for i in range(x_data.shape[0]):
        # compute a new (sliding window) index
        end_ix = i + num_steps
        # if index is larger than the size of the dataset, we stop
        if end_ix >= x_data.shape[0]:
            break
        # Get a sequence of data for x
        seq_X = x_data[i:end_ix]
        # Get only the last element of the sequency for y
        seq_y = y_data[end_ix]
        # Append the list with sequencies
        X.append(seq_X)
        y.append(seq_y)
    # Make final arrays
    x_array = np.array(X)
    y_array = np.array(y)

    print(f"X_array shape {x_array.shape}\nY_array shape {y_array.shape}")
    return x_array, y_array

In [29]:
x_train_lstm, y_train_lstm = lstm_data_transform(X_train, y_train, 90)
print("-----\nTest")
x_test_lstm, y_test_lstm = lstm_data_transform(X_test, y_test, 90)

X_array shape (916, 90, 3)
Y_array shape (916, 1)
-----
Test
X_array shape (162, 90, 3)
Y_array shape (162, 1)


In [30]:
X_train, X_val, y_train, y_val = train_test_split(x_train_lstm,
                                                  y_train_lstm,
                                                  test_size=0.2,
                                                  random_state=22)
print(f'''X_train shape {X_train.shape}
y_train shape {len(y_train)}
X_val shape {X_val.shape}
y_val {len(y_val)}
''')

X_train shape (732, 90, 3)
y_train shape 732
X_val shape (184, 90, 3)
y_val 184



# RNN Models

## Model Functions

In [31]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Year,Month,Day
0,2015-01-02,2058.899902,2072.360107,2046.040039,2058.199951,2708700000,2015,1,2
1,2015-01-05,2054.439941,2054.439941,2017.339966,2020.579956,3799120000,2015,1,5
2,2015-01-06,2022.150024,2030.25,1992.439941,2002.609985,4460110000,2015,1,6
3,2015-01-07,2005.550049,2029.609985,2005.550049,2025.900024,3805480000,2015,1,7
4,2015-01-08,2030.609985,2064.080078,2030.609985,2062.139893,3934010000,2015,1,8


In [32]:
def run_model (model, model_name):
  callback = EarlyStopping(monitor='loss', patience=2)
  start = time()
  fitted_model = model.fit(X_train,
                           y_train,
                           callbacks=[callback],
                           epochs=75,
                           validation_data=(X_val, y_val))
  end = time()
  runtime = round((end-start), 4)
  print(f"{model_name} took {runtime} seconds or {round((runtime/60),4)} minutes to run.")
  return runtime

def compare_pred_actual(model, x, y, test_train, train=True):
  pred = minmax.inverse_transform(model.predict(x))
  actual = minmax.inverse_transform(y)

  pred_lst = [round(pred[row][0],3) for row in range(0,len(pred))]
  actual_lst = [round(actual[row][0],3) for row in range(0,len(actual))]
  diff = [(actual_lst[row] - pred_lst[row]) for row in range(0, len(pred))]

  compare = pd.DataFrame(list(zip(actual_lst, pred_lst, diff)),
                              columns=[f"Actual_{test_train}", f"Pred_{test_train}", 'Diff'])
  
  if train == True:
    compare['Date'] = data['Date'][:len(x)].copy().values
  elif train == False:
    compare['Date'] = data['Date'][1006:(1006 + len(y))].copy().values

  return pred_lst, actual_lst, compare

def graph_pred(compare, test_train, model_name):
  fig = px.line(compare, x='Date', y=[f"Actual_{test_train}",f"Pred_{test_train}"])
  fig.update_layout(title_text=f"Compare {model_name} {test_train} S&P 500 Close Price Predictions",
                    height=500, width=900)
  fig.show()
  fig.write_image(f"images/line_{model_name}_{test_train}_pred_comparison.png")

  return

def evaluating_model (y_true, y_pred, model_name, train_test, runtime):
  df=pd.DataFrame({ 'model_name': [model_name, model_name, 
                                   model_name, model_name,
                                   model_name, model_name,
                                   model_name, model_name],
                   'type': [train_test, train_test,
                            train_test, train_test,
                            train_test, train_test,
                            train_test, train_test],
                   'metric_name': ["MSE","RMSE",'MAE',
                                   'R_SQR', 'EXV', 'ME',
                                   'RMSLE', 'runtime'],
                   'metric' : [round(mean_squared_error(y_true, y_pred).numpy(),4),
                               round(mean_squared_error(y_true, y_pred).numpy()**0.5,4),
                               round(mean_absolute_error(y_true, y_pred), 4),
                               round(r2_score(y_true, y_pred), 4),
                               round(explained_variance_score(y_true, y_pred), 4),
                               round(max_error(y_true, y_pred), 4),
                               round(mean_squared_log_error(y_true, y_pred)**0.5, 4),
                               round((runtime/60),4)]})
  return df


## Model 1

### Run

In [33]:
rnn1 = Sequential([
    layers.SimpleRNN(20, return_sequences=True, input_shape=[90, 3]),
    layers.SimpleRNN(20, return_sequences=True),
    layers.SimpleRNN(1)
])

rnn1.compile(optimizer="adam", loss="mse", metrics=['mean_squared_error'])

In [34]:
rnn1_rt = run_model(rnn1, 'rnn1')

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
rnn1 took 10.787 seconds or 0.1798 minutes to run.


### Expectations of input/output

In [35]:
[print(i.shape, i.dtype) for i in rnn1.inputs]
[print(o.shape, o.dtype) for o in rnn1.outputs]
[print(l.name, l.input_shape, l.dtype) for l in rnn1.layers]

(None, 90, 3) <dtype: 'float32'>
(None, 1) <dtype: 'float32'>
simple_rnn (None, 90, 3) float32
simple_rnn_1 (None, 90, 20) float32
simple_rnn_2 (None, 90, 20) float32


[None, None, None]

### Evaluation of Train Predictions

In [36]:
rnn1_pred_train, rnn1_actual_train, rnn1_compare_train = compare_pred_actual(rnn1, X_train, y_train, 'Train', True)
rnn1_compare_train

Unnamed: 0,Actual_Train,Pred_Train,Diff,Date
0,3108.409,3068.996094,39.412906,2015-01-02
1,3054.425,3039.805908,14.619092,2015-01-05
2,3085.832,3087.399902,-1.567902,2015-01-06
3,2966.919,2974.920898,-8.001898,2015-01-07
4,3090.685,3091.562012,-0.877012,2015-01-08
...,...,...,...,...
727,2670.860,2698.666992,-27.806992,2017-11-20
728,2688.462,2704.165039,-15.703039,2017-11-21
729,3165.579,3116.197998,49.381002,2017-11-22
730,2630.882,2640.268066,-9.386066,2017-11-24


In [37]:
rnn1_compare_train.describe()

Unnamed: 0,Actual_Train,Pred_Train,Diff
count,732.0,732.0,732.0
mean,2835.12727,2839.991211,-4.865268
std,215.281792,198.099976,35.432154
min,2447.89,2449.256104,-159.240074
25%,2640.37725,2661.798767,-22.090227
50%,2828.7045,2846.958008,-9.302494
75%,3040.1005,3051.582092,9.757707
max,3239.243,3146.028076,97.239094


In [38]:
graph_pred(rnn1_compare_train, 'Train', 'rnn1')

In [39]:
rnn1_train = evaluating_model(rnn1_actual_train, rnn1_pred_train, 'rnn1', 'train', rnn1_rt)
rnn1_train

Unnamed: 0,model_name,type,metric_name,metric
0,rnn1,train,MSE,1277.393066
1,rnn1,train,RMSE,35.7406
2,rnn1,train,MAE,26.4538
3,rnn1,train,R_SQR,0.9724
4,rnn1,train,EXV,0.9729
5,rnn1,train,ME,159.2401
6,rnn1,train,RMSLE,0.0125
7,rnn1,train,runtime,0.1798


### Evaluate Test Predictions

In [40]:
rnn1_pred_test, rnn1_actual_test, rnn1_compare_test = compare_pred_actual(rnn1,x_test_lstm, y_test_lstm, 'Test', train=False)
rnn1_compare_test.head()

Unnamed: 0,Actual_Test,Pred_Test,Diff,Date
0,2811.87,2897.879883,-86.009883,2019-01-02
1,2834.41,2902.88501,-68.47501,2019-01-03
2,2850.96,2860.110107,-9.150107,2019-01-04
3,2876.32,2872.335938,3.984063,2019-01-07
4,2859.53,2880.413086,-20.883086,2019-01-08


In [41]:
rnn1_test = evaluating_model(rnn1_actual_test, rnn1_pred_test, 'rnn1', 'test', rnn1_rt)
rnn1_test

Unnamed: 0,model_name,type,metric_name,metric
0,rnn1,test,MSE,2020.026245
1,rnn1,test,RMSE,44.9447
2,rnn1,test,MAE,35.3208
3,rnn1,test,R_SQR,0.835
4,rnn1,test,EXV,0.8398
5,rnn1,test,ME,147.06
6,rnn1,test,RMSLE,0.015
7,rnn1,test,runtime,0.1798


In [42]:
graph_pred(rnn1_compare_test, 'Test', 'rnn1')

## Model 2

### Run

In [43]:
rnn2 = Sequential([
    layers.LSTM(3, return_sequences=True, input_shape=[90, 3]),
    layers.LSTM(3, return_sequences=True),
    layers.LSTM(3, return_sequences=True),
    layers.LSTM(3, return_sequences=True),
    layers.SimpleRNN(1)
])

rnn2.compile(optimizer="adam", loss="mse", metrics=['mean_squared_error'])

In [44]:
rnn2_rt = run_model(rnn2, 'rnn2')

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
rnn2 took 80.4488 seconds or 1.3408 minutes to run.


### Expectations of input/output

In [45]:
[print(i.shape, i.dtype) for i in rnn2.inputs]
[print(o.shape, o.dtype) for o in rnn2.outputs]
[print(l.name, l.input_shape, l.dtype) for l in rnn2.layers]

(None, 90, 3) <dtype: 'float32'>
(None, 1) <dtype: 'float32'>
lstm (None, 90, 3) float32
lstm_1 (None, 90, 3) float32
lstm_2 (None, 90, 3) float32
lstm_3 (None, 90, 3) float32
simple_rnn_3 (None, 90, 3) float32


[None, None, None, None, None]

### Evaluation of Train Predictions

In [46]:
rnn2_pred_train, rnn2_actual_train, rnn2_compare_train = compare_pred_actual(rnn2, X_train, y_train, 'Train')
rnn2_compare_train.head()

Unnamed: 0,Actual_Train,Pred_Train,Diff,Date
0,3108.409,3062.322021,46.086979,2015-01-02
1,3054.425,3071.916016,-17.491016,2015-01-05
2,3085.832,3104.041992,-18.209992,2015-01-06
3,2966.919,2950.228027,16.690973,2015-01-07
4,3090.685,3102.254883,-11.569883,2015-01-08


In [47]:
rnn2_compare_train.describe()

Unnamed: 0,Actual_Train,Pred_Train,Diff
count,732.0,732.0,732.0
mean,2835.12727,2819.145752,15.980494
std,215.281792,220.383896,46.922829
min,2447.89,2493.666016,-264.676109
25%,2640.37725,2613.28656,-7.786305
50%,2828.7045,2828.170532,18.932487
75%,3040.1005,3072.979553,46.155293
max,3239.243,3139.495117,112.496066


In [48]:
rnn2_train = evaluating_model(rnn2_actual_train, rnn2_pred_train, 'rnn2', 'train', rnn2_rt)
rnn2_train

Unnamed: 0,model_name,type,metric_name,metric
0,rnn2,train,MSE,2454.119873
1,rnn2,train,RMSE,49.5391
2,rnn2,train,MAE,38.638
3,rnn2,train,R_SQR,0.947
4,rnn2,train,EXV,0.9525
5,rnn2,train,ME,264.6761
6,rnn2,train,RMSLE,0.0177
7,rnn2,train,runtime,1.3408


In [49]:
graph_pred(rnn2_compare_train, 'Train', 'rnn2')

### Evaluate Test Predictions

In [50]:
rnn2_pred_test, rnn2_actual_test, rnn2_compare_test = compare_pred_actual(rnn2, x_test_lstm, y_test_lstm, 'Test', False)
rnn2_compare_test.head()

Unnamed: 0,Actual_Test,Pred_Test,Diff,Date
0,2811.87,2910.909912,-99.039912,2019-01-02
1,2834.41,2912.697998,-78.287998,2019-01-03
2,2850.96,2913.562012,-62.602012,2019-01-04
3,2876.32,2913.461914,-37.141914,2019-01-07
4,2859.53,2912.447021,-52.917021,2019-01-08


In [51]:
rnn2_test = evaluating_model(rnn2_actual_test, rnn2_pred_test, 'rnn2', 'test', rnn2_rt)
rnn2_test

Unnamed: 0,model_name,type,metric_name,metric
0,rnn2,test,MSE,5548.334961
1,rnn2,test,RMSE,74.4871
2,rnn2,test,MAE,66.6802
3,rnn2,test,R_SQR,0.5468
4,rnn2,test,EXV,0.5803
5,rnn2,test,ME,155.6271
6,rnn2,test,RMSLE,0.0251
7,rnn2,test,runtime,1.3408


In [52]:
graph_pred(rnn2_compare_test, 'Test', 'rnn2')

## Model 3

### Run

In [53]:
rnn3 = Sequential([
    layers.LSTM(50, return_sequences=True, activation='relu', input_shape=(90, 3)),
    layers.LSTM(50, activation='relu'),
    layers.Dense(1),
])

rnn3.compile(optimizer="adam", loss="mse", metrics=['mean_squared_error'])

In [54]:
rnn3_rt = run_model(rnn3, 'rnn3')

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
rnn3 took 31.7751 seconds or 0.5296 minutes to run.


### Expectations of input/output

In [55]:
[print(i.shape, i.dtype) for i in rnn3.inputs]
[print(o.shape, o.dtype) for o in rnn3.outputs]
[print(l.name, l.input_shape, l.dtype) for l in rnn3.layers]

(None, 90, 3) <dtype: 'float32'>
(None, 1) <dtype: 'float32'>
lstm_4 (None, 90, 3) float32
lstm_5 (None, 90, 50) float32
dense (None, 50) float32


[None, None, None]

### Evaluation of Train Predictions

In [56]:
rnn3_pred_train, rnn3_actual_train, rnn3_compare_train = compare_pred_actual(rnn3, X_train, y_train, 'Train')
rnn3_compare_train.head()

Unnamed: 0,Actual_Train,Pred_Train,Diff,Date
0,3108.409,3081.445068,26.963932,2015-01-02
1,3054.425,3051.543945,2.881055,2015-01-05
2,3085.832,3122.049072,-36.217072,2015-01-06
3,2966.919,2969.454102,-2.535102,2015-01-07
4,3090.685,3130.662109,-39.977109,2015-01-08


In [57]:
rnn3_compare_train.describe()

Unnamed: 0,Actual_Train,Pred_Train,Diff
count,732.0,732.0,732.0
mean,2835.12727,2841.356689,-6.229657
std,215.281792,217.212845,28.63417
min,2447.89,2489.610107,-173.878014
25%,2640.37725,2641.453796,-15.792892
50%,2828.7045,2839.834961,-3.564604
75%,3040.1005,3061.255981,10.674807
max,3239.243,3240.893066,70.36607


In [58]:
rnn3_train = evaluating_model(rnn3_actual_train, rnn3_pred_train, 'rnn3', 'train', rnn3_rt)
rnn3_train

Unnamed: 0,model_name,type,metric_name,metric
0,rnn3,train,MSE,857.604126
1,rnn3,train,RMSE,29.2849
2,rnn3,train,MAE,19.8208
3,rnn3,train,R_SQR,0.9815
4,rnn3,train,EXV,0.9823
5,rnn3,train,ME,173.878
6,rnn3,train,RMSLE,0.0103
7,rnn3,train,runtime,0.5296


In [59]:
graph_pred(rnn3_compare_train, 'Train', 'rnn3')

### Evaluate Test Predictions

In [60]:
rnn3_pred_test, rnn3_actual_test, rnn3_compare_test = compare_pred_actual(rnn3, x_test_lstm, y_test_lstm, 'Test', False)
rnn3_compare_test.head()

Unnamed: 0,Actual_Test,Pred_Test,Diff,Date
0,2811.87,2911.831055,-99.961055,2019-01-02
1,2834.41,2900.125977,-65.715977,2019-01-03
2,2850.96,2887.955078,-36.995078,2019-01-04
3,2876.32,2876.285889,0.034111,2019-01-07
4,2859.53,2868.486084,-8.956084,2019-01-08


In [61]:
rnn3_test = evaluating_model(rnn3_actual_test, rnn3_pred_test, 'rnn3', 'test', rnn3_rt)
rnn3_test

Unnamed: 0,model_name,type,metric_name,metric
0,rnn3,test,MSE,1851.622559
1,rnn3,test,RMSE,43.0305
2,rnn3,test,MAE,34.8836
3,rnn3,test,R_SQR,0.8488
4,rnn3,test,EXV,0.8553
5,rnn3,test,ME,159.7761
6,rnn3,test,RMSLE,0.0147
7,rnn3,test,runtime,0.5296


In [62]:
graph_pred(rnn3_compare_test, 'Test', 'rnn3')

# Model Comparisons

In [63]:
model_compare = pd.concat([rnn1_train, rnn1_test,
                           rnn2_train, rnn2_test,
                           rnn3_train, rnn3_test])

In [64]:
model_compare.head(8)

Unnamed: 0,model_name,type,metric_name,metric
0,rnn1,train,MSE,1277.393066
1,rnn1,train,RMSE,35.7406
2,rnn1,train,MAE,26.4538
3,rnn1,train,R_SQR,0.9724
4,rnn1,train,EXV,0.9729
5,rnn1,train,ME,159.2401
6,rnn1,train,RMSLE,0.0125
7,rnn1,train,runtime,0.1798


In [65]:
def model_comparisons (metric):
  grouped_df = model_compare[model_compare['metric_name']== metric]

  fig = px.bar(grouped_df, x='model_name', color='type',  barmode='group',
              y='metric', text_auto='.2s')
  fig.update_layout(height=600, width=1000, yaxis_title='', xaxis_title='',
                    title_text=f"RNN Models by {metric}")
  fig.show()
  fig.write_image(f"images/rnn_models_{metric}.png")

  return

In [66]:
model_comparisons('RMSE')

In [67]:
model_comparisons('MAE') 

In [68]:
model_comparisons('runtime') 

In [69]:
def min_max(compare_1, compare_2, compare_3):
  min_close = list(compare_1.min()[:2]) + list(compare_2.min()[:2]) +list(compare_3.min()[:2])
  min_close.sort(reverse=True)
  min = min_close[0]
  
  max_close = list(compare_1.max()[:2]) + list(compare_2.max()[:2]) +list(compare_3.max()[:2])
  max_close.sort(reverse=True)
  max = max_close[0]

  return min, max

def line_test(df, pred=True):
  if pred==True:
    fig = go.Scatter(x=df['Date'], y='Pred_Test')
  elif pred==False:
    fig = go.Scatter(x=df['Date'], y='Actual_Test')
  return fig

In [70]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=rnn3_compare_test['Date'],
                         y=rnn3_compare_test['Actual_Test'],
                         mode='lines+markers', name='Actual Close',
                         line_color='#000000'))
fig.add_trace(go.Scatter(x=rnn1_compare_test['Date'],
                         y=rnn1_compare_test['Pred_Test'],
                         mode='lines+markers', name='rnn1'))
fig.add_trace(go.Scatter(x=rnn2_compare_test['Date'],
                         y=rnn2_compare_test['Pred_Test'],
                         mode='lines+markers', name='rnn2'))
fig.add_trace(go.Scatter(x=rnn3_compare_test['Date'],
                         y=rnn3_compare_test['Pred_Test'],
                         mode='lines+markers', name='rnn3'))


fig.update_layout(title_text="Compare RNN Model Test Predictions",
                  yaxis_title='Close Price', xaxis_title='Year',
                  height = 500, width = 800)
fig.show()
fig.write_image('images/compare_model_test_predictions.png')