<a href="https://colab.research.google.com/github/oimartin/SP_500_index_RNN/blob/main/sp500_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ingest

## Load Libraries

In [1]:
!pip install yfinance
!pip install -U kaleido

Collecting yfinance
  Downloading yfinance-0.1.70-py2.py3-none-any.whl (26 kB)
Collecting lxml>=4.5.1
  Downloading lxml-4.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 4.2 MB/s 
Collecting requests>=2.26
  Downloading requests-2.27.1-py2.py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 484 kB/s 
Installing collected packages: requests, lxml, yfinance
  Attempting uninstall: requests
    Found existing installation: requests 2.23.0
    Uninstalling requests-2.23.0:
      Successfully uninstalled requests-2.23.0
  Attempting uninstall: lxml
    Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests~

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.io as pio
import kaleido
import os

if not os.path.exists("images"):
  os.mkdir('images') # save plotly images

import matplotlib.pyplot as plt
import yfinance as yf
import datetime as dt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, max_error, mean_squared_log_error

from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

from time import time
from keras.models import Sequential
from keras import layers
from keras.losses import BinaryCrossentropy, Poisson, SparseCategoricalCrossentropy, mean_squared_error
from keras.metrics import BinaryAccuracy
from keras.callbacks import EarlyStopping

## Load Data

In [3]:
data = yf.Ticker('^GSPC').history(start=dt.datetime(2015,1,1),
                           end=dt.datetime(2020,1,1)).reset_index()
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2015-01-02,2058.899902,2072.360107,2046.040039,2058.199951,2708700000,0,0
1,2015-01-05,2054.439941,2054.439941,2017.339966,2020.579956,3799120000,0,0
2,2015-01-06,2022.150024,2030.25,1992.439941,2002.609985,4460110000,0,0
3,2015-01-07,2005.550049,2029.609985,2005.550049,2025.900024,3805480000,0,0
4,2015-01-08,2030.609985,2064.080078,2030.609985,2062.139893,3934010000,0,0


# EDA

## First View

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          1258 non-null   datetime64[ns]
 1   Open          1258 non-null   float64       
 2   High          1258 non-null   float64       
 3   Low           1258 non-null   float64       
 4   Close         1258 non-null   float64       
 5   Volume        1258 non-null   int64         
 6   Dividends     1258 non-null   int64         
 7   Stock Splits  1258 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(3)
memory usage: 78.8 KB


In [5]:
data.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
count,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0
mean,2452.403505,2462.925358,2440.702211,2452.643027,3625464000.0,0.0,0.0
std,357.405072,357.889358,357.025302,357.451845,671811400.0,0.0,0.0
min,1833.400024,1847.0,1810.099976,1829.079956,1296540000.0,0.0,0.0
25%,2101.687439,2108.959961,2092.134888,2102.08252,3232422000.0,0.0,0.0
50%,2434.209961,2441.555054,2427.97998,2434.14502,3520885000.0,0.0,0.0
75%,2773.082458,2783.702515,2758.289978,2771.179993,3900050000.0,0.0,0.0
max,3247.22998,3247.929932,3234.370117,3240.02002,7609010000.0,0.0,0.0


In [6]:
display(data['Dividends'].value_counts(),
        data['Stock Splits'].value_counts())

0    1258
Name: Dividends, dtype: int64

0    1258
Name: Stock Splits, dtype: int64

In [7]:
data = data.drop(columns=['Dividends','Stock Splits'])
data.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')

In [8]:
data['Year'] = data['Date'].dt.strftime('%Y')
data['Month'] = data['Date'].dt.strftime('%m')
data['Day'] = data['Date'].dt.strftime('%d')
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Year,Month,Day
0,2015-01-02,2058.899902,2072.360107,2046.040039,2058.199951,2708700000,2015,1,2
1,2015-01-05,2054.439941,2054.439941,2017.339966,2020.579956,3799120000,2015,1,5
2,2015-01-06,2022.150024,2030.25,1992.439941,2002.609985,4460110000,2015,1,6
3,2015-01-07,2005.550049,2029.609985,2005.550049,2025.900024,3805480000,2015,1,7
4,2015-01-08,2030.609985,2064.080078,2030.609985,2062.139893,3934010000,2015,1,8


## Plots

In [82]:
fig = go.Figure()
fig.add_trace(go.Candlestick(x=data['Date'], close=data['Close'], open=data['Open'], 
                             low=data['Low'], high=data['High']))
fig.update_layout(title_text="2015-2019: Overview of S&P 500",
                  yaxis_title='Price', xaxis_title='Year')
fig.show()
fig.write_image('images/canldestick_allyears.png')

In [83]:
fig = px.box(data, y=["Open", 'High', 'Low', 'Close'], color='Year', points="all")
fig.update_layout(height=600, width=1000, yaxis_title='Price', xaxis_title='',
                  title_text='2015-2019: S&P 500 Open, High, Low, Close Prices')
fig.show()
fig.write_image('images/boxplot_years_openHighlowClose.png')

In [84]:
fig = px.box(data, y=["Open", 'High', 'Low', 'Close'], points="all")
fig.update_layout(height=600, width=1000, yaxis_title='Price', xaxis_title='')
fig.show()
fig.write_image('images/boxplot_openHighlowClose.png')

In [85]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=data['Volume']))

# Overlay both histograms
fig.update_layout(bargap=0.1, title_text="2015-2019: S&P 500 Volume",
                  yaxis_title='Count', xaxis_title='Volume')

# Reduce opacity to see both histograms
fig.show()
fig.write_image('images/combined_volume.png')

In [86]:
fig = px.scatter(data, x='Date', y='Volume', color='Year', trendline='lowess',
                 trendline_color_override='black')
fig.update_layout(title_text='2015-2019: S&P 500 Volume')
fig.show()
fig.write_image('images/scatter_year_volume_all.png')

In [87]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=data['Open'], name='Open'))
fig.add_trace(go.Histogram(x=data['Close'], name='Close'))

# Overlay both histograms
fig.update_layout(barmode='overlay', bargap=0.1,
                  title_text="2015-2019: S&P 500 Open Price",
                  yaxis_title='Count', xaxis_title='Open Price')

# Reduce opacity to see both histograms
fig.update_traces(opacity=0.6)
fig.show()
fig.write_image('images/combined_open_close_overlay.png')

In [88]:
fig = px.scatter(data, x='Date', y='Close', color='Year', trendline='lowess',
                 trendline_color_override='black')
fig.update_layout(title_text='2015-2019: S&P 500 Volume')
fig.show()
fig.write_image('images/scatter_year_close_all.png')

In [91]:
fig = px.scatter(data.iloc[:1006, :], x='Date', y='Close', color='Year', trendline='lowess',
                 trendline_color_override='black')
fig.update_layout(title_text='2015-2019: S&P 500 Volume')
fig.show()
fig.write_image('images/scatter_year_close_80.png')

In [90]:
fig = px.line(data.iloc[:1006, :], x='Date', y='Close', color='Year')
fig.update_layout(title_text='2015-2019: S&P 500 Volume')
fig.show()
fig.write_image('images/line_year_close_80.png')

In [92]:
fig = px.scatter(data.iloc[1006:, :], x='Date', y='Close', color='Year', trendline='lowess',
                 trendline_color_override='black', color_discrete_sequence=['#ff7f0e'])
fig.update_layout(title_text='2015-2019: S&P 500 Volume')
fig.show()
fig.write_image('images/scatter_year_close_20.png')

In [93]:
fig = px.line(data.iloc[1006:, :], x='Date', y='Close', color='Year', color_discrete_sequence=['#ff7f0e'])
fig.update_layout(title_text='2015-2019: S&P 500 Volume')
fig.show()
fig.write_image('images/line_year_close_20.png')

In [94]:
min = data['Open'].min()
max = data['Open'].max()

def hist_year(df, col, year):
  fig = go.Histogram(x=df[df['Year']== year][col],
                      xbins=dict(
                      start=min,
                      end= max,
                      size=50),
                      autobinx=False,
                     name=year)
  return fig

In [95]:
fig = make_subplots(rows=5, cols=1)
trace0 = hist_year(data, 'Open', '2015')
trace1 = hist_year(data, 'Open', '2016')
trace2 = hist_year(data, 'Open', '2017')
trace3 = hist_year(data, 'Open', '2018')
trace4 = hist_year(data, 'Open', '2019')

# adjust plots
fig.update_layout(bargap=0.1)
fig.update_xaxes(range=[min,max])
fig.update_layout(title_text="2015-2019: S&P 500 Open Price Comparison", height=700)

# add trace to subplot
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 2, 1)
fig.append_trace(trace2, 3, 1)
fig.append_trace(trace3, 4, 1)
fig.append_trace(trace4, 5, 1)
fig.show()
fig.write_image('images/compare_open_all.png')

In [96]:
fig = px.scatter_3d(data, x='Open', y='Close', z='High',
                    color='Year')
camera = dict(
    up=dict(x=1, y=0, z=1),
    center=dict(x=-1, y=0, z=0),
    eye=dict(x=2.15, y=0.1, z=0.1)
)
fig.update_layout(height=900, title_text='2015-2019: 3D View of S&P 500\nby Open, High, and Close Prices',
                  margin=dict(l=0, r=0, b=0, t=0), scene_camera=camera)
fig.show()
fig.write_image('images/3d_view_open_high_close_all.png')

In [97]:
corr = data.iloc[:, 1:6].corr()
corr_no_vol = data.iloc[:, 1:5].corr()
corr

Unnamed: 0,Open,High,Low,Close,Volume
Open,1.0,0.999541,0.99922,0.998774,-0.213469
High,0.999541,1.0,0.999025,0.999316,-0.206946
Low,0.99922,0.999025,1.0,0.999468,-0.228355
Close,0.998774,0.999316,0.999468,1.0,-0.219588
Volume,-0.213469,-0.206946,-0.228355,-0.219588,1.0


In [98]:
fig = go.Figure()
fig.add_trace(go.Heatmap(
    z=corr,
    x=corr.columns,
    y=corr.columns,
    colorscale=px.colors.diverging.RdBu
))
fig.update_layout(height=400, width=400)
fig.show()
fig.write_image('images/heatmap_corr.png')

In [99]:
fig = go.Figure()
fig.add_trace(go.Heatmap(
    z=corr_no_vol,
    x=corr_no_vol.columns,
    y=corr_no_vol.columns,
    colorscale=px.colors.diverging.RdBu
))
fig.update_layout(height=400, width=400)
fig.show()
fig.write_image('images/heatmap_corr_no_vol.png')

# Preprocessing Data

In [22]:
minmax = MinMaxScaler(feature_range=(0,1))

def select_transform(df, features, y_output):
  split_80 = int((data.shape[0]*0.8))
  print(f"Split at {split_80}")

  x_train = df.loc[:split_80-1, features].copy()
  y_train = df.loc[:split_80-1, [y_output]].copy()

  x_test = df.loc[split_80:, features].copy()
  y_test = df.loc[split_80:, [y_output]].copy()

  x_train_sc = minmax.fit_transform(x_train)
  y_train_sc = minmax.fit_transform(y_train)
  x_test_sc = minmax.fit_transform(x_test)
  y_test_sc = minmax.fit_transform(y_test)

  print(f'''X_train shape {x_train_sc.shape}
y_train shape {len(y_train_sc)}
X_test shape {x_test_sc.shape}
y_test {len(y_test_sc)}
''')

  return x_train_sc, y_train_sc, x_test_sc, y_test_sc

In [23]:
X_train, y_train, X_test, y_test = select_transform(data, ['Open', 'High','Low'], 'Close')

Split at 1006
X_train shape (1006, 3)
y_train shape 1006
X_test shape (252, 3)
y_test 252



In [24]:
# Inspired by https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/
# and https://towardsdatascience.com/how-to-reshape-data-and-do-regression-for-time-series-using-lstm-133dad96cd00

def lstm_data_transform(x_data, y_data, num_steps=5):
    """ Changes data to the format for LSTM training 
for sliding window approach """
    # Prepare the list for the transformed data
    X, y = list(), list()
    # Loop of the entire data set
    for i in range(x_data.shape[0]):
        # compute a new (sliding window) index
        end_ix = i + num_steps
        # if index is larger than the size of the dataset, we stop
        if end_ix >= x_data.shape[0]:
            break
        # Get a sequence of data for x
        seq_X = x_data[i:end_ix]
        # Get only the last element of the sequency for y
        seq_y = y_data[end_ix]
        # Append the list with sequencies
        X.append(seq_X)
        y.append(seq_y)
    # Make final arrays
    x_array = np.array(X)
    y_array = np.array(y)

    print(f"X_array shape {x_array.shape}\nY_array shape {y_array.shape}")
    return x_array, y_array

In [25]:
x_train_lstm, y_train_lstm = lstm_data_transform(X_train, y_train, 90)
print("-----\nTest")
x_test_lstm, y_test_lstm = lstm_data_transform(X_test, y_test, 90)

X_array shape (916, 90, 3)
Y_array shape (916, 1)
-----
Test
X_array shape (162, 90, 3)
Y_array shape (162, 1)


In [26]:
X_train, X_val, y_train, y_val = train_test_split(x_train_lstm,
                                                  y_train_lstm,
                                                  test_size=0.2,
                                                  random_state=22)
print(f'''X_train shape {X_train.shape}
y_train shape {len(y_train)}
X_val shape {X_val.shape}
y_val {len(y_val)}
''')

X_train shape (732, 90, 3)
y_train shape 732
X_val shape (184, 90, 3)
y_val 184



# RNN Models

## Model Functions

In [211]:
def run_model (model, model_name):
  callback = EarlyStopping(monitor='loss', patience=2)
  start = time()
  fitted_model = model.fit(X_train,
                           y_train,
                           callbacks=[callback],
                           epochs=75,
                           validation_data=(X_val, y_val))
  end = time()
  runtime = round((end-start), 4)
  print(f"{model_name} took {runtime} seconds or {round((runtime/60),4)} minutes to run.")
  return runtime

def compare_pred_actual(model, x, y, test_train, train=True):
  pred = minmax.inverse_transform(model.predict(x))
  actual = minmax.inverse_transform(y)

  pred_lst = [round(pred[row][0],3) for row in range(0,len(pred))]
  actual_lst = [round(actual[row][0],3) for row in range(0,len(actual))]
  diff = [(actual_lst[row] - pred_lst[row]) for row in range(0, len(pred))]

  compare = pd.DataFrame(list(zip(actual_lst, pred_lst, diff)),
                              columns=[f"Actual_{test_train}", f"Pred_{test_train}", 'Diff'])
  
  if train == True:
    compare['Date'] = data.iloc[:1006, 0]
  elif train == False:
    compare['Date'] = data.iloc[1006:, 0]

  return pred_lst, actual_lst, compare

def graph_pred(compare, test_train, model_name):
  fig = px.line(compare, x='Date', y=[f"Actual_{test_train}",f"Pred_{test_train}"])
  fig.update_layout(title_text=f"Compare {model_name} {test_train} S&P 500 Close Price Predictions")
  fig.show()
  fig.write_image(f"images/line_{model_name}_{test_train}_pred_comparison.png")

  return

def evaluating_model (y_true, y_pred, model_name, train_test, runtime):
  df=pd.DataFrame({ 'model_name': [model_name, model_name, 
                                   model_name, model_name,
                                   model_name, model_name,
                                   model_name, model_name],
                   'type': [train_test, train_test,
                            train_test, train_test,
                            train_test, train_test,
                            train_test, train_test],
                   'metric_name': ["MSE","RMSE",'MAE',
                                   'R_SQR', 'EXV', 'ME',
                                   'RMSLE', 'runtime'],
                   'metric' : [round(mean_squared_error(y_true, y_pred).numpy(),4),
                               round(mean_squared_error(y_true, y_pred).numpy()**0.5,4),
                               round(mean_absolute_error(y_true, y_pred), 4),
                               round(r2_score(y_true, y_pred), 4),
                               round(explained_variance_score(y_true, y_pred), 4),
                               round(max_error(y_true, y_pred), 4),
                               round(mean_squared_log_error(y_true, y_pred)**0.5, 4),
                               round((runtime/60),4)]})
  return df


In [213]:
data.iloc[1006:, 0]

1006   2019-01-02
1007   2019-01-03
1008   2019-01-04
1009   2019-01-07
1010   2019-01-08
          ...    
1253   2019-12-24
1254   2019-12-26
1255   2019-12-27
1256   2019-12-30
1257   2019-12-31
Name: Date, Length: 252, dtype: datetime64[ns]

## Model 1

### Run

In [28]:
rnn1 = Sequential([
    layers.SimpleRNN(20, return_sequences=True, input_shape=[90, 3]),
    layers.SimpleRNN(20, return_sequences=True),
    layers.SimpleRNN(1)
])

rnn1.compile(optimizer="adam", loss="mse", metrics=['mean_squared_error'])

In [29]:
rnn1_rt = run_model(rnn1, 'rnn1')

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
rnn1 took 37.8645 seconds or 0.6311 minutes to run.


### Expectations of input/output

In [30]:
[print(i.shape, i.dtype) for i in rnn1.inputs]
[print(o.shape, o.dtype) for o in rnn1.outputs]
[print(l.name, l.input_shape, l.dtype) for l in rnn1.layers]

(None, 90, 3) <dtype: 'float32'>
(None, 1) <dtype: 'float32'>
simple_rnn (None, 90, 3) float32
simple_rnn_1 (None, 90, 20) float32
simple_rnn_2 (None, 90, 20) float32


[None, None, None]

### Evaluation of Train Predictions

In [153]:
rnn1_pred_train, rnn1_actual_train, rnn1_compare_train = compare_pred_actual(rnn1, X_train, y_train, 'Train', True)
rnn1_compare_train.head()

Unnamed: 0,Actual_Train,Pred_Train,Diff,Date
0,3108.409,3070.206055,38.202945,2015-01-02
1,3054.425,3046.758057,7.666943,2015-01-05
2,3085.832,3098.850098,-13.018098,2015-01-06
3,2966.919,2945.729004,21.189996,2015-01-07
4,3090.685,3105.702881,-15.017881,2015-01-08


In [109]:
rnn1_compare_train.describe()

Unnamed: 0,Actual_Train,Pred_Train,Diff
count,732.0,732.0,732.0
mean,2835.12727,2826.814697,8.313604
std,215.281792,212.22821,32.444375
min,2447.89,2394.327881,-197.161949
25%,2640.37725,2641.735291,-6.932905
50%,2828.7045,2823.157959,9.541006
75%,3040.1005,3059.307556,24.595808
max,3239.243,3161.62793,151.643957


In [160]:
graph_pred(rnn1_compare_train, 'Train', 'rnn1')

In [110]:
rnn1_train = evaluating_model(rnn1_actual_train, rnn1_pred_train, 'rnn1', 'train', rnn1_rt)
rnn1_train

Unnamed: 0,model_name,type,metric_name,metric
0,rnn1,train,MSE,1120.315186
1,rnn1,train,RMSE,33.4711
2,rnn1,train,MAE,24.3301
3,rnn1,train,R_SQR,0.9758
4,rnn1,train,EXV,0.9773
5,rnn1,train,ME,197.1619
6,rnn1,train,RMSLE,0.0118
7,rnn1,train,runtime,0.6311


### Evaluate Test Predictions

In [212]:
rnn1_pred_test, rnn1_actual_test, rnn1_compare_test = compare_pred_actual(rnn1,x_test_lstm, y_test_lstm, 'Test', train=False)
rnn1_compare_test.head()

Unnamed: 0,Actual_Test,Pred_Test,Diff,Date
0,2811.87,2850.980957,-39.110957,NaT
1,2834.41,2849.333008,-14.923008,NaT
2,2850.96,2848.059082,2.900918,NaT
3,2876.32,2859.033936,17.286064,NaT
4,2859.53,2886.772949,-27.242949,NaT


In [112]:
rnn1_test = evaluating_model(rnn1_actual_test, rnn1_pred_test, 'rnn1', 'test', rnn1_rt)
rnn1_test

Unnamed: 0,model_name,type,metric_name,metric
0,rnn1,test,MSE,2541.25
1,rnn1,test,RMSE,50.4108
2,rnn1,test,MAE,42.6998
3,rnn1,test,R_SQR,0.7924
4,rnn1,test,EXV,0.8517
5,rnn1,test,ME,120.173
6,rnn1,test,RMSLE,0.0169
7,rnn1,test,runtime,0.6311


In [161]:
graph_pred(rnn1_compare_test, 'Test', 'rnn1' )

## Model 2

### Run

In [36]:
rnn2 = Sequential([
    layers.LSTM(3, return_sequences=True, input_shape=[90, 3]),
    layers.LSTM(3, return_sequences=True),
    layers.LSTM(3, return_sequences=True),
    layers.LSTM(3, return_sequences=True),
    layers.SimpleRNN(1)
])

rnn2.compile(optimizer="adam", loss="mse", metrics=['mean_squared_error'])

In [37]:
rnn2_rt = run_model(rnn2, 'rnn2')

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
rnn2 took 164.921 seconds or 2.7487 minutes to run.


### Expectations of input/output

In [38]:
[print(i.shape, i.dtype) for i in rnn2.inputs]
[print(o.shape, o.dtype) for o in rnn2.outputs]
[print(l.name, l.input_shape, l.dtype) for l in rnn2.layers]

(None, 90, 3) <dtype: 'float32'>
(None, 1) <dtype: 'float32'>
lstm (None, 90, 3) float32
lstm_1 (None, 90, 3) float32
lstm_2 (None, 90, 3) float32
lstm_3 (None, 90, 3) float32
simple_rnn_3 (None, 90, 3) float32


[None, None, None, None, None]

### Evaluation of Train Predictions

In [113]:
rnn2_pred_train, rnn2_actual_train, rnn2_compare_train = compare_pred_actual(rnn2, X_train, y_train, 'Train')
rnn2_compare_train.head()

Unnamed: 0,Actual_Train,Pred_Train,Diff,Date
0,3108.409,3070.238037,38.170963,2015-01-02
1,3054.425,3065.794922,-11.369922,2015-01-05
2,3085.832,3108.899902,-23.067902,2015-01-06
3,2966.919,2966.545898,0.373102,2015-01-07
4,3090.685,3108.426025,-17.741025,2015-01-08


In [114]:
rnn2_compare_train.describe()

Unnamed: 0,Actual_Train,Pred_Train,Diff
count,732.0,732.0,732.0
mean,2835.12727,2836.487305,-1.35951
std,215.281792,207.9207,39.342473
min,2447.89,2541.977051,-243.834068
25%,2640.37725,2635.447754,-17.480494
50%,2828.7045,2839.543945,1.592091
75%,3040.1005,3067.276184,19.421293
max,3239.243,3151.003906,93.194904


In [115]:

rnn2_train = evaluating_model(rnn2_actual_train, rnn2_pred_train, 'rnn2', 'train', rnn2_rt)
rnn2_train

Unnamed: 0,model_name,type,metric_name,metric
0,rnn2,train,MSE,1547.563721
1,rnn2,train,RMSE,39.3391
2,rnn2,train,MAE,27.8003
3,rnn2,train,R_SQR,0.9666
4,rnn2,train,EXV,0.9666
5,rnn2,train,ME,243.8341
6,rnn2,train,RMSLE,0.0137
7,rnn2,train,runtime,2.7487


In [162]:
graph_pred(rnn2_compare_train, 'Train', 'rnn2')

### Evaluate Test Predictions

In [116]:
rnn2_pred_test, rnn2_actual_test, rnn2_compare_test = compare_pred_actual(rnn2, x_test_lstm, y_test_lstm, 'Test')
rnn2_compare_test.head()

Unnamed: 0,Actual_Test,Pred_Test,Diff,Date
0,2811.87,2932.270996,-120.400996,2015-01-02
1,2834.41,2929.353027,-94.943027,2015-01-05
2,2850.96,2925.337891,-74.377891,2015-01-06
3,2876.32,2920.337891,-44.017891,2015-01-07
4,2859.53,2915.344971,-55.814971,2015-01-08


In [117]:
rnn2_test = evaluating_model(rnn2_actual_test, rnn2_pred_test, 'rnn2', 'test', rnn2_rt)
rnn2_test

Unnamed: 0,model_name,type,metric_name,metric
0,rnn2,test,MSE,3992.386475
1,rnn2,test,RMSE,63.1853
2,rnn2,test,MAE,54.1114
3,rnn2,test,R_SQR,0.6739
4,rnn2,test,EXV,0.6809
5,rnn2,test,ME,170.842
6,rnn2,test,RMSLE,0.0213
7,rnn2,test,runtime,2.7487


In [163]:
graph_pred(rnn2_compare_test, 'Test', 'rnn2')

## Model 3

### Run

In [118]:
rnn3 = Sequential([
    layers.LSTM(50, return_sequences=True, activation='relu', input_shape=(90, 3)),
    layers.LSTM(50, activation='relu'),
    layers.Dense(1),
])

rnn3.compile(optimizer="adam", loss="mse", metrics=['mean_squared_error'])

In [45]:
rnn3_rt = run_model(rnn3, 'rnn3')

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
rnn3 took 21.67 seconds or 0.3612 minutes to run.


### Expectations of input/output

In [46]:
[print(i.shape, i.dtype) for i in rnn3.inputs]
[print(o.shape, o.dtype) for o in rnn3.outputs]
[print(l.name, l.input_shape, l.dtype) for l in rnn3.layers]

(None, 90, 3) <dtype: 'float32'>
(None, 1) <dtype: 'float32'>
lstm_4 (None, 90, 3) float32
lstm_5 (None, 90, 50) float32
dense (None, 50) float32


[None, None, None]

### Evaluation of Train Predictions

In [119]:
rnn3_pred_train, rnn3_actual_train, rnn3_compare_train = compare_pred_actual(rnn3, X_train, y_train, 'Train')
rnn3_compare_train.head()

Unnamed: 0,Actual_Train,Pred_Train,Diff,Date
0,3108.409,2413.014893,695.394107,2015-01-02
1,3054.425,2413.849121,640.575879,2015-01-05
2,3085.832,2409.913086,675.918914,2015-01-06
3,2966.919,2419.474121,547.444879,2015-01-07
4,3090.685,2409.833008,680.851992,2015-01-08


In [48]:
rnn3_compare_train.describe()

Unnamed: 0,Actual_Train,Pred_Train,Diff
count,732.0,732.0,732.0
mean,2835.12727,2839.538086,-4.410394
std,215.281792,214.912506,29.004648
min,2447.89,2498.180908,-175.369957
25%,2640.37725,2639.981445,-14.62127
50%,2828.7045,2839.509521,-1.321586
75%,3040.1005,3053.964539,12.635673
max,3239.243,3237.14209,84.567975


In [120]:
rnn3_train = evaluating_model(rnn3_actual_train, rnn3_pred_train, 'rnn3', 'train', rnn3_rt)
rnn3_train

Unnamed: 0,model_name,type,metric_name,metric
0,rnn3,train,MSE,218982.140625
1,rnn3,train,RMSE,467.9553
2,rnn3,train,MAE,409.1267
3,rnn3,train,R_SQR,-3.7314
4,rnn3,train,EXV,-0.1148
5,rnn3,train,ME,834.5521
6,rnn3,train,RMSLE,0.1728
7,rnn3,train,runtime,0.3612


In [165]:
graph_pred(rnn3_compare_train, 'Train', 'rnn3')

### Evaluate Test Predictions

In [121]:
rnn3_pred_test, rnn3_actual_test, rnn3_compare_test = compare_pred_actual(rnn3, x_test_lstm, y_test_lstm, 'Test')
rnn3_compare_test.head()

Unnamed: 0,Actual_Test,Pred_Test,Diff,Date
0,2811.87,2421.971924,389.898076,2015-01-02
1,2834.41,2422.24292,412.16708,2015-01-05
2,2850.96,2422.596924,428.363076,2015-01-06
3,2876.32,2422.971924,453.348076,2015-01-07
4,2859.53,2423.279053,436.250947,2015-01-08


In [122]:
rnn3_test = evaluating_model(rnn3_actual_test, rnn3_pred_test, 'rnn3', 'test', rnn3_rt)
rnn3_test

Unnamed: 0,model_name,type,metric_name,metric
0,rnn3,test,MSE,336712.8125
1,rnn3,test,RMSE,580.2696
2,rnn3,test,MAE,568.7361
3,rnn3,test,R_SQR,-26.5013
4,rnn3,test,EXV,-0.0824
5,rnn3,test,ME,832.7241
6,rnn3,test,RMSLE,0.214
7,rnn3,test,runtime,0.3612


In [164]:
graph_pred(rnn3_compare_test, 'Test', 'rnn3')

# Model Comparisons

In [52]:
model_compare = pd.concat([rnn1_train, rnn1_test,
                           rnn2_train, rnn2_test,
                           rnn3_train, rnn3_test])

In [53]:
model_compare.head(8)

Unnamed: 0,model_name,type,metric_name,metric
0,rnn1,train,MSE,1120.315186
1,rnn1,train,RMSE,33.4711
2,rnn1,train,MAE,24.3301
3,rnn1,train,R_SQR,0.9758
4,rnn1,train,EXV,0.9773
5,rnn1,train,ME,197.1619
6,rnn1,train,RMSLE,0.0118
7,rnn1,train,runtime,0.6311


In [54]:
def model_comparisons (metric):
  grouped_df = model_compare[model_compare['metric_name']== metric]

  fig = px.bar(grouped_df, x='model_name', color='type',  barmode='group',
              y='metric', text_auto='.2s')
  fig.update_layout(height=600, width=1000, yaxis_title='', xaxis_title='',
                    title_text=f"RNN Models by {metric}")
  fig.show()
  fig.write_image(f"images/rnn_models_{metric}.png")

  return

In [126]:
rnn3_compare_test

Unnamed: 0,Actual_Test,Pred_Test,Diff,Date
0,2811.87,2421.971924,389.898076,2015-01-02
1,2834.41,2422.242920,412.167080,2015-01-05
2,2850.96,2422.596924,428.363076,2015-01-06
3,2876.32,2422.971924,453.348076,2015-01-07
4,2859.53,2423.279053,436.250947,2015-01-08
...,...,...,...,...
157,3223.38,2408.031006,815.348994,2015-08-18
158,3239.91,2407.664062,832.245937,2015-08-19
159,3240.02,2407.295898,832.724102,2015-08-20
160,3221.29,2406.897949,814.392051,2015-08-21


In [59]:
model_comparisons('MAE')

In [58]:
model_comparisons('MSE') 

In [57]:
model_comparisons('runtime') 

In [191]:
max_close = list(rnn1_compare_test.max()[:2]) + list(rnn2_compare_test.max()[:2]) +list(rnn3_compare_test.max()[:2])
max_close.sort(reverse=True)
max_close[0]

3240.02

In [192]:
min_close = list(rnn1_compare_test.min()[:2]) + list(rnn2_compare_test.min()[:2]) +list(rnn3_compare_test.min()[:2])
min_close.sort(reverse=True)
min_close[0]

2822.101

In [196]:
def min_max(compare_1, compare_2, compare_3):
  min_close = list(compare_1.min()[:2]) + list(compare_2.min()[:2]) +list(compare_3.min()[:2])
  min_close.sort(reverse=True)
  min = min_close[0]
  
  max_close = list(compare_1.max()[:2]) + list(compare_2.max()[:2]) +list(compare_3.max()[:2])
  max_close.sort(reverse=True)
  max = max_close[0]

  return min, max

def line_test(df, pred=True):
  if pred==True:
    fig = go.Scatter(x=df['Date'], y='Pred_Test')
  elif pred==False:
    fig = go.Scatter(x=df['Date'], y='Actual_Test')
  return fig

In [208]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=rnn3_compare_test['Date'],
                         y=rnn3_compare_test['Actual_Test'],
                         mode='lines+markers', name='Actual Close'))
fig.add_trace(go.Scatter(x=rnn1_compare_test['Date'],
                         y=rnn1_compare_test['Pred_Test'],
                         mode='lines+markers', name='rnn1'))
fig.add_trace(go.Scatter(x=rnn2_compare_test['Date'],
                         y=rnn2_compare_test['Pred_Test'],
                         mode='lines+markers', name='rnn2'))
fig.add_trace(go.Scatter(x=rnn3_compare_test['Date'],
                         y=rnn3_compare_test['Pred_Test'],
                         mode='lines+markers', name='rnn3'))


fig.update_layout(title_text="2015-2019: Overview of S&P 500",
                  yaxis_title='Price', xaxis_title='Year')
fig.show()
fig.write_image('images/compare_model_test_predictions.png')