<a href="https://colab.research.google.com/github/oimartin/SP_500_index_RNN/blob/main/sp500_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ingest

## Load Libraries

In [81]:
# !pip install yfinance
!pip install -U kaleido



In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import kaleido
import os
import matplotlib.pyplot as plt
import yfinance as yf
import datetime as dt
%matplotlib inline

from sklearn.model_selection import train_test_split
from time import time
from keras.models import Sequential
from keras import layers
from keras.losses import BinaryCrossentropy, Poisson, SparseCategoricalCrossentropy
from keras.metrics import BinaryAccuracy
from keras.callbacks import EarlyStopping

## Load Data

In [2]:
data = yf.Ticker('^GSPC').history(start=dt.datetime(2015,1,1),
                           end=dt.datetime(2020,1,1)).reset_index()

In [3]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2015-01-02,2058.899902,2072.360107,2046.040039,2058.199951,2708700000,0,0
1,2015-01-05,2054.439941,2054.439941,2017.339966,2020.579956,3799120000,0,0
2,2015-01-06,2022.150024,2030.25,1992.439941,2002.609985,4460110000,0,0
3,2015-01-07,2005.550049,2029.609985,2005.550049,2025.900024,3805480000,0,0
4,2015-01-08,2030.609985,2064.080078,2030.609985,2062.139893,3934010000,0,0


# EDA

## Pre-processing

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          1258 non-null   datetime64[ns]
 1   Open          1258 non-null   float64       
 2   High          1258 non-null   float64       
 3   Low           1258 non-null   float64       
 4   Close         1258 non-null   float64       
 5   Volume        1258 non-null   int64         
 6   Dividends     1258 non-null   int64         
 7   Stock Splits  1258 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(3)
memory usage: 78.8 KB


## First View

In [5]:
if not os.path.exists("images"):
  os.mkdir('images')

In [6]:
fig = go.Figure(data=[go.Candlestick(x=data['Date'], close=data['Close'], open=data['Open'], 
                             low=data['Low'], high=data['High'])])

fig.show()

In [7]:
data.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
count,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0
mean,2452.403505,2462.925358,2440.702211,2452.643027,3625464000.0,0.0,0.0
std,357.405072,357.889358,357.025302,357.451845,671811400.0,0.0,0.0
min,1833.400024,1847.0,1810.099976,1829.079956,1296540000.0,0.0,0.0
25%,2101.687439,2108.959961,2092.134888,2102.08252,3232422000.0,0.0,0.0
50%,2434.209961,2441.555054,2427.97998,2434.14502,3520885000.0,0.0,0.0
75%,2773.082458,2783.702515,2758.289978,2771.179993,3900050000.0,0.0,0.0
max,3247.22998,3247.929932,3234.370117,3240.02002,7609010000.0,0.0,0.0


In [14]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=data['Volume']))

# Overlay both histograms
fig.update_layout(bargap=0.1, title_text="Combined Volume: 2015-2019")

# Reduce opacity to see both histograms
fig.show()
fig.write_image('images/combined_volume.png')

In [15]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=data['Open'], name='Open'))
fig.add_trace(go.Histogram(x=data['Close'], name='Close'))

# Overlay both histograms
fig.update_layout(barmode='overlay', bargap=0.1,
                  title_text="")

# Reduce opacity to see both histograms
fig.update_traces(opacity=0.6)
fig.show()
fig.write_image('images/combined_open_close_overlay.png')

In [10]:
data['Year'] = data['Date'].dt.strftime('%Y')
data['Month'] = data['Date'].dt.strftime('%m')
data['Day'] = data['Date'].dt.strftime('%d')
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Year,Month,Day
0,2015-01-02,2058.899902,2072.360107,2046.040039,2058.199951,2708700000,0,0,2015,1,2
1,2015-01-05,2054.439941,2054.439941,2017.339966,2020.579956,3799120000,0,0,2015,1,5
2,2015-01-06,2022.150024,2030.25,1992.439941,2002.609985,4460110000,0,0,2015,1,6
3,2015-01-07,2005.550049,2029.609985,2005.550049,2025.900024,3805480000,0,0,2015,1,7
4,2015-01-08,2030.609985,2064.080078,2030.609985,2062.139893,3934010000,0,0,2015,1,8


In [11]:
min = data['Open'].min()
max = data['Open'].max()

def hist_year(df, col, year):
  fig = go.Histogram(x=df[df['Year']== year][col],
                      xbins=dict(
                      start=min,
                      end= max,
                      size=50),
                      autobinx=False,
                     name=year)
  return fig

In [12]:
fig = make_subplots(rows=5, cols=1)
trace0 = hist_year(data, 'Open', '2015')
trace1 = hist_year(data, 'Open', '2016')
trace2 = hist_year(data, 'Open', '2017')
trace3 = hist_year(data, 'Open', '2018')
trace4 = hist_year(data, 'Open', '2019')

# Overlay both histograms
fig.update_layout(bargap=0.1)
fig.update_xaxes(range=[min,max])
fig.update_layout(title_text="Comparing Open Prices: 2015-2019", height=700)

# Reduce opacity to see both histograms
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 2, 1)
fig.append_trace(trace2, 3, 1)
fig.append_trace(trace3, 4, 1)
fig.append_trace(trace4, 5, 1)
fig.show()
fig.write_image('images/compare_open_all.png')

In [47]:
fig = px.scatter_3d(data, x='Open', y='Close', z='High',
                    color='Year')
camera = dict(
    up=dict(x=1, y=0, z=1),
    center=dict(x=0, y=0, z=0),
    eye=dict(x=5, y=2, z=0.1)
)
fig.update_layout(height=700, title_text='Open, Close, and High Price by Year',
                  margin=dict(l=0, r=0, b=0, t=0), scene_camera=camera)
fig.show()