<a href="https://colab.research.google.com/github/oimartin/SP_500_index_RNN/blob/main/sp500_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ingest

## Load Libraries

In [21]:
# !pip install yfinance

Collecting yfinance
  Downloading yfinance-0.1.70-py2.py3-none-any.whl (26 kB)
Collecting requests>=2.26
  Downloading requests-2.27.1-py2.py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.3 MB/s 
[?25hCollecting lxml>=4.5.1
  Downloading lxml-4.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 8.9 MB/s 
Installing collected packages: requests, lxml, yfinance
  Attempting uninstall: requests
    Found existing installation: requests 2.23.0
    Uninstalling requests-2.23.0:
      Successfully uninstalled requests-2.23.0
  Attempting uninstall: lxml
    Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires req

In [11]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import yfinance as yf
import datetime as dt
%matplotlib inline

from sklearn.model_selection import train_test_split
from time import time
from keras.models import Sequential
from keras import layers
from keras.losses import BinaryCrossentropy, Poisson, SparseCategoricalCrossentropy
from keras.metrics import BinaryAccuracy
from keras.callbacks import EarlyStopping

## Load Data

In [23]:
data = yf.Ticker('^GSPC').history(start=dt.datetime(2015,1,1),
                           end=dt.datetime(2020,1,1)).reset_index()

In [24]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2015-01-02,2058.899902,2072.360107,2046.040039,2058.199951,2708700000,0,0
1,2015-01-05,2054.439941,2054.439941,2017.339966,2020.579956,3799120000,0,0
2,2015-01-06,2022.150024,2030.25,1992.439941,2002.609985,4460110000,0,0
3,2015-01-07,2005.550049,2029.609985,2005.550049,2025.900024,3805480000,0,0
4,2015-01-08,2030.609985,2064.080078,2030.609985,2062.139893,3934010000,0,0


# EDA

## Pre-processing

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          1258 non-null   datetime64[ns]
 1   Open          1258 non-null   float64       
 2   High          1258 non-null   float64       
 3   Low           1258 non-null   float64       
 4   Close         1258 non-null   float64       
 5   Volume        1258 non-null   int64         
 6   Dividends     1258 non-null   int64         
 7   Stock Splits  1258 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(3)
memory usage: 78.8 KB


## First View

In [26]:
fig = go.Figure(data=[go.Candlestick(x=data['Date'], close=data['Close'], open=data['Open'], 
                             low=data['Low'], high=data['High'])])

fig.show()

In [27]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=data['Volume']))

# Overlay both histograms
fig.update_layout(bargap=0.1)
# Reduce opacity to see both histograms
fig.show()

In [28]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=data['Open']))
fig.add_trace(go.Histogram(x=data['Close']))

# Overlay both histograms
fig.update_layout(barmode='overlay', bargap=0.1)
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.6)
fig.show()

In [29]:
data['Year'] = data['Date'].dt.strftime('%Y')
data['Month'] = data['Date'].dt.strftime('%m')
data['Day'] = data['Date'].dt.strftime('%d')
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Year,Month,Day
0,2015-01-02,2058.899902,2072.360107,2046.040039,2058.199951,2708700000,0,0,2015,1,2
1,2015-01-05,2054.439941,2054.439941,2017.339966,2020.579956,3799120000,0,0,2015,1,5
2,2015-01-06,2022.150024,2030.25,1992.439941,2002.609985,4460110000,0,0,2015,1,6
3,2015-01-07,2005.550049,2029.609985,2005.550049,2025.900024,3805480000,0,0,2015,1,7
4,2015-01-08,2030.609985,2064.080078,2030.609985,2062.139893,3934010000,0,0,2015,1,8


In [30]:
m_y_data = data.groupby(['Month', 'Year']).median().reset_index()

def by_year (df):
  m_y_data.loc[m_y_data['Year']== '2022'].sort_values(by='Month', ascending=True).Open
  return


In [31]:
m_y_data.loc[m_y_data['Year']== '2018'].sort_values(by='Month', ascending=True)

Unnamed: 0,Month,Year,Open,High,Low,Close,Volume,Dividends,Stock Splits
3,1,2018,2798.959961,2807.040039,2778.379883,2798.030029,3576350000.0,0.0,0.0
8,2,2018,2715.800049,2737.600098,2697.77002,2703.959961,3938450000.0,0.0,0.0
13,3,2018,2715.050049,2730.889893,2701.73999,2716.939941,3500330000.0,0.0,0.0
18,4,2018,2657.360107,2676.47998,2647.159912,2656.870117,3349370000.0,0.0,0.0
23,5,2018,2713.300049,2724.305054,2703.484985,2716.549927,3349680000.0,0.0,0.0
28,6,2018,2760.790039,2769.280029,2748.459961,2762.590088,3555090000.0,0.0,0.0
33,7,2018,2797.360107,2808.610107,2793.389893,2801.830078,3063850000.0,0.0,0.0
38,8,2018,2855.919922,2862.439941,2851.97998,2856.97998,2976970000.0,0.0,0.0
43,9,2018,2903.830078,2908.300049,2895.77002,2904.310059,3241250000.0,0.0,0.0
48,10,2018,2775.659912,2797.77002,2755.179932,2767.780029,3598710000.0,0.0,0.0


In [33]:
years = ['2019', '2018', '2017', '2016', '2015']
colors = ['slategray', 'magenta', 'red', 'green']
fig = ff.create_distplot([yr2021, yr2020, yr2019, yr2018], years, curve_type='normal', colors=colors)
fig.show()

PlotlyError: ignored

In [None]:
px.density_contour(data, x=['Open', 'Close', 'High', 'Low'])