In [73]:
import os
from pandas_datareader import data as pdr
import pandas as pd
import yfinance as yf

In [88]:
START_DATE = "2003-08-01"
END_DATE = "2015-01-01"
tickers = ['MMM', "AOS", "GOOGL", "AMZN", "AEP", "AXP", "AIG", "AMP", "ADI", "AAPL", ]


In [94]:


def get_sp500_tickers():
    # Download S&P 500 data from Yahoo Finance
    tickers = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
    return list(tickers['Symbol'])

In [95]:
def build_stock_dataset(start=START_DATE, end=END_DATE):
    """
    Creates the dataset containing all stock prices
    :returns: stock_prices.csv
    """

    ticker_list = get_sp500_tickers()
    data = yf.download(ticker_list,START_DATE,END_DATE,auto_adjust=True)[['Close', 'Volume']]
    data_adj_close = data['Close'].reset_index()
    data_volume = data['Volume'].reset_index()
    data_path = os.path.join(os.getcwd(), "stock_data")
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    
    data_adj_close.to_csv(os.path.join(data_path, "stock_all_sp500_08_2003_to_01_2015_prices.csv"))
    data_volume.to_csv(os.path.join(data_path, "stock_volumes.csv"))


def build_sp500_dataset(start=START_DATE, end=END_DATE):
    """
    Creates the dataset containing S&P500 prices
    :returns: sp500_index.csv
    """
    index_data = yf.download("SPY", start=START_DATE, end=END_DATE, auto_adjust=True)
    
    index_data.to_csv("stock_data/sp500_08_2003_to_01_2015_index.csv")
    return index_data


In [96]:
build_stock_dataset()
build_sp500_dataset()

[*********************100%***********************]  503 of 503 completed

33 Failed downloads:
['DOW', 'CARR', 'CEG', 'GEHC', 'LW', 'CRWD', 'GDDY', 'VST', 'VLTO', 'FTV', 'CTVA', 'ABNB', 'HPE', 'UBER', 'KVUE', 'SW', 'DAY', 'PLTR', 'HWM', 'SOLV', 'MRNA', 'KHC', 'OTIS', 'IR', 'INVH', 'GEV', 'FOXA', 'PYPL', 'VICI', 'FOX', 'DELL']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (1d 2003-08-01 -> 2015-01-01) (Yahoo error = "Data doesn\'t exist for startDate = 1059710400, endDate = 1420088400")')
['BF.B']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (1d 2003-08-01 -> 2015-01-01)')
['BRK.B']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2003-08-01,66.228538,66.455551,65.594225,65.774506,49321000
2003-08-04,65.640964,66.101674,64.766287,65.774506,55214100
2003-08-05,65.707750,65.941442,64.325619,64.379036,61415600
2003-08-06,64.559313,65.474050,64.379033,64.752945,50096900
2003-08-07,64.879788,65.480714,64.606037,65.433975,43427400
...,...,...,...,...,...
2014-12-24,174.768892,175.037734,174.516843,174.558853,42963400
2014-12-26,175.012526,175.466216,174.962119,175.121750,57326700
2014-12-29,174.936918,175.567033,174.869704,175.356995,79643900
2014-12-30,174.928510,175.062925,174.340392,174.416016,73540800


In [72]:
data

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-08-01,253.554763,254.068075,252.267004,252.924377,53853300
2018-08-02,251.600730,254.473414,251.393597,254.302338,63426400
2018-08-03,254.428312,255.445921,254.248194,255.391891,53935400
2018-08-06,255.427956,256.643657,255.031718,256.328491,39400900
2018-08-07,257.003911,257.562239,256.868809,257.174988,43196600
...,...,...,...,...,...
2024-12-24,596.059998,601.340027,595.469971,601.299988,33160100
2024-12-26,599.500000,602.479980,598.080017,601.340027,41219100
2024-12-27,597.539978,597.780029,590.760010,595.010010,64969300
2024-12-30,587.890015,591.739990,584.409973,588.219971,56578800


In [86]:
sp500 = build_sp500_dataset()

[*********************100%***********************]  1 of 1 completed


In [87]:
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-08-01,253.554763,254.068075,252.267004,252.924377,53853300
2018-08-02,251.600730,254.473414,251.393597,254.302338,63426400
2018-08-03,254.428312,255.445921,254.248194,255.391891,53935400
2018-08-06,255.427956,256.643657,255.031718,256.328491,39400900
2018-08-07,257.003911,257.562239,256.868809,257.174988,43196600
...,...,...,...,...,...
2024-12-24,596.059998,601.340027,595.469971,601.299988,33160100
2024-12-26,599.500000,602.479980,598.080017,601.340027,41219100
2024-12-27,597.539978,597.780029,590.760010,595.010010,64969300
2024-12-30,587.890015,591.739990,584.409973,588.219971,56578800
