### Import Packages and Set Directories

In [1]:
import bs4 as bs
import requests
import yfinance as yf
import json
import os
import sys
import warnings
import inspect
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.patches import Patch
from pypfopt import EfficientFrontier
from pypfopt import risk_models
from pypfopt import expected_returns
from scipy.interpolate import make_interp_spline


# Set directories
notebook_dir = os.getcwd()
base_dir = os.path.join(notebook_dir, '..')
data_dir = os.path.join(base_dir, 'data')
stock_dir = os.path.join(base_dir, 'data', 'stocks')
src_dir = os.path.join(base_dir, 'src')
graph_dir = os.path.join(base_dir, 'results', 'graphs')
sys.path.append(src_dir)

### Stock Data

In [2]:
def get_sp_tickers(num):
    wiki_url = 'http://en.wikipedia.org/wiki/List_of_S%26P_' + str(num) + '_companies'
    resp = requests.get(wiki_url)
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        tickers.append(ticker)
    tickers = [s.replace('\n', '') for s in tickers]
    return tickers

def get_stock_data(tickers, category, start_date, end_date):
    """
    tickers: list of tickers
    category: price category. E.g. 'Close', 'Adj Close'.
    start_date, end_date: start and end date. E.g. 1996-03-01
    """
    df = yf.download(tickers, start=start_date, end=end_date, group_by='Ticker')
    df = df.stack(level=0).rename_axis(['Date', 'Ticker']).reset_index(level=1)
    pivoted_df = df.pivot(columns='Ticker', values=category).dropna(axis=1)
    return pivoted_df

# Download the stock data for all tickers in SP1500 
def create_stock_df(year, tickers_list):
    """
    year (integer)
    """
    start_date = str(year) + '-01-01'  # start date
    end_date = str(year) + '-12-31'    # end date    

    # Return dataframe
    stock_df = get_stock_data(tickers_list, 'Adj Close', start_date, end_date)    
    stock_df = stock_df.round(4)
    return stock_df


Download stock tickers

In [4]:
"""
sp500_tickers = get_sp_tickers(500)  # S&P 500 Tickers
sp400_tickers = get_sp_tickers(400)  # S&P MidCap 400 Tickers
sp600_tickers = get_sp_tickers(600)  # S&P SmallCap 600 Tickers

# Combine all tickers into one list
sp1500_tickers = sp500_tickers + sp400_tickers + sp600_tickers

# Convert the combined tickers to a JSON string
all_tickers = {'SP1500': sp1500_tickers}
json_data = json.dumps(all_tickers, indent=4)

# Save to file
with open(os.path.join(data_dir, 'sp1500_tickers.json'), 'w') as file:
    file.write(json_data)
"""

# Later, when you need to load the tickers
with open(os.path.join(data_dir, 'sp1500_tickers.json'), 'r') as file:
    tickers_data = file.read()
sp1500_tickers = json.loads(tickers_data)

Download stock data using tickers

In [13]:
# Download the stock data for all tickers in SP1500 
stock_df = pd.DataFrame()
start_year = 1994 # 1994 
end_year = 2023

for year in range(start_year, end_year+1):
    stock_year_df = create_stock_df(year, sp1500_tickers['SP1500'])
    stock_df = pd.concat([stock_df, stock_year_df], axis=0, sort=False)


[*********************100%%**********************]  1506 of 1506 completed

798 Failed downloads:
['GEF', 'NRG', 'SAIC', 'CTKB', 'OMCL', 'LOPE', 'ORA', 'PECO', 'ADEA', 'EXLS', 'RCM', 'DXPE', 'ENPH', 'RILY', 'NSA', 'LESL', 'CALM', 'SLP', 'SHAK', 'VSTO', 'MD', 'CPRI', 'BHF', 'IBP', 'ACLS', 'HUBG', 'PANW', 'AAP', 'NEO', 'MUSA', 'RXO', 'SRCL', 'HPP', 'ILMN', 'ECPG', 'WLK', 'DAL', 'YETI', 'DBX', 'HCC', 'ULTA', 'WTFC', 'CALX', 'TRIP', 'ANSS', 'NFLX', 'GNRC', 'CRL', 'INN', 'OGS', 'IRWD', 'NPO', 'STEL', 'TRGP', 'GNW', 'CUBI', 'SGH', 'ZI', 'NFBK', 'RSG', 'QNST', 'ALGN', 'TWO', 'APLE', 'DRQ', 'BANC', 'WH', 'CROX', 'NOVT', 'IRM', 'WAB', 'KDP', 'VVI', 'DVA', 'EIG', 'POST', 'AMKR', 'CIVI', 'GEV', 'FDP', 'ACM', 'ALV', 'ZD', 'VREX', 'AMN', 'ARMK', 'NXPI', 'CARG', 'SPNT', 'GPRE', 'GME', 'EXEL', 'GS', 'CCI', 'MKSI', 'SONO', 'DOW', 'EL', 'RGA', 'AZTA', 'MRNA', 'FLR', 'GM', 'RWT', 'IPGP', 'ALEX', 'MSM', 'KN', 'DEA', 'EPAM', 'IR', 'WEX', 'EQIX', 'UPS', 'SUPN', 'CHUY', 'AMR', 'ALGT', 'COR', 'PCRX', 'CCRN',

In [18]:
stock_df.to_csv(os.path.join(data_dir, 'stock_daily.csv'))