In [None]:
#Install modules as needed
%pip install yfinance #If you are using an ide use the python magic function "%pip install" to always get the latest version
import yfinance as yf
import pandas as pd
import h5py
from pathlib import Path
import json

print("Importing Complete") 

Importing Complete


In [None]:
#Let's bring in all the SP500 Consitutents we need to download off of Yahoo Finance, this is file we got from the Part 1 Tutorial, 
sp500_constituents_filepath = '/p2data/S&P500 Consitutents 20070101-20220116.json' #It has been placed in p2data for easy access

with open(sp500_constituents_filepath, 'r') as f:
  sp500_constituents = json.load(f)

additional_tickers = ['SPY','DIA','QQQ','^VIX'] #You may add any additional tickers you want here thats no in the SP500

tickers = sp500_constituents + additional_tickers
print('We need to download {} tickers'.format(len(tickers))) #We see that there are 848 tickers we need to download

We need to download 848 tickers


In [None]:
#848 is a lot of tickers and data. Let's slice our tickers list and choose to download 100 of them at a time
#Start with slices [:100] --> [100:200] --> [200:300] or any amount of tickers you are comfortable with to download at a time
#You can slice the tickers here then run all the code below to save them to the file format of your choice. 
#Then you can come back to this cell and select a new slice to download

sliced_tickers = tickers[:100]
print(sliced_tickers) #Print the tickers here to double check that they are different than the ones you just downloaded

['A', 'AABA', 'AAL', 'AAP', 'AAPL', 'ABBV', 'ABC', 'ABI', 'ABKFQ', 'ABMD', 'ABT', 'ACAS', 'ACN', 'ACS', 'ADBE', 'ADCT', 'ADI', 'ADM', 'ADP', 'ADS', 'ADSK', 'ADT', 'AEE', 'AEP', 'AES', 'AET', 'AFL', 'AGN', 'AIG', 'AIV', 'AIZ', 'AJG', 'AKAM', 'AKS', 'ALB', 'ALGN', 'ALK', 'ALL', 'ALLE', 'ALTR', 'ALXN', 'AMAT', 'AMCR', 'AMD', 'AME', 'AMG', 'AMGN', 'AMP', 'AMT', 'AMZN', 'AN', 'ANDV', 'ANET', 'ANF', 'ANRZQ', 'ANSS', 'ANTM', 'AON', 'AOS', 'APA', 'APC', 'APCC', 'APD', 'APH', 'APOL', 'APTV', 'ARE', 'ARG', 'ARNC', 'ASH', 'ASN', 'AT', 'ATGE', 'ATI', 'ATO', 'ATVI', 'AV', 'AVB', 'AVGO', 'AVP', 'AVY', 'AW', 'AWK', 'AXP', 'AYE', 'AYI', 'AZO', 'BA', 'BAC', 'BAX', 'BBBY', 'BBT', 'BBWI', 'BBY', 'BC', 'BCR', 'BDK', 'BDX', 'BEAM', 'BEN']


In [None]:
#With our ticker slice we can now download their historicals. Note that the YF modules is not a REST API. It works more similar to a web scraper.
#We will also create two lists that will record which tickers were avaliable on YF and those that were not avaliable on YF
#We will address the unavaliable tickers in the Part 3 Tutorial

def download_yf_tickers(tickers, period='15y'): #You can take more than 15yrs here or specifiy a start_date and end_date
  historicals = dict()
  tickers_avaliable_on_yf = []
  tickers_not_avaliable_on_yf = []

  for ticker in tickers:
    ticker_ref = yf.Ticker(ticker)
    ticker_history = ticker_ref.history(period=period, 
                                        auto_adjust=True) #auto_adjust=True, to get adjusted OHLC

    if ticker_history.empty: #Returns an empty DataFrame if the tickers YF history doesn't exist
      tickers_not_avaliable_on_yf.append(ticker)
    else: 
      historicals[ticker] = ticker_history
      tickers_avaliable_on_yf.append(ticker)
  return (historicals, tickers_avaliable_on_yf, tickers_not_avaliable_on_yf)

historicals, tickers_avaliable_on_yf, tickers_not_avaliable_on_yf = download_yf_tickers(sliced_tickers)

- AABA: No data found, symbol may be delisted
- ABI: No data found for this date range, symbol may be delisted
- ABKFQ: No data found, symbol may be delisted
- ACAS: No data found for this date range, symbol may be delisted
- AGN: No data found, symbol may be delisted
- AKS: No data found, symbol may be delisted
- ALXN: No data found, symbol may be delisted
- ANRZQ: No data found for this date range, symbol may be delisted
- APC: No data found, symbol may be delisted
- APCC: No data found for this date range, symbol may be delisted
- APOL: No data found for this date range, symbol may be delisted
- ARG: No data found for this date range, symbol may be delisted
- AT: No data found, symbol may be delisted
- AV: No data found for this date range, symbol may be delisted
- AVP: No data found, symbol may be delisted
- AW: No data found for this date range, symbol may be delisted
- BBT: No data found, symbol may be delisted
- BCR: No data found for this date range, symbol may be delisted


In [None]:
#Keep track of the tickers that were avaliable on yf and those that weren't, we will save it in p2outputs in the logs folder
def record_attendance_of_tickers_to_json(tickers_to_sort, 
                                        filepath,
                                        status):
  log_filepath = f'{filepath}/{status}_yf_tickers.json'
  log_file = Path(log_filepath) #To check if the file already exists, if so we will extend and overwrtie the list
  if log_file.is_file():
    with open(log_filepath, 'r+', encoding='utf-8') as f:
      updated_tickers_lst = json.load(f)
      updated_tickers_lst.extend(tickers_to_sort)
      f.seek(0)
      json.dump(updated_tickers_lst, f, ensure_ascii=False, indent=4)
  else: #If file does not exist, we will create one and dump the tickers_lst into it
    with open(log_filepath , 'w', encoding='utf-8') as f:
      json.dump(tickers_to_sort, f, ensure_ascii=False, indent=4)
  print('{0} tickers have been logged to {0} tickers list'.format(status))
  return

logs_filepath = '/p2outputs/logs'
record_attendance_of_tickers_to_json(tickers_avaliable_on_yf,
                                     logs_filepath,
                                     status='avaliable')
record_attendance_of_tickers_to_json(tickers_not_avaliable_on_yf,
                                     logs_filepath,
                                     status='missing')

avaliable tickers have been logged to avaliable tickers list
missing tickers have been logged to missing tickers list


In [None]:
#Here you can choose which format you want to save your historicals as
#You can use pd.hdfstore to store your historicals in a pandas format but it will require different formatting. 
#Currently hdf5 are saved to accommodate loading as numpys

def format_historicals_to_csv(historicals):
  '''
  Description:
    - Remove dividends and stock splits, as adjusted OHLC will already consider them
  Returns:
    - Formatted historicals for a CSV file
  '''
  for ticker in historicals:
    historicals[ticker] = historicals[ticker].drop(['Dividends', 'Stock Splits'], axis='columns')
    historicals[ticker] = historicals[ticker].reset_index()
  print('Finished formatting historicals as csv format')
  return historicals

def format_historicals_to_hdf5(historicals):
  '''
  Description:
    - Remove dividends and stock splits, as adjusted OHLC will already consider them
    - Change datetime to timestamps for HDF5 as HDF5 does not accept datetimes
  Returns:
    - Formatted historicals for an HDF5 file
  '''
  for ticker in historicals:
    historicals[ticker] = historicals[ticker].drop(['Dividends', 'Stock Splits'], axis='columns')
    historicals[ticker] = historicals[ticker].reset_index()
    historicals[ticker]['Date'] = historicals[ticker]['Date'].apply(lambda x: x.timestamp())
  print('Finished formatting historicals as hdf5 format')
  return historicals

hdf5_historicals = format_historicals_to_hdf5(historicals)

Finished formatting historicals as hdf5 format


In [None]:
#Save your historicals to a filepath of your choice
def save_historicals_to_csv(historicals, filepath):
  for ticker in historicals:
    csv_filepath = f'{filepath}/{ticker}.csv'
    historicals[ticker].to_csv(csv_filepath)
    print('Ticker {} Saved as CSV'.format(ticker))
  print('All Tickers Have Been Saved')

def save_historicals_to_hdf5(historicals, filepath):
  for ticker in historicals:
    hdf5_filepath = f'{filepath}/{ticker}.hdf5'
    with h5py.File(hdf5_filepath, 'w') as f:
      history = f.create_group('historicals')
      history.create_dataset(name='15Y', data=historicals[ticker], compression='gzip')
    print('Saved {} as HDF5'.format(ticker))

historicals_filepath = '/p2outputs'
save_historicals_to_hdf5(hdf5_historicals, historicals_filepath)

Saved A as HDF5
Saved AAL as HDF5
Saved AAP as HDF5
Saved AAPL as HDF5
Saved ABBV as HDF5
Saved ABC as HDF5
Saved ABMD as HDF5
Saved ABT as HDF5
Saved ACN as HDF5
Saved ACS as HDF5
Saved ADBE as HDF5
Saved ADCT as HDF5
Saved ADI as HDF5
Saved ADM as HDF5
Saved ADP as HDF5
Saved ADS as HDF5
Saved ADSK as HDF5
Saved ADT as HDF5
Saved AEE as HDF5
Saved AEP as HDF5
Saved AES as HDF5
Saved AET as HDF5
Saved AFL as HDF5
Saved AIG as HDF5
Saved AIV as HDF5
Saved AIZ as HDF5
Saved AJG as HDF5
Saved AKAM as HDF5
Saved ALB as HDF5
Saved ALGN as HDF5
Saved ALK as HDF5
Saved ALL as HDF5
Saved ALLE as HDF5
Saved ALTR as HDF5
Saved AMAT as HDF5
Saved AMCR as HDF5
Saved AMD as HDF5
Saved AME as HDF5
Saved AMG as HDF5
Saved AMGN as HDF5
Saved AMP as HDF5
Saved AMT as HDF5
Saved AMZN as HDF5
Saved AN as HDF5
Saved ANDV as HDF5
Saved ANET as HDF5
Saved ANF as HDF5
Saved ANSS as HDF5
Saved ANTM as HDF5
Saved AON as HDF5
Saved AOS as HDF5
Saved APA as HDF5
Saved APD as HDF5
Saved APH as HDF5
Saved APTV as

In [None]:
#Check if all the tickers were saved succesfully as the file format of your choice, please specifiy in the 'save_type' keyword
def check_if_tickers_were_saved_successfully(tickers_avaliable_on_yf, 
                                            filepath,
                                            save_type='hdf5'):
  assert save_type in ['csv', 'hdf5'], 'Save type must be "csv" or "hdf5"'

  tickers_not_saved = []
  for ticker in tickers_avaliable_on_yf:
    ticker_filepath = f'{filepath}/{ticker}.{save_type}'
    ticker_file = Path(ticker_filepath)
    if ticker_file.is_file():
      pass
    else:
      print("{} is missing".format(ticker))
      tickers_not_saved.append(ticker)
  return tickers_not_saved

tickers_not_saved = check_if_tickers_saved_successfully(tickers_avaliable_on_yf, 
                                                        historicals_filepath,
                                                        save_type='hdf5') #Change to csv or hdf5 depending on the format you used
print('Tickers NOT saved successfully were {}'.format(tickers_not_saved))

Tickers NOT saved successfully were []


## For Reference: How to load your database to memory

In [None]:
#For reference on how to load your database as a pandas dataframe, you will need your sp500_constituents list
#There is no need to run this if you are still downloading all your historicals from Yahoo Finance

def load_csv_tickers_as_pd_historicals(tickers, filepath):
  historicals = dict()
  '''
    Description:
      - Takes tickers as a list ['A', 'AAPL', 'AMZN']
      - Formats CSV with pandas "pd.read_csv"
    Returns:
      - Historicals dict() containing formatted pandas DataFrames with tickers as keys
  '''
  for ticker in tickers:
    csv_filepath = f'{filepath}/{ticker}.csv'
    dataset = pd.read_csv(csv_filepath, index_col='Date')
    historicals[ticker] = dataset
  return historicals

def load_hdf5_tickers_as_pd_historicals(tickers, filepath):
  historicals = dict()
  columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
  
  for ticker in tickers:
    hdf5_filepath = f'{filepath}/{ticker}.hdf5'
    with h5py.File(hdf5_filepath, 'r') as f:
      group = f['historicals']
      data = group['15Y'][()]
      
    dataset = pd.DataFrame(data=data, columns=columns)
    dataset['Date'] = pd.to_datetime(dataset['Date'], unit='s')
    dataset = dataset.set_index('Date')
    historicals[ticker] = dataset
  return historicals

historicals = load_hdf5_tickers_as_pd_historicals(sp500_constituents, historicals_filepath)