<a href="https://colab.research.google.com/github/muchcreative/15Yr-Free-Historical-Data-and-Cleaning/blob/main/YF_FullHistroicalsDownloader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
'''
  - Download all applicable tickers from YF
  - Compare with IEX and store data
  - Considered HDF5 / CSV / Pickle Format
  - Pickle format is more for a temporary storage
  - If you are consistently updating the historicals, I recommend CSV or HDF5
'''
from google.colab import drive
drive.mount('/content/drive')

%pip install yfinance #Install modules as needed, if you using an ide use the python magic function "%pip install" to always get the latest version
import yfinance as yf
import pandas as pd
import h5py
from pathlib import Path
import json
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/April')

from pipelines import loaders, downloaders, url_generator
print("Imported Libraries") 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Imported Libraries


In [2]:
iex_sp500_constituents_filepath = '/content/drive/MyDrive/Colab Notebooks/April/data/iexS&P500constituents.json'
missing_iex_constituents_filepath = '/content/drive/MyDrive/Colab Notebooks/April/data/missingiexconstituents.json'

with open(iex_sp500_constituents_filepath, 'r') as f:
  iex_sp500_constituents = json.load(f)

with open(missing_iex_constituents_filepath, 'r') as f:
  missing_iex_constituents = json.load(f)

market_tickers = ['SPY','DIA','QQQ','^VIX']

tickers = iex_sp500_constituents + missing_iex_constituents + market_tickers
start_date = '2017-01-01'

In [58]:
#Check len of yf_tickers
print('We need to download {} tickers'.format(len(tickers)))

#Slice the tickers to make downloading easier and upload historicals to files in parts
sliced_tickers = tickers[:10]
print(sliced_tickers)

We need to download 848 tickers
['A', 'AAL', 'AAP', 'AAPL', 'ABBV', 'ABC', 'ABMD', 'ABT', 'ACN', 'ADBE']


In [50]:
def download_yf_tickers(tickers, period='15y'): #You can take more than 15yrs here or specifiy a start_date and end_date
  historicals = dict()
  tickers_avaliable_on_yf = []
  tickers_not_avaliable_on_yf = []

  for ticker in tickers:
    ticker_ref = yf.Ticker(ticker)
    ticker_history = ticker_ref.history(period=period, 
                                        auto_adjust=True) #auto_adjust=True, give adjusted OHLC
    if ticker_history.empty: #Returns an empty DataFrame if YF history doesn't exist
      tickers_not_avaliable_on_yf.append(ticker)
    else: 
      historicals[ticker] = ticker_history
      tickers_avaliable_on_yf.append(ticker)
  return (historicals, tickers_avaliable_on_yf, tickers_not_avaliable_on_yf)

historicals, tickers_avaliable_on_yf, tickers_not_avaliable_on_yf = download_yf_tickers(sliced_tickers)

In [51]:
#Keep track of the tickers that were avaliable on yf and those that weren't
def record_attendance_of_tickers_on_yf(tickers_avaliable_on_yf,
                                       tickers_not_avaliable_on_yf,
                                       logs_filepath):
  record_avaliable_tickers_to_json(tickers_avaliable_on_yf, logs_filepath) #Both record functions use same code, but differentiated for easier readability
  record_missing_tickers_to_json(tickers_not_avaliable_on_yf, logs_filepath)
  print('Attendance Taken and Recorded to Jsons Respectively')
  return

def record_avaliable_tickers_to_json(tickers_avaliable_on_yf, filepath):
  avaliable_tickers_log_filepath = f'{filepath}/avaliable_yf_tickers.json'
  avaliable_tickers_file = Path(avaliable_tickers_log_filepath) #Call full list missing tickers file

  if avaliable_tickers_file.is_file(): #Check if file already exists, if so extend and overwrite list
    with open(avaliable_tickers_log_filepath, 'r+', encoding='utf-8') as f:
      avaliable_tickers = json.load(f)
      avaliable_tickers.extend(tickers_avaliable_on_yf)
      f.seek(0)
      json.dump(avaliable_tickers, f, ensure_ascii=False, indent=4)
  else: #If file does not exist, create it and dump current missing tickers list into it
    with open(avaliable_tickers_log_filepath , 'w', encoding='utf-8') as f:
      json.dump(tickers_avaliable_on_yf, f, ensure_ascii=False, indent=4)
  print('Avaliable tickers have been logged to avaliable tickers list')
  return

def record_missing_tickers_to_json(tickers_not_avaliable_on_yf, filepath):
  missing_tickers_log_filepath = f'{filepath}/missing_yf_tickers.json'
  missing_tickers_file = Path( missing_tickers_log_filepath) #Call full list missing tickers file

  if missing_tickers_file.is_file(): #Check if file already exists, if so extend and overwrite list
    with open(missing_tickers_log_filepath, 'r+', encoding='utf-8') as f:
      missing_tickers = json.load(f)
      missing_tickers.extend(tickers_not_avaliable_on_yf)
      f.seek(0)
      json.dump(missing_tickers, f, ensure_ascii=False, indent=4)
  else: #If file does not exist, create it and dump current missing tickers list into it
    with open(missing_tickers_log_filepath, 'w', encoding='utf-8') as f:
      json.dump(tickers_not_avaliable_on_yf, f, ensure_ascii=False, indent=4)
  print('Missing tickers have been logged to missing tickers list')
  return

logs_filepath = '/content/drive/MyDrive/Colab Notebooks/April/data/yf/logs'
record_attendance_of_tickers_on_yf(tickers_avaliable_on_yf,
                                       tickers_not_avaliable_on_yf,
                                       logs_filepath)

Avaliable tickers have been logged to avaliable tickers list
Missing tickers have been logged to missing tickers list
Attendance Taken and Recorded to Jsons Respectively


In [None]:
def format_historicals_to_csv(historicals):
  '''
  Description:
    - Remove dividends and stock splits, as adjusted OHLC will already consider them
  Returns:
    - Formatted historicals for a CSV file
  '''
  for ticker in historicals:
    historicals[ticker] = historicals[ticker].drop(['Dividends', 'Stock Splits'], axis='columns')
#    historicals[ticker] = historicals[ticker].reset_index()
  return historicals

def format_historicals_to_hdf5(historicals):
  '''
  Description:
    - Remove dividends and stock splits, as adjusted OHLC will already consider them
    - Change datetime to timestamps for HDF5 as HDF5 does not accept datetimes
  Returns:
    - Formatted historicals for an HDF5 file
  '''
  for ticker in historicals:
    historicals[ticker] = historicals[ticker].drop(['Dividends', 'Stock Splits'], axis='columns')
    historicals[ticker] = historicals[ticker].reset_index()
    historicals[ticker]['Date'] = historicals[ticker]['Date'].apply(lambda x: x.timestamp())
  return historicals

#historicals = format_historicals_to_csv(historicals)
historicals = format_historicals_to_hdf5(historicals)

In [55]:
#Save historicals to HDF5
historicals_filepath = '/content/drive/MyDrive/Colab Notebooks/April/data/yf'

#If you prefer CSV files
def save_historicals_to_csv(historicals, filepath):
  for ticker in historicals:
    csv_filepath = f'{filepath}/{ticker}.csv'
    historicals[ticker].to_csv(csv_filepath)
    print('Ticker {} Saved to CSV'.format(ticker))
  print('All Tickers Have Been Saved')

#If you prefer HDF5 files, you will need to use a formatter
#Alternatively you can use pd.hdfstore but I prefer to have it as saved as a numpy array
def save_historicals_to_hdf5(historicals, filepath):
  for ticker in historicals:
    hdf5_filepath = f'{filepath}/{ticker}.hdf5'
    with h5py.File(hdf5_filepath, 'w') as f:
      history = f.create_group('historicals')
      history.create_dataset(name='15Y', data=historicals[ticker], compression='gzip')
    print('Saved {} as HDF5'.format(ticker))

#save_historicals_to_csv(historicals, historicals_filepath)
save_historicals_to_hdf5(historicals, historicals_filepath)

Saved ^VIX as HDF5


In [56]:
def check_if_tickers_were_saved_successfully(tickers_avaliable_on_yf, 
                                        filepath,
                                        save_type='csv'):
  assert save_type in ['csv', 'hdf5'], 'Save type must be "csv" or "hdf5"'

  tickers_not_saved = []
  for ticker in tickers_avaliable_on_yf:
    ticker_filepath = f'{filepath}/{ticker}.{save_type}'
    ticker_file = Path(ticker_filepath)
    if ticker_file.is_file():
      pass
    else:
      print("{} is missing".format(ticker))
      tickers_not_saved.append(ticker)
  return tickers_not_saved

tickers_not_saved = check_if_tickers_saved_successfully(tickers_avaliable_on_yf, 
                                                        historicals_filepath,
                                                        save_type='hdf5') #Change to csv or hdf5 depending on the format you used
print('Tickers NOT saved successfully were {}'.format(tickers_not_saved))

Tickers NOT saved successfully were []


In [57]:
#For reference on loading the database
def load_csv_tickers_as_pd_historicals(tickers, filepath):
  historicals = dict()
  '''
    Description:
      - Takes tickers as a list ['A', 'AAPL', 'AMZN']
      - Formats CSV with pandas "pd.read_csv"
    Returns:
      - Historicals dict() containing formatted pandas DataFrames with tickers as keys
  '''
  for ticker in tickers:
    csv_filepath = f'{filepath}/{ticker}.csv'
    dataset = pd.read_csv(csv_filepath, index_col='Date')
    historicals[ticker] = dataset
  return historicals

def load_hdf5_tickers_as_pd_historicals(tickers, filepath):
  historicals = dict()
  columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
  
  for ticker in tickers:
    hdf5_filepath = f'{filepath}/{ticker}.hdf5'
    with h5py.File(hdf5_filepath, 'r') as f:
      group = f['historicals']
      data = group['15Y'][()]
      
    dataset = pd.DataFrame(data=data, columns=columns)
    dataset['Date'] = pd.to_datetime(dataset['Date'], unit='s')
    dataset = dataset.set_index('Date')
    historicals[ticker] = dataset
  return historicals

#historicals = load_csv_tickers_as_pd_historicals(['A'], historicals_filepath)
historicals = load_hdf5_tickers_as_pd_historicals(['A'], historicals_filepath)