In [None]:
# We saw that we had a lot of missing data in Part 3A, in this section we will try to remedy that with another dataset.
# This dataset comes from IEX Cloud. For 15Yr data you will need a subcription which is around 10USD per month.
# It comes with realtime data, other financial/valuation metrics, and some level of customer support.

import numpy as np
import pandas as pd
import datetime as dt
from itertools import chain

import json
import requests

from p3Binputs.apitokens import IEX_TOKEN  # Import your IEX_TOKEN.
                                           # If you want to try this for free you can use IEX Sandbox Mode and a Sandbox IEX Token.
print("Imported Libraries") 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Imported Libraries


In [None]:
# You can decide if you want too create two full SP500 databases and average or merge your results for the most accurate OHLC.
# Or you can just download only the historicals you are missing here. For now, I will download only the historicals we are missing.
# But I suggest trying three databases, if you can fit it. One for YF historicals, one for IEX, and one for the merge or average results

filepath = 'p3Binputs/full_missing_tickers_and_dates'

with open(filepath, 'r') as f:
  yf_missing_tickers_and_dates = json.load(f)

tickers = list(yf_missing_tickers_and_dates.keys())
print(tickers[:5])  # Check first 5 of the tickers we will request from IEX Cloud

['AABA', 'ABI', 'ABKFQ', 'ACAS', 'ACS']


In [None]:
# You can make any changes to the tickers list here
# For example I know the Sears tickers was spelled wrong from Part 3A, it should be $SHLDQ not $SHLD

tickers.remove('SHLD')
tickers.append('SHLDQ')
yf_missing_tickers_and_dates['SHLDQ'] = yf_missing_tickers_and_dates.pop('SHLD') 

In [None]:
# Generate Batch Request Urls, batch urls are more efficient and IEX Cloud will charge you less credits compared to single url requests

def generate_iex_historical_batch_urls(tickers, date_length, partition_size=50, IEX_TOKEN=IEX_TOKEN):
  '''Generates historical batch urls for IEX Cloud.
  
  Args:
    tickers: list of tickers as strings.
    date_length: string specifying how much data will be downloaded
    partition_size: integer specifying how many tickers will be downloaded
                    in each batch url. Max is 100 for IEX Cloud. Defaults 50.
    IEX_TOKEN: string of your IEX TOKEN. Defaults to the imported IEX Token
  
  Returns:
    historical_batch_urls: list of historical batch urls with specified partition_size
    ticker_batches: list of lists with each list denoting the tickers in each 
                    generated batch url. Indices match the historical_batch_urls indices.
  '''

  historical_batch_urls = []
  ticker_batches = []  # To have a recording of tickers in each batch

  for ticker_partition in partition(tickers, partition_size):  # Partitioned for 50 tickers in each batch url 
    ticker_batches.append(ticker_partition)
    ticker_partition = ",".join(ticker_partition)
    # Batch Url should be changed to the respective sandbox mode url if you are testing if it works, you need to use a sandbox url
    batch_url = (f"https://cloud.iexapis.com/stable/stock/market/batch?symbols="
                + f"{ticker_partition}&types=chart&range={date_length}&token={IEX_TOKEN}")
    historical_batch_urls.append(batch_url)
  return historical_batch_urls, ticker_batches

def partition(tickers, partition_size):
  '''Partitions tickers into a list of lists with specified partition size'''
  partitioned_tickers = []
  for i in range(0, len(tickers), partition_size):
    partitioned_tickers.append(tickers[i:i+partition_size])
  return partitioned_tickers

date_length = '15Y'  # Max Data Length for IEX is 15 years.
historical_batch_urls, ticker_batches = generate_iex_historical_batch_urls(tickers, date_length)

print('BATCH URL HERE')   # print(historical_batch_urls)  # Can click the printed url batches to check, but will cost credits if not in sandbox
print(f'We have {len(historical_batch_urls)} batch urls that need to be downloaded')

BATCH URL HERE
We have 5 batch urls that need to be downloaded


In [None]:
# Segregate Batch URLS for easier consumption and slowly move the sliding window.
# Similar to what we did when downloading our Yahoo Finance Historicals.

sliced_historical_batch_urls = historical_batch_urls[:5]  # I will download them all at once.
sliced_ticker_batches =  ticker_batches[:5]  # Need this for analysis later, slice it the same as above for tracking purposes.
print(sliced_historical_batch_urls[0])  # Check the url, you can click it, but it will cost credits if you are not in Sandbox Mode.

https://cloud.iexapis.com/stable/stock/market/batch?symbols=AABA,ABI,ABKFQ,ACAS,ACS,AGN,AKS,ALXN,ANRZQ,APC,APCC,APOL,ARG,ASN,AT,AV,AVP,AW,AYE,BBT,BCR,BDK,BF.B,BHGE,BJS,BMC,BMET,BNI,BRCM,BRK.B,BSC,BTUUQ,BXLT,CAM,CBS,CBSS,CCE,CCTYQ,CEG,CELG,CEPH,CFC,CFN,CITGQ,CMCSK,CMVT,COG,COV,CPGX,CTL&types=chart&range=15Y&token=SOME_IEX_TOKEN


In [None]:
# Request IEX Historicals from IEX Cloud and prepare to save them later as hdf5.

def download_iex_historicals(batch_urls):
  '''Downloads IEX historicals by making API requests to IEX Cloud.

  Downloaded data is for the adjusted Open, High, Low, Close and Volume.
  If no server response from IEX Cloud is recieved when the batch url requests
  the data, the function will safely raise a SystemExit. A key error will be 
  logged if the ticker data for a date is missing.
  
  Args:
    batch_urls: list of IEX batch urls.

  Returns:
    historicals: dict with tickers as keys and OHLC data as list of lists.
                 Each list contains a date as a timestamp and the adjusted OHLCV data
                 for that date.
    key_error_log: list of lists of the key errors that occured. Each list contains
                   which ticker that caused the key error and the date that it happened. 

  Raises:
    RequestException: SystemExit, prints "Stopped at batch url: {batch_url}" and "Status Code: {hist_response.status_code}".
    KeyError: Excepted, logs error into key_error_log.
  '''

  historicals = dict()
  key_error_log = []

  for batch_url in batch_urls:
    try:
      hist_response = requests.get(batch_url)
      hist_response.raise_for_status()
      hist_response = hist_response.json()
    except requests.exceptions.RequestException as e:
      print(f'Stopped at batch url: {batch_url}')
      print(f'Status Code: {hist_response.status_code}')
      raise SystemExit(e)
    for ticker in hist_response:
      ticker_hist = list()
      total_amount_of_days = len(hist_response[ticker]['chart'])
      for day in range(0, total_amount_of_days):
        current_date = hist_response[ticker]['chart'][day]['date']
        current_timestamp = dt.datetime.strptime(current_date,"%Y-%m-%d").timestamp()  # Change to timestamp to save as hdf5.
        try:
          ticker_hist.append([current_timestamp,
                              hist_response[ticker]['chart'][day]['fOpen'],  # As per IEX Cloud documentation the 'f' 
                              hist_response[ticker]['chart'][day]['fHigh'],  # in front of the OHLCV names specify for the adjusted OHLCV values.
                              hist_response[ticker]['chart'][day]['fLow'],
                              hist_response[ticker]['chart'][day]['fClose'],
                              hist_response[ticker]['chart'][day]['fVolume']])
          historicals[ticker] = ticker_hist
        except KeyError as e:
          print(f"Key Error with {current_date} at {ticker} for {e}")
          key_error_log.append([ticker, current_date, e])
      print(f'Finished downloading {ticker}')
  return historicals, key_error_log

historicals, key_error_log = download_iex_historicals(sliced_historical_batch_urls)

In [None]:
# Check your key errors, this occurs if a data point is missing; the entire OHLC for that day will not be added.

print(key_error_log)

{'VAR': ['2011-01-27', KeyError('fHigh')], 'EQ': ['2019-08-12', KeyError('fOpen')], 'PLL': ['2019-08-12', KeyError('fOpen')]}


In [None]:
# Filter out historicals that didn't have data from IEX to prevent saving empty hdf5 arrays.
# Two things to filter out here, the ticker is missing from the historicals dictionary or 
# the historicals dictionary has the ticker key but is paired with an empty array. 

def collect_tickers_not_found_on_iex(historicals, ticker_batches):
  '''Collects tickers that were not avaliable on IEX Cloud.

  There are two ways that the tickers are not found on IEX. The first 
  is that the ticker is missing from the historicals dict.The second is 
  that the historicals  dict has the ticker key but is paired with an empty 
  array. Both ways will be checked for and returned.  

  Args:
    historicals: dict with tickers as keys and OHLC data as a list of lists.
    ticker_batches: list of lists with each list denoting the tickers in each 
                    generated batch url.

  Returns:
    missing_iex_tickers: list of tickers that are missing from the historical dict
    empty_historical_tickers: list of tickers that are present in the historical dict,
                              but are paired as empty arrays.
  '''

  flat_ticker_batches = chain.from_iterable(ticker_batches)  # Flatten the list of lists.
  
  # Check if the ticker keys are missing.
  missing_iex_tickers = [ticker 
                         for ticker in flat_ticker_batches
                         if ticker not in list(historicals.keys())]  
  
  # Check if the ticker keys are paired with empty arrays.
  empty_historical_tickers =  [ticker
                              for ticker, historicals in historicals.items()
                              if not historicals]
  return missing_iex_tickers, empty_historical_tickers

missing_iex_tickers, empty_historical_tickers = collect_tickers_not_found_on_iex(historicals, sliced_ticker_batches)
print(f'We have {len(missing_iex_tickers) + len(empty_historical_tickers)} missing tickers in this batch')
print(missing_iex_tickers[:10])  # We can take a peek out of our missing tickers here.

We have 160 missing tickers in this batch
['AABA', 'ABI', 'ABKFQ', 'ACAS', 'ACS', 'AGN', 'AKS', 'ANRZQ', 'APC', 'APCC']


In [None]:
# Let's add the missing_iex_tickers as empty lists to the historicals for data comparison later.

def add_missing_iex_tickers_to_historicals(historicals, missing_iex_tickers):
  '''Adds missing IEX tickers to historicals as empty lists.'''
  for ticker in missing_iex_tickers:
    historicals[ticker] = []
  return historicals

historicals = add_missing_iex_tickers_to_historicals(historicals, missing_iex_tickers)

In [None]:
# Let's see what data we are still missing after both our Yahoo Finance and IEX Cloud data downloads.

def collect_data_that_is_still_missing(historicals, yf_missing_tickers_and_dates):
  '''Collects the tickers and dates that are still missing after the IEX download and YF download.'''
  data_still_missing = {ticker: np.setdiff1d(yf_missing_tickers_and_dates[ticker], historicals[ticker])
                       for ticker in historicals}
  return data_still_missing

def convert_timestamps_to_datetimes(data_still_missing):
  '''Converts timestamps to datetimes.'''
  data_still_missing = {ticker:
                               [dt.date.fromtimestamp(missing_date)
                               for missing_date in missing_dates]
                       for ticker, missing_dates in data_still_missing.items()}
  return data_still_missing

data_still_missing = collect_data_that_is_still_missing(historicals, yf_missing_tickers_and_dates)
# converted_data_still_missing = convert_timestamps_to_datetimes(data_still_missing)  # Can convert timestamps to dates to easily see the difference

In [None]:
# Compute difference and see how much of the data we filled.

def calculate_metrics_of_added_data_from_iex(historicals, data_still_missing, yf_missing_tickers_and_dates):
  '''Calculates updated metrics with the addition of new data.
  
  Args:
    historicals: dict with tickers as keys and OHLC data as a list of lists.
    data_still_missing: dict with tickers as keys and their 
                        current missing dates as values after IEX data addition.
    yf_missing_tickers_and_dates: dict with tickers as keys and their missing
                                  datas as values before IEX data addition.

  Returns:
    tickers_and_amount_missing: dict with tickers as keys and integer values specifying
                                how much data is still missing for the ticker.
    total_dates_missing: integer as the total amount of data still missing.
    tickers_and_amount_reduced: dict with tickers as keys and integer values specifying
                               how much data was reduced by adding the IEX data. 
    total_reductions: integer as the amount of missing data that has been 
                      reduced by the IEX data.
  '''

  tickers_and_amount_missing = {ticker: len(data_still_missing[ticker])
                              for ticker in historicals}
  total_dates_missing = sum(tickers_and_amount_missing.values())

  tickers_and_amount_reduced = {ticker: len(yf_missing_tickers_and_dates[ticker]) - len(data_still_missing[ticker])
                              for ticker in historicals}
  total_reductions = sum(tickers_and_amount_reduced.values())
  return (tickers_and_amount_missing, total_dates_missing, tickers_and_amount_reduced, total_reductions)

(tickers_and_amount_missing, total_dates_missing, 
 tickers_and_amount_reduced, total_reductions) = calculate_metrics_of_added_data_from_iex(historicals, data_still_missing, yf_missing_tickers_and_dates)
total_inital_missing_data = total_dates_missing + total_reductions

print(f"Amount of data still missing {total_dates_missing}")
print(f"Amount of missing data we filled {total_reductions}")
print(f"Of the {total_inital_missing_data} total inital missing data for these batches, we filled {(total_reductions/total_inital_missing_data):.2%} for these batches")

Amount of data still missing 290871
Amount of missing data we filled 36913
Of the 327784 total inital missing data for these batches, we filled 11.26% for these batches


In [None]:
# Check how many tickers have been filled by downloading data from IEX Cloud

def check_remaining_missing_tickers(data_still_missing):
  '''Checks which tickers are still missing from the database.
  
  Args:
    data_still_missing: dict with tickers as keys and their 
                        current missing dates as values after IEX data addition.
  
  Returns:
    remaining_missing_tickers: list
    filled_tickers: list
  '''

  remaining_missing_tickers = []
  filled_tickers = []

  for ticker, dates in data_still_missing.items():
    if dates.size == 0:
      filled_tickers.append(ticker)
    else:
       remaining_missing_tickers.append(ticker)
  return remaining_missing_tickers, filled_tickers

remaining_missing_tickers, filled_tickers = check_remaining_missing_tickers(data_still_missing)
print(f"Amount of remaining missing tickers: {len(remaining_missing_tickers)}")
print(f"Amount of missing tickers filled by IEX Cloud: {len(filled_tickers)}")

Amount of remaining missing tickers: 228
Amount of missing tickers filled by IEX Cloud: 5


In [None]:
# In summary we got rid of around 10% of the missing data and 5 full tickers from our missing data
# This is not bad, considering you will most likely have to use manual research to complete the rest of the missing tickers
# From here I suggest looking at the data to see it matches the YF data and creating pipelines be able to quickly compare them
# Additionally, we can understand that at this point manual research will benefit us a lot more than just pulling data from different online datasets

## Possible Next Steps

In [None]:
# Drop the filled_tickers from the data_still_missing dict
# Then save the data_still_missing to a logs folder as a json

def format_data_still_missing_for_json(data_still_missing):
  '''Formats data that is still missing to a json format.'''
  formatted_data_still_missing = dict()

  # Remove filled tickers as they are empty here (not missing anymore)
  for ticker in filled_tickers:
    data_still_missing.pop(ticker)

  # Need ndarray as list for json
  for ticker, missing_dates in data_still_missing.items():
    formatted_data_still_missing[ticker] = missing_dates.tolist()
  return  formatted_data_still_missing

save_filepath = '/p3outputs/data_still_missing_after_iex.json'
formatted_data_still_missing = format_data_still_missing_for_json(data_still_missing)

with open(save_filepath, 'w', encoding='utf-8') as f:
  json.dump(formatted_data_still_missing, f, ensure_ascii=False, indent=4)

print('Data still missing has been saved as a json')

Data still missing has been saved as a json


In [None]:
# You will also need to one day merge the historicals from IEX to YF
# I suggest saving three seperate databases one for YF, IEX, and a combined one of YF and IEX
# This will reduce a lot of re-downloading of historicals if something goes bad during the merge
# The functions I use to merge the datasets are as follows for reference
# This does not take into account data that is wrong in the historicals, it is a naive merge
# You will need to use a pipeline to check if data is consistent with your other databases

def merge_historicals(yf_historicals, iex_historicals):
  '''Merges given historicals.

  Historicals are concated with each other and same or overlapping dates 
  are removed. The merged dataframe is then sorted by ascending dates 
  from past to present.
  
  Args:
    yf_historicals: dict with tickers as keys and OHLC data as values.
                    Each OHLC data is given as a pandas dataframe. 
    iex_historicals: dict with tickers as keys and OHLC data as values.
                     Each OHLC data is given as a pandas dataframe. 
  
  Returns:
    merged_historicals: dict with tickers as keys and OHLC data as a pandas dataframe.
  '''

  merged_historicals = dict()

  all_tickers = set(yf_historicals.keys()) + set(iex_historicals.keys())
  for ticker in all_tickers:
    merged_historicals[ticker] = pd.concat([yf_historicals[ticker], iex_historicals[ticker]])
    merged_historicals[ticker] = merged_historicals[ticker].groupby(merged_historicals[ticker].index).first().sort_index()
  return merged_historicals

def test_for_ordinance(historicals):
  '''Tests that historicals are all in chronological order.

  Args:
    historicals: dict with tickers as keys and OHLC data as values.
                 Each OHLC data is given as a pandas dataframe. 

  Returns:
    ordinance: bool. Will be True if all the data is in chronological order
               or False if the data is not in chronological order
  '''

  ordinance = True
  for ticker in historicals:
    current_date = dt.date.min
    for date in historicals[ticker].index:
      if current_date > date or current_date == date:
        print(f'Error with ticker {ticker} on {date}')
        ordinance = False
        return ordinance
      else:
        current_date = date
  return ordinance

In [None]:
# From here you are free to save these downloaded historicals
# But again I suggest creating a complete IEX database and then merging the two on a seperate database
# This way you can check that the data matches up in the pipeline and complete any safety checks without overwriting the seperate bases