In [3]:
#We saw that we had a lot of missing data in Part 3A, in this section we will try to remedy that with another dataset
#This dataset comes from IEX Cloud. For 15Yr data you will need a subcription which is around 10USD per month
#It comes with realtime data, other financial/valuation metrics, and some level of customer support

import numpy as np
import datetime as dt
from itertools import chain

import json
import requests

from p3Binputs.apitokens import IEX_TOKEN #Import your IEX_TOKEN
                                          #If you want to try this for free you can use IEX Sandbox Mode and a Sandbox Token
print("Imported Libraries") 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Imported Libraries


In [4]:
#You can decide if you want too create two full SP500 databases and average or merge your results for the most accurate OHLC
#Or you can just download only the historicals you are missing here, here I will download only the historicals we are missing
#But I suggest trying three databases, if you can fit it. One for YF historicals, one for IEX, and one for the merge or average results

filepath = 'p3Binputs/full_missing_tickers_and_dates'

with open(filepath, 'r') as f:
  full_missing_tickers_and_dates = json.load(f)

tickers = list(full_missing_tickers_and_dates.keys())
print(tickers[:5]) #Check first 5 of the tickers we will request from IEX Cloud

['AABA', 'ABI', 'ABKFQ', 'ACAS', 'ACS']


In [5]:
#You can make any changes to the tickers list here
#For example I know the Sears tickers was spelled wrong from Part 3A, it should be $SHLDQ not SHLD

tickers.remove('SHLD')
tickers.append('SHLDQ')
full_missing_tickers_and_dates['SHLDQ'] = full_missing_tickers_and_dates.pop('SHLD') 

In [56]:
#Generate Batch Request Urls, batch urls are more efficient and IEX Cloud will charge you less credits compared to single url requests
def get_historical_batch_urls(tickers, date_length, IEX_TOKEN=IEX_TOKEN):
  '''Returns list of partitioned batch urls for api requests'''
  historical_batch_urls = []
  ticker_batches = [] #To have a recording of tickers in each batch

  for ticker_partition in _partition(tickers): #Partitioned for 50 tickers in each batch url 
    ticker_batches.append(ticker_partition)
    ticker_partition = ",".join(ticker_partition)
    #Batch Url should be changed to the respective sandbox mode url if you are testing if it works, you need to use a sandbox url
    batch_url = (f"https://cloud.iexapis.com/stable/stock/market/batch?symbols="
                + f"{ticker_partition}&types=chart&range={date_length}&token={IEX_TOKEN}")
    historical_batch_urls.append(batch_url)
  return historical_batch_urls, ticker_batches

def _partition(tickers, partition_size=50):
  partitioned_tickers = []
  for i in range(0, len(tickers), partition_size):
    partitioned_tickers.append(tickers[i:i+partition_size])
  return partitioned_tickers

date_length = '15Y' #Max Data Length for IEX is 15 years
historical_batch_urls, ticker_batches = get_historical_batch_urls(tickers, date_length)

print('BATCH URL HERE')  #print(historical_batch_urls) #Can click the printed url batches to check, but will cost credits if not in sandbox
print('We have {} batch urls that need to be downloaded'.format(len(historical_batch_urls)))

BATCH URL HERE
We have 5 batch urls that need to be downloaded


In [57]:
#Segregate Batch URLS for easier consumption and slowly move the sliding window
#Similar to what we did when downloading our Yahoo Finance Historicals
#Reminder to delete API Key

sliced_historical_batch_urls = historical_batch_urls[:5] #I am just going to do it all
sliced_ticker_batches =  ticker_batches[:5] #Need this for analysis later, slice it the same as above for tracking purposes

print(sliced_historical_batch_urls[0]) #Check the url, you can click it, but it will cost credits if not in Sandbox Mode

https://cloud.iexapis.com/stable/stock/market/batch?symbols=AABA,ABI,ABKFQ,ACAS,ACS,AGN,AKS,ALXN,ANRZQ,APC,APCC,APOL,ARG,ASN,AT,AV,AVP,AW,AYE,BBT,BCR,BDK,BF.B,BHGE,BJS,BMC,BMET,BNI,BRCM,BRK.B,BSC,BTUUQ,BXLT,CAM,CBS,CBSS,CCE,CCTYQ,CEG,CELG,CEPH,CFC,CFN,CITGQ,CMCSK,CMVT,COG,COV,CPGX,CTL&types=chart&range=15Y&token=SOME_IEX_TOKEN


In [None]:
#Request IEX Historicals from IEX Cloud and prepare to save them later as hdf5
def get_iex_historicals(hist_batch_urls):
  historicals = dict()
  key_error_log = []
  for hist_batch_url in hist_batch_urls:
    try:
      hist_response = requests.get(hist_batch_url)
      hist_response.raise_for_status()
      hist_response = hist_response.json()
    except requests.exceptions.RequestException as e:
      print('Stopped at batch url: {}'.format(hist_batch_url))
      print('Status Code: {}'.format(hist_response.status_code))
      raise SystemExit(e)
    for ticker in hist_response:
      ticker_hist = list()
      total_amount_of_days = len(hist_response[ticker]['chart'])
      for day in range(0, total_amount_of_days):
        current_date = hist_response[ticker]['chart'][day]['date']
        current_timestamp = dt.datetime.strptime(current_date,"%Y-%m-%d").timestamp() #Change to timestamp to save as hdf5
        try:
          ticker_hist.append([current_timestamp,
                            hist_response[ticker]['chart'][day]['fOpen'], #The 'f' in front of the OHLC names hash for the adjusted prices
                            hist_response[ticker]['chart'][day]['fHigh'], #fHigh is missing lol?
                            hist_response[ticker]['chart'][day]['fLow'],
                            hist_response[ticker]['chart'][day]['fClose'],
                            hist_response[ticker]['chart'][day]['fVolume']])
          historicals[ticker] = ticker_hist
        except KeyError as e:
          print("Key Error with {} at {} for {}".format(current_date, ticker, e))
          key_error_log.append([ticker, current_date, e])
      print('Finished downloading {}'.format(ticker))
  return historicals, key_error_log

historicals, key_error_log = get_iex_historicals(sliced_historical_batch_urls)

In [22]:
#Check your key errors, this occurs if a data point is missing; the entire OHLC for that day will not get added
print(key_error_log)

{'VAR': ['2011-01-27', KeyError('fHigh')], 'EQ': ['2019-08-12', KeyError('fOpen')], 'PLL': ['2019-08-12', KeyError('fOpen')]}


In [24]:
#Filter out historicals that didn't have data from IEX, to prevent saving empty hdf5 arrays
#Two things to filter out here, historicals dict have ticker key but empty array or ticker missing from historical dict

def filter_out_historicals_missing_from_iex(historicals, sliced_ticker_batches):
  flat_ticker_batches = chain.from_iterable(sliced_ticker_batches) #Flatten it for a for loop
  
  missing_iex_tickers = [ticker 
                         for ticker in flat_ticker_batches
                         if ticker not in list(historicals.keys())] #Check for if tickers are missing
  
  empty_historical_tickers =  [ticker
                              for ticker, historicals in historicals.items()
                              if not historicals]
  return missing_iex_tickers, empty_historical_tickers

missing_iex_tickers, empty_historical_tickers = filter_out_historicals_missing_from_iex(historicals, sliced_ticker_batches)
print('We have {} missing tickers in this batch'.format(len(missing_iex_tickers)
                                                        + len(empty_historical_tickers)))
print(missing_iex_tickers[:10]) #We can take a peek out of our missing tickers here

We have 160 missing tickers in this batch
['AABA', 'ABI', 'ABKFQ', 'ACAS', 'ACS', 'AGN', 'AKS', 'ANRZQ', 'APC', 'APCC']


In [25]:
#Let's add the missing_iex_tickers as empty lists to the historicals for data comparison
def add_missing_iex_tickers_to_historicals(historicals, missing_iex_tickers):
  for ticker in missing_iex_tickers:
    historicals[ticker] = []
  return historicals

historicals = add_missing_iex_tickers_to_historicals(historicals, missing_iex_tickers)

In [26]:
#Let's see what data we are still missing between our missing data and the actual data
#You have to include missing_iex_tickers here as it was not included in our historicals Ok double check this

def check_data_that_is_still_missing_from_batches(historicals, 
                                                  missing_iex_tickers,
                                                  full_missing_tickers_and_dates):
  data_still_missing = {ticker: np.setdiff1d(full_missing_tickers_and_dates[ticker], historicals[ticker])
                       for ticker in historicals}
  return data_still_missing

def convert_timestamps_to_datetimes(data_still_missing):
    data_still_missing = {ticker:
                                  [dt.date.fromtimestamp(missing_date)
                                  for missing_date in missing_dates]
                          for ticker, missing_dates in data_still_missing.items()}
    return data_still_missing

data_still_missing = check_data_that_is_still_missing_from_batches(historicals, 
                                                                   missing_iex_tickers,
                                                                   full_missing_tickers_and_dates)
#converted_data_still_missing = convert_timestamps_to_datetimes(data_still_missing) #Can convert timestamps to dates to easily see the difference

In [28]:
#Compute difference and see how much of the data we filled
def calculate_data_reductions_from_iex(historicals,
                                      data_still_missing,
                                      full_missing_tickers_and_dates):

  amount_still_missing = {ticker: len(data_still_missing[ticker])
                          for ticker in historicals}
  total_still_missing = sum(amount_still_missing.values())

  amount_data_reductions = {ticker: len(full_missing_tickers_and_dates[ticker]) - len(data_still_missing[ticker])
                          for ticker in historicals}
  total_reductions = sum(amount_data_reductions.values())
  return (amount_still_missing, total_still_missing, amount_data_reductions, total_reductions)

(amount_still_missing, total_still_missing, 
 amount_data_reductions, total_reductions) = calculate_data_reductions_from_iex(historicals, 
                                                                                data_still_missing, 
                                                                                full_missing_tickers_and_dates)
total_inital_missing_data = total_still_missing + total_reductions

print("Amount of data still missing {}".format(total_still_missing))
print("Amount of missing data we filled {}".format(total_reductions))
print("Of the {} total inital missing data for these batches, we filled {:.2%} for these batches".format(total_inital_missing_data,
                                                                                                         total_reductions/total_inital_missing_data))

Amount of data still missing 290871
Amount of missing data we filled 36913
Of the 327784 total inital missing data for these batches, we filled 11.26% for these batches


In [45]:
def check_remaining_missing_tickers(data_still_missing):
  remaining_missing_tickers = []
  filled_tickers = []
  for ticker, dates in data_still_missing.items():
    if dates.size == 0:
      filled_tickers.append(ticker)
    else:
       remaining_missing_tickers.append(ticker)
  return remaining_missing_tickers, filled_tickers

remaining_missing_tickers, filled_tickers = check_remaining_missing_tickers(data_still_missing)
print("Amount of remaining missing tickers: {}".format(len(remaining_missing_tickers)))
print("Amount of missing tickers filled by IEX Cloud: {}".format(len(filled_tickers)))

Amount of remaining missing tickers: 228
Amount of missing tickers filled by IEX Cloud: 5


In [None]:
#In summary we got rid of around 10% of the missing data and 5 full tickers from our missing data
#This is not bad, considering you will most likely have to use manual research to complete the rest of the missing tickers
#From here I suggest looking at the data to see it matches the YF data and creating pipelines be able to quickly compare them
#Additionally, we can understand that at this point manual research will benefit us a lot more than just pulling data from different online datasets

## Possible Next Steps

In [65]:
#Drop the filled_tickers from the data_still_missing dict
#Then save the data_still_missing to a logs folder as a json

def format_data_still_missing_for_json(data_still_missing):
  formatted_data_still_missing = dict()

  #Remove filled tickers as they are empty here (not missing anymore)
  for ticker in filled_tickers:
    data_still_missing.pop(ticker)

   #Need ndarray as list for json
  for ticker, missing_dates in data_still_missing.items():
    formatted_data_still_missing[ticker] = missing_dates.tolist()
  return  formatted_data_still_missing

save_filepath = '/p3outputs/data_still_missing_after_iex.json'
formatted_data_still_missing = format_data_still_missing_for_json(data_still_missing)

with open(save_filepath, 'w', encoding='utf-8') as f:
  json.dump(formatted_data_still_missing, f, ensure_ascii=False, indent=4)

print('Data still missing has been saved as a json')

Data still missing has been saved as a json


In [66]:
#You will also need to one day merge the historicals from IEX to YF
#I suggest saving three seperate databases one for YF, IEX, and a combined one of YF and IEX
#This will reduce a lot of re-downloading of historicals if something goes bad during the merge
#The functions I use to merge the datasets are as follows for reference
#This does not take into account data that is wrong in the historicals, it is a naive merge
#You will need to use a pipeline to check if data is consistent with your other databases

import pandas as pd
import datetime as dt

def merge_historicals(yf_historicals, iex_historicals):
  historicals = dict()
  all_tickers = set(yf_historicals.keys()) + set(iex_historicals.keys())
  for ticker in all_tickers:
    historicals[ticker] = pd.concat([yf_historicals[ticker], iex_historicals[ticker]])
    historicals[ticker] = historicals[ticker].groupby(historicals[ticker].index).first().sort_index()
  return historicals

def test_for_ordinance(historicals):
  '''
  Description:
    Double check that the sort and merge worked

  Returns:
    - True if all the data is in chronological order
    - False if the data is not in chronological order
  '''
  ordinance = True
  for ticker in historicals:
    current_date = dt.date.min
    for date in historicals[ticker].index:
      if current_date > date or current_date == date:
        print('Error with ticker {} on {}'.format(ticker, date))
        ordinance = False
        return ordinance
      else:
        current_date = date
  return ordinance

In [None]:
#From here you are free to save these downloaded historicals
#But again I suggest creating a complete IEX database and then merging the two on a seperate database
#This way you can check that the data matches up in the pipeline and complete any safety checks without overwriting the seperate bases