In [1]:
import requests
import numpy as np
import pandas as pd
import itertools
import multiprocessing as mp
import datetime
import time
import os
import json
import gc

pd.plotting.register_matplotlib_converters()

# 1 Get trade data from polygon.io

## 1.a Define api key, url endpoint

In [None]:
api_key = json.load(open('Polygon/api-key.json'))['API-KEY']
url     = 'https://api.polygon.io/v2/ticks/stocks/trades/{symbol}/{date}?timestamp={timestamp}&limit={limit}&apiKey={api_key}'

## 1.b Define function to get trades for a single date

In [None]:
def get_trades(date):
    """
    Get all trades that occurred on a given date
    """
    params = {
    'symbol': 'SPY',
    'date': date,
    'timestamp': '0',
    'limit': 50000,
    'api_key': api_key
    }
    trade_data    = []
    results_count = params['limit']
    
    #API limits response to 50k trades per request
    #this loop keeps requesting until it has exhausted all trades
    while results_count >= params['limit']:
        response = requests.get(url.format(**params)).json()
        if 'results' in response:
            results_count       = response['results_count']
            trade_data         += response['results']
            #replace starting timestamp of request with last trade's timestamp
            params['timestamp'] = response['results'][-1]['t']
        else:
            results_count = 0
            
    return trade_data

## 1.c Get trades for every day between January 1st 2003 and December 31st 2020

In [None]:
for year in range(2004, 2020):
    #generate all dates in the year in format 2020-03-12 (year-month-day)
    dates = [str(x)[:10] for x in pd.date_range('{}-01-01'.format(year), '{}-12-31'.format(year), freq='D')]
    
    #8 cores, so use multiprocess requests to Polygon.io
    with mp.Pool(8) as pool:
        #creates a list of dictionaries, each dictionary will become a row in the dataframe
        trade_data = list(itertools.chain.from_iterable(pool.map(get_trades, dates)))
    
    #combine data into a pandas dataframe
    spydf = pd.DataFrame(trade_data)
        
    #write to CSV file
    spydf.to_csv('Polygon/Raw/SPY_{}.csv'.format(year), index = False)
    
    #deallocate and free up memory
    del trade_data
    del spydf
    gc.collect()

# 2 Trim data and export as binary file

## 2.a Define function for trimming data

In [6]:
def prime(year):
    """
    Read in SPY_{year}.csv
    - create datetime index
    - convert to US/Eastern timezone
    - trim to trading hours
    - remove outliers
    """
    #read in the CSV file of trades
    df = pd.read_csv('Polygon/Raw/SPY_{}.csv'.format(year), engine='c', index_col=['t'], usecols = ['t','p','s'], dtype={'t':np.int64, 'p':np.float32, 's':np.float32})
    
    #order columns
    df = df.loc[:,['p','s']]
    
    #convert index to pd.DatetimeIndex, timezone naive, daylight savings naive
    df.index = pd.to_datetime(df.index, unit='ns')
    
    #convert index to US-Eastern timezone, automatically takes care of daylight savings
    df.index = df.index.tz_localize('UTC').tz_convert('US/Eastern')
    
    #restrict data to trading hours
    df = df.between_time('09:30:00', '16:00:00')
    
    #remove trades with zero shares
    df = df[df['s'] > 0]
    
    #remove outliers = pct change >= 1%
    df = df.groupby(pd.Grouper(freq='D'), as_index=False).apply(lambda g: g[abs(((g['p'].shift(-1)-g['p']) / g['p']) < 0.01)]).reset_index(level=0, drop=True)
    
    return df


## 2.b Define function for exporting binary file

In [7]:
def to_binary(df):
    """
    Take a SPY dataframe and export it as a binary file to be read in later
    np.fromfile takes 300 milliseconds
    pd.read_csv takes 5 seconds
    """
    #convert datetime index to integer index with 64 bits (8 bytes)
    df.index = df.index.astype(np.int64)
    
    #reset index to make it a column (thus placing it in df.values)
    df.reset_index(inplace=True)
    
    #make sure that the data type of the numpy array is float with 64 bits (8 bytes)
    #np.fromfile will corrupt if this is false
    assert df.values.dtype == np.float64
    
    #export numpy array as binary file
    #NOT PLATFORM INDEPENDENT
    df.values.tofile('Polygon/Primed/SPY_{}.binary'.format(year))
    
    return
    

## 2.c Trim data and export in binary format

In [None]:
for year in range(2003, 2021):
    print(year)
    
    #clean and trim data
    df = prime(year)
    
    #export as binary file
    to_binary(df)
    