# TRACE
Script downloads intraday TRACE bond data, calculates daily summary statistics at the bond-day level and creates a dataframe over the specified (mindate,maxdate) sample range.

In [None]:
import pandas as pd
import datetime
import numpy as np
import wrds

In [None]:
# initialize WRDS connection
db = wrds.Connection()

In [None]:
db.list_libraries();

In [None]:
db.list_tables(library='trace');

In [None]:
db.describe_table(library='trace', table='trace');

In [None]:
# sample query:
testdate = datetime.date(2002,6,31)
query = str("""SELECT cusip_id, bsym, trd_exctn_dt, trd_exctn_tm, rptd_pr
        FROM trace.trace
        WHERE  trd_exctn_dt < '{!s}'
        LIMIT 5000 """).format(testdate)
query
dftemp = db.raw_sql(query)

dftemp = dftemp.dropna(subset=['cusip_id'])

dftemp.head()

In [None]:
# function that summarizes data for a given day
def get_dfTRACEdailysummary(db, date):
    query = str("""SELECT cusip_id, bsym, trd_exctn_dt, trd_exctn_tm, rptd_pr
        FROM trace.trace
        WHERE  trd_exctn_dt = '{!s}' """).format(date)
    
    # download data
    dftemp = db.raw_sql(query)
    
    # drop observations with missing cusip
    dftemp = dftemp.dropna(subset=['cusip_id'])
    
    # take last observation from each cusip (i.e. closing price)
    # ATTENTION: ASSUMING THAT QUERIED DATA ALREADY SORTED ACROSS EXECUTION TIME!
    grouped = dftemp.groupby(['cusip_id'])
    dfout = grouped.agg({'bsym':'last', 'rptd_pr':'last', 'trd_exctn_dt':'last',
                         'trd_exctn_tm':'last'}).reset_index()
    
    return dfout

In [None]:
# choose sample range
mindate = datetime.date(2015,7,1)
maxdate = datetime.date(2018,6,30)
day_1 = datetime.timedelta(days=1)

In [None]:
# test for a given day
dfTRACEdsum = get_dfTRACEdailysummary(db, mindate)
dfTRACEdsum.shape

In [None]:
# loop over all days
i = 0
date = mindate
print(date)
dfTRACEdsum = get_dfTRACEdailysummary(db, mindate)
print(dfTRACEdsum.shape)
while True:
    i = i+1
    date = date + day_1
    print(date)
#     print('iteration {!s}: date is {!s}'.format(i, date))
    
    # get data from that date
    dfTRACEdsum_more = get_dfTRACEdailysummary(db, date)
    print(dfTRACEdsum_more.shape)
#     print('size of new data is {!s}'.format(dfTRACEdsum_more.size))
    
    dfTRACEdsum = pd.concat([dfTRACEdsum, dfTRACEdsum_more])
    
    if date >= maxdate:
        break

In [None]:
dfTRACEdsum_new = dfTRACEdsum.drop_duplicates(subset=['cusip_id', 'trd_exctn_dt'],
                                              keep='last', inplace=False)

In [None]:
dfTRACEdsum_new.to_csv('TRACE-2015-2018.csv')

## concatenate
It can take too much RAM to store the data over a large time sample. An easy approach is to only download a few years at a time, save and then concatenate the data.

In [None]:
df_all = pd.read_csv('TRACE-2002-2006.csv', delimiter=',')

df_add = pd.read_csv('TRACE-2007-2010.csv', delimiter=',')
df_all = pd.concat([df_all, df_add])

df_add = pd.read_csv('TRACE-2010-2012.csv', delimiter=',')
df_all = pd.concat([df_all, df_add])

df_add = pd.read_csv('TRACE-2012-2015.csv', delimiter=',')
df_all = pd.concat([df_all, df_add])

df_add = pd.read_csv('TRACE-2015-2018.csv', delimiter=',')
df_all = pd.concat([df_all, df_add])

In [None]:
df_all.drop_duplicates(subset=['cusip_id', 'trd_exctn_dt'], keep='last', inplace=True)
df_all.to_csv('TRACE-all.csv')