In [1]:
# Have to install janitor package to run. Not included in anaconda.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import janitor

In [2]:
exec(open("../header.py").read())

# Purpose

To replicate and then test out Boone and White (2015).

# Import raw data

In [3]:
sample = False
header = Header(sample)

In [4]:
raw_crsp = pd.read_csv(header.raw_root("crsp_data.csv"), parse_dates = ['date'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
print(f"Date range: {min(raw_crsp.date)} - {max(raw_crsp.date)}")

Date range: 1994-01-03 00:00:00 - 2007-12-31 00:00:00


In [6]:
raw_crsp.head(2)

Unnamed: 0,PERMNO,date,SICCD,NCUSIP,TICKER,COMNAM,TSYMBOL,PERMCO,HEXCD,CUSIP,PRC,VOL,RET,BID,ASK,SHROUT,NUMTRD,ewretd
0,10001,1994-01-03,4920,29274A10,EWST,ENERGY WEST INC,EWST,7953,2,36720410,17.75,4791.0,-0.034014,17.75,18.0,1091.0,4.0,0.00351
1,10001,1994-01-04,4920,29274A10,EWST,ENERGY WEST INC,EWST,7953,2,36720410,17.5,2000.0,-0.014085,17.25,18.0,1091.0,2.0,0.006372


# Clean data

In [7]:
# Basic cleaning
# Filter start date to May 1995 even though start date we care about is 1996
#   in order to establish ranking that carries forward until May 1996
cleaned_crsp = raw_crsp\
    .clean_names()\
    .loc[lambda x:(x.date >= '1995-05-01')&(x.date <= '2006-12-31')]\
    .dropna(axis = 0, subset = ['bid', 'ask', 'vol', 'prc', 'shrout'])\
    .assign(prc = lambda x:np.abs(x.prc))\
    .assign(mktcap = lambda x: x.prc * x.shrout)\
    .assign(year = lambda x:x.date.dt.year,
            month = lambda x:x.date.dt.month,
            yrmo = lambda x: x.year*100 + x.month)

In [8]:
# Variables of interest
cleaned_crsp = cleaned_crsp\
    .drop(['siccd', 'permco'], axis = 1)\
    .assign(bid_ask_spread = lambda x:(x.ask - x.bid)/((x.ask + x.bid)/2),
            turnover = lambda x:x.vol/x.mktcap,
            dollar_vol = lambda x:x.vol * x.prc)

In [9]:
# Cusip cleanup
cleaned_crsp = cleaned_crsp\
    .assign(cusip = lambda x:x.ncusip.str.replace('.0','',regex = False))\
    .assign(cusip = lambda x:x.ncusip.str.slice(0,8))\
    .drop(['ncusip'], axis = 1)

In [10]:
# Rank market cap by year-month
# cleaned_crsp['rank'] = cleaned_crsp\
#     .groupby('yrmo', as_index = False)['mktcap']\
#     .transform(lambda x:x.rank(ascending = False))

In [11]:
cleaned_crsp.head(5)

Unnamed: 0,permno,date,ticker,comnam,tsymbol,hexcd,cusip,prc,vol,ret,...,shrout,numtrd,ewretd,mktcap,year,month,yrmo,bid_ask_spread,turnover,dollar_vol
334,10001,1995-05-01,EWST,ENERGY WEST INC,EWST,2,29274A10,8.25,400.0,0.1,...,2244.0,3.0,0.000588,18513.0,1995,5,199505,0.095238,0.021606,3300.0
335,10001,1995-05-02,EWST,ENERGY WEST INC,EWST,2,29274A10,7.875,0.0,-0.045455,...,2244.0,0.0,0.001771,17671.5,1995,5,199505,0.095238,0.0,0.0
336,10001,1995-05-03,EWST,ENERGY WEST INC,EWST,2,29274A10,8.25,218.0,0.047619,...,2244.0,1.0,0.003985,18513.0,1995,5,199505,0.095238,0.011776,1798.5
337,10001,1995-05-04,EWST,ENERGY WEST INC,EWST,2,29274A10,7.5,5800.0,-0.090909,...,2244.0,6.0,-0.003479,16830.0,1995,5,199505,0.095238,0.344623,43500.0
338,10001,1995-05-05,EWST,ENERGY WEST INC,EWST,2,29274A10,8.25,600.0,0.1,...,2244.0,1.0,0.001353,18513.0,1995,5,199505,0.095238,0.03241,4950.0


In [12]:
cleaned_crsp.columns

Index(['permno', 'date', 'ticker', 'comnam', 'tsymbol', 'hexcd', 'cusip',
       'prc', 'vol', 'ret', 'bid', 'ask', 'shrout', 'numtrd', 'ewretd',
       'mktcap', 'year', 'month', 'yrmo', 'bid_ask_spread', 'turnover',
       'dollar_vol'],
      dtype='object')

# Data Integrity checks

In [13]:
print(f"Old shape: {raw_crsp.shape}")
print(f"New shape: {cleaned_crsp.shape}")

Old shape: (27927544, 18)
New shape: (22198012, 22)


In [14]:
# Permno is a unique identifier
assert cleaned_crsp\
    .groupby(['permno', 'date'], as_index = False)\
    .agg(counts = ('ticker', 'count'))\
    .loc[lambda x:x.counts > 1]\
    .shape[0] == 0

In [15]:
# CUSIP-date is a unique identifier
assert cleaned_crsp\
    .groupby(['cusip', 'date'], as_index = False)\
    .agg(counts = ('permno','count'))\
    .loc[lambda x:x.counts > 1, 'counts']\
    .shape[0] == 0

# Save cleaned datsets

In [16]:
cleaned_crsp.to_csv(header.clean_root("crsp.csv"), index = False)