In [1]:
# Have to install janitor package to run. Not included in anaconda.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import janitor

In [2]:
exec(open("../header.py").read())

# Purpose

To replicate and then test out Boone and White (2015).

# Import raw data

In [3]:
sample = False
header = Header(sample)

In [4]:
raw_russ = pd.read_csv(header.raw_root("russell_index_inclusion.csv"), parse_dates = ['Date'])

In [5]:
raw_russ.head(5)

Unnamed: 0,Date,CUSIP,Ticker,MktValue,Shares,Russell1000,Russell2000,Name,R1000_WT,R2000_WT
0,1996-01-31,000361105,AIR,,15959,N,Y,AAR CORP,,0.0599
1,1996-01-31,000752105,ABCR,,7983,N,Y,ABC RAIL PRODUCTS CO,,0.0304
2,1996-01-31,00077R108,ABRX,,4831,N,Y,ABR INFORMATION SVCS,,0.0422
3,1996-01-31,000782102,ABTC,,10604,N,Y,ABT BUILDING PRODUCT,,0.0275
4,1996-01-31,000886101,ADCT,,62760,Y,N,ADC TELECOMMUNICATIO,0.0467,


# Clean data

In [6]:
# Make russell indicator flags
# Add yrmo
# Substring cusip to first 8 to match crsp

cleaned_russ = raw_russ\
    .clean_names()\
    .loc[lambda x:(x.date >= '1996-01-01')&(x.date <= '2006-12-31')]\
    .assign(russell1000 = lambda x:np.select(
        [x.russell1000 == 'Y',x.russell1000 == 'N'],
        [1,0],
        default = -1),
            russell2000 = lambda x:np.select(
        [x.russell2000 == 'Y',x.russell2000 == 'N'],
        [1,0],
        default = -1),
            r1000_wt = lambda x:x.r1000_wt.fillna(0),
            r2000_wt = lambda x:x.r2000_wt.fillna(0)
    )\
    .assign(year = lambda x:x.date.dt.year,
            month = lambda x:x.date.dt.month,
            yrmo = lambda x: x.year*100 + x.month)\
    .assign(cusip = lambda x:x.cusip.str.slice(0,8))

In [7]:
cleaned_russ.head(5)

Unnamed: 0,date,cusip,ticker,mktvalue,shares,russell1000,russell2000,name,r1000_wt,r2000_wt,year,month,yrmo
0,1996-01-31,00036110,AIR,,15959,0,1,AAR CORP,0.0,0.0599,1996,1,199601
1,1996-01-31,00075210,ABCR,,7983,0,1,ABC RAIL PRODUCTS CO,0.0,0.0304,1996,1,199601
2,1996-01-31,00077R10,ABRX,,4831,0,1,ABR INFORMATION SVCS,0.0,0.0422,1996,1,199601
3,1996-01-31,00078210,ABTC,,10604,0,1,ABT BUILDING PRODUCT,0.0,0.0275,1996,1,199601
4,1996-01-31,00088610,ADCT,,62760,1,0,ADC TELECOMMUNICATIO,0.0467,0.0,1996,1,199601


# Data integrity checks

In [8]:
assert len(cleaned_russ.russell1000.unique()) == 2
assert len(cleaned_russ.russell2000.unique()) == 2

In [9]:
# Date-cusip is unique identifier
assert cleaned_russ\
    .groupby(['date', 'cusip'], as_index = False)\
    .agg(counts = ('ticker', 'count'))\
    .loc[lambda x:x.counts > 1]\
    .shape[0] == 0

In [10]:
print(f"Old shape: {raw_russ.shape}")
print(f"New shape: {cleaned_russ.shape}")

Old shape: (386154, 10)
New shape: (386154, 13)


# Save cleaned datsets

In [11]:
cleaned_russ.to_csv(header.clean_root("russell.csv"), index = False)