In [2]:
# Have to install janitor package to run. Not included in anaconda.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import janitor

In [3]:
exec(open("../header.py").read())

# Import raw data

In [4]:
sample = False
header = Header(sample)

In [5]:
raw_merged_df = pd.read_csv(header.clean_root("crsp_russ.csv"), parse_dates = ['date'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [6]:
raw_merged_df.columns

Index(['permno', 'date', 'ticker_x', 'comnam', 'tsymbol', 'hexcd', 'cusip',
       'prc', 'vol', 'ret', 'bid', 'ask', 'shrout', 'numtrd', 'ewretd',
       'mktcap', 'year', 'month', 'yrmo', 'bid_ask_spread', 'turnover',
       'dollar_vol', 'ticker_y', 'russell1000', 'russell2000', 'name',
       'r1000_wt', 'r2000_wt'],
      dtype='object')

In [25]:
raw_merged_df.date.min()

Timestamp('1995-05-01 00:00:00')

# Create rank

In [7]:
ranks_df = raw_merged_df\
    .loc[lambda x:x.month == 5]\
    .groupby(['permno', 'yrmo'], as_index = False)\
    .apply(lambda d:d.loc[lambda x:x.date == max(x.date)])\
    .reset_index(drop = True)\
    .loc[:,['yrmo', 'year', 'permno', 'russell1000', 'mktcap']]\
    .fillna({'russell1000':0, 'russell2000':0})

In [8]:
ranks_df['mkt_rank_desc'] = ranks_df\
    .groupby(['yrmo','year','russell1000'], as_index = False)['mktcap']\
    .transform(lambda x:x.rank(ascending = False))

In [9]:
ranks_df['mkt_rank_asc'] = ranks_df\
    .groupby(['yrmo','year','russell1000'], as_index = False)['mktcap']\
    .transform(lambda x:x.rank(ascending = True))

In [10]:
ranks_df = ranks_df.reset_index(drop = True)\
    .rename({'year':'russell_year'}, axis = 1)\
    .loc[:,['russell_year', 'permno', 'mkt_rank_desc', 'mkt_rank_asc']]

# Create russell year variable

In [11]:
merged_df = raw_merged_df\
    .assign(russell_year = lambda x:np.where(x.month <= 5, x.year - 1, x.year))\
    .fillna({'russell1000':0, 'russell2000':0})

# Join on rank

In [12]:
merged_df = merged_df\
    .merge(ranks_df, how = 'left', on = ['permno', 'russell_year'])

In [13]:
merged_df = merged_df\
    .loc[lambda x:(x.russell1000 == 1)|(x.russell2000 == 1)]

In [14]:
merged_df.columns

Index(['permno', 'date', 'ticker_x', 'comnam', 'tsymbol', 'hexcd', 'cusip',
       'prc', 'vol', 'ret', 'bid', 'ask', 'shrout', 'numtrd', 'ewretd',
       'mktcap', 'year', 'month', 'yrmo', 'bid_ask_spread', 'turnover',
       'dollar_vol', 'ticker_y', 'russell1000', 'russell2000', 'name',
       'r1000_wt', 'r2000_wt', 'russell_year', 'mkt_rank_desc',
       'mkt_rank_asc'],
      dtype='object')

# Data Integrity

## Example of contradictory market caps and reasoning

**Question**: Why is the larger company in the Russell 2000 while the smaller company is in the Russell 1000?

**Answer**: It's because the Russell 1000 company's market cap dropped from last year May and/or the Russell 2000 company's market cap rose from last year May. Russell 1000/2000 membership is mostly decided at month-end in May. The smaller market-cap company then dropped to Russell 2000 for the next year.

In [15]:
merged_df.loc[lambda x:(x.cusip == '88160910')&(x.yrmo >= 200505),
              ['yrmo','cusip','mktcap','russell1000','russell2000','prc','shrout','ticker_x', 'ticker_y', 'comnam', 'name']]

Unnamed: 0,yrmo,cusip,mktcap,russell1000,russell2000,prc,shrout,ticker_x,ticker_y,comnam,name
1789175,200505,88160910,2620927.4,0.0,1.0,39.13,66980.0,TSO,TSO,TESORO CORP,TESORO CORP
1789176,200505,88160910,2479599.6,0.0,1.0,37.02,66980.0,TSO,TSO,TESORO CORP,TESORO CORP
1789177,200505,88160910,2607531.4,0.0,1.0,38.93,66980.0,TSO,TSO,TESORO CORP,TESORO CORP
1789178,200505,88160910,2686567.8,0.0,1.0,40.11,66980.0,TSO,TSO,TESORO CORP,TESORO CORP
1789179,200505,88160910,2659106.0,0.0,1.0,39.70,66980.0,TSO,TSO,TESORO CORP,TESORO CORP
...,...,...,...,...,...,...,...,...,...,...,...
1789591,200612,88160910,4404112.0,1.0,0.0,65.44,67300.0,TSO,TSO,TESORO CORP,TESORO CORP
1789592,200612,88160910,4398728.0,1.0,0.0,65.36,67300.0,TSO,TSO,TESORO CORP,TESORO CORP
1789593,200612,88160910,4447857.0,1.0,0.0,66.09,67300.0,TSO,TSO,TESORO CORP,TESORO CORP
1789594,200612,88160910,4455260.0,1.0,0.0,66.20,67300.0,TSO,TSO,TESORO CORP,TESORO CORP


In [16]:
merged_df.loc[lambda x:(x.cusip == '92552R40')&(x.yrmo >= 200504),
              ['yrmo','cusip','mktcap','russell1000','russell2000','prc','shrout','ticker_x', 'ticker_y', 'comnam', 'name']]

Unnamed: 0,yrmo,cusip,mktcap,russell1000,russell2000,prc,shrout,ticker_x,ticker_y,comnam,name
981936,200504,92552R40,597238.70,1.0,0.0,26.77,22310.0,VVI,VVI,VIAD CORP,VIAD CORP
981937,200504,92552R40,601700.70,1.0,0.0,26.97,22310.0,VVI,VVI,VIAD CORP,VIAD CORP
981938,200504,92552R40,596569.40,1.0,0.0,26.74,22310.0,VVI,VVI,VIAD CORP,VIAD CORP
981939,200504,92552R40,593892.20,1.0,0.0,26.62,22310.0,VVI,VVI,VIAD CORP,VIAD CORP
981940,200504,92552R40,593669.10,1.0,0.0,26.61,22310.0,VVI,VVI,VIAD CORP,VIAD CORP
...,...,...,...,...,...,...,...,...,...,...,...
982373,200612,92552R40,881341.00,0.0,1.0,40.75,21628.0,VVI,VVI,VIAD CORP,VIAD CORP
982374,200612,92552R40,893020.12,0.0,1.0,41.29,21628.0,VVI,VVI,VIAD CORP,VIAD CORP
982375,200612,92552R40,893668.96,0.0,1.0,41.32,21628.0,VVI,VVI,VIAD CORP,VIAD CORP
982376,200612,92552R40,885666.60,0.0,1.0,40.95,21628.0,VVI,VVI,VIAD CORP,VIAD CORP


## Other checks

In [17]:
# Check ratio of companies that don't have rankings.
np.sum(merged_df['mkt_rank_asc'].isna())/len(merged_df)

0.011270489804257235

In [18]:
# Check that most are from 1996 or because of IPOs after May that join Russell 1000/2000 straight out the gate
merged_df.loc[lambda x:x.mkt_rank_asc.isna(),'year'].value_counts()

2005    18084
2006    13721
2000     8927
1996     8562
2004     7618
1999     6221
1998     5024
1997     4977
2001     3263
2002     2323
2003     2153
Name: year, dtype: int64

In [19]:
merged_df.loc[lambda x:x.mkt_rank_asc.isna(),'month'].value_counts()

5     11783
3     11070
4     10679
1      8908
12     8703
2      8337
11     5859
10     5285
9      4977
8      2442
7      1823
6      1007
Name: month, dtype: int64

In [20]:
# Either russell 1000 or russell 2000. Not both
assert np.sum((merged_df['russell1000'] + merged_df['russell2000']) != 1) == 0

In [21]:
# Permno-date is unique identifier
assert merged_df\
    .groupby(['permno','date'], as_index = False)\
    .agg(counts = ('cusip', 'count'))\
    .reset_index(drop = True)\
    .loc[lambda x:x.counts > 1]\
    .shape[0] == 0

In [22]:
# Russell 1000 should have larger market cap on average
t = merged_df\
    .groupby('russell1000', as_index = False)\
    .agg(mean_mkt_cap = ('mktcap', 'mean'))

r1000 = t.loc[lambda x:x.russell1000 == 1,'mean_mkt_cap'].values[0]
r2000 = t.loc[lambda x:x.russell1000 == 0,'mean_mkt_cap'].values[0]

assert r1000 > r2000

# Final filters

In [27]:
merged_df.columns

Index(['permno', 'date', 'ticker_x', 'comnam', 'tsymbol', 'hexcd', 'cusip',
       'prc', 'vol', 'ret', 'bid', 'ask', 'shrout', 'numtrd', 'ewretd',
       'mktcap', 'year', 'month', 'yrmo', 'bid_ask_spread', 'turnover',
       'dollar_vol', 'ticker_y', 'russell1000', 'russell2000', 'name',
       'r1000_wt', 'r2000_wt', 'russell_year', 'mkt_rank_desc',
       'mkt_rank_asc'],
      dtype='object')

In [28]:
# Get rid of observations outside time period
# Get rid of observations without a valid mkt_ranking_asc or mkt_ranking_desc
merged_df = merged_df\
    .loc[lambda x:~(x.mkt_rank_asc.isna())]\
    .loc[lambda x:~(x.mkt_rank_desc.isna())]

# Save cleaned datsets

In [23]:
merged_df.to_csv(header.clean_root("crsp_russ_ranks.csv"), index = False)