In [1]:
# Have to install janitor package to run. Not included in anaconda.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import janitor

In [2]:
exec(open("../header.py").read())

# Import raw data

In [3]:
sample = True
header = Header(sample)

In [4]:
raw_df = pd.read_csv(header.clean_root("crsp_russ_ranks.csv"), parse_dates = ['date'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
raw_df.head(2)

Unnamed: 0,permno,date,ticker_x,comnam,tsymbol,hexcd,cusip,prc,vol,ret,...,dollar_vol,ticker_y,russell1000,russell2000,name,r1000_wt,r2000_wt,russell_year,mkt_rank_desc,mkt_rank_asc
0,76272,1996-05-02,VPI,VINTAGE PETROLEUM INC,,1,92746010,25.0,96000.0,0.020408,...,2400000.0,VPI,0.0,1.0,VINTAGE PETROLEUM IN,0.0,0.0556,1995,1183.0,1962.0
1,75510,1996-12-02,ADBE,ADOBE SYSTEMS INC,ADBE,3,00724F10,43.375,2332847.0,0.098101,...,101187200.0,ADBE,1.0,0.0,ADOBE SYSTEMS INC,0.04,0.0,1996,375.0,450.0


In [6]:
raw_df.columns

Index(['permno', 'date', 'ticker_x', 'comnam', 'tsymbol', 'hexcd', 'cusip',
       'prc', 'vol', 'ret', 'bid', 'ask', 'shrout', 'numtrd', 'ewretd',
       'mktcap', 'year', 'month', 'yrmo', 'bid_ask_spread', 'turnover',
       'dollar_vol', 'ticker_y', 'russell1000', 'russell2000', 'name',
       'r1000_wt', 'r2000_wt', 'russell_year', 'mkt_rank_desc',
       'mkt_rank_asc'],
      dtype='object')

# Variables of interest

In [7]:
y_vars = ['bid_ask_spread', 'turnover', 'dollar_vol']
treatment_x_vars = ['russell2000']
running_x_vars = ['weight_rank', 'may_rank']

# Create running $x$ vars

In [8]:
df = raw_df.copy()

df['weight_rank_r1000'] = df\
    .groupby(['yrmo','year','russell1000'], as_index = False)['r1000_wt']\
    .transform(lambda x:x.rank(ascending = True))

df['weight_rank_r2000'] = df\
    .groupby(['yrmo','year','russell1000'], as_index = False)['r2000_wt']\
    .transform(lambda x:x.rank(ascending = False))

# Rely on fact that every stock is in either in r1000 or r2000 from data cleaning
df['weight_rank'] = np.where(df['russell1000'] == 1, df['weight_rank_r1000'], df['weight_rank_r2000'] * -1)

In [9]:
df['may_rank'] = np.where(df['russell1000'] == 1, df['mkt_rank_asc'], df['mkt_rank_desc'] * -1)

# Create $y$ vars

In [10]:
df['y1_spread'] = df['bid_ask_spread']/((df['bid']+df['ask'])/2)
df['y2_spread'] = df\
    .groupby(['permno','russell_year'])['y1_spread']\
    .transform(lambda x:x.mean())
df['y3_spread'] = df['bid_ask_spread']

In [11]:
# Fix bad formula
df['turnover'] = df['vol']/df['shrout']

df['y1_turnover'] = df\
    .groupby(['permno','russell_year'])['turnover']\
    .transform(lambda x:x.mean())
df['y2_turnover'] = df['turnover']

In [12]:
df['y1_dollar_vol'] = df\
    .groupby(['permno','russell_year'])['dollar_vol']\
    .transform(lambda x:x.mean())
df['y2_dollar_vol'] = df['dollar_vol']

# Difference-in-Mean Results

## Function

In [13]:
import scipy.stats

In [14]:
def diff_mean(df, y, running_x, treatment_x, bw_start, bw_end):
    # Restrict df[x] to bandwidth
    df_subset = df\
        .loc[lambda d:(d[running_x] <= bw_end)&(d[running_x] >= -1*bw_end)]\
        .loc[lambda d:(d[running_x] >= bw_start)|(d[running_x] <= -1*bw_start)]
    
    # Show difference in features across treatment
    diff_means = df_subset\
        .groupby(treatment_x)\
        .agg(mean = (y, 'mean'),
             sd = (y, 'std'),
             count = (y, 'count'))\
        .reset_index()\
        .rename({'mean':y+'_mean',
                 'sd':y+'_sd',
                 'russell2000':'treatment'}, axis = 1)
    
    # Calculate statistical significance of difference
    mu_1 = diff_means.loc[lambda x:x.treatment == 1, y+'_mean'].values
    mu_2 = diff_means.loc[lambda x:x.treatment == 0, y+'_mean'].values
    sd_1 = diff_means.loc[lambda x:x.treatment == 1, y+'_sd'].values
    sd_2 = diff_means.loc[lambda x:x.treatment == 0, y+'_sd'].values
    n_1 = diff_means.loc[lambda x:x.treatment == 1, 'count'].values
    n_2 = diff_means.loc[lambda x:x.treatment == 0, 'count'].values
    
    t_stat = (mu_1 - mu_2)/np.sqrt((sd_1**2/n_1) + (sd_2**2/n_2))
    p_val = scipy.stats.norm.sf(np.abs(t_stat))
    
    return diff_means, (t_stat, p_val)

## Results

In [18]:
def print_results(y_var):
    r1 = diff_mean(df, y = y_var, running_x = 'weight_rank', treatment_x = 'russell2000', bw_start = 0, bw_end = 50)
    print(r1[1][0])
    r1 = diff_mean(df, y = y_var, running_x = 'may_rank', treatment_x = 'russell2000', bw_start = 0, bw_end = 50)
    print(r1[1][0])        

In [19]:
print_results('y1_spread')

[-8.76543978]
[-0.11369557]


In [20]:
print_results('y1_turnover')

[13.49792168]
[0.4692759]


In [21]:
print_results('y1_dollar_vol')

[22.98708791]
[2.22395551]


# RDD Results

## Function

In [22]:
import statsmodels.formula.api as smf

In [23]:
def rdd(df, y, running_x, treatment_x, bw_start, bw_end, polynomials = 3):
    # Restrict df[x] to bandwidth
    df_subset = df.copy()\
        .loc[lambda d:(d[running_x] <= bw_end)&(d[running_x] >= -1*bw_end)]\
        .loc[lambda d:(d[running_x] >= bw_start)|(d[running_x] <= -1*bw_start)]
    
    # Add on the polynomial terms on both sides of the treatment
    ind_vars = [running_x, treatment_x, 'interact']
    df_subset['interact'] = df_subset[running_x] * df_subset[treatment_x]
    
    for p in range(2,polynomials+1):
        df_subset[f'{running_x}_p{p}'] = df_subset[running_x]**p
        df_subset[f'interact_p{p}'] = df_subset[f'{running_x}_p{p}']*df_subset[treatment_x]
        ind_vars.append(f'{running_x}_p{p}')
        ind_vars.append(f'interact_p{p}')
        
    ind_vars = '+'.join(ind_vars)
    print(ind_vars)
    
    rdd = smf.ols(f'{y} ~ {ind_vars}', data = df_subset).fit()
    
    output_df = pd.DataFrame({'coef':rdd.params, 't_stat':rdd.tvalues})\
        .reset_index()\
        .rename({'index':'x'}, axis = 1)
    
    return output_df

## Results

In [24]:
def print_rdd_results(y_var):
    r1 = rdd(df, y = y_var, running_x = 'weight_rank', treatment_x = 'russell2000', bw_start = 0, bw_end = 50)
    print(r1.loc[lambda x:x.x == 'russell2000', 't_stat'].values[0])
    r1 = rdd(df, y = y_var, running_x = 'may_rank', treatment_x = 'russell2000', bw_start = 0, bw_end = 100)
    print(r1.loc[lambda x:x.x == 'russell2000', 't_stat'].values[0])

In [25]:
print_rdd_results('y1_spread')

weight_rank+russell2000+interact+weight_rank_p2+interact_p2+weight_rank_p3+interact_p3
-9.188609948137392
may_rank+russell2000+interact+may_rank_p2+interact_p2+may_rank_p3+interact_p3
-2.828228135131576


In [26]:
print_rdd_results('y1_turnover')

weight_rank+russell2000+interact+weight_rank_p2+interact_p2+weight_rank_p3+interact_p3
10.046479662227373
may_rank+russell2000+interact+may_rank_p2+interact_p2+may_rank_p3+interact_p3
-0.6428110066383971


In [28]:
print_rdd_results('y1_dollar_vol')

weight_rank+russell2000+interact+weight_rank_p2+interact_p2+weight_rank_p3+interact_p3
24.489537500269357
may_rank+russell2000+interact+may_rank_p2+interact_p2+may_rank_p3+interact_p3
13.526154946496842


# RDD Variation
Include controls for size. Though that should have the opposite effect.

In [115]:
import statsmodels.formula.api as smf

In [119]:
def rdd2(df, y, running_x, treatment_x, bw_start, bw_end, polynomials = 3):
    # Restrict df[x] to bandwidth
    df_subset = df.copy()\
        .loc[lambda d:(d[running_x] <= bw_end)&(d[running_x] >= -1*bw_end)]\
        .loc[lambda d:(d[running_x] >= bw_start)|(d[running_x] <= -1*bw_start)]
    
    # Add on the polynomial terms on both sides of the treatment
    ind_vars = [running_x, treatment_x, 'interact', 'mktcap']
    df_subset['interact'] = df_subset[running_x] * df_subset[treatment_x]
    
    for p in range(2,polynomials+1):
        df_subset[f'{running_x}_p{p}'] = df_subset[running_x]**p
        df_subset[f'interact_p{p}'] = df_subset[f'{running_x}_p{p}']*df_subset[treatment_x]
        ind_vars.append(f'{running_x}_p{p}')
        ind_vars.append(f'interact_p{p}')
        
    ind_vars = '+'.join(ind_vars)
    print(ind_vars)
    
    rdd = smf.ols(f'{y} ~ {ind_vars}', data = df_subset).fit()
    
    output_df = pd.DataFrame({'coef':rdd.params, 't_stat':rdd.tvalues})\
        .reset_index()\
        .rename({'index':'x'}, axis = 1)
    
    return output_df

## Results

In [120]:
def print_rdd2_results(y_var):
    r1 = rdd2(df, y = y_var, running_x = 'weight_rank', treatment_x = 'russell2000', bw_start = 0, bw_end = 50)
    print(r1.loc[lambda x:x.x == 'russell2000', 't_stat'].values[0])
    r1 = rdd2(df, y = y_var, running_x = 'weight_rank', treatment_x = 'russell2000', bw_start = 0, bw_end = 100)
    print(r1.loc[lambda x:x.x == 'russell2000', 't_stat'].values[0])
    r1 = rdd2(df, y = y_var, running_x = 'weight_rank', treatment_x = 'russell2000', bw_start = 50, bw_end = 100)
    print(r1.loc[lambda x:x.x == 'russell2000', 't_stat'].values[0])
    r1 = rdd2(df, y = y_var, running_x = 'weight_rank', treatment_x = 'russell2000', bw_start = 100, bw_end = 200)
    print(r1.loc[lambda x:x.x == 'russell2000', 't_stat'].values[0])

In [122]:
print_rdd2_results('y1_spread')

weight_rank+russell2000+interact+mktcap+weight_rank_p2+interact_p2+weight_rank_p3+interact_p3
-7.161538734832559
weight_rank+russell2000+interact+mktcap+weight_rank_p2+interact_p2+weight_rank_p3+interact_p3
-11.887699584542132
weight_rank+russell2000+interact+mktcap+weight_rank_p2+interact_p2+weight_rank_p3+interact_p3
-1.63809436734944
weight_rank+russell2000+interact+mktcap+weight_rank_p2+interact_p2+weight_rank_p3+interact_p3
-0.12965197488197974


In [123]:
print_rdd2_results('y1_turnover')

weight_rank+russell2000+interact+mktcap+weight_rank_p2+interact_p2+weight_rank_p3+interact_p3
10.2431264202977
weight_rank+russell2000+interact+mktcap+weight_rank_p2+interact_p2+weight_rank_p3+interact_p3
13.61932106364578
weight_rank+russell2000+interact+mktcap+weight_rank_p2+interact_p2+weight_rank_p3+interact_p3
0.023202006394033563
weight_rank+russell2000+interact+mktcap+weight_rank_p2+interact_p2+weight_rank_p3+interact_p3
-3.436526720382295


In [124]:
print_rdd2_results('y1_dollar_vol')

weight_rank+russell2000+interact+mktcap+weight_rank_p2+interact_p2+weight_rank_p3+interact_p3
16.05762491480111
weight_rank+russell2000+interact+mktcap+weight_rank_p2+interact_p2+weight_rank_p3+interact_p3
20.248612779737243
weight_rank+russell2000+interact+mktcap+weight_rank_p2+interact_p2+weight_rank_p3+interact_p3
0.4774299369298698
weight_rank+russell2000+interact+mktcap+weight_rank_p2+interact_p2+weight_rank_p3+interact_p3
-1.0898906616420136


# Save results