In [1]:
import wrds
import time
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from datetime import datetime
from linearmodels.iv import IV2SLS
from linearmodels.iv import IVGMM

### 1. Load Data and Merge IV Data

In [2]:
df_regression = pd.read_csv("df_main_reg.csv")
df_rank_iv = pd.read_csv("../Ressell_Rank/RR_data/df_rank_iv.csv") 
# '../' means move up one directory

#### -1) Merge the instrument to each quarter
- the logic of the Russell 2000 instrument applies from Q3 of the reconstitution year through Q2 of the following year.
- e.g. 2020.5.31 affects: 2020Q3, 2020Q4, 2021Q1, 2021Q2

In [3]:
# df_rank_iv: 5.31 of each year; df_regression: March, June, Sep, Dec of each year

# Step 1: Convert dates to datetime
df_regression['date'] = pd.to_datetime(df_regression['date'])
df_rank_iv['date'] = pd.to_datetime(df_rank_iv['date'])

In [4]:
# Step 2: Create effective year and quarter range for the instrument
# Shift date in the iv data forward by one month (to July 1st)
df_rank_iv['effective_start'] = df_rank_iv['date'] + pd.DateOffset(days=30)
df_rank_iv['effective_end'] = df_rank_iv['effective_start'] + pd.DateOffset(months=12)

In [5]:
df_rank_iv.head(3)

Unnamed: 0,permno,permco,cusip,date,mrkcap_crsp,tot_mrkcap_crsp,cusip_co,gvkey,prccm,cshom,...,ajexm,cshoq,ajexq,tot_mktcap_COMP_ALL,tot_mktcap_r3,mkt_value,Rank,assigned_to_R2000,effective_start,effective_end
0,10002,7954,05978R10,1999-05-31,103.737672,103.737672,05978R,19049.0,13.421875,7.7311,...,1.0,7.729,1.0,103.765858,103.765858,103.765858,4703.0,1,1999-06-30,2000-06-30
1,10002,7954,05978R10,2001-05-31,97.29307,97.29307,05978R,19049.0,11.41,8.559727,...,1.0,8.527,1.0,97.666488,97.666488,97.666488,4452.0,1,2001-06-30,2002-06-30
2,10002,7954,05978R10,2002-05-31,100.734335,100.734335,05978R,19049.0,11.720045,8.368,...,1.0,8.367,1.0,98.07334,100.734335,100.734335,4302.0,1,2002-06-30,2003-06-30


##### clean data before merge
- remove redundant columns from df_rank_iv
- kept only cusips of rank<=5000

In [6]:
# Keep only CUSIPs from df_rank_iv
valid_cusips = df_rank_iv['cusip'].unique()
df_regression_clean = df_regression[df_regression['cusip'].isin(valid_cusips)].copy()

In [7]:
# Remove redundant columns
df_rank_iv_clean = df_rank_iv[['cusip', 'effective_start', 'effective_end', 'assigned_to_R2000']].copy()

##### Exapnd the iv data to quarter frequency for merge

In [8]:
# Vectorized expansion using cross join with quarterly dates
# Create quarterly dates covering the full range
min_date = df_rank_iv_clean['effective_start'].min()
max_date = df_rank_iv_clean['effective_end'].max()
quarter_ends = pd.date_range(start=min_date, end=max_date, freq='Q')  # Quarter ends: Mar 31, Jun 30, Sep 30, Dec 31

  quarter_ends = pd.date_range(start=min_date, end=max_date, freq='Q')  # Quarter ends: Mar 31, Jun 30, Sep 30, Dec 31


In [9]:
#Create DataFrame of all quarters
quarter_df = pd.DataFrame({'date': quarter_ends})
quarter_df['key'] = 1

#Add a key to df_rank_iv_clean for cross join
df_rank_iv_clean['key'] = 1

#Cross join to assign each firm all quarters
expanded = df_rank_iv_clean.merge(quarter_df, on='key').drop('key', axis=1)

#Keep only quarters within [effective_start, effective_end]
expanded = expanded[
    (expanded['date'] >= expanded['effective_start']) &
    (expanded['date'] <= expanded['effective_end'])
].copy()

#Keep only the useful columns
df_iv_quarters = expanded[['cusip', 'date', 'assigned_to_R2000']]

In [10]:
df_iv_quarters.head(3)

Unnamed: 0,cusip,date,assigned_to_R2000
0,05978R10,1999-06-30,1
1,05978R10,1999-09-30,1
2,05978R10,1999-12-31,1


##### merge iv data to df_regression

In [12]:
# Perform left merge so df_regression keeps all rows
df_regression_iv = df_regression_clean.merge(df_iv_quarters, on=['cusip', 'date'], how='left')

# Fill missing assigned_to_R2000 with 0 (not assigned)
df_regression_iv['assigned_to_R2000'] = df_regression_iv['assigned_to_R2000'].fillna(0).astype(int)

In [13]:
df_regression_iv.head(3)

Unnamed: 0,cusip,date,bas,tno,mktcap,price_ind,volume,illiq,volatility,synch,ind_own,act_own,ins_own,auto_lag1,auto_lag2,auto_lag3,auto_lag4,assigned_to_R2000
0,30710,2014-12-31,0.084839,332.131683,178.587706,0.039206,7.001668,1.135563,0.277043,-3.648,0.012341,0.084358,0.234046,0.041,0.169,0.203,0.078,0
1,30710,2015-03-31,0.051639,443.806176,274.286735,0.03411,9.355878,1.095236,0.317542,-1.841,0.012512,0.131629,0.329042,0.223,0.12,0.025,0.305,0
2,30710,2015-06-30,0.04619,581.424544,465.762113,0.02719,12.664008,0.313499,0.249738,-2.326,0.017502,0.114851,0.349641,0.106,0.062,0.202,0.156,1


In [14]:
df_regression_iv.shape

(290302, 18)

In [16]:
file_path_1 = 'df_regression_iv.csv' 
df_regression_iv.to_csv(file_path_1, index=False)

### 2. OLS Regression with IV

In [17]:
# Define independent variables
market_quality_vars = ['bas', 'tno', 'illiq', 'volatility', 'synch', 'auto_lag1', 'auto_lag2', 'auto_lag3', 'auto_lag4']
Y = df_regression_iv[market_quality_vars]

In [18]:
# Define X vars
index_var = ['ind_own']
control_vars = ['mktcap', 'price_ind', 'volume']

X_endog = df_regression_iv[index_var]
X_exog = sm.add_constant(df_regression_iv[control_vars]) #control vars are exogenous and add constant

In [19]:
# Define Instruments
instr_vars = ['assigned_to_R2000']
Z_instr = df_regression_iv[instr_vars]

#### Testing 2SLS Regression

In [29]:
# Example data (replace with your actual data)
data = pd.DataFrame({'y': [1, 2, 3, 4, 5], 'endog': [2, 3, 4, 5, 6], 'exog': [1, 2, 1, 2, 1], 'instruments': [3, 4, 5, 6, 7]})

# Define the model
formula = 'y ~ exog + [endog ~ instruments]'
mod = IV2SLS.from_formula(formula, data)
res = mod.fit()

# Access first-stage results
first_stage_results = res.first_stage
print(first_stage_results.diagnostics)

       rsquared  partial.rsquared  shea.rsquared       f.stat  f.pval   f.dist
endog  0.997436          0.987677       0.987677  2477.854767     0.0  chi2(1)


#### IV Results

In [21]:
# Collect results
iv_results = {}

for yvar in Y.columns:
    y = df_regression_iv[yvar]

    model = IV2SLS(
        dependent=y,
        exog=X_exog,
        endog=X_endog,
        instruments=Z_instr
    ).fit(cov_type='robust')

    iv_results[yvar] = model

In [22]:
# Collect summary statistics in a table
summary_table = pd.DataFrame({
    var: {
        'coef_ind_own': model.params.get('ind_own', float('nan')),
        'pval_ind_own': model.pvalues.get('ind_own', float('nan')),
        'r2': model.rsquared,
        'first_stage_F': model.first_stage.diagnostics.get('f.stat', float('nan')).iloc[0]
    }
    for var, model in iv_results.items()
}).T


print(summary_table)

            coef_ind_own  pval_ind_own        r2  first_stage_F
bas            -1.763263      0.000000 -0.017424    6127.655981
tno          7597.325190      0.000000 -0.079721    6127.655981
illiq       -1518.313973      0.000000 -0.026057    6127.655981
volatility      0.837717      0.000000  0.034907    6127.655981
synch          12.496932      0.000000 -0.096696    6127.655981
auto_lag1      -0.178987      0.000000  0.001134    6127.655981
auto_lag2      -0.174687      0.000000 -0.010134    6127.655981
auto_lag3       0.002546      0.877229  0.000061    6127.655981
auto_lag4       0.015122      0.361680 -0.000172    6127.655981


In [80]:
model.first_stage.diagnostics

Unnamed: 0,rsquared,partial.rsquared,shea.rsquared,f.stat,f.pval,f.dist
ind_own,0.028917,0.001656,0.001656,647.760303,0.0,chi2(3)


In [None]:
### 2. GMM(Generalized Method of Moments) Regression

In [23]:
gmm_results = {}

for yvar in Y.columns:
    y = df_regression_iv[yvar]
    
    model = IVGMM(
        dependent=y,
        exog=X_exog,
        endog=X_endog,
        instruments=Z_instr
    )
    results = model.fit(cov_type='robust')
    gmm_results[yvar] = results
    
    print(f"Results for {yvar}")
    print(results.summary)

Results for bas
                          IV-GMM Estimation Summary                           
Dep. Variable:                    bas   R-squared:                     -0.0174
Estimator:                     IV-GMM   Adj. R-squared:                -0.0174
No. Observations:              290302   F-statistic:                    807.63
Date:                Wed, Jun 18 2025   P-value (F-stat)                0.0000
Time:                        04:12:52   Distribution:                  chi2(4)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          0.2133     0.0056     38.183     0.0000      0.2024      0.2243
mktcap      6.313e-07  6.035e-08    

In [24]:
summary_data = []

for yvar, model in gmm_results.items():
    summary_data.append({
        'yvar': yvar,
        'coef_ind_own': model.params.get('ind_own', float('nan')),
        'pval_ind_own': model.pvalues.get('ind_own', float('nan')),
        'r2': model.rsquared,
        'J-stat': model.j_stat.stat,
        'J-pval': model.j_stat.pval
    })

summary_table = pd.DataFrame(summary_data).set_index('yvar')
print(summary_table)

            coef_ind_own  pval_ind_own        r2        J-stat  J-pval
yvar                                                                  
bas            -1.763263      0.000000 -0.017424  1.535018e-21     NaN
tno          7597.325190      0.000000 -0.079721  1.054682e-21     NaN
illiq       -1518.313973      0.000000 -0.026057  8.916299e-23     NaN
volatility      0.837717      0.000000  0.034907  1.487093e-21     NaN
synch          12.496932      0.000000 -0.096696  7.099225e-23     NaN
auto_lag1      -0.178987      0.000000  0.001134  2.870644e-21     NaN
auto_lag2      -0.174687      0.000000 -0.010134  4.900727e-21     NaN
auto_lag3       0.002546      0.877229  0.000061  9.371964e-21     NaN
auto_lag4       0.015122      0.361680 -0.000172  8.144827e-21     NaN
