In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
import statsmodels.api as sm
from linearmodels import PanelOLS

In [2]:
url = 'https://www.dropbox.com/s/uso1u9asqam7rp1/merged.dta?dl=1'
data = pd.read_stata(url)
data['yyyymm'] = data['yyyymm'].astype(int)
data = data.sort_values(['cusip', 'yyyymm'], ignore_index=True)

In [3]:
data['ret'] = pd.to_numeric(data['ret'], errors='coerce')
data['year'] = (data['yyyymm']/100).astype(int)
data = data.drop_duplicates(['cusip', 'year']).copy()
data['lnme'] = np.log(data['me'])
data = data.sort_values(['cusip', 'year'], ignore_index=True)
data['lag_lnme'] = data.groupby('cusip')['lnme'].shift(1)
data['lag_year'] = data.groupby('cusip')['year'].shift(1)
data['year_diff'] = data['year'] - data['lag_year']
data.loc[data['year_diff']!=1, 'lag_lnme'] = np.nan
data = data.dropna(subset=['lag_lnme', 'ret'], how='any')
data = data[['cusip', 'year', 'ret', 'lag_lnme']]

In [4]:
est = sm.OLS(data['ret'], sm.add_constant(data['lag_lnme'])).fit()
est.summary()

# robust standard error
(sm.OLS(data['ret'], sm.add_constant(data['lag_lnme']))
    .fit(cov_type='hc0', use_t=True).summary())

0,1,2,3
Dep. Variable:,ret,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,45.41
Date:,"Wed, 16 Mar 2022",Prob (F-statistic):,1.61e-11
Time:,22:02:17,Log-Likelihood:,29162.0
No. Observations:,60822,AIC:,-58320.0
Df Residuals:,60820,BIC:,-58300.0
Df Model:,1,,
Covariance Type:,hc0,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0224,0.002,9.586,0.000,0.018,0.027
lag_lnme,-0.0022,0.000,-6.739,0.000,-0.003,-0.002

0,1,2,3
Omnibus:,51007.444,Durbin-Watson:,1.971
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7467699.262
Skew:,3.364,Prob(JB):,0.0
Kurtosis:,56.865,Cond. No.,20.4


In [5]:
data['a'] = est.params[0]
data['b'] = est.params[1]

data['p_calc'] = data['a'] + data['b']*data['lag_lnme']
data['p_est'] = est.predict()

data['e_calc'] = data['ret'] - data['p_calc']
data['e_est'] = est.resid

In [6]:
data1 = data.set_index(['cusip', 'year'])
panel_est = (PanelOLS(data1['ret'], sm.add_constant(data1['lag_lnme']),
    entity_effects=True).fit())
panel_est.summary

0,1,2,3
Dep. Variable:,ret,R-squared:,0.0178
Estimator:,PanelOLS,R-squared (Between):,-0.2650
No. Observations:,60822,R-squared (Within):,0.0178
Date:,"Wed, Mar 16 2022",R-squared (Overall):,-0.1220
Time:,22:02:17,Log-likelihood,3.549e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,966.69
Entities:,7407,P-value,0.0000
Avg Obs:,8.2114,Distribution:,"F(1,53414)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,0.1768,0.0054,32.602,0.0000,0.1662,0.1874
lag_lnme,-0.0276,0.0009,-31.092,0.0000,-0.0293,-0.0258


In [7]:
panel_est = (PanelOLS(data1['ret'], sm.add_constant(data1['lag_lnme']),
    entity_effects=True).fit(cov_type='robust'))
panel_est.summary

0,1,2,3
Dep. Variable:,ret,R-squared:,0.0178
Estimator:,PanelOLS,R-squared (Between):,-0.2650
No. Observations:,60822,R-squared (Within):,0.0178
Date:,"Wed, Mar 16 2022",R-squared (Overall):,-0.1220
Time:,22:02:18,Log-likelihood,3.549e+04
Cov. Estimator:,Robust,,
,,F-statistic:,966.69
Entities:,7407,P-value,0.0000
Avg Obs:,8.2114,Distribution:,"F(1,53414)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,0.1768,0.0073,24.053,0.0000,0.1624,0.1912
lag_lnme,-0.0276,0.0012,-23.270,0.0000,-0.0299,-0.0253


In [8]:
panel_est = (PanelOLS(data1['ret'], sm.add_constant(data1['lag_lnme']),
    entity_effects=True) .fit(cov_type='clustered', cluster_entity=True))
panel_est.summary

0,1,2,3
Dep. Variable:,ret,R-squared:,0.0178
Estimator:,PanelOLS,R-squared (Between):,-0.2650
No. Observations:,60822,R-squared (Within):,0.0178
Date:,"Wed, Mar 16 2022",R-squared (Overall):,-0.1220
Time:,22:02:18,Log-likelihood,3.549e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,966.69
Entities:,7407,P-value,0.0000
Avg Obs:,8.2114,Distribution:,"F(1,53414)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,0.1768,0.0074,23.806,0.0000,0.1622,0.1913
lag_lnme,-0.0276,0.0012,-22.571,0.0000,-0.0300,-0.0252
