In [1]:
import numpy as np
import pandas as pd
import statsmodels as sms
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection

import seaborn as sns # for data visualization
sns.set_style("whitegrid")

from dateutil.relativedelta import *
from pandas.tseries.offsets import *

pd.set_option('display.max_columns', None)

In [2]:
import os 

LOOKUP_FOLDER = '~/misp/lookup_tables'

def clean_ccm(ccm):
    ccm['permno'] = ccm['permno'].astype(int).astype(str)
    ccm['gvkey'] = ccm['gvkey'].astype(str)
    ccm['linkdt'] = pd.to_datetime(ccm['linkdt'])
    ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt'])
    isnull = ccm['linkenddt'].isnull()
    ccm.loc[isnull, 'linkenddt'] = pd.Timestamp(year=2047, month=7, day=1)
    return ccm

ccm = pd.read_pickle(os.path.join(LOOKUP_FOLDER, 'ccm.pkl'))
ccm_raw = pd.read_pickle(os.path.join(LOOKUP_FOLDER, 'ccm_raw.pkl'))
ccm, ccm_raw = clean_ccm(ccm), clean_ccm(ccm_raw)

In [3]:
syy_a = pd.read_csv('~/misp_data/syy/annualfactor.csv', index_col=0)
syy_m = pd.read_csv('~/misp_data/syy/monthlyfactor.csv', index_col=0)
syy_q = pd.read_csv('~/misp_data/syy/quarterlyfactor.csv', index_col=0)

ccm['gvkey'] = ccm['gvkey'].astype(int)
syy_q = syy_q.merge(ccm, on=['gvkey'])
syy_q = syy_q[['permno', 'year', 'month', 'retonat11']]
syy_a['year']=syy_a.index
syy_a = syy_a.merge(ccm, on=['gvkey'])
syy_a = syy_a[['permno', 'year', 'month', 'netstkis1' ,'accruals3', 'netopat4','atgr5','invtoat6','o8','grpf10']]
syy_q['date'] = pd.to_datetime(syy_q[['year', 'month']].assign(DAY=1))
syy_a['date'] = pd.to_datetime(syy_a[['year', 'month']].assign(DAY=1))
syy_a = syy_a.sort_values(['date'], ascending=True).set_index('date')
syy_q = syy_q.sort_values(['date'], ascending=True).set_index('date')
syy_a = syy_a.drop(columns=['year', 'month'])
syy_q = syy_q.drop(columns=['year', 'month'])
syy_m['date'] = pd.to_datetime(syy_m[['year', 'month']].assign(DAY=1))
syy_m = syy_m.drop(columns=['year', 'month'])
syy_m['permno'] = syy_m.index.astype(int)
syy_m = syy_m.sort_values(['date'], ascending=True).set_index('date')

  mask |= (ar1 == a)


In [4]:
syy_aq = pd.merge_asof(syy_q, syy_a, on='date', direction='backward', by='permno')
syy_aq['permno'] = syy_aq['permno'].astype(int)
syy_aqm = pd.merge_asof(syy_m, syy_aq, left_index=True, right_on='date', direction='backward', by='permno')

In [8]:
syy = syy_aqm.dropna(thresh=10) 
syy['date']=syy['date']+MonthEnd(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [9]:
syy.tail()

Unnamed: 0,compeqis2,mmt9,permno,date,retonat11,netstkis1,accruals3,netopat4,atgr5,invtoat6,o8,grpf10
881608,-0.054699,4.0,34817,2019-12-31,0.015173,1.0,0.15786,0.75771,0.040578,0.100876,-4.940793,0.208094
881254,-0.048032,7.0,11891,2019-12-31,-0.001305,1.0,-0.010576,0.678001,0.034993,0.067333,-3.10335,0.158026
880777,-0.071108,4.0,30681,2019-12-31,0.011588,1.0,-0.036759,0.174238,0.076226,0.002033,-4.333738,0.11198
881755,0.012172,2.0,13825,2019-12-31,0.019411,,-0.055376,0.281669,0.161264,0.007114,-6.724171,0.081898
882609,0.772942,3.0,14505,2019-12-31,0.001135,7.0,0.009321,0.55345,0.025487,0.073171,-3.405293,0.07884


In [7]:
%%time
ret_decomp_dcf3 = pd.read_csv('~/misp_data/decomp_dcf3y_hist_1960_2016.csv', index_col=0)
ret_decomp_dcf5 = pd.read_csv('~/misp_data/decomp_dcf5y_hist_1960_2014.csv', index_col=0)
ret_decomp_dcf10 = pd.read_csv('~/misp_data/decomp_dcf10y_hist_1960_2004.csv', index_col=0)

CPU times: user 35.2 s, sys: 3.02 s, total: 38.3 s
Wall time: 39.6 s


In [14]:
ret_decomp_dcf5['jdate_crsp'] = pd.to_datetime(ret_decomp_dcf5.jdate_crsp)

In [15]:
mdf5 = syy.merge(ret_decomp_dcf5, left_on=['date', 'permno'], right_on=['jdate_crsp', 'permno'], how='inner')

In [17]:
mdf5 = mdf5.sort_values(['permno', 'date'], ascending=True).set_index('date')

In [25]:
import mkl
mkl.set_num_threads(5)

def ols_coef(x,formula):
    return smf.ols(formula,data=x).fit().params

def fama_macbeth_summary(p):
    s = p.describe().T
    s['std_error'] = s['std']/np.sqrt(s['count'])
    s['tstat'] = s['mean']/s['std_error']
    return s[['mean','std_error','tstat', 'count', 'std']]

def fama_macbeth_reg_panel(regdf, xname='misp', yname='ret', 
                     csname='permno', tsname='jdate_crsp'): 
    # Panel factor setting FM reg
    gamma_cs = (regdf.groupby(tsname).apply(ols_coef,f'{yname} ~ {xname}'))
    gamma_cs = gamma_cs.rename(columns={"Intercept": "alpha_i_t", f"beta_{xname}": "lamba_t"})
    return fama_macbeth_summary(gamma_cs)

In [30]:
for var in ['compeqis2', 'mmt9','retonat11', 'netstkis1', 'accruals3',
              'netopat4', 'atgr5', 'invtoat6', 'o8', 'grpf10']:
    regdf = mdf5[['jdate_crsp', 'permno', 'ticker', var, 'misp']].replace([np.inf, -np.inf], np.nan).dropna()
    print(f'Misp ~ {var}, 5Yr-DCF')
    print(fama_macbeth_reg_panel(regdf, xname=var, yname='misp'))
    print('')

Misp ~ compeqis2, 5Yr-DCF
               mean  std_error      tstat  count       std
alpha_i_t  1.239963   0.016060  77.208349  519.0  0.365871
compeqis2  0.337379   0.014509  23.253590  519.0  0.330531

Misp ~ mmt9, 5Yr-DCF
               mean  std_error      tstat  count       std
alpha_i_t  1.280092   0.018581  68.892285  519.0  0.423306
mmt9      -0.008076   0.001675  -4.821499  519.0  0.038157

Misp ~ retonat11, 5Yr-DCF
               mean  std_error      tstat  count       std
alpha_i_t  1.222612   0.018707  65.356512  518.0  0.425760
retonat11  0.438731   0.196049   2.237865  518.0  4.462002

Misp ~ netstkis1, 5Yr-DCF
               mean  std_error      tstat  count       std
alpha_i_t  1.183200   0.016128  73.363583  519.0  0.367419
netstkis1 -0.000438   0.000726  -0.603672  519.0  0.016539

Misp ~ accruals3, 5Yr-DCF
               mean  std_error      tstat  count       std
alpha_i_t  1.244900   0.016524  75.337149  519.0  0.376452
accruals3  0.038604   0.029043   1.329210  51

In [31]:
misp_scroe = pd.read_csv('~/misp_data/Misp_Score.csv', index_col=0)
misp_scroe['date'] = pd.to_datetime(misp_scroe['yyyymm'], format='%Y%m') + MonthEnd(0)

In [49]:
mdf5_with_score = mdf5.merge(misp_scroe, left_on=['jdate_crsp', 'permno'], right_on=['date', 'permno'], how='inner')
mdf5_with_score['ret'] = np.exp(mdf5_with_score['r_t+60'])-1
mdf5_with_score['ret_ct'] = np.exp(mdf5_with_score['r*_t+60']-1)
mdf5_with_score['ret_chg'] = np.exp(mdf5_with_score['r_chg+60'])
mdf5_with_score['avg_score'] = mdf5_with_score['avg_score']/mdf5_with_score['avg_score'].mean()

In [51]:
for var in ['misp', 'ret', 'ret_ct', 'ret_chg']:
    regdf = mdf5_with_score[['jdate_crsp', 'permno', 'ticker', var, 'avg_score']].replace([np.inf, -np.inf], np.nan).dropna()
    print(f'{var} ~ Misp Score, 5Yr-DCF')
    print(fama_macbeth_reg_panel(regdf, xname='avg_score', yname=var))
    print('')

misp ~ Misp Score, 5Yr-DCF
               mean  std_error      tstat  count       std
alpha_i_t  1.138278   0.016510  68.945170  519.0  0.376122
avg_score  0.094649   0.008144  11.622263  519.0  0.185528

ret ~ Misp Score, 5Yr-DCF
               mean  std_error      tstat  count       std
alpha_i_t  2.172194   0.106655  20.366602  519.0  2.429762
avg_score -0.292415   0.091624  -3.191453  519.0  2.087349

ret_ct ~ Misp Score, 5Yr-DCF
               mean  std_error      tstat  count       std
alpha_i_t  0.124456   0.040125   3.101707  483.0  0.881841
avg_score  0.832374   0.052210  15.942827  483.0  1.147431

ret_chg ~ Misp Score, 5Yr-DCF
               mean  std_error     tstat  count       std
alpha_i_t  1.699340   0.191915  8.854663  483.0  4.217758
avg_score  1.192378   0.208746  5.712113  483.0  4.587655

