In [1]:
import numpy as np
import pandas as pd
import statsmodels as sms
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection

import seaborn as sns # for data visualization
sns.set_style("whitegrid")

from dateutil.relativedelta import *
from pandas.tseries.offsets import *

pd.set_option('display.max_columns', None)

In [2]:
import os 

LOOKUP_FOLDER = '~/misp/lookup_tables'

def clean_ccm(ccm):
    ccm['permno'] = ccm['permno'].astype(int).astype(str)
    ccm['gvkey'] = ccm['gvkey'].astype(str)
    ccm['linkdt'] = pd.to_datetime(ccm['linkdt'])
    ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt'])
    isnull = ccm['linkenddt'].isnull()
    ccm.loc[isnull, 'linkenddt'] = pd.Timestamp(year=2047, month=7, day=1)
    return ccm

ccm = pd.read_pickle(os.path.join(LOOKUP_FOLDER, 'ccm.pkl'))
ccm_raw = pd.read_pickle(os.path.join(LOOKUP_FOLDER, 'ccm_raw.pkl'))
ccm, ccm_raw = clean_ccm(ccm), clean_ccm(ccm_raw)

In [3]:
syy_a = pd.read_csv('~/misp_data/syy/annualfactor.csv', index_col=0)
syy_m = pd.read_csv('~/misp_data/syy/monthlyfactor.csv', index_col=0)
syy_q = pd.read_csv('~/misp_data/syy/quarterlyfactor.csv', index_col=0)

ccm['gvkey'] = ccm['gvkey'].astype(int)
syy_q = syy_q.merge(ccm, on=['gvkey'])
syy_q = syy_q[['permno', 'year', 'month', 'retonat11']]
syy_a['year']=syy_a.index
syy_a = syy_a.merge(ccm, on=['gvkey'])
syy_a = syy_a[['permno', 'year', 'month', 'netstkis1' ,'accruals3', 'netopat4','atgr5','invtoat6','o8','grpf10']]
syy_q['date'] = pd.to_datetime(syy_q[['year', 'month']].assign(DAY=1))
syy_a['date'] = pd.to_datetime(syy_a[['year', 'month']].assign(DAY=1))
syy_a = syy_a.sort_values(['date'], ascending=True).set_index('date')
syy_q = syy_q.sort_values(['date'], ascending=True).set_index('date')
syy_a = syy_a.drop(columns=['year', 'month'])
syy_q = syy_q.drop(columns=['year', 'month'])
syy_m['date'] = pd.to_datetime(syy_m[['year', 'month']].assign(DAY=1))
syy_m = syy_m.drop(columns=['year', 'month'])
syy_m['permno'] = syy_m.index.astype(int)
syy_m = syy_m.sort_values(['date'], ascending=True).set_index('date')

  mask |= (ar1 == a)


In [4]:
syy_aq = pd.merge_asof(syy_q, syy_a, on='date', direction='backward', by='permno')
syy_aq['permno'] = syy_aq['permno'].astype(int)
syy_aqm = pd.merge_asof(syy_m, syy_aq, left_index=True, right_on='date', direction='backward', by='permno')

In [5]:
syy = syy_aqm.dropna(thresh=10) 
syy['date']=syy['date']+MonthEnd(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [6]:
syy.tail()

Unnamed: 0,compeqis2,mmt9,permno,date,retonat11,netstkis1,accruals3,netopat4,atgr5,invtoat6,o8,grpf10
881608,-0.054699,4.0,34817,2019-12-31,0.015173,1.0,0.15786,0.75771,0.040578,0.100876,-4.940793,0.208094
881254,-0.048032,7.0,11891,2019-12-31,-0.001305,1.0,-0.010576,0.678001,0.034993,0.067333,-3.10335,0.158026
880777,-0.071108,4.0,30681,2019-12-31,0.011588,1.0,-0.036759,0.174238,0.076226,0.002033,-4.333738,0.11198
881755,0.012172,2.0,13825,2019-12-31,0.019411,,-0.055376,0.281669,0.161264,0.007114,-6.724171,0.081898
882609,0.772942,3.0,14505,2019-12-31,0.001135,7.0,0.009321,0.55345,0.025487,0.073171,-3.405293,0.07884


In [8]:
%%time
ret_decomp_dcf3 = pd.read_csv('~/misp_data/decomp_dcf3y_RF_1970_2019.csv', index_col=0)
ret_decomp_dcf5 = pd.read_csv('~/misp_data/decomp_dcf5y_RF_1970_2019.csv', index_col=0)
ret_decomp_dcf10 = pd.read_csv('~/misp_data/decomp_dcf10y_RF_1970_2019.csv', index_col=0)

CPU times: user 1min 16s, sys: 5.16 s, total: 1min 22s
Wall time: 1min 22s


In [9]:
ret_decomp_dcf5['jdate_crsp'] = pd.to_datetime(ret_decomp_dcf5.jdate_crsp)

In [10]:
mdf5 = syy.merge(ret_decomp_dcf5, left_on=['date', 'permno'], right_on=['jdate_crsp', 'permno'], how='inner')

In [11]:
mdf5 = mdf5.sort_values(['permno', 'date'], ascending=True).set_index('date')

In [12]:
# import mkl
# mkl.set_num_threads(5)

def ols_coef(x,formula):
    return smf.ols(formula,data=x).fit().params

def fama_macbeth_summary(p):
    s = p.describe().T
    s['std_error'] = s['std']/np.sqrt(s['count'])
    s['tstat'] = s['mean']/s['std_error']
    return s[['mean','std_error','tstat', 'count', 'std']]

def fama_macbeth_reg_panel(regdf, xname='misp', yname='ret', 
                     csname='permno', tsname='jdate_crsp'): 
    # Panel factor setting FM reg
    gamma_cs = (regdf.groupby(tsname).apply(ols_coef,f'{yname} ~ {xname}'))
    gamma_cs = gamma_cs.rename(columns={"Intercept": "alpha_i_t", f"beta_{xname}": "lamba_t"})
    return fama_macbeth_summary(gamma_cs)

In [13]:
for var in ['compeqis2', 'mmt9','retonat11', 'netstkis1', 'accruals3',
              'netopat4', 'atgr5', 'invtoat6', 'o8', 'grpf10']:
    regdf = mdf5[['jdate_crsp', 'permno', 'ticker', var, 'misp']].replace([np.inf, -np.inf], np.nan).dropna()
    print(f'Misp ~ {var}, 5Yr-DCF')
    print(fama_macbeth_reg_panel(regdf, xname=var, yname='misp'))
    print('')

Misp ~ compeqis2, 5Yr-DCF
               mean  std_error      tstat  count       std
alpha_i_t  1.556183   0.016683  93.278351  580.0  0.401785
compeqis2  0.648042   0.016964  38.200977  580.0  0.408548

Misp ~ mmt9, 5Yr-DCF
               mean  std_error      tstat  count       std
alpha_i_t  1.206108   0.017336  69.573040  580.0  0.417503
mmt9       0.048464   0.001714  28.281627  580.0  0.041269

Misp ~ retonat11, 5Yr-DCF
               mean  std_error      tstat  count       std
alpha_i_t  1.403988   0.023083  60.824526  578.0  0.554943
retonat11  3.835223   0.357494  10.728086  578.0  8.594730

Misp ~ netstkis1, 5Yr-DCF
               mean  std_error      tstat  count       std
alpha_i_t  1.219704   0.019956  61.120103  580.0  0.480601
netstkis1  0.049924   0.000971  51.414285  580.0  0.023385

Misp ~ accruals3, 5Yr-DCF
               mean  std_error      tstat  count       std
alpha_i_t  1.481999   0.019083  77.662648  580.0  0.459568
accruals3 -0.021249   0.010927  -1.944594  58

In [14]:
misp_scroe = pd.read_csv('~/misp_data/Misp_Score.csv', index_col=0)
misp_scroe['date'] = pd.to_datetime(misp_scroe['yyyymm'], format='%Y%m') + MonthEnd(0)

In [15]:
mdf5_with_score = mdf5.merge(misp_scroe, left_on=['jdate_crsp', 'permno'], right_on=['date', 'permno'], how='inner')
mdf5_with_score['ret'] = np.exp(mdf5_with_score['r_t+60'])-1
mdf5_with_score['ret_ct'] = np.exp(mdf5_with_score['r*_t+60']-1)
mdf5_with_score['ret_chg'] = np.exp(mdf5_with_score['r_chg+60'])
mdf5_with_score['avg_score'] = mdf5_with_score['avg_score']/mdf5_with_score['avg_score'].mean()

In [16]:
for var in ['misp', 'ret', 'ret_ct', 'ret_chg']:
    regdf = mdf5_with_score[['jdate_crsp', 'permno', 'ticker', var, 'avg_score']].replace([np.inf, -np.inf], np.nan).dropna()
    print(f'{var} ~ Misp Score, 5Yr-DCF')
    print(fama_macbeth_reg_panel(regdf, xname='avg_score', yname=var))
    print('')

misp ~ Misp Score, 5Yr-DCF
               mean  std_error      tstat  count       std
alpha_i_t  1.461768   0.017742  82.390707  544.0  0.413809
avg_score -0.015439   0.011059  -1.396022  544.0  0.257945

ret ~ Misp Score, 5Yr-DCF
                mean  std_error     tstat  count         std
alpha_i_t  30.327883   4.452253  6.811807  544.0  103.843481
avg_score -19.995692   3.146183 -6.355539  544.0   73.380974

ret_ct ~ Misp Score, 5Yr-DCF
               mean  std_error     tstat  count       std
alpha_i_t  0.667854   0.042344  15.77215  532.0  0.976666
avg_score  0.077856   0.037062   2.10066  532.0  0.854850

ret_chg ~ Misp Score, 5Yr-DCF
               mean  std_error      tstat  count       std
alpha_i_t  1.342733   0.038711  34.685699  532.0  0.892884
avg_score  0.274553   0.035297   7.778418  532.0  0.814125

