## Class07

In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
from linearmodels import PanelOLS

### Merge CRSP and Compustat
The unique identifier in CRSP and Compustat is **PERMNO** and **GVKEY** respectively. To merge the two databases, **CUSIP** is used. Both CRSP and Compustat use most recent CUSIP.

#### Read CRSP

In [2]:
file_path = '/Users/ml/Google Drive/af/teaching/database/data/'
msf_raw = pd.read_csv(file_path+'msf_1992_2017.txt',sep='\t',low_memory=False)
msf_raw.columns = msf_raw.columns.str.lower()

In [3]:
msf = msf_raw[(msf_raw['shrcd'].isin([10,11]))&(msf_raw['exchcd'].isin([1,2,3]))].copy()
msf['ret'] = pd.to_numeric(msf['ret'],errors='coerce')

In [4]:
msf = msf.sort_values(['permno','date']).reset_index(drop=True)
msf = msf.drop_duplicates(['permno','date'])
msf = msf.sort_values(['permno','date']).reset_index(drop=True)

In [5]:
msf['yyyymm'] =(msf['date']/100).astype(int)
msf['calyr'] = (msf['yyyymm']/100).astype(int)
msf['month'] = (msf['yyyymm']%100).astype(int)

Match CRSP return data from July in year t to June in year t+1 with accounting data in year t-1

In [6]:
msf['mergeyr'] = np.where((msf['month']>=7)&(msf['month']<=12),msf['calyr'],msf['calyr']-1)

#### Read Compustat

In [7]:
ag = pd.read_stata(file_path+'roa.dta')

In [8]:
ag['calyr'] = (ag['datadate']/10000).astype(int)
ag['mergeyr'] = ag['calyr'] + 1
ag['cusip'] =ag['cusip'].str[:8]
ag['ag'] = ag['at'] / ag['at_l1'] - 1
p1 = ag['ag'].quantile(0.01)
p99 = ag['ag'].quantile(0.99)
ag = ag[(ag['ag']>=p1)&(ag['ag']<=p99)]

#### Merge return with asset growth

In [9]:
msf_1 = msf.merge(ag[['cusip','mergeyr','ag']],how='inner',on=['cusip','mergeyr'])

In [10]:
sm.ols('ret~ag',msf_1).fit().summary()

0,1,2,3
Dep. Variable:,ret,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,681.4
Date:,"Thu, 16 Aug 2018",Prob (F-statistic):,3.65e-150
Time:,15:52:36,Log-Likelihood:,343770.0
No. Observations:,1263251,AIC:,-687500.0
Df Residuals:,1263249,BIC:,-687500.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0137,0.000,80.166,0.000,0.013,0.014
ag,-0.0046,0.000,-26.103,0.000,-0.005,-0.004

0,1,2,3
Omnibus:,1539007.208,Durbin-Watson:,2.027
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2176338402.768
Skew:,5.722,Prob(JB):,0.0
Kurtosis:,206.018,Cond. No.,1.35


### Merge CRSP and Thomson Reuters 13F
CUSIP in Thomson Reuters 13F is histotical CUSIP which is the NCUSIP in CRSP, so to merge CRSP and 13F, we need match NCUSIP in CRSP with CUSIP in 13F. 

#### Read 13F

In [11]:
tr13f = pd.read_csv(file_path+'tr13f.txt',sep='\t',low_memory=False)
tr13f.columns = tr13f.columns.str.lower()

In [12]:
tr13f = tr13f.drop_duplicates(['cusip','rdate'])
tr13f = tr13f.rename(columns={'cusip':'ncusip'})
tr13f = tr13f.sort_values(['ncusip','rdate']).reset_index(drop=True)

Match CRSP return data from July in year t to June in year t+1 with 13F data in June of year t

In [13]:
tr13f['mergeyr'] = (tr13f['rdate']/10000).astype(int)
tr13f['month'] = (tr13f['rdate']/100).astype(int)%100
tr13f = tr13f[tr13f['month']==6][['ncusip','mergeyr','numinstowners','instown_perc']]

In [14]:
msf_2 = msf.merge(tr13f,how='inner',on=['ncusip','mergeyr'])
msf_2.head(10)

Unnamed: 0,permno,date,shrcd,exchcd,ncusip,cusip,ret,yyyymm,calyr,month,mergeyr,numinstowners,instown_perc
0,10001,19920731,11.0,3.0,39040610,36720410,0.06383,199207,1992,7,1992,3,0.009185
1,10001,19920831,11.0,3.0,39040610,36720410,0.04,199208,1992,8,1992,3,0.009185
2,10001,19920930,11.0,3.0,39040610,36720410,0.165962,199209,1992,9,1992,3,0.009185
3,10001,19921030,11.0,3.0,39040610,36720410,-0.025,199210,1992,10,1992,3,0.009185
4,10001,19921130,11.0,3.0,39040610,36720410,-0.017094,199211,1992,11,1992,3,0.009185
5,10001,19921231,11.0,3.0,39040610,36720410,-0.01513,199212,1992,12,1992,3,0.009185
6,10001,19930129,11.0,3.0,39040610,36720410,0.0,199301,1993,1,1992,3,0.009185
7,10001,19930226,11.0,3.0,39040610,36720410,0.017857,199302,1993,2,1992,3,0.009185
8,10001,19930331,11.0,3.0,39040610,36720410,0.011053,199303,1993,3,1992,3,0.009185
9,10001,19930430,11.0,3.0,39040610,36720410,0.070175,199304,1993,4,1992,3,0.009185
