In [2]:
"""Codes that pull and link CRSP and CUMPUSTATS Data
"""

import os
import pandas as pd
import numpy as np
import datetime as dt
import wrds
import psycopg2 
import matplotlib.pyplot as plt
from dateutil.relativedelta import *
from pandas.tseries.offsets import *
from scipy import stats

pd.set_option('display.max_columns', None)

LOOKUP_FOLDER = '~/misp/lookup_tables'

def clean_ccm(ccm):
    ccm['permno'] = ccm['permno'].astype(int).astype(str)
    ccm['gvkey'] = ccm['gvkey'].astype(str)
    ccm['linkdt'] = pd.to_datetime(ccm['linkdt'])
    ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt'])
    isnull = ccm['linkenddt'].isnull()
    ccm.loc[isnull, 'linkenddt'] = pd.Timestamp(year=2047, month=7, day=1)
    return ccm

ccm = pd.read_pickle(os.path.join(LOOKUP_FOLDER, 'ccm.pkl'))
ccm_raw = pd.read_pickle(os.path.join(LOOKUP_FOLDER, 'ccm_raw.pkl'))
ccm, ccm_raw = clean_ccm(ccm), clean_ccm(ccm_raw)

In [3]:
conn=wrds.Connection()

Enter your WRDS username [mma3]:mingweima
Enter your password:········
WRDS recommends setting up a .pgpass file.
You can find more info here:
https://www.postgresql.org/docs/9.5/static/libpq-pgpass.html.
Loading library list...
Done


In [4]:
sdate = '1959-01-01'
syyyy = sdate[0:4]
syear = int(syyyy)
smm = sdate[5:7]
sdd = sdate[8:10]

forward=61

backward=0

comp = conn.raw_sql(f"""select 
                        f.gvkey, datadate, at, pstkl, txditc,
                        pstkrv, seq, pstk, 
                        ni, epspi, revt,
                        capx,
                        ajex,
                        fyear, 
                        apdedate, pdate, fdate, c.gvkey, f.cusip as cnum,
                        c.cik, sic as sic2, sic, naics, sale, cogs, xsga, xrd, xad, ib, ebitda, ebit, nopi, 
                        spi, pi, txp, txfed, txfo, txt, xint, capx, oancf, dvt, ob, gdwlia, gdwlip, gwo, rect, act, 
                        che, ppegt, invt, at, aco, intan, ao, ppent, gdwl, fatb, fatl, lct, dlc, dltt, lt, dm, dcvt, 
                        cshrc, dcpstk, ap, lco, lo, drc, drlt, txdi, ceq, scstkc, emp, csho, /*addition*/
                        abs(prcc_f) as prcc_f, csho*prcc_f as mve_f, /*HXZ*/
                        am, txdb, dvc, dvp, dp, dvpsx_f, mib, ivao, ivst, sstk, prstkc,
                        dv, dltis, dltr, dlcch, oibdp, dvpa, tstkp, oiadp, xpp, xacc, re, ppenb,
                        ppenls, capxv, fopt, wcap
                        from comp.names as c, comp.funda as f
                        where f.gvkey=c.gvkey /*get consolidated, standardized, industrial format statements*/
                        and f.indfmt='INDL'
                        and f.datafmt='STD'
                        and f.popsrc='D'
                        and f.consol='C'
                        and fyear>='{syear-backward}'
                        and fyear<='{syear+forward}'
                        """)

In [5]:
# create preferrerd stock
comp['ps']=np.where(comp['pstkrv'].isnull(), comp['pstkl'], comp['pstkrv'])
comp['ps']=np.where(comp['ps'].isnull(),comp['pstk'], comp['ps'])
comp['ps']=np.where(comp['ps'].isnull(),0,comp['ps'])

comp['txditc']=comp['txditc'].fillna(0)

# create book equity
comp['be']=comp['seq']+comp['txditc']-comp['ps']
comp['be']=np.where(comp['be']>0, comp['be'], np.nan)

comp['datadate']=pd.to_datetime(comp['datadate']) # convert datadate to date fmt
comp['year']=comp['datadate'].dt.year

# line up date to be end of month
comp['date']=pd.to_datetime(comp['datadate'])
comp['jdate']=comp['date']+MonthEnd(0)

comp = comp.loc[:,~comp.columns.duplicated()]
comp.shape

(515899, 108)

In [6]:
%%time

def gvkey_to_permno(gvkey):
    gvkey = str(gvkey)
    ccm_match_sub = ccm.loc[ccm['gvkey'] == gvkey]
    permno = list(ccm_match_sub['permno'])
    if len(permno) == 1:
        return permno[0]
    return np.nan

# convert gvkeys to permnos
comp['permno'] = comp['gvkey']
comp['permno'] = comp['permno'].apply(gvkey_to_permno)
comp = comp[comp['permno'].notna()]
comp = comp.astype({"permno": int})

CPU times: user 17min 21s, sys: 4.15 s, total: 17min 25s
Wall time: 17min 25s


In [14]:
comp.to_csv('~/misp_data/comp_1959-2019.csv')

In [19]:
%%time
crsp_m = conn.raw_sql(f"""
                      select a.permno, a.date, b.ticker, b.ncusip, b.namedt, b.nameendt,
                      b.shrcd, b.exchcd, b.siccd,
                      a.ret, a.retx, a.shrout, abs(a.prc) as prc, a.cfacpr, a.cfacshr
                      from crsp.msf as a
                      left join crsp.msenames as b
                      on a.permno=b.permno
                      and b.namedt<=a.date
                      and a.date<=b.nameendt
                      where a.date between '01/01/{1959}' and '12/31/{2019}'
                      and b.exchcd between 1 and 3
                      """) 

CPU times: user 29 s, sys: 5.95 s, total: 34.9 s
Wall time: 1min 2s


In [20]:
# change variable format to int
crsp_m[['permno','shrcd','exchcd']]=crsp_m[['permno','shrcd','exchcd']].astype(int)

# Line up date to be end of month
crsp_m['date']=pd.to_datetime(crsp_m['date'])
crsp_m['jdate']=crsp_m['date']+MonthEnd(0)

# add delisting return
dlret = conn.raw_sql("""
                     select permno, dlret, dlstdt 
                     from crsp.msedelist
                     """)
dlret.permno=dlret.permno.astype(int)
dlret['dlstdt']=pd.to_datetime(dlret['dlstdt'])
dlret['jdate']=dlret['dlstdt']+MonthEnd(0)

crsp = pd.merge(crsp_m, dlret, how='left',on=['permno','jdate'])
crsp['dlret']=crsp['dlret'].fillna(0)
crsp['ret']=crsp['ret'].fillna(0)
crsp['retadj']=(1+crsp['ret'])*(1+crsp['dlret'])-1
crsp['me']=crsp['prc'].abs()*crsp['shrout'] # calculate market equity
crsp=crsp.drop(['dlret','dlstdt','prc','shrout'], axis=1)

In [21]:
_tmp_crsp = crsp.sort_values(['permno','date'], ascending=False)\
    .set_index('date')

umds = []

for m in [3, 6, 12, 24, 36, 48, 60]:
    _tmp_crsp[f'logret_{m}']=np.log(1+_tmp_crsp['retadj'])
    umd_m = _tmp_crsp.groupby(['permno'])[f'logret_{m}'].rolling(m, min_periods=m).sum()
    umd_m = umd_m.reset_index()
    umd_m[f'cumret_{m}']=np.exp(umd_m[f'logret_{m}'])-1
    umds += [umd_m]

dfs = [df.set_index(['permno', 'date']) for df in umds]
umd = pd.concat(dfs, axis=1).reset_index()

umd['jdate']=umd['date']+MonthEnd(0)
crsp = crsp.merge(umd, how='left',on=['permno','jdate'])
crsp = crsp.drop(columns=['date_x', 'date_y'])

  import sys


In [23]:
crsp['jdate_crsp'] = crsp['jdate']
comp['jdate_comp'] = comp['jdate']
crsp['year']=crsp['jdate'].dt.year

In [39]:
pd.set_option('display.max_columns', None)
crsp.head(20)

Unnamed: 0,permno,ticker,ncusip,namedt,nameendt,shrcd,exchcd,siccd,ret,retx,cfacpr,cfacshr,jdate,retadj,me,logret_3,cumret_3,logret_6,cumret_6,logret_12,cumret_12,logret_24,cumret_24,logret_36,cumret_36,logret_48,cumret_48,logret_60,cumret_60,jdate_crsp,year,logret,termret,termlogret,termcumret
0,10000,OMFGA,68391610,1986-01-07,1986-12-03,10,3,3990.0,0.0,,1.0,1.0,1986-01-31,0.0,16100.0,0.014185,0.014286,-0.346523,-0.292857,-2.138282,-0.882143,,,,,,,,,1986-01-31,1986,0.0,-2.995732,-2.995732,-0.95
1,10000,OMFGA,68391610,1986-01-07,1986-12-03,10,3,3990.0,-0.257143,-0.257143,1.0,1.0,1986-02-28,-0.257143,11960.0,-0.089612,-0.085714,-0.430783,-0.35,-2.376693,-0.907143,,,,,,,,,1986-02-28,1986,-0.297252,-2.995732,-2.995732,-0.95
2,10000,OMFGA,68391610,1986-01-07,1986-12-03,10,3,3990.0,0.365385,0.365385,1.0,1.0,1986-03-31,0.365385,16330.0,-0.044233,-0.043269,-1.089043,-0.663462,-2.079442,-0.875,,,,,,,,,1986-03-31,1986,0.311436,-2.995732,-2.995732,-0.95
3,10000,OMFGA,68391610,1986-01-07,1986-12-03,10,3,3990.0,-0.098592,-0.098592,1.0,1.0,1986-04-30,-0.098592,15172.0,-0.360707,-0.302817,-1.45932,-0.767606,-2.876386,-0.943662,,,,,,,,,1986-04-30,1986,-0.103797,-2.995732,-2.995732,-0.95
4,10000,OMFGA,68391610,1986-01-07,1986-12-03,10,3,3990.0,-0.222656,-0.222656,1.0,1.0,1986-05-31,-0.222656,11793.859375,-0.341171,-0.289062,-1.633155,-0.804688,-2.837127,-0.941406,,,,,,,,,1986-05-31,1986,-0.251873,-2.995732,-2.995732,-0.95
5,10000,OMFGA,68391610,1986-01-07,1986-12-03,10,3,3990.0,-0.005025,-0.005025,1.0,1.0,1986-06-30,-0.005025,11734.59375,-1.04481,-0.648241,-1.323013,-0.733668,-2.654248,-0.929648,,,,,,,,,1986-06-30,1986,-0.005038,-2.995732,-2.995732,-0.95
6,10000,OMFGA,68391610,1986-01-07,1986-12-03,10,3,3990.0,-0.080808,-0.080808,1.0,1.0,1986-07-31,-0.080808,10786.34375,-1.098612,-0.666667,-1.79176,-0.833333,,,,,,,,,,,1986-07-31,1986,-0.08426,-2.995732,-2.995732,-0.95
7,10000,OMFGA,68391610,1986-01-07,1986-12-03,10,3,3990.0,-0.615385,-0.615385,1.0,1.0,1986-08-31,-0.615385,4148.59375,-1.291984,-0.725275,-1.94591,-0.857143,,,,,,,,,,,1986-08-31,1986,-0.955512,-2.995732,-2.995732,-0.95
8,10000,OMFGA,68391610,1986-01-07,1986-12-03,10,3,3990.0,-0.057143,-0.057143,1.0,1.0,1986-09-30,-0.057143,3911.53125,-0.278203,-0.242857,-0.990399,-0.628571,,,,,,,,,,,1986-09-30,1986,-0.058841,-2.995732,-2.995732,-0.95
9,10000,OMFGA,68391610,1986-01-07,1986-12-03,10,3,3990.0,-0.242424,-0.242424,1.0,1.0,1986-10-31,-0.242424,3002.34375,-0.693147,-0.5,-1.417066,-0.757576,,,,,,,,,,,1986-10-31,1986,-0.277632,-2.995732,-2.995732,-0.95


In [38]:
crsp['logret'] = np.log(1+crsp['retadj'])
crsp['termlogret'] = crsp.groupby('permno')['logret'].transform('sum') 
crsp['termcumret'] = np.exp(crsp['termlogret'])-1

  """Entry point for launching an IPython kernel.


In [40]:
crsp.to_csv('~/misp_data/crsp_1959-2019.csv')