In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
# This url contains sample data from CRSP
url = 'https://www.dropbox.com/s/6mk86g97uji2f80/crsp_month_raw.txt?dl=1'
crsp_raw = pd.read_csv(url, sep='\t', low_memory=False)

# View first 5 rows and first 5 columns
crsp_raw.iloc[:5, :5]

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,SICCD
0,10001,20110131,11.0,2.0,4925
1,10001,20110228,11.0,2.0,4925
2,10001,20110331,11.0,2.0,4925
3,10001,20110429,11.0,2.0,4925
4,10001,20110531,11.0,2.0,4925


In [3]:
# Observations
len(crsp_raw)

870692

In [4]:
# Number of rows and columns (variables)
crsp_raw.shape

(870692, 12)

In [5]:
# List all variables in the dataset
crsp_raw.columns

Index(['PERMNO', 'date', 'SHRCD', 'EXCHCD', 'SICCD', 'NCUSIP', 'PERMCO',
       'CUSIP', 'PRC', 'RET', 'SHROUT', 'CFACPR'],
      dtype='object')

In [6]:
# Data type of variables
crsp_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 870692 entries, 0 to 870691
Data columns (total 12 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   PERMNO  870692 non-null  int64  
 1   date    870692 non-null  int64  
 2   SHRCD   865287 non-null  float64
 3   EXCHCD  865287 non-null  float64
 4   SICCD   865287 non-null  object 
 5   NCUSIP  865287 non-null  object 
 6   PERMCO  870692 non-null  int64  
 7   CUSIP   870692 non-null  object 
 8   PRC     855391 non-null  float64
 9   RET     860572 non-null  object 
 10  SHROUT  864704 non-null  float64
 11  CFACPR  864704 non-null  float64
dtypes: float64(5), int64(3), object(4)
memory usage: 79.7+ MB


In [7]:
# Rename uppercase to lowercase
# Typing lowercase is easier than uppercase
crsp_raw.columns = crsp_raw.columns.str.lower()

# Convert data type of date to date format
# So that we can apply date functions when manipulating dates
crsp_raw['date'] = pd.to_datetime(crsp_raw['date'], format='%Y%m%d')

# Convert data type of ret to numerical value
# Stock return should be numerical values rather than string
# For example,
#  string: '1' + '2' = '12'
#  numerical: 1 + 2 = 3
crsp_raw['ret'] = pd.to_numeric(crsp_raw['ret'], errors='coerce')

# Convert siccd to numerical value
crsp_raw['siccd'] = pd.to_numeric(crsp_raw['siccd'], errors='coerce')

crsp_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 870692 entries, 0 to 870691
Data columns (total 12 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   permno  870692 non-null  int64         
 1   date    870692 non-null  datetime64[ns]
 2   shrcd   865287 non-null  float64       
 3   exchcd  865287 non-null  float64       
 4   siccd   865153 non-null  float64       
 5   ncusip  865287 non-null  object        
 6   permco  870692 non-null  int64         
 7   cusip   870692 non-null  object        
 8   prc     855391 non-null  float64       
 9   ret     849541 non-null  float64       
 10  shrout  864704 non-null  float64       
 11  cfacpr  864704 non-null  float64       
dtypes: datetime64[ns](1), float64(7), int64(2), object(2)
memory usage: 79.7+ MB


In [8]:
crsp = crsp_raw.copy()

# Keep NYSE/AMEX/NASDAQ
crsp = crsp[crsp['exchcd'].isin([1, 2, 3])]

# Keep common shares
crsp = crsp[crsp['shrcd'].isin([10, 11])]

# Convert shrcd and exchcd to int
crsp[['exchcd', 'shrcd']] = crsp[['exchcd', 'shrcd']].astype(int)

len(crsp)

444358

In [9]:
crsp = crsp.drop_duplicates(['permno', 'date'])
len(crsp)

444358

In [10]:
temp = crsp.copy()
# Convert shrcd and exchcd to category
temp[['exchcd', 'shrcd']] = temp[['exchcd', 'shrcd']].astype('category')
temp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 444358 entries, 0 to 870691
Data columns (total 12 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   permno  444358 non-null  int64         
 1   date    444358 non-null  datetime64[ns]
 2   shrcd   444358 non-null  category      
 3   exchcd  444358 non-null  category      
 4   siccd   444256 non-null  float64       
 5   ncusip  444358 non-null  object        
 6   permco  444358 non-null  int64         
 7   cusip   444358 non-null  object        
 8   prc     441959 non-null  float64       
 9   ret     439721 non-null  float64       
 10  shrout  444036 non-null  float64       
 11  cfacpr  444036 non-null  float64       
dtypes: category(2), datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 38.1+ MB


In [11]:
# If you want to import from local
# file_path = Path('/Users/ml/Dropbox/teaching/data/crsp_month.txt')
# crsp = pd.read_csv(file_path, sep='\t', parse_dates=['date'])

# Import from url
url = 'https://www.dropbox.com/s/0nuxwo3cf7vfcy3/crsp_month.txt?dl=1'
crsp = pd.read_csv(url, sep='\t', parse_dates=['date'])

In [12]:
# Market value
crsp['price'] = crsp['prc'].abs()
crsp['me'] = (crsp['price']*crsp['shrout']) / 1000
crsp['lnme'] = np.log(crsp['me'])

# Adjusted price
crsp.loc[crsp['cfacpr']>0, 'adjprc'] = crsp['price'] / crsp['cfacpr']

# Holding period returns
crsp['yyyymm'] = crsp['date'].dt.year*100 + crsp['date'].dt.month

# Monthly index
# From 1 to n
# For example, month index will be from 1 to 120 if we have 120 months data
month_idx = crsp.drop_duplicates('yyyymm')[['yyyymm']].copy()
month_idx = month_idx.sort_values('yyyymm', ignore_index=True)
month_idx['midx'] = month_idx.index + 1

crsp = crsp.merge(month_idx, how='left', on='yyyymm')

# Past 6-month returns
crsp['logret'] = np.log(crsp['ret']+1)
crsp = crsp.sort_values(['permno', 'yyyymm'], ignore_index=True)
crsp['hpr'] = (crsp.groupby('permno')['logret']
  .rolling(window=6, min_periods=6).sum().reset_index(drop=True))
crsp['hpr'] = np.exp(crsp['hpr']) - 1

crsp['midx_lag'] = crsp.groupby('permno')['midx'].shift(5)
crsp['gap'] = crsp['midx'] - crsp['midx_lag']

temp1 = crsp.query('permno==10028 & 201107<=yyyymm<=201212').copy()

# Replace it by missing if there is month gap.
crsp.loc[crsp['gap']!=5, 'hpr'] = np.nan

temp2 = crsp.query('permno==10028 & 201107<=yyyymm<=201212').copy()

In [13]:
temp1[['permno', 'yyyymm', 'midx', 'midx_lag', 'gap', 'ret', 'hpr']]

Unnamed: 0,permno,yyyymm,midx,midx_lag,gap,ret,hpr
305,10028,201107,7,2.0,5.0,0.247887,1.004524
306,10028,201108,8,3.0,5.0,-0.032731,1.056241
307,10028,201109,9,4.0,5.0,-0.054842,0.588236
308,10028,201110,10,5.0,5.0,0.02963,0.437932
309,10028,201111,11,6.0,5.0,-0.070743,0.347828
310,10028,201112,12,7.0,5.0,-0.036129,0.052114
311,10028,201201,13,8.0,5.0,-0.026774,-0.179457
312,10028,201202,14,9.0,5.0,0.081155,-0.082847
313,10028,201203,15,10.0,5.0,-0.048346,-0.076543
314,10028,201211,23,11.0,12.0,-0.246524,-0.324221


In [14]:
temp2[['permno', 'yyyymm', 'midx', 'midx_lag', 'gap', 'ret', 'hpr']]

Unnamed: 0,permno,yyyymm,midx,midx_lag,gap,ret,hpr
305,10028,201107,7,2.0,5.0,0.247887,1.004524
306,10028,201108,8,3.0,5.0,-0.032731,1.056241
307,10028,201109,9,4.0,5.0,-0.054842,0.588236
308,10028,201110,10,5.0,5.0,0.02963,0.437932
309,10028,201111,11,6.0,5.0,-0.070743,0.347828
310,10028,201112,12,7.0,5.0,-0.036129,0.052114
311,10028,201201,13,8.0,5.0,-0.026774,-0.179457
312,10028,201202,14,9.0,5.0,0.081155,-0.082847
313,10028,201203,15,10.0,5.0,-0.048346,-0.076543
314,10028,201211,23,11.0,12.0,-0.246524,


In [15]:
crsp[['ret', 'lnme']].describe()

Unnamed: 0,ret,lnme
count,439721.0,441959.0
mean,0.011124,6.421212
std,0.177839,2.155586
min,-0.9936,-2.426619
25%,-0.058187,4.866619
50%,0.004907,6.397188
75%,0.066725,7.890045
max,19.883589,14.62909


In [16]:
# Percentiles
crsp[['ret', 'lnme']].describe(percentiles=[0.1, 0.9])

Unnamed: 0,ret,lnme
count,439721.0,441959.0
mean,0.011124,6.421212
std,0.177839,2.155586
min,-0.9936,-2.426619
10%,-0.140673,3.625798
50%,0.004907,6.397188
90%,0.152479,9.241648
max,19.883589,14.62909


In [17]:
# Summary statistics by year
crsp['year'] = crsp['date'].dt.year
round(crsp.groupby('year')[['ret', 'lnme']].describe()
.loc[:, (slice(None), ['mean', '50%', 'std'])], 4)

Unnamed: 0_level_0,ret,lnme,ret,lnme,ret,lnme
Unnamed: 0_level_1,mean,mean,50%,50%,std,std
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2011,-0.0064,6.0642,-0.0089,6.019,0.1486,2.0849
2012,0.0161,6.1141,0.0102,6.0783,0.1459,2.0934
2013,0.0325,6.3779,0.0228,6.3643,0.1331,2.0869
2014,0.0041,6.5273,0.0026,6.4729,0.1388,2.0568
2015,-0.0048,6.4665,-0.0062,6.4348,0.1783,2.1173
2016,0.0155,6.3939,0.0091,6.3736,0.1734,2.1637
2017,0.0134,6.5546,0.0066,6.5825,0.1531,2.184
2018,-0.012,6.632,-0.0089,6.6748,0.1615,2.1934
2019,0.0197,6.5616,0.0137,6.5667,0.2078,2.2513
2020,0.0346,6.5497,0.0095,6.4663,0.2837,2.2452


In [18]:
# Summary statistics by stock exchange
round(crsp.groupby('exchcd')[['ret', 'lnme']].describe()
.loc[:, (slice(None), ['mean', '50%', 'std'])], 4)

Unnamed: 0_level_0,ret,lnme,ret,lnme,ret,lnme
Unnamed: 0_level_1,mean,mean,50%,50%,std,std
exchcd,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,0.0106,7.8429,0.0095,7.8152,0.1306,1.741
2,0.0061,4.0257,-0.0104,3.8878,0.2336,1.4255
3,0.0119,5.7896,0.003,5.6982,0.1956,1.9301


In [19]:
# Summary statistics in subsamples
print('Before 2015')
print(crsp.query('year<=2015')[['ret', 'lnme']].describe())
print('\nAfter 2015')
print(crsp.query('year>=2016')[['ret', 'lnme']].describe())

Before 2015
                 ret           lnme
count  223628.000000  224712.000000
mean        0.008081       6.308403
std         0.150606       2.096421
min        -0.935356      -0.896832
25%        -0.054922       4.771121
50%         0.004207       6.272747
75%         0.061369       7.744562
max        15.984456      13.528774

After 2015
                 ret           lnme
count  216093.000000  217247.000000
mean        0.014274       6.537897
std         0.202149       2.209074
min        -0.993600      -2.426619
25%        -0.062021       4.975455
50%         0.005594       6.533329
75%         0.072987       8.032318
max        19.883589      14.629090
