In [6]:
import pandas as pd

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

# Table of Content
1. [COMPUSTAT](#COMPUSTAT-Data)
2. [CRSP](#CRSP-Data)

**Note:** Kindly run only one of the database

## COMPUSTAT Data

In [2]:
df_compustat = pd.read_csv('data/COMPUSTAT_20220423.csv')
df_compustat

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,cusip,conm,...,priusa,sic,spcindcd,spcseccd,spcsrc,state,stko,weburl,dldte,ipodate
0,1001,1981/12/31,1981.0,INDL,C,D,STD,AMFD.,000165100,A & M FOOD SERVICES INC,...,1.0,5812.0,420.0,978.0,,OK,0.0,,1986/07/31,
1,1001,1982/12/31,1982.0,INDL,C,D,STD,AMFD.,000165100,A & M FOOD SERVICES INC,...,1.0,5812.0,420.0,978.0,,OK,0.0,,1986/07/31,
2,1001,1983/12/31,1983.0,INDL,C,D,STD,AMFD.,000165100,A & M FOOD SERVICES INC,...,1.0,5812.0,420.0,978.0,,OK,0.0,,1986/07/31,
3,1001,1984/12/31,1984.0,INDL,C,D,STD,AMFD.,000165100,A & M FOOD SERVICES INC,...,1.0,5812.0,420.0,978.0,,OK,0.0,,1986/07/31,
4,1001,1985/12/31,1985.0,INDL,C,D,STD,AMFD.,000165100,A & M FOOD SERVICES INC,...,1.0,5812.0,420.0,978.0,,OK,0.0,,1986/07/31,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480263,347085,2021/02/28,2020.0,INDL,C,D,STD,KARO,Y4600W108,KAROOOOO LTD,...,1.0,7370.0,,,,,0.0,www.karooooo.com,,2021/04/01
480264,351491,2019/12/31,2019.0,INDL,C,D,STD,IVCGF,N47017103,IVECO GROUP N V,...,1.0,3711.0,,,,,3.0,www.ivecogroup.com,,2022/01/03
480265,351491,2020/12/31,2020.0,INDL,C,D,STD,IVCGF,N47017103,IVECO GROUP N V,...,1.0,3711.0,,,,,3.0,www.ivecogroup.com,,2022/01/03
480266,351590,2019/12/31,2019.0,INDL,C,D,STD,DTRUY,23384L101,DAIMLER TRUCK HOLDING AG,...,90.0,3713.0,,,,,3.0,www.daimlertruck.com,,


### Data Wrangling - COMPUSTAT

In [3]:
'''
The default web query has boxes checked for Industry Formats (INDFMT) “INDL” and “FS.”
FS: Financial Services (includes banks, insurance companies, broker/dealers, real estate and other financial services)
INDL: Industrial (includes companies reporting manufacturing, retail, construction and other commercial operations other than financial services)

The default setting on the web query pulling both of these is the most notable source for duplicate observations.

Pulling both of these formats for a non-financial services firm for a given GVKEY and DATADATE results in:
- The one GKVEY DATADATE observation with INDFMT = INDL for the actual 10-K numbers.
- The second GVKEY DATADATE observation with INDFMT= FS is essentially the same 10-K converted to a “financial services” format.
Source: https://robsonglasscock.wordpress.com/2018/04/12/gvkey-and-datadate-or-fyear-duplicates-in-compustat/

For the remaining conditions, please see the following link:
Link: http://kaichen.work/?p=387
'''

# Removing duplicate entries based on Industry formats
df_compustat = df_compustat[(df_compustat['indfmt']=='INDL') & (df_compustat['datafmt']=='STD') & (df_compustat['popsrc']=='D') & (df_compustat['consol']=='C') & (df_compustat['scf']<4) | (df_compustat['scf']==7)]
df_compustat

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,cusip,conm,...,priusa,sic,spcindcd,spcseccd,spcsrc,state,stko,weburl,dldte,ipodate
2,1001,1983/12/31,1983.0,INDL,C,D,STD,AMFD.,000165100,A & M FOOD SERVICES INC,...,1.0,5812.0,420.0,978.0,,OK,0.0,,1986/07/31,
3,1001,1984/12/31,1984.0,INDL,C,D,STD,AMFD.,000165100,A & M FOOD SERVICES INC,...,1.0,5812.0,420.0,978.0,,OK,0.0,,1986/07/31,
4,1001,1985/12/31,1985.0,INDL,C,D,STD,AMFD.,000165100,A & M FOOD SERVICES INC,...,1.0,5812.0,420.0,978.0,,OK,0.0,,1986/07/31,
6,1003,1982/12/31,1982.0,INDL,C,D,STD,ANTQ,000354100,A.A. IMPORTING CO INC,...,1.0,5712.0,449.0,976.0,,MO,3.0,www.aaimporting.com,1992/04/30,
7,1003,1983/12/31,1983.0,INDL,C,D,STD,ANTQ,000354100,A.A. IMPORTING CO INC,...,1.0,5712.0,449.0,976.0,,MO,3.0,www.aaimporting.com,1992/04/30,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480263,347085,2021/02/28,2020.0,INDL,C,D,STD,KARO,Y4600W108,KAROOOOO LTD,...,1.0,7370.0,,,,,0.0,www.karooooo.com,,2021/04/01
480264,351491,2019/12/31,2019.0,INDL,C,D,STD,IVCGF,N47017103,IVECO GROUP N V,...,1.0,3711.0,,,,,3.0,www.ivecogroup.com,,2022/01/03
480265,351491,2020/12/31,2020.0,INDL,C,D,STD,IVCGF,N47017103,IVECO GROUP N V,...,1.0,3711.0,,,,,3.0,www.ivecogroup.com,,2022/01/03
480266,351590,2019/12/31,2019.0,INDL,C,D,STD,DTRUY,23384L101,DAIMLER TRUCK HOLDING AG,...,90.0,3713.0,,,,,3.0,www.daimlertruck.com,,


In [4]:
# Extracting/Selecting required columns only
cols_required = ['cusip', 'conm', 'gsector', 'ggroup', 'gind', 'gsubind', 'fyear', 'epspx', 'bkvlps',]

df_compustat = df_compustat[cols_required]
df_compustat

Unnamed: 0,cusip,conm,gsector,ggroup,gind,gsubind,fyear,epspx,bkvlps
2,000165100,A & M FOOD SERVICES INC,25.0,2530.0,253010.0,25301040.0,1983.0,0.40,2.1925
3,000165100,A & M FOOD SERVICES INC,25.0,2530.0,253010.0,25301040.0,1984.0,0.32,2.5118
4,000165100,A & M FOOD SERVICES INC,25.0,2530.0,253010.0,25301040.0,1985.0,0.68,3.2633
6,000354100,A.A. IMPORTING CO INC,25.0,2550.0,255040.0,25504040.0,1982.0,0.44,0.9443
7,000354100,A.A. IMPORTING CO INC,25.0,2550.0,255040.0,25504040.0,1983.0,0.49,2.2717
...,...,...,...,...,...,...,...,...,...
480263,Y4600W108,KAROOOOO LTD,45.0,4510.0,451030.0,45103010.0,2020.0,0.98,2.6356
480264,N47017103,IVECO GROUP N V,20.0,2010.0,201060.0,20106010.0,2019.0,3.77,
480265,N47017103,IVECO GROUP N V,20.0,2010.0,201060.0,20106010.0,2020.0,-19.68,
480266,23384L101,DAIMLER TRUCK HOLDING AG,20.0,2010.0,201060.0,20106010.0,2019.0,1.18,


In [5]:
#Renaming the CUSIPs - Taking the first 6 characters

df_compustat['cusip'] = df_compustat.cusip.str[:6]
df_compustat['cusip'] = df_compustat['cusip'].astype(str)
df_compustat

Unnamed: 0,cusip,conm,gsector,ggroup,gind,gsubind,fyear,epspx,bkvlps
2,000165,A & M FOOD SERVICES INC,25.0,2530.0,253010.0,25301040.0,1983.0,0.40,2.1925
3,000165,A & M FOOD SERVICES INC,25.0,2530.0,253010.0,25301040.0,1984.0,0.32,2.5118
4,000165,A & M FOOD SERVICES INC,25.0,2530.0,253010.0,25301040.0,1985.0,0.68,3.2633
6,000354,A.A. IMPORTING CO INC,25.0,2550.0,255040.0,25504040.0,1982.0,0.44,0.9443
7,000354,A.A. IMPORTING CO INC,25.0,2550.0,255040.0,25504040.0,1983.0,0.49,2.2717
...,...,...,...,...,...,...,...,...,...
480263,Y4600W,KAROOOOO LTD,45.0,4510.0,451030.0,45103010.0,2020.0,0.98,2.6356
480264,N47017,IVECO GROUP N V,20.0,2010.0,201060.0,20106010.0,2019.0,3.77,
480265,N47017,IVECO GROUP N V,20.0,2010.0,201060.0,20106010.0,2020.0,-19.68,
480266,23384L,DAIMLER TRUCK HOLDING AG,20.0,2010.0,201060.0,20106010.0,2019.0,1.18,


In [6]:
# Analyzing the number of NAs in the database

df_compustat.isna().sum()

cusip          0
conm           0
gsector     4924
ggroup      4924
gind        4924
gsubind     4924
fyear          1
epspx      15963
bkvlps     11042
dtype: int64

In [7]:
# Analyzing the number of NAs after temporary dropping the NAs from the columns. This step is done by incrementally adding the columns to dropna function.

df_compustat.dropna(subset=['cusip', 'fyear', 'gsector', 'ggroup', 'gind', 'gsubind', 'epspx', 'bkvlps']).isna().sum()

cusip      0
conm       0
gsector    0
ggroup     0
gind       0
gsubind    0
fyear      0
epspx      0
bkvlps     0
dtype: int64

In [8]:
#Dropping all the rows with NAs
df_compustat.dropna(subset=['cusip', 'fyear', 'gsector', 'ggroup', 'gind', 'gsubind', 'epspx', 'bkvlps'], inplace=True)
df_compustat

Unnamed: 0,cusip,conm,gsector,ggroup,gind,gsubind,fyear,epspx,bkvlps
2,000165,A & M FOOD SERVICES INC,25.0,2530.0,253010.0,25301040.0,1983.0,0.40,2.1925
3,000165,A & M FOOD SERVICES INC,25.0,2530.0,253010.0,25301040.0,1984.0,0.32,2.5118
4,000165,A & M FOOD SERVICES INC,25.0,2530.0,253010.0,25301040.0,1985.0,0.68,3.2633
6,000354,A.A. IMPORTING CO INC,25.0,2550.0,255040.0,25504040.0,1982.0,0.44,0.9443
7,000354,A.A. IMPORTING CO INC,25.0,2550.0,255040.0,25504040.0,1983.0,0.49,2.2717
...,...,...,...,...,...,...,...,...,...
480257,21077C,CONTEXTLOGIC INC,25.0,2550.0,255020.0,25502020.0,2021.0,-0.57,1.2432
480260,45256X,IMMUNITYBIO INC,35.0,3520.0,352010.0,35201010.0,2021.0,-0.89,-0.6087
480261,Y4600W,KAROOOOO LTD,45.0,4510.0,451030.0,45103010.0,2018.0,0.79,43983.0000
480262,Y4600W,KAROOOOO LTD,45.0,4510.0,451030.0,45103010.0,2019.0,0.86,55947.0000


In [9]:
# Converting the fyear to Integer type
df_compustat['fyear'] = df_compustat['fyear'].astype(int)
df_compustat.dtypes

cusip       object
conm        object
gsector    float64
ggroup     float64
gind       float64
gsubind    float64
fyear        int32
epspx      float64
bkvlps     float64
dtype: object

In [10]:
#Lagging the variables
df_compustat['epspx'] = df_compustat.sort_values(by=['cusip','fyear'],ascending=False).groupby(['cusip'])['epspx'].shift(-1)
df_compustat['bkvlps'] = df_compustat.sort_values(by=['cusip','fyear'],ascending=False).groupby(['cusip'])['bkvlps'].shift(-1)
df_compustat.dropna(subset=['epspx', 'bkvlps'], inplace=True)
df_compustat.reset_index(drop=True, inplace=True)
df_compustat

Unnamed: 0,cusip,conm,gsector,ggroup,gind,gsubind,fyear,epspx,bkvlps
0,000165,A & M FOOD SERVICES INC,25.0,2530.0,253010.0,25301040.0,1984,0.40,2.1925
1,000165,A & M FOOD SERVICES INC,25.0,2530.0,253010.0,25301040.0,1985,0.32,2.5118
2,000354,A.A. IMPORTING CO INC,25.0,2550.0,255040.0,25504040.0,1983,0.44,0.9443
3,000354,A.A. IMPORTING CO INC,25.0,2550.0,255040.0,25504040.0,1984,0.49,2.2717
4,000354,A.A. IMPORTING CO INC,25.0,2550.0,255040.0,25504040.0,1985,0.14,2.4160
...,...,...,...,...,...,...,...,...,...
290704,21077C,CONTEXTLOGIC INC,25.0,2550.0,255020.0,25502020.0,2019,-0.35,-12.3599
290705,21077C,CONTEXTLOGIC INC,25.0,2550.0,255020.0,25502020.0,2020,-0.22,-14.6777
290706,21077C,CONTEXTLOGIC INC,25.0,2550.0,255020.0,25502020.0,2021,-1.27,1.7496
290707,Y4600W,KAROOOOO LTD,45.0,4510.0,451030.0,45103010.0,2019,0.79,43983.0000


In [11]:
# Exporting the data

df_compustat.to_csv('data/Cleaned_Compustat.csv', index=False)

In [12]:
del(df_compustat)

In [13]:
from IPython.display import display_html
def restartkernel() :
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)

## CRSP Data

In [7]:
# Extracting/Selecting required columns only
required_cols = ['DATE', 'CUSIP', 'PERMNO', 'SHRCD', 'PRC', 'SHROUT', 'RET']

# Reading the dataset
df_crsp = pd.read_csv('data/msf_new2.csv', usecols=required_cols).rename(columns={'DATE':'date'})
df_crsp

Unnamed: 0,PERMNO,date,CUSIP,SHRCD,PRC,RET,SHROUT
0,10000,19851231,68391610,,,,
1,10000,19860131,68391610,10.0,-4.37500,C,3680.0
2,10000,19860228,68391610,10.0,-3.25000,-0.257143,3680.0
3,10000,19860331,68391610,10.0,-4.43750,0.365385,3680.0
4,10000,19860430,68391610,10.0,-4.00000,-0.098592,3793.0
...,...,...,...,...,...,...,...
4701549,93436,20200831,88160R10,11.0,498.32001,0.741452,931809.0
4701550,93436,20200930,88160R10,11.0,429.01001,-0.139087,948000.0
4701551,93436,20201030,88160R10,11.0,388.04001,-0.095499,947901.0
4701552,93436,20201130,88160R10,11.0,567.59998,0.462736,947901.0


### Data Wrangling - CRSP
- SHRCD means Share Code
    - First Digit = 1 means 'Ordinary Common Shares'
    - Second Digit = 0 means 'No special status found' & Second Digit = 1 means 'No special status ncessary'

In [8]:
#Filtering for common shares & dropping the column
df_crsp = df_crsp[(df_crsp['SHRCD']==10.0) | (df_crsp['SHRCD']==11.0)]
df_crsp.drop(columns=['SHRCD'], inplace=True)
df_crsp

Unnamed: 0,PERMNO,date,CUSIP,PRC,RET,SHROUT
1,10000,19860131,68391610,-4.37500,C,3680.0
2,10000,19860228,68391610,-3.25000,-0.257143,3680.0
3,10000,19860331,68391610,-4.43750,0.365385,3680.0
4,10000,19860430,68391610,-4.00000,-0.098592,3793.0
5,10000,19860530,68391610,-3.10938,-0.222656,3793.0
...,...,...,...,...,...,...
4701549,93436,20200831,88160R10,498.32001,0.741452,931809.0
4701550,93436,20200930,88160R10,429.01001,-0.139087,948000.0
4701551,93436,20201030,88160R10,388.04001,-0.095499,947901.0
4701552,93436,20201130,88160R10,567.59998,0.462736,947901.0


In [9]:
# Checking the number of NAs in the dataset
df_crsp.isna().sum()

PERMNO        0
date          0
CUSIP         0
PRC       98067
RET       42393
SHROUT     3717
dtype: int64

In [10]:
# Convert column "PRC" of a DataFrame to numeric

df_crsp['PRC'] = pd.to_numeric(df_crsp['PRC'])
df_crsp.dtypes

PERMNO      int64
date        int64
CUSIP      object
PRC       float64
RET        object
SHROUT    float64
dtype: object

In [11]:
# Dropping all the rows with PRC as NA
df_crsp.dropna(subset=['PRC'], inplace=True)
df_crsp

Unnamed: 0,PERMNO,date,CUSIP,PRC,RET,SHROUT
1,10000,19860131,68391610,-4.37500,C,3680.0
2,10000,19860228,68391610,-3.25000,-0.257143,3680.0
3,10000,19860331,68391610,-4.43750,0.365385,3680.0
4,10000,19860430,68391610,-4.00000,-0.098592,3793.0
5,10000,19860530,68391610,-3.10938,-0.222656,3793.0
...,...,...,...,...,...,...
4701549,93436,20200831,88160R10,498.32001,0.741452,931809.0
4701550,93436,20200930,88160R10,429.01001,-0.139087,948000.0
4701551,93436,20201030,88160R10,388.04001,-0.095499,947901.0
4701552,93436,20201130,88160R10,567.59998,0.462736,947901.0


In [12]:
'''
Q. Why is the stock price in CRSP preceded by a minus sign?
A.
- Sometimes you see negative stock prices in CRSP. This means that there was no closing price available for that period.
- Instead, the bid/ask average was used.
- To distinguish the bid/ask averages from actual closing prices, CRSP puts a leading dash in front of the price when the bid/ask average was used.
- If neither price nor bid/ask average is available, Price or Bid/Ask Average is set to zero.
'''

# Converting the PRC to positive value
df_crsp['PRC'] = df_crsp['PRC'].abs()

# Creating a column named 'fyear' - it will be helpful in merging the COMPUSTAT and CRSP dataframes later on
df_crsp['fyear'] = (df_crsp['date']/10000).astype('int')

#Converting the date to Datetime
df_crsp['date'] = pd.to_datetime(df_crsp['date'], format='%Y%m%d')

# Creating a column named 'mktcap' - to calculate the Market Capitalization of the stock at the point in time
df_crsp['mktcap'] = abs(df_crsp['PRC'])*df_crsp['SHROUT']

'''
The first 6 digits of a CUSIP identify the company, digits 7-8 describe the security and the 9th is check digit.
'''

#Renaming the CUSIPs - Taking the first 6 characters
df_crsp['CUSIP'] = df_crsp.CUSIP.str[:6]

# Renaming the columns for consistency with COMPUSTAT
df_crsp.rename(columns={'CUSIP':'cusip', 'PRC':'prc', 'SHROUT':'shrout', 'RET':'ret', 'PERMNO':'permno'}, inplace=True)

#Rearranging the columns
df_crsp = df_crsp[['date', 'fyear', 'cusip', 'permno', 'prc', 'shrout', 'mktcap', 'ret']]

df_crsp

Unnamed: 0,date,fyear,cusip,permno,prc,shrout,mktcap,ret
1,1986-01-31,1986,683916,10000,4.37500,3680.0,1.610000e+04,C
2,1986-02-28,1986,683916,10000,3.25000,3680.0,1.196000e+04,-0.257143
3,1986-03-31,1986,683916,10000,4.43750,3680.0,1.633000e+04,0.365385
4,1986-04-30,1986,683916,10000,4.00000,3793.0,1.517200e+04,-0.098592
5,1986-05-30,1986,683916,10000,3.10938,3793.0,1.179388e+04,-0.222656
...,...,...,...,...,...,...,...,...
4701549,2020-08-31,2020,88160R,93436,498.32001,931809.0,4.643391e+08,0.741452
4701550,2020-09-30,2020,88160R,93436,429.01001,948000.0,4.067015e+08,-0.139087
4701551,2020-10-30,2020,88160R,93436,388.04001,947901.0,3.678235e+08,-0.095499
4701552,2020-11-30,2020,88160R,93436,567.59998,947901.0,5.380286e+08,0.462736


In [13]:
#Lagging the variables
df_crsp['prc'] = df_crsp.sort_values(by=['cusip','date'],ascending=False).groupby(['cusip'])['prc'].shift(-1)
df_crsp['shrout'] = df_crsp.sort_values(by=['cusip','date'],ascending=False).groupby(['cusip'])['shrout'].shift(-1)
df_crsp['mktcap'] = df_crsp.sort_values(by=['cusip','date'],ascending=False).groupby(['cusip'])['mktcap'].shift(-1)
df_crsp['ret'] = df_crsp.sort_values(by=['cusip','date'],ascending=False).groupby(['cusip'])['ret'].shift(-1)

df_crsp.dropna(subset=['prc', 'shrout', 'mktcap', 'ret'], inplace=True)
df_crsp.reset_index(drop=True, inplace=True)
df_crsp = df_crsp[df_crsp['ret']!='C']
df_crsp['ret'] = df_crsp['ret'].astype('float64')
df_crsp

Unnamed: 0,date,fyear,cusip,permno,prc,shrout,mktcap,ret
1,1986-03-31,1986,683916,10000,3.25000,3680.0,1.196000e+04,-0.257143
2,1986-04-30,1986,683916,10000,4.43750,3680.0,1.633000e+04,0.365385
3,1986-05-30,1986,683916,10000,4.00000,3793.0,1.517200e+04,-0.098592
4,1986-06-30,1986,683916,10000,3.10938,3793.0,1.179388e+04,-0.222656
5,1986-07-31,1986,683916,10000,3.09375,3793.0,1.173459e+04,-0.005025
...,...,...,...,...,...,...,...,...
3565662,2020-08-31,2020,88160R,93436,1430.76001,186362.0,2.666393e+08,0.325011
3565663,2020-09-30,2020,88160R,93436,498.32001,931809.0,4.643391e+08,0.741452
3565664,2020-10-30,2020,88160R,93436,429.01001,948000.0,4.067015e+08,-0.139087
3565665,2020-11-30,2020,88160R,93436,388.04001,947901.0,3.678235e+08,-0.095499


In [14]:
df_crsp.to_csv('data/Cleaned_Crsp.csv', index=False)

In [15]:
from IPython.display import display_html
def restartkernel() :
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)