In [1]:
import pandas as pd
import numpy as np
import dask #used here to scale computing capabilities of pandas through parallelism
import dask.dataframe as dd
import pickle #since openning the data was time consuming let's just save it in a pickle file

In [2]:
crsp_pickle=open ("crsp.pickle","rb")
ibes_pickle=open ("ibes.pickle","rb")
comp_pickle=open ("comp.pickle","rb")

In [3]:
crsp=pickle.load(crsp_pickle)
crsp.info

<bound method DataFrame.info of Dask DataFrame Structure:
               Unnamed: 0 PERMNO    date    SHRCD   EXCHCD  NCUSIP  TICKER   CUSIP   DIVAMT    FACPR   NWPERM    BIDLO    ASKHI      PRC      VOL      RET      BID      ASK   SHROUT   SPREAD     RETX
npartitions=12                                                                                                                                                                                         
                    int64  int64  object  float64  float64  object  object  object  float64  float64  float64  float64  float64  float64  float64  float64  float64  float64  float64  float64  float64
                      ...    ...     ...      ...      ...     ...     ...     ...      ...      ...      ...      ...      ...      ...      ...      ...      ...      ...      ...      ...      ...
...                   ...    ...     ...      ...      ...     ...     ...     ...      ...      ...      ...      ...      ...      ...      

In [4]:
ibes=pickle.load(ibes_pickle)
ibes.info

<bound method DataFrame.info of Dask DataFrame Structure:
               Unnamed: 0  TICKER   CUSIP ESTIMATOR ANALYS    FPI    VALUE FPEDATS REVDATS REVTIMS ANNDATS ANNTIMS   ACTUAL ANNDATS_ACT ANNTIMS_ACT
npartitions=17                                                                                                                                     
                    int64  object  object     int64  int64  int64  float64  object  object  object  object  object  float64      object      object
                      ...     ...     ...       ...    ...    ...      ...     ...     ...     ...     ...     ...      ...         ...         ...
...                   ...     ...     ...       ...    ...    ...      ...     ...     ...     ...     ...     ...      ...         ...         ...
                      ...     ...     ...       ...    ...    ...      ...     ...     ...     ...     ...     ...      ...         ...         ...
                      ...     ...     ...       ...   

In [5]:
comp = pickle.load(comp_pickle)
comp.columns

Index(['gvkey', 'datadate', 'fyearq', 'fqtr', 'fyr', 'indfmt', 'consol',
       'popsrc', 'datafmt', 'tic', 'cusip', 'curcdq', 'datacqtr', 'datafqtr',
       'actq', 'atq', 'ceqq', 'cshoq', 'dlcq', 'dlttq', 'ibq', 'revtq',
       'exchg', 'costat', 'prccq', 'naics', 'sic'],
      dtype='object')

In [6]:
drp_crsp=['DIVAMT','EXCHCD','FACPR', 'NWPERM', 'BIDLO', 'ASKHI','TICKER','BID','ASK','SPREAD','RETX','SHRCD','Unnamed: 0']
crsp=crsp.drop(columns=drp_crsp)

In [7]:
crsp

Unnamed: 0_level_0,PERMNO,date,NCUSIP,CUSIP,PRC,VOL,RET,SHROUT
npartitions=12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,int64,object,object,object,float64,float64,float64,float64
,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...


In [8]:
crsp['PRC']=crsp['PRC'].abs()

#### Questions:
- What exactly are the variables we need from IBES (in terms of dates)
- Merging requires too much RAM and I can't just use a part of the dataset, otherwise it won't find matches
- Maybe find the most relevant 2/3 firms for each industry first? Or try replicating Hartzmark and Shue(2018)?
- How to control for T's own earning surprise
- When do we use characteristic-adjusted returns (or an easier version of that)
- When to use Kenneth French's data (market excess return, risk-free rate, SMB, HML, UMD, and short term reversal portfolios as well as size cutoffs)
- At some point we also have to adjust for mechanical relation: 
    - We remove from the characteristic-matched portfolio a stock's own return and the return of firms included in the calculation of surpriset-1.

##### To measure return on day t:
"Our measure of return on day t is a stock's raw return on day t minus the day t return of this characteristic-matched portfolio."

##### To measure earnings surprise:
we take each analyst's most recent forecast, thereby limiting the sample to only one forecast per analyst, and then take the median of this number within a certain time window for each firm's earnings announcement. In our base specification, we take all analyst forecasts made between two and fifteen days prior to the announcement of earnings.

Calculation:
\begin{equation*}
surprise\it = \frac{actrualEarnings - medianEstimate_i[t-15,t-2]}{price_i,t-3}
\end{equation*}

In [9]:
drop_comp=['indfmt', 'consol','curcdq','popsrc','datafmt','tic','datacqtr', 'datafqtr', 'actq','ceqq','dlcq', 'dlttq','exchg', 'costat']
comp=comp.drop(columns=drop_comp)

In [10]:
comp

Unnamed: 0,gvkey,datadate,fyearq,fqtr,fyr,cusip,atq,cshoq,ibq,revtq,prccq,naics,sic
0,001004,1971-08-31,1971.0,1.0,5.0,000361105,,0.780,0.150,5.185,14.999987,423860,5080
1,001004,1971-11-30,1971.0,2.0,5.0,000361105,,0.780,0.185,5.515,19.249980,423860,5080
2,001004,1972-02-29,1971.0,3.0,5.0,000361105,,1.009,0.204,6.160,25.624976,423860,5080
3,001004,1972-05-31,1971.0,4.0,5.0,000361105,16.501,1.034,0.315,8.219,32.999938,423860,5080
4,001004,1972-08-31,1972.0,1.0,5.0,000361105,,1.044,0.351,7.555,21.749983,423860,5080
...,...,...,...,...,...,...,...,...,...,...,...,...,...
653590,345980,2020-03-31,2020.0,1.0,12.0,21077C107,,,-66.000,440.000,,454110,5961
653591,345980,2020-06-30,2020.0,2.0,12.0,21077C107,,,-11.000,701.000,,454110,5961
653592,345980,2020-09-30,2020.0,3.0,12.0,21077C107,1342.000,586.982,-99.000,606.000,,454110,5961
653593,345980,2020-12-31,2020.0,4.0,12.0,21077C107,2397.000,587.000,-569.000,794.000,18.240000,454110,5961


In [11]:
#We have to drop he last digit in CUSIP for Compustat to be equal to CRSP
comp['cusip'] = comp['cusip'].str[:-1]
comp

Unnamed: 0,gvkey,datadate,fyearq,fqtr,fyr,cusip,atq,cshoq,ibq,revtq,prccq,naics,sic
0,001004,1971-08-31,1971.0,1.0,5.0,00036110,,0.780,0.150,5.185,14.999987,423860,5080
1,001004,1971-11-30,1971.0,2.0,5.0,00036110,,0.780,0.185,5.515,19.249980,423860,5080
2,001004,1972-02-29,1971.0,3.0,5.0,00036110,,1.009,0.204,6.160,25.624976,423860,5080
3,001004,1972-05-31,1971.0,4.0,5.0,00036110,16.501,1.034,0.315,8.219,32.999938,423860,5080
4,001004,1972-08-31,1972.0,1.0,5.0,00036110,,1.044,0.351,7.555,21.749983,423860,5080
...,...,...,...,...,...,...,...,...,...,...,...,...,...
653590,345980,2020-03-31,2020.0,1.0,12.0,21077C10,,,-66.000,440.000,,454110,5961
653591,345980,2020-06-30,2020.0,2.0,12.0,21077C10,,,-11.000,701.000,,454110,5961
653592,345980,2020-09-30,2020.0,3.0,12.0,21077C10,1342.000,586.982,-99.000,606.000,,454110,5961
653593,345980,2020-12-31,2020.0,4.0,12.0,21077C10,2397.000,587.000,-569.000,794.000,18.240000,454110,5961


- if there are duplicates just remove them (make sure there are no duplicates)
- 

##### Next steps for CRSP
- drop the rest of the variables that are unnecessary


##### Next steps for IBES
- annadats_act -> the day of the announcement
- remove estimator, fpi, revdats, revtims
- t -> anndats - anndats_act >= -15
- filter the analysts that didn't follow this
- by ticker fpedats analys: sort anndats and use value at the most recent anndats 
- take the median/mean of the leftover forecasts
- consensus forecast = mean or median across 6 analysts for a given ticker at a given fpedats 
- by ticker fpedats: unique obs -> make it for firm level

##### Next steps for Compustat
- ibes and compustat are quarterly and crsp is monthly
- fyearq (year) fqtr(qtr)
- cusip in compustat is the same as the cusip in crsp
- in compusat we must drop the last digit in cusip 
- 'prccq' is the book value per share
- 'cshoq' number of shares outstanding
- ibq -> NI
- revtq -> total revenue (can be used for firm size)
- 'atq' -> total assets

In [12]:
drop_ibes=['Unnamed: 0','ESTIMATOR','FPI','REVDATS','REVTIMS']
ibes=ibes.drop(columns=drop_ibes)
ibes

Unnamed: 0_level_0,Unnamed: 0,TICKER,CUSIP,ANALYS,VALUE,FPEDATS,ANNDATS,ANNTIMS,ACTUAL,ANNDATS_ACT,ANNTIMS_ACT
npartitions=17,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
,int64,object,object,int64,float64,object,object,object,float64,object,object
,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...


In [13]:
#Celaning out the date that comes as a data reading error when we read a stata file
ibes['ANNTIMS']=ibes['ANNTIMS'].str[10:]
ibes['ANNTIMS_ACT']=ibes['ANNTIMS_ACT'].str[10:]
ibes

Unnamed: 0_level_0,Unnamed: 0,TICKER,CUSIP,ANALYS,VALUE,FPEDATS,ANNDATS,ANNTIMS,ACTUAL,ANNDATS_ACT,ANNTIMS_ACT
npartitions=17,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
,int64,object,object,int64,float64,object,object,object,float64,object,object
,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...


In [14]:
# We are only taking a slice of the ibes data to work with it. Once we figure out we can transfer to use the whole data
ibes=ibes.head(200)

##### Following Hartzmark and Shue (2018), the earnings forecasts cannot be stale, so they filter out all the forecasts that were made more than 15 days before the day of the announcement.

In [15]:
#transform both columns into datetime.date objects
ibes['ANNDATS']=pd.to_datetime(ibes['ANNDATS'])
ibes['ANNDATS_ACT']=pd.to_datetime(ibes['ANNDATS_ACT'])
ibes['FPEDATS']=pd.to_datetime(ibes['FPEDATS'])
#the date.days attribute will return the days in int format
ibes_fltr = ibes[(ibes['ANNDATS']-ibes['ANNDATS_ACT']).dt.days >= -15]

In [16]:
ibes_fltr

Unnamed: 0.1,Unnamed: 0,TICKER,CUSIP,ANALYS,VALUE,FPEDATS,ANNDATS,ANNTIMS,ACTUAL,ANNDATS_ACT,ANNTIMS_ACT
11,12,0,87482X10,80474,0.1,2014-03-31,2014-05-07,19:32:00,0.12,2014-05-06,10:45:00
14,15,0,87482X10,80474,0.19,2014-06-30,2014-08-07,23:49:00,0.27,2014-08-06,17:05:00
108,109,1,26878510,104605,0.804,2013-06-30,2014-02-26,06:36:00,,2014-01-23,17:05:00
109,110,1,26878510,104605,-0.448,2013-09-30,2014-02-26,06:36:00,,2014-01-23,17:05:00
110,111,1,26878510,107698,0.07,2013-12-31,2014-02-26,00:16:00,0.2,2014-02-27,23:13:00
111,112,1,26878510,78506,0.23,2013-12-31,2014-02-26,03:30:00,0.2,2014-02-27,23:13:00
112,113,1,26878510,147283,0.34,2013-12-31,2014-02-26,07:02:00,0.2,2014-02-27,23:13:00
113,114,1,26878510,42769,0.2,2013-12-31,2014-02-26,00:07:00,0.2,2014-02-27,23:13:00
114,115,1,26878510,83355,0.34,2013-12-31,2014-02-26,09:37:00,0.2,2014-02-27,23:13:00
115,116,1,26878510,137332,0.67,2013-12-31,2014-02-26,05:37:00,0.2,2014-02-27,23:13:00


##### The next step following Hartzmark and Shue (2018):
- sort the IBES: we take each N
- this will limit the sample to only one forecast per analyst per quarter
- take the median of this number to get the quarterly consensus
- by ticker, fpedats, analys: sort anndats and use value at the most recent anndats

In [39]:
ibes_fltr_grp=ibes_fltr.groupby(['TICKER','FPEDATS','ANALYS'])
#ibes_fltr_grp.get_group('0000')
ibes_fin_fltr=ibes_fltr_grp['ANNDATS'].max()
ibes_fin_fltr

TICKER  FPEDATS     ANALYS
0000    2014-03-31  80474    2014-05-07
        2014-06-30  80474    2014-08-07
0001    2013-06-30  104605   2014-02-26
        2013-09-30  104605   2014-02-26
        2013-12-31  42769    2014-02-26
                    78506    2014-02-26
                    83355    2014-02-26
                    107698   2014-02-26
                    132083   2014-02-28
                    137332   2014-02-26
                    147283   2014-02-26
        2014-03-31  18677    2014-05-05
                    22290    2014-04-24
                    109408   2014-05-05
                    122739   2014-05-05
                    136072   2014-05-05
        2014-06-30  42769    2014-07-31
                    71894    2014-08-01
                    83355    2014-08-01
                    136072   2014-08-01
                    137332   2014-08-01
                    139608   2014-08-01
                    147283   2014-08-01
Name: ANNDATS, dtype: datetime64[ns]

##### 

In [48]:
ibes_fltr_grp['ANNDATS'].max()
ibes_fltr_grp

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ANNDATS,VALUE
TICKER,FPEDATS,ANALYS,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2014-03-31,80474,2014-05-07,0.1
0,2014-06-30,80474,2014-08-07,0.19
1,2013-06-30,104605,2014-02-26,0.804
1,2013-09-30,104605,2014-02-26,-0.448
1,2013-12-31,42769,2014-02-26,0.2
1,2013-12-31,78506,2014-02-26,0.23
1,2013-12-31,83355,2014-02-26,0.34
1,2013-12-31,107698,2014-02-26,0.07
1,2013-12-31,132083,2014-02-28,0.22
1,2013-12-31,137332,2014-02-26,0.67


In [90]:
ibes_fnl=ibes_fltr.groupby(['TICKER','FPEDATS']).agg({'ANNDATS':'max','VALUE':'median'})
ibes_fnl

Unnamed: 0_level_0,Unnamed: 1_level_0,ANNDATS,VALUE
TICKER,FPEDATS,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2014-03-31,2014-05-07,0.1
0,2014-06-30,2014-08-07,0.19
1,2013-06-30,2014-02-26,0.804
1,2013-09-30,2014-02-26,-0.448
1,2013-12-31,2014-02-28,0.23
1,2014-03-31,2014-05-05,0.17
1,2014-06-30,2014-08-01,0.205


In [73]:
#This kinda works, but it accounts for all the values in the median calculation. We need to figure out a way to drop the repeated values from the same analyst
pd.DataFrame(ibes_fltr_grp['ANNDATS']).reset_index()
groups = ibes_fltr_grp.apply(list)
  
df1 = groups.reset_index(name = 'listvalues')
# show the dataframe
df1

Unnamed: 0,TICKER,FPEDATS,ANALYS,listvalues
0,0,2014-03-31,80474,"[TICKER, CUSIP, ANALYS, VALUE, FPEDATS, ANNDAT..."
1,0,2014-06-30,80474,"[TICKER, CUSIP, ANALYS, VALUE, FPEDATS, ANNDAT..."
2,1,2013-06-30,104605,"[TICKER, CUSIP, ANALYS, VALUE, FPEDATS, ANNDAT..."
3,1,2013-09-30,104605,"[TICKER, CUSIP, ANALYS, VALUE, FPEDATS, ANNDAT..."
4,1,2013-12-31,42769,"[TICKER, CUSIP, ANALYS, VALUE, FPEDATS, ANNDAT..."
5,1,2013-12-31,78506,"[TICKER, CUSIP, ANALYS, VALUE, FPEDATS, ANNDAT..."
6,1,2013-12-31,83355,"[TICKER, CUSIP, ANALYS, VALUE, FPEDATS, ANNDAT..."
7,1,2013-12-31,107698,"[TICKER, CUSIP, ANALYS, VALUE, FPEDATS, ANNDAT..."
8,1,2013-12-31,132083,"[TICKER, CUSIP, ANALYS, VALUE, FPEDATS, ANNDAT..."
9,1,2013-12-31,137332,"[TICKER, CUSIP, ANALYS, VALUE, FPEDATS, ANNDAT..."


In [74]:
df1.drop('listvalues', axis=1, inplace=True)
df1

Unnamed: 0,TICKER,FPEDATS,ANALYS
0,0,2014-03-31,80474
1,0,2014-06-30,80474
2,1,2013-06-30,104605
3,1,2013-09-30,104605
4,1,2013-12-31,42769
5,1,2013-12-31,78506
6,1,2013-12-31,83355
7,1,2013-12-31,107698
8,1,2013-12-31,132083
9,1,2013-12-31,137332


In [95]:

df3 = ibes_fnl.reset_index()
df3

Unnamed: 0,TICKER,FPEDATS,ANNDATS,VALUE
0,0,2014-03-31,2014-05-07,0.1
1,0,2014-06-30,2014-08-07,0.19
2,1,2013-06-30,2014-02-26,0.804
3,1,2013-09-30,2014-02-26,-0.448
4,1,2013-12-31,2014-02-28,0.23
5,1,2014-03-31,2014-05-05,0.17
6,1,2014-06-30,2014-08-01,0.205


In [98]:
ibes_test=ibes_fltr.groupby(['TICKER','FPEDATS']).agg({'ANNDATS':'max','VALUE':'median','ACTUAL':''})

AttributeError: 'SeriesGroupBy' object has no attribute ''