In [1]:
import pandas as pd

In [44]:
df = pd.read_parquet('../data/processed/forecasts_pmafe.parquet')
df.head()

Unnamed: 0,ibes_ticker_pk,official_ticker,cname,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,afe_analyst_i,afe_analyst_i_avg,afe_mean_firm_j,pmafe
0,0,TLMR,TALMER BANCORP,2500,72481,0.85,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26,316 days,0.17,0.198,0.166786,0.187152
1,0,TLMR,TALMER BANCORP,149,119962,0.86,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26,316 days,0.16,0.174286,0.166786,0.044968
2,0,TLMR,TALMER BANCORP,1267,71182,0.89,2015-12-31,2015-04-30,2015-02-18,1.02,2016-01-26,316 days,0.13,0.158,0.166786,-0.052677
3,0,TLMR,TALMER BANCORP,149,119962,0.89,2015-12-31,2015-04-30,2015-02-23,1.02,2016-01-26,311 days,0.13,0.174286,0.166786,0.044968
4,0,TLMR,TALMER BANCORP,873,79092,0.91,2015-12-31,2015-07-06,2015-04-07,1.02,2016-01-26,268 days,0.11,0.136667,0.166786,-0.180585


In [45]:
df

ibes_ticker_pk                   object
official_ticker                  object
cname                            object
estimator                         int64
analyst                           int64
estimated_eps                   float64
fiscal_period_ending     datetime64[ns]
revision_date            datetime64[ns]
announce_date            datetime64[ns]
actual_eps                      float64
announce_date_actual     datetime64[ns]
forecast_horizon        timedelta64[ns]
afe_analyst_i                   float64
afe_analyst_i_avg               float64
afe_mean_firm_j                 float64
pmafe                           float64
dtype: object

In [4]:
df_analyst = df[['analyst', 'announce_date']]
df_analyst['year'] = df_analyst['announce_date'].dt.year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analyst['year'] = df_analyst['announce_date'].dt.year


In [5]:
df_analyst = df_analyst.drop(columns= ['announce_date'])
df_analyst = df_analyst.drop_duplicates().reset_index(drop=True)

In [6]:
df_analyst

Unnamed: 0,analyst,year
0,72481,2015
1,119962,2015
2,71182,2015
3,79092,2015
4,80474,2015
...,...,...
40200,191722,2022
40201,104860,2022
40202,188121,2022
40203,131108,2019


- Complexity: number of firms for which analyst i supplied at least one forecast during the first 11 months of year t OR number of two-digit SICs for which analyst i supplied at least one forecast during the first 11 months of year t minus the average number of two-digit SICs followed by an analyst following firm j at time t.

In [4]:
number_of_forecasts_by_analyst_pro_period = df.groupby(['fiscal_period_ending', 'analyst'])['official_ticker'].count().reset_index()\
    .sort_values(by=['fiscal_period_ending', 'official_ticker'], ascending=[True, False]).set_index(['fiscal_period_ending', 'analyst'])

In [6]:
number_of_forecasts_by_analyst_pro_period.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,official_ticker
fiscal_period_ending,analyst,Unnamed: 2_level_1
2015-01-31,106026,14
2015-01-31,118217,11
2015-01-31,127622,11
2015-01-31,108441,10
2015-01-31,114475,10


- Forecasting ability/experience: number of years for which analyst i supplied at least one forecast during the first 11 months of the year through year t

In [7]:
old_data = pd.read_parquet('../data/raw/1990-2014.parquet')
old_data.head()

Unnamed: 0,TICKER,OFTIC,CNAME,ACTDATS,ANALYS,FPI,MEASURE,FPEDATS,ANNDATS
0,0,TLMR,TALMER BANCORP,2014-03-11,119962,1,EPS,2014-12-31,2014-03-09
1,0,TLMR,TALMER BANCORP,2014-03-11,80474,1,EPS,2014-12-31,2014-03-10
2,0,TLMR,TALMER BANCORP,2014-03-11,50789,1,EPS,2014-12-31,2014-03-10
3,0,TLMR,TALMER BANCORP,2014-03-11,79092,1,EPS,2014-12-31,2014-03-10
4,0,TLMR,TALMER BANCORP,2014-03-17,71182,1,EPS,2014-12-31,2014-03-17


In [8]:
analysts_experience = old_data[['ANALYS', 'ANNDATS']].drop_duplicates().reset_index(drop=True)
analysts_experience

Unnamed: 0,ANALYS,ANNDATS
0,119962,2014-03-09
1,80474,2014-03-10
2,50789,2014-03-10
3,79092,2014-03-10
4,71182,2014-03-17
...,...,...
2138012,136140,2013-11-10
2138013,41097,2013-11-14
2138014,136140,2014-09-28
2138015,41097,2014-10-13


In [9]:
analysts_experience['ANNDATS'] = pd.to_datetime(analysts_experience['ANNDATS'])
analysts_experience['year'] = analysts_experience['ANNDATS'].dt.year
analysts_experience.head()

Unnamed: 0,ANALYS,ANNDATS,year
0,119962,2014-03-09,2014
1,80474,2014-03-10,2014
2,50789,2014-03-10,2014
3,79092,2014-03-10,2014
4,71182,2014-03-17,2014


In [10]:
analysts_experience.rename(columns={'ANALYS':'analyst'}, inplace=True)
analysts_experience.drop(columns='ANNDATS', inplace=True)
analysts_experience.head()

Unnamed: 0,analyst,year
0,119962,2014
1,80474,2014
2,50789,2014
3,79092,2014
4,71182,2014


In [11]:
all_years_analyst = pd.concat([analysts_experience, df_analyst])
all_years_analyst.drop_duplicates().reset_index(drop=True)
all_years_analyst.head()

Unnamed: 0,analyst,year
0,119962,2014
1,80474,2014
2,50789,2014
3,79092,2014
4,71182,2014


In [24]:
all_years_analyst['experience'] = all_years_analyst.groupby('analyst')['year'].transform(lambda x: x - x.min())

In [41]:
all_years_analyst = all_years_analyst.drop_duplicates()

In [42]:
all_years_analyst.to_parquet('../data/processed/analyst_experience.parquet')

In [43]:
all_years_analyst

Unnamed: 0,analyst,year,experience
0,119962,2014,7
1,80474,2014,8
2,50789,2014,19
3,79092,2014,13
4,71182,2014,20
...,...,...,...
40200,191722,2022,0
40201,104860,2022,1
40202,188121,2022,0
40203,131108,2019,6


### Sic codes join

In [66]:
df.head()

Unnamed: 0,ibes_ticker_pk,official_ticker,cname,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,afe_analyst_i,afe_analyst_i_avg,afe_mean_firm_j,pmafe
0,0,TLMR,TALMER BANCORP,2500,72481,0.85,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26,316 days,0.17,0.198,0.166786,0.187152
1,0,TLMR,TALMER BANCORP,149,119962,0.86,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26,316 days,0.16,0.174286,0.166786,0.044968
2,0,TLMR,TALMER BANCORP,1267,71182,0.89,2015-12-31,2015-04-30,2015-02-18,1.02,2016-01-26,316 days,0.13,0.158,0.166786,-0.052677
3,0,TLMR,TALMER BANCORP,149,119962,0.89,2015-12-31,2015-04-30,2015-02-23,1.02,2016-01-26,311 days,0.13,0.174286,0.166786,0.044968
4,0,TLMR,TALMER BANCORP,873,79092,0.91,2015-12-31,2015-07-06,2015-04-07,1.02,2016-01-26,268 days,0.11,0.136667,0.166786,-0.180585


In [67]:
link_table = pd.read_csv('../data/link_table_crisp_ibes.csv')
link_table.head()

Unnamed: 0,TICKER,PERMNO,NCUSIP,sdate,edate,SCORE
0,0000,14471.0,87482X10,2014-02-20,2016-08-31,1
1,0001,14392.0,26878510,2014-02-20,2019-05-22,1
2,0001,,,2019-06-20,2023-12-31,6
3,0004,14418.0,02504D10,2014-02-20,2018-08-24,1
4,000R,14378.0,14163310,2014-02-20,2020-02-10,1


In [73]:
link_table = link_table[link_table['SCORE'].isin([1,2,3])]
link_table.head()

Unnamed: 0,TICKER,PERMNO,NCUSIP,sdate,edate,SCORE
0,0000,14471.0,87482X10,2014-02-20,2016-08-31,1
1,0001,14392.0,26878510,2014-02-20,2019-05-22,1
3,0004,14418.0,02504D10,2014-02-20,2018-08-24,1
4,000R,14378.0,14163310,2014-02-20,2020-02-10,1
5,000V,14423.0,15117E10,2014-03-20,2016-03-22,1


In [91]:
ambigous_links = link_table.groupby('TICKER').count().query('PERMNO>1')[['PERMNO', 'NCUSIP']].reset_index()

In [96]:
df[df['ibes_ticker_pk'].isin(ambigous_links['TICKER'])]['ibes_ticker_pk'].nunique()

2345

In [101]:
clear_links = link_table[~link_table['TICKER'].isin(ambigous_links['TICKER'])]

In [102]:
clear_links

Unnamed: 0,TICKER,PERMNO,NCUSIP,sdate,edate,SCORE
0,0000,14471.0,87482X10,2014-02-20,2016-08-31,1
1,0001,14392.0,26878510,2014-02-20,2019-05-22,1
3,0004,14418.0,02504D10,2014-02-20,2018-08-24,1
4,000R,14378.0,14163310,2014-02-20,2020-02-10,1
7,000Y,14436.0,90400D10,2014-03-20,2023-12-29,1
...,...,...,...,...,...,...
36150,ZVSA,22781.0,98987D10,2023-09-14,2023-12-04,1
36153,ZVX,85520.0,98950E40,1997-12-18,2007-03-16,1
36155,ZXZX,12720.0,16951E10,2011-07-14,2017-05-18,1
36158,ZYNE,15646.0,98986X10,2015-09-17,2023-10-10,1


In [109]:
df = pd.merge(df, clear_links, how='left', left_on='ibes_ticker_pk', right_on='TICKER')

In [112]:
df['ibes_ticker_pk'].nunique()

7255

In [114]:
crisp_compstsat = pd.read_csv('../data/crisp-computsat-link.csv')
crisp_compstsat.head()

Unnamed: 0,gvkey,conm,tic,cusip,sic,LINKPRIM,LIID,LINKTYPE,LPERMNO,LPERMCO,LINKDT,LINKENDDT
0,1000,A & E PLASTIK PAK INC,AE.2,32102,3089,P,1,LU,25881,23369,1970-11-13,1978-06-30
1,1001,A & M FOOD SERVICES INC,AMFD.,165100,5812,P,1,LU,10015,6398,1983-09-20,1986-07-31
2,1002,AAI CORP,AAIC.1,352104,3825,C,1,LC,10023,22159,1972-12-14,1973-06-05
3,1003,A.A. IMPORTING CO INC,ANTQ,354100,5712,C,1,LU,10031,6672,1983-12-07,1989-08-16
4,1004,AAR CORP,AIR,361105,5080,P,1,LU,54594,20000,1972-04-24,E


In [119]:
sic_cs = crisp_compstsat[['LPERMNO', 'gvkey', 'sic']]

In [123]:
sic_cs[sic_cs['sic'].isna()]

Unnamed: 0,LPERMNO,gvkey,sic


In [121]:
df[df['PERMNO'].isin(sic_cs['LPERMNO'])]['ibes_ticker_pk'].nunique()

4202