In [2]:
import pandas as pd
import numpy as np
import polars as pl

# Imports

In [3]:
file_path_forecasts = '../data/dataset2014-2022-v3.zip'
file_path_company = '../data/dataset-company-industry-data-v1.csv'

df_forecasts = pd.read_csv(file_path_forecasts)
df_companies = pd.read_csv(file_path_company)

In [4]:
df_forecasts.head()

Unnamed: 0,TICKER,CUSIP,OFTIC,CNAME,ESTIMATOR,ANALYS,FPI,MEASURE,VALUE,FPEDATS,REVDATS,ANNDATS,ACTUAL,ANNDATS_ACT
0,0,87482X10,TLMR,TALMER BANCORP,2500,72481,1,EPS,1.2,2014-12-31,2014-12-03,2014-12-03,1.21,2015-01-30
1,0,87482X10,TLMR,TALMER BANCORP,2500,72481,1,EPS,0.85,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26
2,0,87482X10,TLMR,TALMER BANCORP,149,119962,1,EPS,0.86,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26
3,0,87482X10,TLMR,TALMER BANCORP,1267,71182,1,EPS,0.89,2015-12-31,2015-04-30,2015-02-18,1.02,2016-01-26
4,0,87482X10,TLMR,TALMER BANCORP,149,119962,1,EPS,0.89,2015-12-31,2015-04-30,2015-02-23,1.02,2016-01-26


In [5]:
df_companies.head()

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,cusip,curcd,fyr,costat,sic,spcindcd
0,1004,2014-05-31,2013.0,INDL,C,D,STD,AIR,361105,USD,5.0,A,5080.0,110.0
1,1004,2015-05-31,2014.0,INDL,C,D,STD,AIR,361105,USD,5.0,A,5080.0,110.0
2,1004,2016-05-31,2015.0,INDL,C,D,STD,AIR,361105,USD,5.0,A,5080.0,110.0
3,1004,2017-05-31,2016.0,INDL,C,D,STD,AIR,361105,USD,5.0,A,5080.0,110.0
4,1004,2018-05-31,2017.0,INDL,C,D,STD,AIR,361105,USD,5.0,A,5080.0,110.0


# Merge SIC Codes 

In [7]:
sic_codes_w_ticker = df_companies[['tic', 'sic']].drop_duplicates().reset_index(drop=True)
sic_codes_w_cusip = df_companies[['cusip', 'sic']].drop_duplicates().reset_index(drop=True)

## Preprocessing

- Adjust datetime datatype columns
- Apply restrictions based on our assumptions
- Re-scaling depending on accuracy

In [9]:
# convert columns with dates to datetime
df_forecasts = df_forecasts.rename(columns={col: col.lower() for col in df_forecasts.columns})
def convert_to_datetime(df):
    columns_to_convert = ['fpedats','revdats', "anndats", 'anndats_act']
    for column in columns_to_convert:
        df[column] = pd.to_datetime(df[column])
    return df
df_forecasts = convert_to_datetime(df_forecasts)
# filter forecasts  based on time
df_forecasts['difference_date'] = df_forecasts['fpedats'] - df_forecasts['anndats']
df_forecasts = df_forecasts[(df_forecasts['difference_date']> pd.Timedelta(days=30))&(df_forecasts['difference_date']<pd.Timedelta(days=365))]
df_forecasts.head()

Unnamed: 0,ticker,cusip,oftic,cname,estimator,analys,fpi,measure,value,fpedats,revdats,anndats,actual,anndats_act,difference_date
1,0,87482X10,TLMR,TALMER BANCORP,2500,72481,1,EPS,0.85,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26,316 days
2,0,87482X10,TLMR,TALMER BANCORP,149,119962,1,EPS,0.86,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26,316 days
3,0,87482X10,TLMR,TALMER BANCORP,1267,71182,1,EPS,0.89,2015-12-31,2015-04-30,2015-02-18,1.02,2016-01-26,316 days
4,0,87482X10,TLMR,TALMER BANCORP,149,119962,1,EPS,0.89,2015-12-31,2015-04-30,2015-02-23,1.02,2016-01-26,311 days
5,0,87482X10,TLMR,TALMER BANCORP,873,79092,1,EPS,0.91,2015-12-31,2015-07-06,2015-04-07,1.02,2016-01-26,268 days


### Data set restrictions

Restrictions should be implemented with respective functions

- Sample horizon 10 - 20 years
- Frequency: annual
- Include only forecasts issued no earlier than 1 year ahead and no later than 30 days before fiscal year end
- (Exclude forecasts for companies which only one analyst provides a forecast)
- Exclude all analysts appearing in the initial 2-3 years of the dataset (migitating problem of experience)
- (Retain the last forecast an analyst issues in a particular year)
- Drop all unneccessary columns from df
- ...

## Merge df with mapping list

In [26]:
df_merged = pd.merge(left=df_forecasts, right=sic_codes_w_ticker, how='left', left_on='oftic', right_on='tic')
df_merged = df_merged.drop(columns=['tic'])

In [29]:
df_merged_w_cusip = pd.merge(left=df_forecasts, right=sic_codes_w_cusip, how='left', left_on='cusip', right_on='cusip')
df_merged_w_cusip.head()

Unnamed: 0,ticker,cusip,oftic,cname,estimator,analys,fpi,measure,value,fpedats,revdats,anndats,actual,anndats_act,difference_date,sic
0,0,87482X10,TLMR,TALMER BANCORP,2500,72481,1,EPS,0.85,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26,316 days,
1,0,87482X10,TLMR,TALMER BANCORP,149,119962,1,EPS,0.86,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26,316 days,
2,0,87482X10,TLMR,TALMER BANCORP,1267,71182,1,EPS,0.89,2015-12-31,2015-04-30,2015-02-18,1.02,2016-01-26,316 days,
3,0,87482X10,TLMR,TALMER BANCORP,149,119962,1,EPS,0.89,2015-12-31,2015-04-30,2015-02-23,1.02,2016-01-26,311 days,
4,0,87482X10,TLMR,TALMER BANCORP,873,79092,1,EPS,0.91,2015-12-31,2015-07-06,2015-04-07,1.02,2016-01-26,268 days,


In [32]:
df_merged['oftic'][df_merged.sic.isna()].drop_duplicates().count()

1958

In [33]:
df_merged_w_cusip['oftic'][df_merged_w_cusip.sic.isna()].drop_duplicates().count()

8412

In [46]:
df_forecasts[df_forecasts['cusip'].isna()]

Unnamed: 0,ticker,cusip,oftic,cname,estimator,analys,fpi,measure,value,fpedats,revdats,anndats,actual,anndats_act,difference_date
22653,00DB,,,,220,135799,1,EPS,0.66,2015-03-31,2014-12-02,2014-12-02,0.76,2015-05-28,119 days
22654,00DB,,,,282,108642,1,EPS,0.66,2015-03-31,2014-12-03,2014-12-02,0.76,2015-05-28,119 days
22655,00DB,,,,873,148731,1,EPS,0.66,2015-03-31,2014-12-03,2014-12-03,0.76,2015-05-28,118 days
22656,00DB,,,,930,114475,1,EPS,0.64,2015-03-31,2015-01-05,2014-12-03,0.76,2015-05-28,118 days
24483,00EH,,,,3643,155036,1,EPS,-16.00,2015-09-30,2014-12-17,2014-12-15,,2015-12-11,289 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1569564,ZUO,,,,192,150623,1,EPS,-0.66,2019-01-31,2018-06-01,2018-05-07,-0.55,2019-03-21,269 days
1570714,ZYNE,,,,220,55673,1,EPS,-1.95,2015-12-31,2015-08-31,2015-08-31,-2.82,2016-03-14,122 days
1570715,ZYNE,,,,930,113164,1,EPS,-1.38,2015-12-31,2015-09-01,2015-08-31,-2.82,2016-03-14,122 days
1570716,ZYNE,,,,98,122005,1,EPS,-2.35,2015-12-31,2015-09-09,2015-09-08,-2.82,2016-03-14,114 days


In [43]:
df_forecasts['cusip'].isna().count()

1415000

In [45]:
df_forecasts['oftic'].drop_duplicates()

1          TLMR
71          EPE
746        EPEG
750        ACSF
771        CRCM
           ... 
1569565     ZUO
1569672    ZVIA
1569727     TJX
1570718    ZYNE
1570907    ZYXI
Name: oftic, Length: 8413, dtype: object

In [23]:
df_merged['oftic'][df_merged.sic.isna()].drop_duplicates().count()

1958

# DV and Feature Generation

Feature list (Proxies for analyst characteristics):

- Forecasting ability/experience: number of years for which analyst i supplied at least one forecast during the first 11 months of the year through year t
- Broker size: a dummy variable set to 1 if analyst i is employed by a firm in the top size decile during year t, and set to 0 otherwise. Size deciles are calculated based on the number of analysts employed in year t.
- Complexity: number of firms for which analyst i supplied at least one forecast during the first 11 months of year t OR number of two-digit SICs for which analyst i supplied at least one forecast during the first 11 months of year t minus the average number of two-digit SICs followed by an analyst following firm j at time t.
- Forecast age: age (in days) of analyst i's forecast for firm j's earnings at time t
- 
