In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

In [11]:
df = pd.read_parquet('../data/processed/ibes_forecasts_collapsed.parquet')

In [12]:
df.head()

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j
54,0,TLMR,TALMER BANCORP,258,8675,0.89,2015-12-31,2015-12-23,2015-10-30,1.02,2016-01-26,62 days,112,0.873333,2015,2015,0.13,0.146667,0.166786,-0.120628,6
27,0,TLMR,TALMER BANCORP,952,50789,1.14,2014-12-31,2014-11-08,2014-11-07,1.21,2015-01-30,54 days,182,0.806,2014,2014,0.07,0.404,0.3575,0.13007,5
25,0,TLMR,TALMER BANCORP,1267,71182,1.21,2014-12-31,2014-11-06,2014-11-05,1.21,2015-01-30,56 days,182,1.0825,2014,2014,0.0,0.1425,0.3575,-0.601399,5
51,0,TLMR,TALMER BANCORP,1267,71182,0.9,2015-12-31,2015-11-10,2015-10-29,1.02,2016-01-26,63 days,181,0.862,2015,2015,0.12,0.158,0.166786,-0.052677,6
53,0,TLMR,TALMER BANCORP,2500,72481,0.86,2015-12-31,2015-10-30,2015-10-30,1.02,2016-01-26,62 days,173,0.822,2015,2015,0.16,0.198,0.166786,0.187152,6


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 521289 entries, 54 to 2164707
Data columns (total 21 columns):
 #   Column                      Non-Null Count   Dtype          
---  ------                      --------------   -----          
 0   ibes_ticker_pk              521289 non-null  object         
 1   official_ticker             519516 non-null  object         
 2   company_name                519530 non-null  object         
 3   estimator                   521289 non-null  int64          
 4   analyst                     521289 non-null  int64          
 5   estimated_eps               521289 non-null  float64        
 6   fiscal_period_ending        521289 non-null  datetime64[ns] 
 7   revision_date               521289 non-null  datetime64[ns] 
 8   announce_date               521289 non-null  datetime64[ns] 
 9   actual_eps                  521289 non-null  float64        
 10  announce_date_actual        521289 non-null  datetime64[ns] 
 11  forecast_horizon            5

In [14]:
# test
analyst_id = 113333
test = df[(df['ibes_ticker_pk'] == "AAPL") & (df['fiscal_year'] == 2018) & (df['analyst'] == analyst_id)]
test

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j
97025,AAPL,AAPL,APPLE,171,113333,2.95,2018-09-30,2018-09-04,2018-07-31,2.9775,2018-11-01,61 days,196,2.9175,2018,2018,0.0275,0.06,0.130797,-0.541273,45


# Feature List
- Broker Size
- Top 10 Broker
- Coverage
- Surprise
- Lag Suprise
- Complexity

# Broker Size
Broker size as the number of unique analysts at a broker in year t

In [15]:
# print count of unique estimators per fiscal_year
df.groupby('fiscal_year')['estimator'].nunique()

fiscal_year
2010    346
2011    347
2012    332
2013    353
2014    475
2015    501
2016    481
2017    459
2018    437
2019    416
2020    407
2021    408
2022    381
Name: estimator, dtype: int64

In [16]:
# group by estimator and fiscal_year and calculate the count of unique analysts and add as column to df
df['broker_size'] = df.groupby(['fiscal_year', 'estimator'])['analyst'].transform('nunique')

In [17]:
df.head()

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size
54,0,TLMR,TALMER BANCORP,258,8675,0.89,2015-12-31,2015-12-23,2015-10-30,1.02,2016-01-26,62 days,112,0.873333,2015,2015,0.13,0.146667,0.166786,-0.120628,6,36
27,0,TLMR,TALMER BANCORP,952,50789,1.14,2014-12-31,2014-11-08,2014-11-07,1.21,2015-01-30,54 days,182,0.806,2014,2014,0.07,0.404,0.3575,0.13007,5,49
25,0,TLMR,TALMER BANCORP,1267,71182,1.21,2014-12-31,2014-11-06,2014-11-05,1.21,2015-01-30,56 days,182,1.0825,2014,2014,0.0,0.1425,0.3575,-0.601399,5,112
51,0,TLMR,TALMER BANCORP,1267,71182,0.9,2015-12-31,2015-11-10,2015-10-29,1.02,2016-01-26,63 days,181,0.862,2015,2015,0.12,0.158,0.166786,-0.052677,6,124
53,0,TLMR,TALMER BANCORP,2500,72481,0.86,2015-12-31,2015-10-30,2015-10-30,1.02,2016-01-26,62 days,173,0.822,2015,2015,0.16,0.198,0.166786,0.187152,6,7


In [18]:
# test broker size: All unique analysts in 2018 at broker 171 (correct answer = 50)
estimator_id = 171
test_bs = df[(df['fiscal_year'] == 2018) & (df['estimator'] == estimator_id)]
test_bs['analyst'].nunique()

50

# Top 10 Brockerage

Do analysts of the Top 10 % largest brokers (by unqiue analyst count) have a lower pmafe on average?

Dummy variable: set to 1 if analyst i is employed by a firm in the top 10 % during year t, and set to 0 otherwise

In [19]:
df["broker_size"].describe()

count    521289.000000
mean         65.789608
std          59.017977
min           1.000000
25%          19.000000
50%          46.000000
75%         104.000000
max         248.000000
Name: broker_size, dtype: float64

In [20]:
df["broker_size"].quantile(0.90)

146.0

In [21]:
# generate list of top 10 % broker_size thresholds per fiscal year
top_10_thresholds = df.groupby('fiscal_year')['broker_size'].quantile(0.90).reset_index()
top_10_thresholds

Unnamed: 0,fiscal_year,broker_size
0,2010,120.0
1,2011,129.0
2,2012,122.0
3,2013,120.0
4,2014,173.0
5,2015,174.0
6,2016,182.0
7,2017,202.0
8,2018,188.0
9,2019,151.0


In [22]:
# if value of broker_size in df is greater than the threshold in top_10_thresholds in a fiscal_year, set to 1, else 0
#df['top_10'] = np.where(df['broker_size'] > df['fiscal_year'].map(top_10_thresholds.set_index('fiscal_year')['broker_size']), 1, 0)


In [23]:
def top_10_brokerage(df):
    """
    Function to add a dummy for each analyst that is employed by a brockerage that belongs to the 
    Top 10 % of brokerages by analyst count in year t
    Set to 1 if analyst i is employed by a firm in the top quantile during year t (fpedats_year), and set to 0 otherwise
    broker = df["estimator"]
    analyst = df["analyst"]
    broker_size = df["broker_size"]
    Output: this function adds a new column to the dataframe called "top_10_brokerage" with the dummy variable
    """
    top_10_thresholds = df.groupby('fiscal_year')['broker_size'].quantile(0.90).reset_index()
    df['top_10'] = np.where(df['broker_size'] > df['fiscal_year'].map(top_10_thresholds.set_index('fiscal_year')['broker_size']), 1, 0)
    return df

In [24]:
top_10_brokerage(df)

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10
54,0000,TLMR,TALMER BANCORP,258,8675,0.8900,2015-12-31,2015-12-23,2015-10-30,1.0200,2016-01-26,62 days,112,0.873333,2015,2015,0.1300,0.146667,0.166786,-0.120628,6,36,0
27,0000,TLMR,TALMER BANCORP,952,50789,1.1400,2014-12-31,2014-11-08,2014-11-07,1.2100,2015-01-30,54 days,182,0.806000,2014,2014,0.0700,0.404000,0.357500,0.130070,5,49,0
25,0000,TLMR,TALMER BANCORP,1267,71182,1.2100,2014-12-31,2014-11-06,2014-11-05,1.2100,2015-01-30,56 days,182,1.082500,2014,2014,0.0000,0.142500,0.357500,-0.601399,5,112,0
51,0000,TLMR,TALMER BANCORP,1267,71182,0.9000,2015-12-31,2015-11-10,2015-10-29,1.0200,2016-01-26,63 days,181,0.862000,2015,2015,0.1200,0.158000,0.166786,-0.052677,6,124,0
53,0000,TLMR,TALMER BANCORP,2500,72481,0.8600,2015-12-31,2015-10-30,2015-10-30,1.0200,2016-01-26,62 days,173,0.822000,2015,2015,0.1600,0.198000,0.166786,0.187152,6,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2164644,ZYNX,ZYXI,ZYNEX INC,1549,191964,0.2182,2019-12-31,2019-10-22,2019-10-22,0.2545,2020-02-27,70 days,111,0.222750,2019,2019,0.0363,0.031750,0.027214,0.166667,4,41,0
2164671,ZYNX,ZYXI,ZYNEX INC,1549,191964,0.2364,2020-12-31,2021-02-22,2020-10-28,0.2364,2021-02-25,64 days,176,0.287029,2020,2020,0.0000,0.053229,0.067215,-0.208089,5,36,0
2164693,ZYNX,ZYXI,ZYNEX INC,1549,191964,0.3818,2021-12-31,2021-11-03,2021-11-03,0.4400,2022-02-24,58 days,169,0.332720,2021,2021,0.0582,0.107280,0.101700,0.054867,5,37,0
2164709,ZYNX,ZYXI,ZYNEX,1549,191964,0.4400,2022-12-31,2022-10-28,2022-10-28,0.4400,2023-03-13,64 days,197,0.432500,2022,2022,0.0000,0.007500,0.035867,-0.790892,5,36,0


In [25]:
# test top_10_brokerage: list of largest brokerages in 2018 by broker_size (treshold in 2018 is 188.0)
test_top_10 = df[df['fiscal_year'] == 2018]
test_top_10

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10
726,0001,EPE,EP ENGR CORP,3477,18677,-0.1300,2018-12-31,2018-11-23,2018-11-16,-0.2500,2019-03-14,45 days,156,-0.063333,2018,2018,0.1200,0.193333,0.160935,0.201315,16,30,0
727,0001,EPE,EP ENGR CORP,183,48368,-0.1600,2018-12-31,2018-11-21,2018-11-19,-0.2500,2019-03-14,42 days,144,-0.154286,2018,2018,0.0900,0.107143,0.160935,-0.334247,16,218,1
728,0001,EPE,EP ENGR CORP,118,78506,-0.2100,2018-12-31,2018-12-10,2018-11-20,-0.2500,2019-03-14,41 days,173,-0.106000,2018,2018,0.0400,0.144000,0.160935,-0.105228,16,111,0
724,0001,EPE,EP ENGR CORP,863,83355,-0.1800,2018-12-31,2018-11-14,2018-11-14,-0.2500,2019-03-14,47 days,164,-0.112727,2018,2018,0.0700,0.137273,0.160935,-0.147029,16,6,0
725,0001,EPE,EP ENGR CORP,873,104605,-0.1700,2018-12-31,2018-11-19,2018-11-19,-0.2500,2019-03-14,42 days,130,-0.010000,2018,2018,0.0800,0.240000,0.160935,0.491287,16,194,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2164505,ZYNE,ZYNE,ZYNERBA PHARMS,98,190428,-3.1600,2018-12-31,2018-05-08,2018-05-08,-2.6100,2019-03-11,237 days,237,-3.160000,2018,2018,0.5500,0.550000,0.387308,0.420060,9,40,0
2164635,ZYNX,ZYXI,ZYNEX INC,1549,51165,0.2364,2018-12-31,2018-11-05,2018-10-10,0.2545,2019-02-26,82 days,82,0.236400,2018,2018,0.0181,0.018100,0.015111,0.197794,4,42,0
2164638,ZYNX,ZYXI,ZYNEX INC,1593,72907,0.2545,2018-12-31,2018-11-13,2018-11-09,0.2545,2019-02-26,52 days,181,0.254525,2018,2018,0.0000,0.004525,0.015111,-0.700551,4,2,0
2164636,ZYNX,ZYXI,ZYNEX INC,2709,122730,0.2455,2018-12-31,2019-02-11,2018-11-06,0.2545,2019-02-26,55 days,130,0.221233,2018,2018,0.0090,0.033267,0.015111,1.201471,4,12,0


## Coverage

How many analysts within analyst i's broker cover company j in year t

In [26]:
# test
estimator_id = 228
coverage = df[(df['ibes_ticker_pk'] == "AAPL") & (df['fiscal_year'] == 2018) & (df['estimator'] == estimator_id)]
coverage

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10
96854,AAPL,AAPL,APPLE,228,72069,2.75,2018-09-30,2017-11-03,2017-11-03,2.9775,2018-11-01,331 days,331,2.75,2018,2017,0.2275,0.2275,0.130797,0.739341,45,69,0
97027,AAPL,AAPL,APPLE,228,81187,2.935,2018-09-30,2018-09-12,2018-07-31,2.9775,2018-11-01,61 days,156,2.83375,2018,2018,0.0425,0.14375,0.130797,0.099034,45,69,0


In [27]:
df['broker_coverage'] = df.groupby(['ibes_ticker_pk','fiscal_period_ending', 'estimator'])['analyst'].transform('count')

In [28]:
# which broker has the most analysts following a stock in a given year
print(df[df['fiscal_year'] == 2015].groupby('estimator')['broker_coverage'].max().sort_values(ascending=False).head(20))

estimator
11      4
228     4
192     4
3037    3
952     3
3039    3
249     3
258     3
3647    3
714     3
3602    3
930     3
1273    3
2980    3
183     3
3808    3
2129    3
2283    3
3322    3
2573    3
Name: broker_coverage, dtype: int64


In [29]:
def coverage(df):
    """_summary_
    Function to calculate the coverage of company j at the broker of analyst i. 
    As the count of the number of analysts following company j in year t at the same brokerage as analyst i
    """
    df['broker_coverage'] = df.groupby(['ibes_ticker_pk','fiscal_period_ending', 'estimator'])['analyst'].transform('count')
    return df

# Lagged Accuracy / "Surprise" Feature

By how much was analyst i surprised by the actual EPS of company j (as percentage)

mean_estimate_ijt = mean forecast value of EPS of analyst i for company j in year t
afe_analyst_ijt_mean = mean absolute forecast error of analyst i's forecast for company j in year t

In [30]:
# calculate surprise percentage for each forecast
df['surprise'] = (df['actual_eps'] - df['mean_estimate_ijt']) / df['mean_estimate_ijt']

In [31]:
# previous years surprise for company j for analyst i at time t (shifted by 1)
# first forecast year is always 0 (no surprise)
df['surprise_lag'] = df.sort_values(by=['analyst', 'ibes_ticker_pk', 'fiscal_year']).groupby(['analyst', 'ibes_ticker_pk'])['surprise'].shift(1)


In [32]:
# Identify the first forecast for each analyst-company pair and set surprise_lag to 0
df['rank'] = df.groupby(['analyst', 'ibes_ticker_pk'])['fiscal_year'].rank(method="min")
df.loc[df['rank'] == 1, 'surprise_lag'] = 0

In [33]:
# fiscal_year 2015 - 2020, 72069
analyst_id2 = 72069

df[(df['ibes_ticker_pk'] == "AAPL") & (df['analyst'] == analyst_id2)] #(df['fiscal_year'].between(2000, 2022))]


Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10,broker_coverage,surprise,surprise_lag,rank
94748,AAPL,AAPL,APPLE INC,190,72069,0.5143,2010-09-30,2010-09-03,2010-07-21,0.5411,2010-10-18,71 days,148,0.4571,2010,2010,0.0268,0.084,0.086254,-0.026131,56,27,0,1,0.183767,0.0,1.0
95075,AAPL,AAPL,APPLE INC,190,72069,0.9664,2011-09-30,2011-08-25,2011-07-20,0.9886,2011-10-18,72 days,198,0.83605,2011,2011,0.0222,0.15255,0.157862,-0.03365,63,29,0,1,0.182465,0.183767,2.0
95418,AAPL,AAPL,APPLE INC,228,72069,1.5529,2012-09-30,2012-07-25,2012-07-25,1.5768,2012-10-25,67 days,201,1.451643,2012,2012,0.0239,0.149043,0.134011,0.112169,62,131,1,1,0.086218,0.182465,3.0
95785,AAPL,AAPL,APPLE INC,228,72069,1.4129,2013-09-30,2013-10-08,2013-07-24,1.4196,2013-10-28,68 days,198,1.524086,2013,2013,0.0067,0.1064,0.147664,-0.279444,60,114,0,1,-0.068556,0.086218,4.0
96094,AAPL,AAPL,APPLE INC,228,72069,1.57,2014-09-30,2014-10-20,2014-07-23,1.6125,2014-10-20,69 days,188,1.53975,2014,2014,0.0425,0.07275,0.056417,0.289502,58,103,0,1,0.047248,-0.068556,5.0
96330,AAPL,AAPL,APPLE,228,72069,2.275,2015-09-30,2015-08-28,2015-07-10,2.305,2015-10-27,82 days,199,2.1065,2015,2015,0.03,0.1985,0.161162,0.231677,51,101,0,1,0.094232,0.047248,6.0
96628,AAPL,AAPL,APPLE,228,72069,2.0675,2016-09-30,2016-10-19,2016-07-27,2.0775,2016-10-25,65 days,199,2.153437,2016,2016,0.01,0.107812,0.169175,-0.362718,54,74,0,1,-0.035263,0.094232,7.0
96835,AAPL,AAPL,APPLE,228,72069,2.25,2017-09-30,2017-09-26,2017-08-02,2.3025,2017-11-02,59 days,227,2.22875,2017,2017,0.0525,0.08,0.079169,0.010497,44,69,0,1,0.03309,-0.035263,8.0
96854,AAPL,AAPL,APPLE,228,72069,2.75,2018-09-30,2017-11-03,2017-11-03,2.9775,2018-11-01,331 days,331,2.75,2018,2017,0.2275,0.2275,0.130797,0.739341,45,69,0,2,0.082727,0.03309,9.0


In [34]:
df.drop(columns=['rank'], inplace=True)

# Forecast experience

In [3]:
old_df = pd.read_parquet('../data/raw/1990-2014.parquet')

In [6]:
old_df.head()

Unnamed: 0,TICKER,OFTIC,CNAME,ACTDATS,ANALYS,FPI,MEASURE,FPEDATS,ANNDATS
0,0,TLMR,TALMER BANCORP,2014-03-11,119962,1,EPS,2014-12-31,2014-03-09
1,0,TLMR,TALMER BANCORP,2014-03-11,80474,1,EPS,2014-12-31,2014-03-10
2,0,TLMR,TALMER BANCORP,2014-03-11,50789,1,EPS,2014-12-31,2014-03-10
3,0,TLMR,TALMER BANCORP,2014-03-11,79092,1,EPS,2014-12-31,2014-03-10
4,0,TLMR,TALMER BANCORP,2014-03-17,71182,1,EPS,2014-12-31,2014-03-17


In [4]:
analysts_experience = old_df[['ANALYS', 'ANNDATS']].drop_duplicates().reset_index(drop=True)
analysts_experience

Unnamed: 0,ANALYS,ANNDATS
0,119962,2014-03-09
1,80474,2014-03-10
2,50789,2014-03-10
3,79092,2014-03-10
4,71182,2014-03-17
...,...,...
2138012,136140,2013-11-10
2138013,41097,2013-11-14
2138014,136140,2014-09-28
2138015,41097,2014-10-13


In [5]:
analysts_experience['ANNDATS'] = pd.to_datetime(analysts_experience['ANNDATS'])
analysts_experience['year'] = analysts_experience['ANNDATS'].dt.year
analysts_experience.head()

Unnamed: 0,ANALYS,ANNDATS,year
0,119962,2014-03-09,2014
1,80474,2014-03-10,2014
2,50789,2014-03-10,2014
3,79092,2014-03-10,2014
4,71182,2014-03-17,2014


In [7]:
analysts_experience.rename(columns={'ANALYS':'analyst'}, inplace=True)
analysts_experience.drop(columns='ANNDATS', inplace=True)
analysts_experience.head()

Unnamed: 0,analyst,year
0,119962,2014
1,80474,2014
2,50789,2014
3,79092,2014
4,71182,2014


In [35]:
df['analyst_year'] = df.announce_date.dt.year
df.head()

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10,broker_coverage,surprise,surprise_lag,analyst_year
54,0,TLMR,TALMER BANCORP,258,8675,0.89,2015-12-31,2015-12-23,2015-10-30,1.02,2016-01-26,62 days,112,0.873333,2015,2015,0.13,0.146667,0.166786,-0.120628,6,36,0,1,0.167939,0.0,2015
27,0,TLMR,TALMER BANCORP,952,50789,1.14,2014-12-31,2014-11-08,2014-11-07,1.21,2015-01-30,54 days,182,0.806,2014,2014,0.07,0.404,0.3575,0.13007,5,49,0,1,0.501241,0.0,2014
25,0,TLMR,TALMER BANCORP,1267,71182,1.21,2014-12-31,2014-11-06,2014-11-05,1.21,2015-01-30,56 days,182,1.0825,2014,2014,0.0,0.1425,0.3575,-0.601399,5,112,0,1,0.117783,0.0,2014
51,0,TLMR,TALMER BANCORP,1267,71182,0.9,2015-12-31,2015-11-10,2015-10-29,1.02,2016-01-26,63 days,181,0.862,2015,2015,0.12,0.158,0.166786,-0.052677,6,124,0,1,0.183295,0.117783,2015
53,0,TLMR,TALMER BANCORP,2500,72481,0.86,2015-12-31,2015-10-30,2015-10-30,1.02,2016-01-26,62 days,173,0.822,2015,2015,0.16,0.198,0.166786,0.187152,6,7,0,1,0.240876,0.0,2015


In [36]:
df_analyst = df[['analyst', 'announce_date']]
df_analyst['year'] = df_analyst['announce_date'].dt.year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analyst['year'] = df_analyst['announce_date'].dt.year


In [37]:
df_analyst = df_analyst.drop(columns= ['announce_date'])
df_analyst = df_analyst.drop_duplicates().reset_index(drop=True)

In [38]:
all_years_analyst = pd.concat([analysts_experience, df_analyst])
all_years_analyst.drop_duplicates().reset_index(drop=True)
all_years_analyst.head()

Unnamed: 0,analyst,year
0,119962,2014
1,80474,2014
2,50789,2014
3,79092,2014
4,71182,2014


In [39]:
all_years_analyst['experience'] = all_years_analyst.groupby('analyst')['year'].transform(lambda x: x - x.min())

In [40]:
all_years_analyst = all_years_analyst.drop_duplicates()
all_years_analyst.sort_values(by=['analyst', 'year'])

Unnamed: 0,analyst,year,experience
47833,0,1990,0
395904,0,1991,1
1885360,0,1992,2
1761523,0,1995,5
884535,0,1996,6
...,...,...,...
10726,199370,2022,0
52724,199400,2022,0
1437146,702031,1996,0
1437210,702031,1998,2


In [42]:
exp_analyst_72069 = all_years_analyst[all_years_analyst['analyst'] == 72069].sort_values(by='year')
exp_analyst_72069

Unnamed: 0,analyst,year,experience
685644,72069,2000,0
258821,72069,2001,1
54478,72069,2002,2
54491,72069,2003,3
69123,72069,2004,4
254101,72069,2005,5
275780,72069,2006,6
15682,72069,2007,7
15741,72069,2008,8
15973,72069,2009,9


In [46]:
df = pd.merge(left=df, right=all_years_analyst, how='left', left_on=['analyst','analyst_year'], right_on=['analyst', 'year'])
df.drop(columns=['analyst_year', 'year'], inplace=True)
df.head(2)

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10,broker_coverage,surprise,surprise_lag,experience
0,0,TLMR,TALMER BANCORP,258,8675,0.89,2015-12-31,2015-12-23,2015-10-30,1.02,2016-01-26,62 days,112,0.873333,2015,2015,0.13,0.146667,0.166786,-0.120628,6,36,0,1,0.167939,0.0,25
1,0,TLMR,TALMER BANCORP,952,50789,1.14,2014-12-31,2014-11-08,2014-11-07,1.21,2015-01-30,54 days,182,0.806,2014,2014,0.07,0.404,0.3575,0.13007,5,49,0,1,0.501241,0.0,19


In [53]:
df[(df['ibes_ticker_pk'] == "AAPL") & (df['analyst'] == analyst_id2)] #(df['fiscal_year'].between(2000, 2022))]


Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10,broker_coverage,surprise,surprise_lag,experience,experience_log,analyst_portfolio_complexity
25470,AAPL,AAPL,APPLE INC,190,72069,0.5143,2010-09-30,2010-09-03,2010-07-21,0.5411,2010-10-18,71 days,148,0.4571,2010,2010,0.0268,0.084,0.086254,-0.026131,56,27,0,1,0.183767,0.0,10,2.302585,18
25471,AAPL,AAPL,APPLE INC,190,72069,0.9664,2011-09-30,2011-08-25,2011-07-20,0.9886,2011-10-18,72 days,198,0.83605,2011,2011,0.0222,0.15255,0.157862,-0.03365,63,29,0,1,0.182465,0.183767,11,2.397895,21
25472,AAPL,AAPL,APPLE INC,228,72069,1.5529,2012-09-30,2012-07-25,2012-07-25,1.5768,2012-10-25,67 days,201,1.451643,2012,2012,0.0239,0.149043,0.134011,0.112169,62,131,1,1,0.086218,0.182465,12,2.484907,22
25473,AAPL,AAPL,APPLE INC,228,72069,1.4129,2013-09-30,2013-10-08,2013-07-24,1.4196,2013-10-28,68 days,198,1.524086,2013,2013,0.0067,0.1064,0.147664,-0.279444,60,114,0,1,-0.068556,0.086218,13,2.564949,21
25474,AAPL,AAPL,APPLE INC,228,72069,1.57,2014-09-30,2014-10-20,2014-07-23,1.6125,2014-10-20,69 days,188,1.53975,2014,2014,0.0425,0.07275,0.056417,0.289502,58,103,0,1,0.047248,-0.068556,14,2.639057,22
25475,AAPL,AAPL,APPLE,228,72069,2.275,2015-09-30,2015-08-28,2015-07-10,2.305,2015-10-27,82 days,199,2.1065,2015,2015,0.03,0.1985,0.161162,0.231677,51,101,0,1,0.094232,0.047248,15,2.70805,20
25476,AAPL,AAPL,APPLE,228,72069,2.0675,2016-09-30,2016-10-19,2016-07-27,2.0775,2016-10-25,65 days,199,2.153437,2016,2016,0.01,0.107812,0.169175,-0.362718,54,74,0,1,-0.035263,0.094232,16,2.772589,19
25477,AAPL,AAPL,APPLE,228,72069,2.25,2017-09-30,2017-09-26,2017-08-02,2.3025,2017-11-02,59 days,227,2.22875,2017,2017,0.0525,0.08,0.079169,0.010497,44,69,0,1,0.03309,-0.035263,17,2.833213,17
25478,AAPL,AAPL,APPLE,228,72069,2.75,2018-09-30,2017-11-03,2017-11-03,2.9775,2018-11-01,331 days,331,2.75,2018,2017,0.2275,0.2275,0.130797,0.739341,45,69,0,2,0.082727,0.03309,17,2.833213,5


In [49]:
df['experience_log'] = df['experience'].apply(lambda x: np.log(x) if x != 0 else 0)


# Complexity
- Number of companies analyst j covers in fiscal year

In [52]:
# number of companies analyst i covers in year t
df['analyst_portfolio_complexity'] = df.groupby(['analyst', 'fiscal_year'])['ibes_ticker_pk'].transform('nunique')

In [54]:
# test: all unique ibes_ticker_pk for analyst 72069 followed in 2010
df[(df['analyst'] == 72069) & (df['fiscal_year'] == 2010)]['ibes_ticker_pk'].nunique()

18

In [55]:
# test: show all unique ibes_ticker_pk for analyst 72069 followed in 2010
df[(df['analyst'] == 72069) & (df['fiscal_year'] == 2010)]['ibes_ticker_pk'].unique()

array(['AAPL', 'DGII', 'GLUU', 'GRMN', 'INNN', 'JFAX', 'KNOL', 'LOGI',
       'MOT', 'NZKA', 'PLCM', 'PLT1', 'PTE1', 'QCOM', 'RNWK', 'RVSN',
       'TRMB', 'UNA'], dtype=object)

# Export for linear reg

In [56]:
# df export as parquet
df.to_parquet('../data/processed/ibes_forecasts_collapsed_features_v1.parquet')