In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_parquet('../data/processed/ibes_forecasts_collapsed.parquet')

In [4]:
df.head()

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j
54,0,TLMR,TALMER BANCORP,258,8675,0.89,2015-12-31,2015-12-23,2015-10-30,1.02,2016-01-26,62 days,112,0.873333,2015,2015,0.13,0.146667,0.166786,-0.120628,6
27,0,TLMR,TALMER BANCORP,952,50789,1.14,2014-12-31,2014-11-08,2014-11-07,1.21,2015-01-30,54 days,182,0.806,2014,2014,0.07,0.404,0.3575,0.13007,5
25,0,TLMR,TALMER BANCORP,1267,71182,1.21,2014-12-31,2014-11-06,2014-11-05,1.21,2015-01-30,56 days,182,1.0825,2014,2014,0.0,0.1425,0.3575,-0.601399,5
51,0,TLMR,TALMER BANCORP,1267,71182,0.9,2015-12-31,2015-11-10,2015-10-29,1.02,2016-01-26,63 days,181,0.862,2015,2015,0.12,0.158,0.166786,-0.052677,6
53,0,TLMR,TALMER BANCORP,2500,72481,0.86,2015-12-31,2015-10-30,2015-10-30,1.02,2016-01-26,62 days,173,0.822,2015,2015,0.16,0.198,0.166786,0.187152,6


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 521289 entries, 54 to 2164707
Data columns (total 21 columns):
 #   Column                      Non-Null Count   Dtype          
---  ------                      --------------   -----          
 0   ibes_ticker_pk              521289 non-null  object         
 1   official_ticker             519516 non-null  object         
 2   company_name                519530 non-null  object         
 3   estimator                   521289 non-null  int64          
 4   analyst                     521289 non-null  int64          
 5   estimated_eps               521289 non-null  float64        
 6   fiscal_period_ending        521289 non-null  datetime64[ns] 
 7   revision_date               521289 non-null  datetime64[ns] 
 8   announce_date               521289 non-null  datetime64[ns] 
 9   actual_eps                  521289 non-null  float64        
 10  announce_date_actual        521289 non-null  datetime64[ns] 
 11  forecast_horizon            5

In [6]:
# test
analyst_id = 113333
test = df[(df['ibes_ticker_pk'] == "AAPL") & (df['fiscal_year'] == 2018) & (df['analyst'] == analyst_id)]
test

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j
97025,AAPL,AAPL,APPLE,171,113333,2.95,2018-09-30,2018-09-04,2018-07-31,2.9775,2018-11-01,61 days,196,2.9175,2018,2018,0.0275,0.06,0.130797,-0.541273,45


# Feature List
- Broker Size
- Top 10 Broker
- Coverage
- Surprise
- Lag Suprise
- Complexity

# Broker Size
Broker size as the number of unique analysts at a broker in year t

In [7]:
# print count of unique estimators per fiscal_year
df.groupby('fiscal_year')['estimator'].nunique()

fiscal_year
2010    346
2011    347
2012    332
2013    353
2014    475
2015    501
2016    481
2017    459
2018    437
2019    416
2020    407
2021    408
2022    381
Name: estimator, dtype: int64

In [8]:
# group by estimator and fiscal_year and calculate the count of unique analysts and add as column to df
df['broker_size'] = df.groupby(['fiscal_year', 'estimator'])['analyst'].transform('nunique')

In [9]:
df.head()

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size
54,0,TLMR,TALMER BANCORP,258,8675,0.89,2015-12-31,2015-12-23,2015-10-30,1.02,2016-01-26,62 days,112,0.873333,2015,2015,0.13,0.146667,0.166786,-0.120628,6,36
27,0,TLMR,TALMER BANCORP,952,50789,1.14,2014-12-31,2014-11-08,2014-11-07,1.21,2015-01-30,54 days,182,0.806,2014,2014,0.07,0.404,0.3575,0.13007,5,49
25,0,TLMR,TALMER BANCORP,1267,71182,1.21,2014-12-31,2014-11-06,2014-11-05,1.21,2015-01-30,56 days,182,1.0825,2014,2014,0.0,0.1425,0.3575,-0.601399,5,112
51,0,TLMR,TALMER BANCORP,1267,71182,0.9,2015-12-31,2015-11-10,2015-10-29,1.02,2016-01-26,63 days,181,0.862,2015,2015,0.12,0.158,0.166786,-0.052677,6,124
53,0,TLMR,TALMER BANCORP,2500,72481,0.86,2015-12-31,2015-10-30,2015-10-30,1.02,2016-01-26,62 days,173,0.822,2015,2015,0.16,0.198,0.166786,0.187152,6,7


In [10]:
# test broker size: All unique analysts in 2018 at broker 171 (correct answer = 50)
estimator_id = 171
test_bs = df[(df['fiscal_year'] == 2018) & (df['estimator'] == estimator_id)]
test_bs['analyst'].nunique()

50

# Top 10 Brockerage

Do analysts of the Top 10 % largest brokers (by unqiue analyst count) have a lower pmafe on average?

Dummy variable: set to 1 if analyst i is employed by a firm in the top 10 % during year t, and set to 0 otherwise

In [11]:
df["broker_size"].describe()

count    521289.000000
mean         65.789608
std          59.017977
min           1.000000
25%          19.000000
50%          46.000000
75%         104.000000
max         248.000000
Name: broker_size, dtype: float64

In [12]:
df["broker_size"].quantile(0.90)

146.0

In [13]:
# generate list of top 10 % broker_size thresholds per fiscal year
top_10_thresholds = df.groupby('fiscal_year')['broker_size'].quantile(0.90).reset_index()
top_10_thresholds

Unnamed: 0,fiscal_year,broker_size
0,2010,120.0
1,2011,129.0
2,2012,122.0
3,2013,120.0
4,2014,173.0
5,2015,174.0
6,2016,182.0
7,2017,202.0
8,2018,188.0
9,2019,151.0


In [14]:
# if value of broker_size in df is greater than the threshold in top_10_thresholds in a fiscal_year, set to 1, else 0
#df['top_10'] = np.where(df['broker_size'] > df['fiscal_year'].map(top_10_thresholds.set_index('fiscal_year')['broker_size']), 1, 0)


In [15]:
def top_10_brokerage(df):
    """
    Function to add a dummy for each analyst that is employed by a brockerage that belongs to the 
    Top 10 % of brokerages by analyst count in year t
    Set to 1 if analyst i is employed by a firm in the top quantile during year t (fpedats_year), and set to 0 otherwise
    broker = df["estimator"]
    analyst = df["analyst"]
    broker_size = df["broker_size"]
    Output: this function adds a new column to the dataframe called "top_10_brokerage" with the dummy variable
    """
    top_10_thresholds = df.groupby('fiscal_year')['broker_size'].quantile(0.90).reset_index()
    df['top_10'] = np.where(df['broker_size'] > df['fiscal_year'].map(top_10_thresholds.set_index('fiscal_year')['broker_size']), 1, 0)
    return df

In [16]:
top_10_brokerage(df)

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10
54,0000,TLMR,TALMER BANCORP,258,8675,0.8900,2015-12-31,2015-12-23,2015-10-30,1.0200,2016-01-26,62 days,112,0.873333,2015,2015,0.1300,0.146667,0.166786,-0.120628,6,36,0
27,0000,TLMR,TALMER BANCORP,952,50789,1.1400,2014-12-31,2014-11-08,2014-11-07,1.2100,2015-01-30,54 days,182,0.806000,2014,2014,0.0700,0.404000,0.357500,0.130070,5,49,0
25,0000,TLMR,TALMER BANCORP,1267,71182,1.2100,2014-12-31,2014-11-06,2014-11-05,1.2100,2015-01-30,56 days,182,1.082500,2014,2014,0.0000,0.142500,0.357500,-0.601399,5,112,0
51,0000,TLMR,TALMER BANCORP,1267,71182,0.9000,2015-12-31,2015-11-10,2015-10-29,1.0200,2016-01-26,63 days,181,0.862000,2015,2015,0.1200,0.158000,0.166786,-0.052677,6,124,0
53,0000,TLMR,TALMER BANCORP,2500,72481,0.8600,2015-12-31,2015-10-30,2015-10-30,1.0200,2016-01-26,62 days,173,0.822000,2015,2015,0.1600,0.198000,0.166786,0.187152,6,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2164644,ZYNX,ZYXI,ZYNEX INC,1549,191964,0.2182,2019-12-31,2019-10-22,2019-10-22,0.2545,2020-02-27,70 days,111,0.222750,2019,2019,0.0363,0.031750,0.027214,0.166667,4,41,0
2164671,ZYNX,ZYXI,ZYNEX INC,1549,191964,0.2364,2020-12-31,2021-02-22,2020-10-28,0.2364,2021-02-25,64 days,176,0.287029,2020,2020,0.0000,0.053229,0.067215,-0.208089,5,36,0
2164693,ZYNX,ZYXI,ZYNEX INC,1549,191964,0.3818,2021-12-31,2021-11-03,2021-11-03,0.4400,2022-02-24,58 days,169,0.332720,2021,2021,0.0582,0.107280,0.101700,0.054867,5,37,0
2164709,ZYNX,ZYXI,ZYNEX,1549,191964,0.4400,2022-12-31,2022-10-28,2022-10-28,0.4400,2023-03-13,64 days,197,0.432500,2022,2022,0.0000,0.007500,0.035867,-0.790892,5,36,0


In [17]:
# test top_10_brokerage: list of largest brokerages in 2018 by broker_size (treshold in 2018 is 188.0)
test_top_10 = df[df['fiscal_year'] == 2018]
test_top_10

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10
726,0001,EPE,EP ENGR CORP,3477,18677,-0.1300,2018-12-31,2018-11-23,2018-11-16,-0.2500,2019-03-14,45 days,156,-0.063333,2018,2018,0.1200,0.193333,0.160935,0.201315,16,30,0
727,0001,EPE,EP ENGR CORP,183,48368,-0.1600,2018-12-31,2018-11-21,2018-11-19,-0.2500,2019-03-14,42 days,144,-0.154286,2018,2018,0.0900,0.107143,0.160935,-0.334247,16,218,1
728,0001,EPE,EP ENGR CORP,118,78506,-0.2100,2018-12-31,2018-12-10,2018-11-20,-0.2500,2019-03-14,41 days,173,-0.106000,2018,2018,0.0400,0.144000,0.160935,-0.105228,16,111,0
724,0001,EPE,EP ENGR CORP,863,83355,-0.1800,2018-12-31,2018-11-14,2018-11-14,-0.2500,2019-03-14,47 days,164,-0.112727,2018,2018,0.0700,0.137273,0.160935,-0.147029,16,6,0
725,0001,EPE,EP ENGR CORP,873,104605,-0.1700,2018-12-31,2018-11-19,2018-11-19,-0.2500,2019-03-14,42 days,130,-0.010000,2018,2018,0.0800,0.240000,0.160935,0.491287,16,194,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2164505,ZYNE,ZYNE,ZYNERBA PHARMS,98,190428,-3.1600,2018-12-31,2018-05-08,2018-05-08,-2.6100,2019-03-11,237 days,237,-3.160000,2018,2018,0.5500,0.550000,0.387308,0.420060,9,40,0
2164635,ZYNX,ZYXI,ZYNEX INC,1549,51165,0.2364,2018-12-31,2018-11-05,2018-10-10,0.2545,2019-02-26,82 days,82,0.236400,2018,2018,0.0181,0.018100,0.015111,0.197794,4,42,0
2164638,ZYNX,ZYXI,ZYNEX INC,1593,72907,0.2545,2018-12-31,2018-11-13,2018-11-09,0.2545,2019-02-26,52 days,181,0.254525,2018,2018,0.0000,0.004525,0.015111,-0.700551,4,2,0
2164636,ZYNX,ZYXI,ZYNEX INC,2709,122730,0.2455,2018-12-31,2019-02-11,2018-11-06,0.2545,2019-02-26,55 days,130,0.221233,2018,2018,0.0090,0.033267,0.015111,1.201471,4,12,0


## Coverage

How many analysts within analyst i's broker cover company j in year t

In [18]:
# test
estimator_id = 228
coverage = df[(df['ibes_ticker_pk'] == "AAPL") & (df['fiscal_year'] == 2018) & (df['estimator'] == estimator_id)]
coverage

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10
96854,AAPL,AAPL,APPLE,228,72069,2.75,2018-09-30,2017-11-03,2017-11-03,2.9775,2018-11-01,331 days,331,2.75,2018,2017,0.2275,0.2275,0.130797,0.739341,45,69,0
97027,AAPL,AAPL,APPLE,228,81187,2.935,2018-09-30,2018-09-12,2018-07-31,2.9775,2018-11-01,61 days,156,2.83375,2018,2018,0.0425,0.14375,0.130797,0.099034,45,69,0


In [19]:
df['broker_coverage'] = df.groupby(['ibes_ticker_pk','fiscal_period_ending', 'estimator'])['analyst'].transform('count')

In [20]:
# which broker has the most analysts following a stock in a given year
print(df[df['fiscal_year'] == 2015].groupby('estimator')['broker_coverage'].max().sort_values(ascending=False).head(20))

estimator
11      4
228     4
192     4
3037    3
952     3
3039    3
249     3
258     3
3647    3
714     3
3602    3
930     3
1273    3
2980    3
183     3
3808    3
2129    3
2283    3
3322    3
2573    3
Name: broker_coverage, dtype: int64


In [21]:
def coverage(df):
    """_summary_
    Function to calculate the coverage of company j at the broker of analyst i. 
    As the count of the number of analysts following company j in year t at the same brokerage as analyst i
    """
    df['broker_coverage'] = df.groupby(['ibes_ticker_pk','fiscal_period_ending', 'estimator'])['analyst'].transform('count')
    return df

# Lagged Accuracy / "Surprise" Feature

By how much was analyst i surprised by the actual EPS of company j (as percentage)

mean_estimate_ijt = mean forecast value of EPS of analyst i for company j in year t
afe_analyst_ijt_mean = mean absolute forecast error of analyst i's forecast for company j in year t

In [22]:
# print all rows with mean_estimate_ijt = 0
df[df['mean_estimate_ijt'] == 0]

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10,broker_coverage
11814,0055,TRUE,TRUECAR,3364,191423,0.16,2020-12-31,2020-11-11,2020-11-05,0.19,2021-02-24,56 days,181,0.0,2020,2020,0.03,0.190,0.202553,-0.061975,13,28,0,1
14143,0076,MTLS,MATERIALISE,154,138388,0.00,2018-12-31,2018-08-12,2018-05-07,0.06,2019-03-06,238 days,238,0.0,2018,2018,0.06,0.060,0.036667,0.636364,4,7,0,1
22550,00EG,LC,LENDINGCLUB,149,123387,-0.30,2016-12-31,2016-06-02,2016-05-10,-0.50,2017-02-14,235 days,276,0.0,2016,2016,0.20,0.500,0.847172,-0.409801,23,32,0,2
22908,00EI,CNV,CNOVA,873,92346,0.00,2015-12-31,2015-03-14,2015-01-29,-0.50,2016-02-24,336 days,336,0.0,2015,2015,0.50,0.500,0.400261,0.249184,12,196,1,2
23834,00I0,BOX,BOX,1951,18603,0.00,2020-01-31,2019-11-28,2019-11-26,0.03,2020-02-26,66 days,200,0.0,2020,2019,0.03,0.030,0.033556,-0.105960,20,45,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2158831,ZNGA,ZNGA,ZYNGA INC,118,72088,0.01,2014-12-31,2014-12-08,2014-11-07,-0.01,2015-02-12,54 days,196,0.0,2014,2014,0.02,0.015,0.019075,-0.213615,24,118,0,1
2158891,ZNGA,ZNGA,ZYNGA INC,118,72088,-0.03,2015-12-31,2015-12-14,2015-11-04,-0.01,2016-02-10,57 days,190,0.0,2015,2015,0.02,0.040,0.022953,0.742682,19,112,0,1
2158821,ZNGA,ZNGA,ZYNGA INC,3742,108299,-0.01,2014-12-31,2014-08-10,2014-08-08,-0.01,2015-02-12,145 days,239,0.0,2014,2014,0.00,0.010,0.019075,-0.475743,24,14,0,1
2158995,ZNGA,ZNGA,ZYNGA INC,3736,121040,0.03,2017-12-31,2017-11-08,2017-11-08,0.09,2018-02-07,53 days,144,0.0,2017,2017,0.06,0.090,0.030529,1.947977,15,2,0,1


In [23]:
# calculate surprise percentage for each forecast
df['surprise'] = (df['actual_eps'] - df['mean_estimate_ijt']) / df['mean_estimate_ijt']

In [24]:
# surprise nan
df[df['surprise'].isna()]

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10,broker_coverage,surprise
114235,ACEN,NGNM,NEOGENOMICS INC,1850,137410,0.0,2012-12-31,2012-12-14,2012-11-06,0.0,2013-02-14,55 days,55,0.0,2012,2012,0.0,0.0,0.014615,-1.0,6,87,0,1,
114232,ACEN,NGNM,NEOGENOMICS INC,2223,143682,0.0,2012-12-31,2012-11-02,2012-08-22,0.0,2013-02-14,131 days,131,0.0,2012,2012,0.0,0.0,0.014615,-1.0,6,24,0,1,
657639,DMAN,DMAN,DEMANDTEC INC,157,75922,-0.01,2011-02-28,2011-03-31,2011-01-07,0.0,2011-03-31,52 days,208,0.0,2011,2011,0.01,0.006667,0.013667,-0.512195,9,120,0,1,
657632,DMAN,DMAN,DEMANDTEC INC,192,85455,0.0,2011-02-28,2011-03-30,2010-10-14,0.0,2011-03-31,137 days,137,0.0,2011,2010,0.0,0.0,0.013667,-1.0,9,113,0,2,
1621935,RAD,RAD,RITE AID CP,873,71787,-0.4,2019-02-28,2019-01-21,2018-12-21,0.0,2019-04-11,69 days,182,0.0,2019,2018,0.4,0.4,0.57037,-0.298701,8,191,1,1,
1829166,STAA,STAA,STAAR SURGICAL C,559,76732,0.03,2010-12-31,2011-01-25,2010-11-03,0.0,2011-03-01,58 days,103,0.0,2010,2010,0.03,0.03,0.054643,-0.45098,6,37,0,1,
2076339,WDPT,WYY,WIDEPOINT CORP,4397,40458,0.0,2019-12-31,2020-03-11,2019-10-08,0.0,2020-03-24,84 days,84,0.0,2019,2019,0.0,0.0,0.04,-1.0,3,1,0,1,
2094464,WMGL,AWC,ALUMINA LIMITED,42,122030,0.01,2013-12-31,2013-10-04,2013-09-23,0.0,2014-02-19,99 days,106,0.0,2013,2013,0.01,0.01,0.051429,-0.805556,3,27,0,1,
2125857,XELB,XELB,XCEL BRANDS INC,35,9651,0.0,2014-12-31,2015-02-05,2014-09-18,0.0,2015-04-01,104 days,104,0.0,2014,2014,0.0,0.0,0.038,-1.0,3,20,0,1,
2125858,XELB,XELB,XCEL BRANDS INC,481,31033,0.0,2014-12-31,2014-11-17,2014-11-11,0.0,2015-04-01,50 days,50,0.0,2014,2014,0.0,0.0,0.038,-1.0,3,22,0,1,


In [25]:
# previous years surprise for company j for analyst i at time t (shifted by 1)
# first forecast year is always 0 (no surprise)
df['surprise_lag'] = df.sort_values(by=['analyst', 'ibes_ticker_pk', 'fiscal_year']).groupby(['analyst', 'ibes_ticker_pk'])['surprise'].shift(1)


In [26]:
# Identify the first forecast for each analyst-company pair and set surprise_lag to 0
df['rank'] = df.groupby(['analyst', 'ibes_ticker_pk'])['fiscal_year'].rank(method="min")
df.loc[df['rank'] == 1, 'surprise_lag'] = 0

In [27]:
# fiscal_year 2015 - 2020, 72069
analyst_id2 = 72069

df[(df['ibes_ticker_pk'] == "AAPL") & (df['analyst'] == analyst_id2)] #(df['fiscal_year'].between(2000, 2022))]


Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10,broker_coverage,surprise,surprise_lag,rank
94748,AAPL,AAPL,APPLE INC,190,72069,0.5143,2010-09-30,2010-09-03,2010-07-21,0.5411,2010-10-18,71 days,148,0.4571,2010,2010,0.0268,0.084,0.086254,-0.026131,56,27,0,1,0.183767,0.0,1.0
95075,AAPL,AAPL,APPLE INC,190,72069,0.9664,2011-09-30,2011-08-25,2011-07-20,0.9886,2011-10-18,72 days,198,0.83605,2011,2011,0.0222,0.15255,0.157862,-0.03365,63,29,0,1,0.182465,0.183767,2.0
95418,AAPL,AAPL,APPLE INC,228,72069,1.5529,2012-09-30,2012-07-25,2012-07-25,1.5768,2012-10-25,67 days,201,1.451643,2012,2012,0.0239,0.149043,0.134011,0.112169,62,131,1,1,0.086218,0.182465,3.0
95785,AAPL,AAPL,APPLE INC,228,72069,1.4129,2013-09-30,2013-10-08,2013-07-24,1.4196,2013-10-28,68 days,198,1.524086,2013,2013,0.0067,0.1064,0.147664,-0.279444,60,114,0,1,-0.068556,0.086218,4.0
96094,AAPL,AAPL,APPLE INC,228,72069,1.57,2014-09-30,2014-10-20,2014-07-23,1.6125,2014-10-20,69 days,188,1.53975,2014,2014,0.0425,0.07275,0.056417,0.289502,58,103,0,1,0.047248,-0.068556,5.0
96330,AAPL,AAPL,APPLE,228,72069,2.275,2015-09-30,2015-08-28,2015-07-10,2.305,2015-10-27,82 days,199,2.1065,2015,2015,0.03,0.1985,0.161162,0.231677,51,101,0,1,0.094232,0.047248,6.0
96628,AAPL,AAPL,APPLE,228,72069,2.0675,2016-09-30,2016-10-19,2016-07-27,2.0775,2016-10-25,65 days,199,2.153437,2016,2016,0.01,0.107812,0.169175,-0.362718,54,74,0,1,-0.035263,0.094232,7.0
96835,AAPL,AAPL,APPLE,228,72069,2.25,2017-09-30,2017-09-26,2017-08-02,2.3025,2017-11-02,59 days,227,2.22875,2017,2017,0.0525,0.08,0.079169,0.010497,44,69,0,1,0.03309,-0.035263,8.0
96854,AAPL,AAPL,APPLE,228,72069,2.75,2018-09-30,2017-11-03,2017-11-03,2.9775,2018-11-01,331 days,331,2.75,2018,2017,0.2275,0.2275,0.130797,0.739341,45,69,0,2,0.082727,0.03309,9.0


In [28]:
df.drop(columns=['rank'], inplace=True)

In [29]:
# nan surprise_lag
df[df['surprise_lag'].isna()]

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10,broker_coverage,surprise,surprise_lag
114250,ACEN,NEO,NEOGENOMICS INC,2223,143682,0.04,2013-12-31,2013-11-18,2013-09-30,0.04,2014-02-19,92 days,122,0.03,2013,2013,0.0,0.01,0.010667,-0.0625,4,22,0,1,0.333333,
1621957,RAD,RAD,RITE AID CP,873,71787,0.34,2020-02-29,2020-01-29,2020-01-09,0.15,2020-04-16,51 days,193,0.205,2020,2020,0.19,0.07,0.194286,-0.639706,7,192,1,1,-0.268293,
1829182,STAA,STAA,STAAR SURGICAL C,559,76732,0.06,2011-12-31,2012-02-29,2011-10-24,0.04,2012-03-06,68 days,205,0.076667,2011,2011,0.02,0.036667,0.024375,0.504274,5,43,0,1,-0.478261,
2076345,WDPT,WYY,WIDEPOINT CORP,4397,40458,0.28,2020-12-31,2020-11-24,2020-11-17,1.2,2021-03-23,44 days,185,0.126667,2020,2020,0.92,1.073333,1.17,-0.082621,2,1,0,1,8.473684,
2125865,XELB,XELB,XCEL BRANDS INC,35,9651,0.35,2015-12-31,2015-11-27,2015-11-23,0.36,2016-03-15,38 days,125,0.306667,2015,2015,0.01,0.053333,0.112857,-0.527426,2,19,0,1,0.173913,


# Forecast experience

In [30]:
old_df = pd.read_parquet('../data/raw/1990-2014.parquet')

In [31]:
old_df.head()

Unnamed: 0,TICKER,OFTIC,CNAME,ACTDATS,ANALYS,FPI,MEASURE,FPEDATS,ANNDATS
0,0,TLMR,TALMER BANCORP,2014-03-11,119962,1,EPS,2014-12-31,2014-03-09
1,0,TLMR,TALMER BANCORP,2014-03-11,80474,1,EPS,2014-12-31,2014-03-10
2,0,TLMR,TALMER BANCORP,2014-03-11,50789,1,EPS,2014-12-31,2014-03-10
3,0,TLMR,TALMER BANCORP,2014-03-11,79092,1,EPS,2014-12-31,2014-03-10
4,0,TLMR,TALMER BANCORP,2014-03-17,71182,1,EPS,2014-12-31,2014-03-17


In [32]:
analysts_experience = old_df[['ANALYS', 'ANNDATS']].drop_duplicates().reset_index(drop=True)
analysts_experience

Unnamed: 0,ANALYS,ANNDATS
0,119962,2014-03-09
1,80474,2014-03-10
2,50789,2014-03-10
3,79092,2014-03-10
4,71182,2014-03-17
...,...,...
2138012,136140,2013-11-10
2138013,41097,2013-11-14
2138014,136140,2014-09-28
2138015,41097,2014-10-13


In [33]:
analysts_experience['ANNDATS'] = pd.to_datetime(analysts_experience['ANNDATS'])
analysts_experience['year'] = analysts_experience['ANNDATS'].dt.year
analysts_experience.head()

Unnamed: 0,ANALYS,ANNDATS,year
0,119962,2014-03-09,2014
1,80474,2014-03-10,2014
2,50789,2014-03-10,2014
3,79092,2014-03-10,2014
4,71182,2014-03-17,2014


In [34]:
analysts_experience.rename(columns={'ANALYS':'analyst'}, inplace=True)
analysts_experience.drop(columns='ANNDATS', inplace=True)
analysts_experience.head()

Unnamed: 0,analyst,year
0,119962,2014
1,80474,2014
2,50789,2014
3,79092,2014
4,71182,2014


In [35]:
df['analyst_year'] = df.announce_date.dt.year
df.head()

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10,broker_coverage,surprise,surprise_lag,analyst_year
54,0,TLMR,TALMER BANCORP,258,8675,0.89,2015-12-31,2015-12-23,2015-10-30,1.02,2016-01-26,62 days,112,0.873333,2015,2015,0.13,0.146667,0.166786,-0.120628,6,36,0,1,0.167939,0.0,2015
27,0,TLMR,TALMER BANCORP,952,50789,1.14,2014-12-31,2014-11-08,2014-11-07,1.21,2015-01-30,54 days,182,0.806,2014,2014,0.07,0.404,0.3575,0.13007,5,49,0,1,0.501241,0.0,2014
25,0,TLMR,TALMER BANCORP,1267,71182,1.21,2014-12-31,2014-11-06,2014-11-05,1.21,2015-01-30,56 days,182,1.0825,2014,2014,0.0,0.1425,0.3575,-0.601399,5,112,0,1,0.117783,0.0,2014
51,0,TLMR,TALMER BANCORP,1267,71182,0.9,2015-12-31,2015-11-10,2015-10-29,1.02,2016-01-26,63 days,181,0.862,2015,2015,0.12,0.158,0.166786,-0.052677,6,124,0,1,0.183295,0.117783,2015
53,0,TLMR,TALMER BANCORP,2500,72481,0.86,2015-12-31,2015-10-30,2015-10-30,1.02,2016-01-26,62 days,173,0.822,2015,2015,0.16,0.198,0.166786,0.187152,6,7,0,1,0.240876,0.0,2015


In [36]:
df_analyst = df[['analyst', 'announce_date']]
df_analyst['year'] = df_analyst['announce_date'].dt.year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analyst['year'] = df_analyst['announce_date'].dt.year


In [37]:
df_analyst.head()

Unnamed: 0,analyst,announce_date,year
54,8675,2015-10-30,2015
27,50789,2014-11-07,2014
25,71182,2014-11-05,2014
51,71182,2015-10-29,2015
53,72481,2015-10-30,2015


In [38]:
df_analyst = df_analyst.drop(columns= ['announce_date'])
df_analyst = df_analyst.drop_duplicates().reset_index(drop=True)

In [39]:
all_years_analyst = pd.concat([analysts_experience, df_analyst])
all_years_analyst.drop_duplicates().reset_index(drop=True)
all_years_analyst.head()

Unnamed: 0,analyst,year
0,119962,2014
1,80474,2014
2,50789,2014
3,79092,2014
4,71182,2014


In [40]:
all_years_analyst['experience'] = all_years_analyst.groupby('analyst')['year'].transform(lambda x: x - x.min())

In [41]:
all_years_analyst = all_years_analyst.drop_duplicates()
all_years_analyst.sort_values(by=['analyst', 'year'])

Unnamed: 0,analyst,year,experience
47833,0,1990,0
395904,0,1991,1
1885360,0,1992,2
1761523,0,1995,5
884535,0,1996,6
...,...,...,...
10726,199370,2022,0
52724,199400,2022,0
1437146,702031,1996,0
1437210,702031,1998,2


In [42]:
exp_analyst_72069 = all_years_analyst[all_years_analyst['analyst'] == 72069].sort_values(by='year')
exp_analyst_72069

Unnamed: 0,analyst,year,experience
685644,72069,2000,0
258821,72069,2001,1
54478,72069,2002,2
54491,72069,2003,3
69123,72069,2004,4
254101,72069,2005,5
275780,72069,2006,6
15682,72069,2007,7
15741,72069,2008,8
15973,72069,2009,9


In [43]:
df = pd.merge(left=df, right=all_years_analyst, how='left', left_on=['analyst','analyst_year'], right_on=['analyst', 'year'])
df.drop(columns=['analyst_year', 'year'], inplace=True)
df.head(2)

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10,broker_coverage,surprise,surprise_lag,experience
0,0,TLMR,TALMER BANCORP,258,8675,0.89,2015-12-31,2015-12-23,2015-10-30,1.02,2016-01-26,62 days,112,0.873333,2015,2015,0.13,0.146667,0.166786,-0.120628,6,36,0,1,0.167939,0.0,25
1,0,TLMR,TALMER BANCORP,952,50789,1.14,2014-12-31,2014-11-08,2014-11-07,1.21,2015-01-30,54 days,182,0.806,2014,2014,0.07,0.404,0.3575,0.13007,5,49,0,1,0.501241,0.0,19


In [44]:
df[(df['ibes_ticker_pk'] == "AAPL") & (df['analyst'] == analyst_id2)] #(df['fiscal_year'].between(2000, 2022))]


Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10,broker_coverage,surprise,surprise_lag,experience
25470,AAPL,AAPL,APPLE INC,190,72069,0.5143,2010-09-30,2010-09-03,2010-07-21,0.5411,2010-10-18,71 days,148,0.4571,2010,2010,0.0268,0.084,0.086254,-0.026131,56,27,0,1,0.183767,0.0,10
25471,AAPL,AAPL,APPLE INC,190,72069,0.9664,2011-09-30,2011-08-25,2011-07-20,0.9886,2011-10-18,72 days,198,0.83605,2011,2011,0.0222,0.15255,0.157862,-0.03365,63,29,0,1,0.182465,0.183767,11
25472,AAPL,AAPL,APPLE INC,228,72069,1.5529,2012-09-30,2012-07-25,2012-07-25,1.5768,2012-10-25,67 days,201,1.451643,2012,2012,0.0239,0.149043,0.134011,0.112169,62,131,1,1,0.086218,0.182465,12
25473,AAPL,AAPL,APPLE INC,228,72069,1.4129,2013-09-30,2013-10-08,2013-07-24,1.4196,2013-10-28,68 days,198,1.524086,2013,2013,0.0067,0.1064,0.147664,-0.279444,60,114,0,1,-0.068556,0.086218,13
25474,AAPL,AAPL,APPLE INC,228,72069,1.57,2014-09-30,2014-10-20,2014-07-23,1.6125,2014-10-20,69 days,188,1.53975,2014,2014,0.0425,0.07275,0.056417,0.289502,58,103,0,1,0.047248,-0.068556,14
25475,AAPL,AAPL,APPLE,228,72069,2.275,2015-09-30,2015-08-28,2015-07-10,2.305,2015-10-27,82 days,199,2.1065,2015,2015,0.03,0.1985,0.161162,0.231677,51,101,0,1,0.094232,0.047248,15
25476,AAPL,AAPL,APPLE,228,72069,2.0675,2016-09-30,2016-10-19,2016-07-27,2.0775,2016-10-25,65 days,199,2.153437,2016,2016,0.01,0.107812,0.169175,-0.362718,54,74,0,1,-0.035263,0.094232,16
25477,AAPL,AAPL,APPLE,228,72069,2.25,2017-09-30,2017-09-26,2017-08-02,2.3025,2017-11-02,59 days,227,2.22875,2017,2017,0.0525,0.08,0.079169,0.010497,44,69,0,1,0.03309,-0.035263,17
25478,AAPL,AAPL,APPLE,228,72069,2.75,2018-09-30,2017-11-03,2017-11-03,2.9775,2018-11-01,331 days,331,2.75,2018,2017,0.2275,0.2275,0.130797,0.739341,45,69,0,2,0.082727,0.03309,17


In [45]:
df['experience_log'] = df['experience'].apply(lambda x: np.log(x) if x != 0 else 0)


# Complexity
- Number of companies analyst j covers in fiscal year

In [46]:
# number of companies analyst i covers in year t
df['analyst_portfolio_complexity'] = df.groupby(['analyst', 'fiscal_year'])['ibes_ticker_pk'].transform('nunique')

In [47]:
# test: all unique ibes_ticker_pk for analyst 72069 followed in 2010
df[(df['analyst'] == 72069) & (df['fiscal_year'] == 2010)]['ibes_ticker_pk'].nunique()

18

In [48]:
# test: show all unique ibes_ticker_pk for analyst 72069 followed in 2010
df[(df['analyst'] == 72069) & (df['fiscal_year'] == 2010)]['ibes_ticker_pk'].unique()

array(['AAPL', 'DGII', 'GLUU', 'GRMN', 'INNN', 'JFAX', 'KNOL', 'LOGI',
       'MOT', 'NZKA', 'PLCM', 'PLT1', 'PTE1', 'QCOM', 'RNWK', 'RVSN',
       'TRMB', 'UNA'], dtype=object)

# SIC Codes

In [None]:
# get list of IBES tickers from df
ibes_tickers = df['ibes_ticker_pk'].unique()

In [79]:
import wrds
from fuzzywuzzy import fuzz



In [74]:
db = wrds.Connection()

WRDS recommends setting up a .pgpass file.
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [53]:
ibes1 = db.raw_sql("""
                      select ticker, cusip, cname, sdates from ibes.id
                      where usfirm=1 and cusip != ''
                      """, date_cols=['sdates'])

In [55]:
ibes1.shape

(93507, 4)

In [56]:
ibes1_date = ibes1.groupby(["ticker", "cusip"]).sdates.agg(["min", "max"]).reset_index().rename(columns={"min": "fdate", "max": "ldate"})

ibes2 = pd.merge(ibes1,ibes1_date, how = "left", on = ["ticker", "cusip"])
ibes2 = ibes2.sort_values(by = ["ticker", "cusip", "sdates"])

ibes2 = ibes2.loc[ibes2.sdates == ibes2.ldate].drop(["sdates"], axis=1)
ibes2.head()


Unnamed: 0,ticker,cusip,cname,fdate,ldate
1,0000,87482X10,TALMER BANCORP,2014-02-20,2014-03-20
6,0001,26878510,EP ENGR CORP,2014-02-20,2019-10-17
7,0004,02504D10,AMERICAN CAPITAL,2014-02-20,2014-02-20
8,000R,14163310,CARECOM,2014-02-20,2014-02-20
9,000V,15117E10,CELLADON,2014-03-20,2014-03-20


In [57]:
ibes2.shape

(39236, 5)

In [64]:
crsp1 = db.raw_sql("""
                      select permno, ncusip, comnam, namedt, nameenddt
                      from crsp.stocknames
                      where ncusip != ''
                      """, date_cols=['namedt', 'nameenddt'])

In [65]:
# first namedt
crsp1_fnamedt = crsp1.groupby(['permno','ncusip']).namedt.min().reset_index()

# last nameenddt
crsp1_lnameenddt = crsp1.groupby(['permno','ncusip']).nameenddt.max().reset_index()

# merge both 
crsp1_dtrange = pd.merge(crsp1_fnamedt, crsp1_lnameenddt, \
                          on = ['permno','ncusip'], how='inner')

# replace namedt and nameenddt with the version from the dtrange
crsp1 = crsp1.drop(['namedt'],axis=1).rename(columns={'nameenddt':'enddt'})
crsp2 = pd.merge(crsp1, crsp1_dtrange, on =['permno','ncusip'], how='inner')

# keep only most recent company name
crsp2 = crsp2.loc[crsp2.enddt == crsp2.nameenddt].drop(['enddt'], axis=1)

In [66]:
# 1.3 Create CUSIP Link Table

# Link by full cusip, company names and dates
link1_1 = pd.merge(ibes2, crsp2, how='inner', left_on='cusip', right_on='ncusip').sort_values(['ticker','permno','ldate'])


In [67]:
# Keep link with most recent company name
link1_1_tmp = link1_1.groupby(['ticker','permno']).ldate.max().reset_index()
link1_2 = pd.merge(link1_1, link1_1_tmp, how='inner', on =['ticker', 'permno', 'ldate'])

In [80]:
link1_2['name_ratio'] = link1_2.apply(lambda x: fuzz.token_set_ratio(x.comnam, x.cname), axis=1)

# 10% percentile of the company name distance
name_ratio_p10 = link1_2.name_ratio.quantile(0.10)
name_ratio_p10

67.0

In [81]:
# Function to assign score for companies matched by:
# full cusip and passing name_ratio
# or meeting date range requirement

def score1(row):
    if (row['fdate']<=row['nameenddt']) & (row['ldate']>=row['namedt']) & (row['name_ratio'] >= name_ratio_p10):
        score = 0
    elif (row['fdate']<=row['nameenddt']) & (row['ldate']>=row['namedt']):
        score = 1
    elif row['name_ratio'] >= name_ratio_p10:
        score = 2
    else:
        score = 3
    return score

# assign size portfolio
link1_2['score']=link1_2.apply(score1, axis=1)
link1_2 = link1_2[['ticker','permno','cname','comnam','name_ratio','score']]
link1_2 = link1_2.drop_duplicates()

In [82]:
link1_2.head()

Unnamed: 0,ticker,permno,cname,comnam,name_ratio,score
0,0000,14471,TALMER BANCORP,TALMER BANCORP INC,100,0
1,0001,14392,EP ENGR CORP,E P ENERGY CORP,67,0
2,0004,14418,AMERICAN CAPITAL,AMERICAN CAPITAL SR FLOATING LTD,100,0
3,000R,14378,CARECOM,CARE COM INC,74,0
4,000V,14423,EIGER,EIGER BIOPHARMACEUTICALS INC,100,0


In [83]:
link1_2.groupby(['score']).score.count()

score
0    18880
1     1800
2      679
3       81
Name: score, dtype: int64

In [72]:
##########################
# Step 2: Link by TICKER #
##########################

# Find links for the remaining unmatched cases using Exchange Ticker 

# Identify remaining unmatched cases 
nomatch1 = pd.merge(ibes2[['ticker']], link1_2[['permno','ticker']], on='ticker', how='left')
nomatch1 = nomatch1.loc[nomatch1.permno.isnull()].drop(['permno'], axis=1).drop_duplicates()

# Add IBES identifying information

ibesid = db.raw_sql(""" select ticker, cname, oftic, sdates, cusip from ibes.id """, date_cols=['sdates'])
ibesid = ibesid.loc[ibesid.oftic.notna()]

nomatch2 = pd.merge(nomatch1, ibesid, how='inner', on=['ticker'])
nomatch2.shape

(16919, 5)

In [73]:
# Create first and last 'start dates' for Exchange Tickers
# Label date range variables and keep only most recent company name

nomatch3 = nomatch2.groupby(['ticker', 'oftic']).sdates.agg(['min', 'max']).reset_index().rename(columns={'min':'fdate', 'max':'ldate'})

nomatch3 = pd.merge(nomatch2, nomatch3, how='left', on=['ticker','oftic'])

nomatch3 = nomatch3.loc[nomatch3.sdates == nomatch3.ldate]
nomatch3.shape

(8542, 7)

In [76]:
# Get entire list of CRSP stocks with Exchange Ticker information

crsp_n1 = db.raw_sql(""" select ticker, comnam, permno, ncusip, namedt, nameenddt
                            from crsp.stocknames """, date_cols=['namedt', 'nameenddt'])

crsp_n1 = crsp_n1.loc[crsp_n1.ticker.notna()].sort_values(by=['permno','ticker','namedt'])

In [78]:
# Arrange effective dates for link by Exchange Ticker

crsp_n1_namedt = crsp_n1.groupby(['permno','ticker']).namedt.min().reset_index().rename(columns={'min':'namedt'})
crsp_n1_nameenddt = crsp_n1.groupby(['permno','ticker']).nameenddt.max().reset_index().rename(columns={'max':'nameenddt'})

crsp_n1_dt = pd.merge(crsp_n1_namedt, crsp_n1_nameenddt, how = 'inner', on=['permno','ticker'])

crsp_n1 = crsp_n1.rename(columns={'namedt': 'namedt_ind', 'nameenddt':'nameenddt_ind'})

crsp_n2 = pd.merge(crsp_n1, crsp_n1_dt, how ='left', on = ['permno','ticker'])

crsp_n2 = crsp_n2.rename(columns={'ticker':'crsp_ticker'})
crsp_n2 = crsp_n2.loc[crsp_n2.nameenddt_ind == crsp_n2.nameenddt].drop(['namedt_ind', 'nameenddt_ind'], axis=1)

# Merge remaining unmatched cases using Exchange Ticker 
# Note: Use ticker date ranges as exchange tickers are reused overtime

link2_1 = pd.merge(nomatch3, crsp_n2, how='inner', left_on=['oftic'], right_on=['crsp_ticker'])
link2_1 = link2_1.loc[(link2_1.ldate>=link2_1.namedt) & (link2_1.fdate<=link2_1.nameenddt)]


In [84]:
# Score using company name using 6-digit CUSIP and company name spelling distance
link2_1['name_ratio'] = link2_1.apply(lambda x: fuzz.token_set_ratio(x.comnam, x.cname), axis=1)

link2_2 = link2_1
link2_2['cusip6'] = link2_2.apply(lambda x: x.cusip[:6], axis=1)
link2_2['ncusip6'] = link2_2.apply(lambda x: x.ncusip[:6], axis=1)


In [85]:
# Score using company name using 6-digit CUSIP and company name spelling distance

def score2(row):
    if (row['cusip6']==row['ncusip6']) & (row['name_ratio'] >= name_ratio_p10):
        score = 0
    elif (row['cusip6']==row['ncusip6']):
        score = 4
    elif row['name_ratio'] >= name_ratio_p10:
        score = 5
    else:
        score = 6
    return score

# assign size portfolio
link2_2['score']=link2_2.apply(score2, axis=1)

In [86]:
# Some companies may have more than one TICKER-PERMNO link
# so re-sort and keep the case (PERMNO & Company name from CRSP)
# that gives the lowest score for each IBES TICKER 

link2_2 = link2_2[['ticker','permno','cname','comnam', 'name_ratio', 'score']].sort_values(by=['ticker','score'])
link2_2_score = link2_2.groupby(['ticker']).score.min().reset_index()

link2_3 = pd.merge(link2_2, link2_2_score, how='inner', on=['ticker', 'score'])
link2_3 = link2_3[['ticker','permno','cname','comnam','score']].drop_duplicates()

In [89]:
ibes_crsp_link = pd.concat([link1_2, link2_3])


In [90]:
# Storing iclink for other program usage

ibes_crsp_link.to_csv('../data/processed/ibes_crsp_link.csv', index=False)

In [91]:
ibes_crsp_link = pd.read_csv('../data/processed/ibes_crsp_link.csv')
ibes_crsp_link.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21808 entries, 0 to 21807
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ticker      21807 non-null  object 
 1   permno      21808 non-null  int64  
 2   cname       21808 non-null  object 
 3   comnam      21808 non-null  object 
 4   name_ratio  21440 non-null  float64
 5   score       21808 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 1022.4+ KB


In [92]:
ibes_crsp_link.head()

Unnamed: 0,ticker,permno,cname,comnam,name_ratio,score
0,0000,14471,TALMER BANCORP,TALMER BANCORP INC,100.0,0
1,0001,14392,EP ENGR CORP,E P ENERGY CORP,67.0,0
2,0004,14418,AMERICAN CAPITAL,AMERICAN CAPITAL SR FLOATING LTD,100.0,0
3,000R,14378,CARECOM,CARE COM INC,74.0,0
4,000V,14423,EIGER,EIGER BIOPHARMACEUTICALS INC,100.0,0


In [93]:
crisp_compustat_link = pd.read_parquet('../data/raw/crisp-computsat-link.parquet')

In [95]:
crisp_compustat_link.shape

(31952, 12)

In [94]:
crisp_compustat_link.head()

Unnamed: 0,gvkey,conm,tic,cusip,sic,LINKPRIM,LIID,LINKTYPE,LPERMNO,LPERMCO,LINKDT,LINKENDDT
0,1000,A & E PLASTIK PAK INC,AE.2,32102,3089,P,1,LU,25881,23369,1970-11-13,1978-06-30
1,1001,A & M FOOD SERVICES INC,AMFD.,165100,5812,P,1,LU,10015,6398,1983-09-20,1986-07-31
2,1002,AAI CORP,AAIC.1,352104,3825,C,1,LC,10023,22159,1972-12-14,1973-06-05
3,1003,A.A. IMPORTING CO INC,ANTQ,354100,5712,C,1,LU,10031,6672,1983-12-07,1989-08-16
4,1004,AAR CORP,AIR,361105,5080,P,1,LU,54594,20000,1972-04-24,E


In [100]:
# for how many permno from ibes_crsp_link do we have a link in crisp_compustat_link lpermno  and if not matched in LPERMCO
ibes_crsp_link['permno'].isin(crisp_compustat_link['LPERMNO']).value_counts()

permno
True     21130
False      678
Name: count, dtype: int64

In [101]:
# create linking table 
linking_table = pd.merge(ibes_crsp_link, crisp_compustat_link, how='left', left_on='permno', right_on='LPERMNO')
linking_table.head()

Unnamed: 0,ticker,permno,cname,comnam,name_ratio,score,gvkey,conm,tic,cusip,sic,LINKPRIM,LIID,LINKTYPE,LPERMNO,LPERMCO,LINKDT,LINKENDDT
0,0000,14471,TALMER BANCORP,TALMER BANCORP INC,100.0,0,192458.0,TALMER BANCORP INC,TLMR,87482X101,6020.0,P,1,LC,14471.0,54747.0,2014-02-12,2016-09-30
1,0001,14392,EP ENGR CORP,E P ENERGY CORP,67.0,0,18660.0,EP ENERGY CORP,EPEGQ,268785102,1311.0,P,1,LC,14392.0,54697.0,2014-01-17,2019-04-30
2,0004,14418,AMERICAN CAPITAL,AMERICAN CAPITAL SR FLOATING LTD,100.0,0,19545.0,AMERICAN CAPITAL SR FLTG LTD,ACSF,02504D108,6797.0,P,1,LC,14418.0,54710.0,2014-01-16,2018-09-28
3,000R,14378,CARECOM,CARE COM INC,74.0,0,19352.0,CARE.COM INC,CRCM,141633107,7370.0,P,1,LC,14378.0,54694.0,2014-01-24,2020-02-28
4,000V,14423,EIGER,EIGER BIOPHARMACEUTICALS INC,100.0,0,18929.0,CELLADON CORP,CLDN,15117E107,2836.0,P,1,LC,14423.0,54714.0,2014-01-30,2016-03-22


In [103]:
linking_table = linking_table[["ticker", "permno", "cname","score", "sic"]]

In [112]:
# nan in linking table
linking_table.isna().sum()

ticker      1
permno      0
cname       0
score       0
sic       678
dtype: int64

In [104]:
# join SIC code to df based on linking tables ticker and df ibes_ticker_pk
df = pd.merge(df, linking_table, how='left', left_on='ibes_ticker_pk', right_on='ticker')
df.head()


Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10,broker_coverage,surprise,surprise_lag,experience,experience_log,analyst_portfolio_complexity,ticker,permno,cname,score,sic
0,0,TLMR,TALMER BANCORP,258,8675,0.89,2015-12-31,2015-12-23,2015-10-30,1.02,2016-01-26,62 days,112,0.873333,2015,2015,0.13,0.146667,0.166786,-0.120628,6,36,0,1,0.167939,0.0,25,3.218876,19,0,14471.0,TALMER BANCORP,0.0,6020.0
1,0,TLMR,TALMER BANCORP,952,50789,1.14,2014-12-31,2014-11-08,2014-11-07,1.21,2015-01-30,54 days,182,0.806,2014,2014,0.07,0.404,0.3575,0.13007,5,49,0,1,0.501241,0.0,19,2.944439,43,0,14471.0,TALMER BANCORP,0.0,6020.0
2,0,TLMR,TALMER BANCORP,1267,71182,1.21,2014-12-31,2014-11-06,2014-11-05,1.21,2015-01-30,56 days,182,1.0825,2014,2014,0.0,0.1425,0.3575,-0.601399,5,112,0,1,0.117783,0.0,20,2.995732,24,0,14471.0,TALMER BANCORP,0.0,6020.0
3,0,TLMR,TALMER BANCORP,1267,71182,0.9,2015-12-31,2015-11-10,2015-10-29,1.02,2016-01-26,63 days,181,0.862,2015,2015,0.12,0.158,0.166786,-0.052677,6,124,0,1,0.183295,0.117783,21,3.044522,25,0,14471.0,TALMER BANCORP,0.0,6020.0
4,0,TLMR,TALMER BANCORP,2500,72481,0.86,2015-12-31,2015-10-30,2015-10-30,1.02,2016-01-26,62 days,173,0.822,2015,2015,0.16,0.198,0.166786,0.187152,6,7,0,1,0.240876,0.0,16,2.772589,24,0,14471.0,TALMER BANCORP,0.0,6020.0


In [105]:
# sic empty
df['sic'].isna().sum()

34703

In [108]:
# for how many unique ibes_ticker_pk there is no sic
df['ibes_ticker_pk'][df['sic'].isna()].nunique()

844

In [109]:
df['ibes_ticker_pk'].nunique()

8722

In [107]:
df.shape

(636604, 34)

In [115]:
# drop all rows with nan in sic
df = df.dropna(subset=['sic'])

# drop merge columns
df.drop(columns=['ticker', 'permno', 'cname', 'score'], inplace=True)

# keep only highest level of sic (major group)
df.loc[df['sic'].isna(), 'sic'] = -1 
df['sic'] = df['sic'].astype(str).str[:2]
df['sic'] = df['sic'].astype('int')

In [117]:
# distinct sic
df['sic'].nunique()

66

In [118]:
# distribution of sic
df['sic'].value_counts()

sic
73    70476
28    64084
13    43448
60    34951
36    32407
      ...  
22      510
41      461
83      372
81       81
89       13
Name: count, Length: 66, dtype: int64

In [116]:
df.head()

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10,broker_coverage,surprise,surprise_lag,experience,experience_log,analyst_portfolio_complexity,sic
0,0,TLMR,TALMER BANCORP,258,8675,0.89,2015-12-31,2015-12-23,2015-10-30,1.02,2016-01-26,62 days,112,0.873333,2015,2015,0.13,0.146667,0.166786,-0.120628,6,36,0,1,0.167939,0.0,25,3.218876,19,60
1,0,TLMR,TALMER BANCORP,952,50789,1.14,2014-12-31,2014-11-08,2014-11-07,1.21,2015-01-30,54 days,182,0.806,2014,2014,0.07,0.404,0.3575,0.13007,5,49,0,1,0.501241,0.0,19,2.944439,43,60
2,0,TLMR,TALMER BANCORP,1267,71182,1.21,2014-12-31,2014-11-06,2014-11-05,1.21,2015-01-30,56 days,182,1.0825,2014,2014,0.0,0.1425,0.3575,-0.601399,5,112,0,1,0.117783,0.0,20,2.995732,24,60
3,0,TLMR,TALMER BANCORP,1267,71182,0.9,2015-12-31,2015-11-10,2015-10-29,1.02,2016-01-26,63 days,181,0.862,2015,2015,0.12,0.158,0.166786,-0.052677,6,124,0,1,0.183295,0.117783,21,3.044522,25,60
4,0,TLMR,TALMER BANCORP,2500,72481,0.86,2015-12-31,2015-10-30,2015-10-30,1.02,2016-01-26,62 days,173,0.822,2015,2015,0.16,0.198,0.166786,0.187152,6,7,0,1,0.240876,0.0,16,2.772589,24,60


# Industry Feature
Number of industries analyst i follows in fiscal year t, as the number of unique two-digit SICs followed

In [128]:
# all forecasts of analyst_id2 = 72069 in 2018
df[(df['analyst'] == 72069) & (df['fiscal_year'] == 2018)]

Unnamed: 0,ibes_ticker_pk,official_ticker,company_name,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,mean_forecast_horizon_days,mean_estimate_ijt,fiscal_year,actual_announce_year,afe_analyst_i,afe_analyst_ijt_mean,afe_firm_jt_mean,pmafe,analysts_following_j,broker_size,top_10,broker_coverage,surprise,surprise_lag,experience,experience_log,analyst_portfolio_complexity,sic,analyst_portfolio_complexity_it
27816,AAPL,AAPL,APPLE,228,72069,2.75,2018-09-30,2017-11-03,2017-11-03,2.9775,2018-11-01,331 days,331,2.75,2018,2017,0.2275,0.2275,0.130797,0.739341,45,69,0,2,0.082727,0.03309,17,2.833213,5,36,3
348804,LOGI,LOGI,LOGITECH INTL,228,72069,1.44,2018-03-31,2017-10-24,2017-07-25,1.6,2018-05-02,249 days,249,1.44,2018,2017,0.16,0.16,0.141728,0.128924,12,69,0,1,0.111111,0.170732,17,2.833213,5,35,3
348805,LOGI,LOGI,LOGITECH INTL,228,72069,1.44,2018-03-31,2017-10-24,2017-07-25,1.6,2018-05-02,249 days,249,1.44,2018,2017,0.16,0.16,0.141728,0.128924,12,69,0,1,0.111111,0.170732,17,2.833213,5,35,3
453296,PLT1,PLT,PLANTRONICS INC,228,72069,3.14,2018-03-31,2017-11-01,2017-11-01,3.55,2018-05-01,150 days,243,3.16,2018,2017,0.41,0.39,0.585217,-0.333581,6,69,0,1,0.123418,0.022581,17,2.833213,5,36,3
580476,UBN,UBNT,UBIQUITI NETWORK,228,72069,3.65,2018-06-30,2017-11-14,2017-11-10,3.69,2018-08-24,232 days,282,3.67,2018,2017,0.04,0.02,0.191036,-0.895308,9,69,0,1,0.00545,0.024431,17,2.833213,5,36,3
604412,VSNR,NUAN,NUANCE COMMNS,228,72069,1.1,2018-09-30,2017-11-29,2017-11-29,1.19,2018-11-19,305 days,305,1.1,2018,2017,0.09,0.09,0.053793,0.673077,8,69,0,1,0.081818,-0.125,17,2.833213,5,73,3


In [122]:
# count number of unique sic for analyst 72069
df[(df['analyst'] == 72069) & (df['fiscal_year'] == 2018)]['sic'].nunique()

3

In [123]:
df['analyst_portfolio_complexity_it'] = df.groupby(['analyst', 'fiscal_year'])['sic'].transform('nunique')

In [126]:
# distribution of analyst_portfolio_complexity_it
df['analyst_portfolio_complexity_it'].describe()

count    601901.000000
mean          3.987534
std           3.111240
min           1.000000
25%           2.000000
50%           3.000000
75%           5.000000
max          40.000000
Name: analyst_portfolio_complexity_it, dtype: float64

# Export for linear reg

In [None]:
# df export as parquet
df.to_parquet('../data/processed/ibes_forecasts_collapsed_features_v3.parquet')

In [None]:
df.to_csv('../data/processed/ibes_forecasts_collapsed_features_v2.csv')