In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
# file imports
file_path = '../data/processed/forecasts_pmafe.csv'

# Import marius_eda data
df = pd.read_csv(file_path)

In [33]:
df.head()

Unnamed: 0,ibes_ticker_pk,official_ticker,cname,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,afe_analyst_i,afe_analyst_i_avg,afe_mean_firm_j,pmafe
0,0,TLMR,TALMER BANCORP,2500,72481,0.85,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26,316 days,0.17,0.198,0.166786,0.187152
1,0,TLMR,TALMER BANCORP,149,119962,0.86,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26,316 days,0.16,0.174286,0.166786,0.044968
2,0,TLMR,TALMER BANCORP,1267,71182,0.89,2015-12-31,2015-04-30,2015-02-18,1.02,2016-01-26,316 days,0.13,0.158,0.166786,-0.052677
3,0,TLMR,TALMER BANCORP,149,119962,0.89,2015-12-31,2015-04-30,2015-02-23,1.02,2016-01-26,311 days,0.13,0.174286,0.166786,0.044968
4,0,TLMR,TALMER BANCORP,873,79092,0.91,2015-12-31,2015-07-06,2015-04-07,1.02,2016-01-26,268 days,0.11,0.136667,0.166786,-0.180585


# Ressources / Broker Size
Do analysts of larger brokers (estimators) have a lower pmafe on average?
- Version 1: __Top 50: > 5.000 analysts__
- Version 2: Log Broker size as number of analysts at broker i

Dummy variable: set to 1 if analyst i is employed by a firm in the top 50 during year t, and set to 0 otherwise

In [55]:
def top_brokerage(df):
    """
    Function to add a dummy for each analyst that is employed by a firm in the top 50 brokerages in year t
    (Top 50: > 5.000 analysts)
    Set to 1 if analyst i is employed by a firm in the top 50 during year t (announce year), and set to 0 otherwise
    broker = df["estimator"]
    analyst = df["analyst"]
    broker_size = Count of analysts per brokerage in year t
    year = df["announce_date"].df.year 
    Output: this function adds a new column to the dataframe called "top_brokerage" with the dummy variable
    """
    # setup
    df["top_brokerage"] = 0
    df["broker_size"] = 0
    df["announce_date"] = pd.to_datetime(df["announce_date"])
    df["announce_year"] = df["announce_date"].dt.year
    
    # count of analysts per brokerage in year t
    df["broker_size"] = df.groupby(["estimator", "announce_year"])["analyst"].transform("count")
    
    # if count of analysts per brokerage in year t is greater than 5000, set top_brokerage to 1 otherwise 0
    df.loc[df["broker_size"] > 5000, "top_brokerage"] = 1
    
    
    return df

In [51]:
df2 = df.copy()

In [52]:
top_brokerage(df2)
df2.head()

Unnamed: 0,ibes_ticker_pk,official_ticker,cname,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,afe_analyst_i,afe_analyst_i_avg,afe_mean_firm_j,pmafe,top_brokerage,broker_size
0,0,TLMR,TALMER BANCORP,2500,72481,0.85,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26,316 days,0.17,0.198,0.166786,0.187152,0,413
1,0,TLMR,TALMER BANCORP,149,119962,0.86,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26,316 days,0.16,0.174286,0.166786,0.044968,0,2228
2,0,TLMR,TALMER BANCORP,1267,71182,0.89,2015-12-31,2015-04-30,2015-02-18,1.02,2016-01-26,316 days,0.13,0.158,0.166786,-0.052677,1,5398
3,0,TLMR,TALMER BANCORP,149,119962,0.89,2015-12-31,2015-04-30,2015-02-23,1.02,2016-01-26,311 days,0.13,0.174286,0.166786,0.044968,0,2228
4,0,TLMR,TALMER BANCORP,873,79092,0.91,2015-12-31,2015-07-06,2015-04-07,1.02,2016-01-26,268 days,0.11,0.136667,0.166786,-0.180585,1,6492


### Log Broker Size


In [53]:
def log_broker_size(df):
    """
    Calculate the log of the count of analysts per brokerage in year t
    Args:
        df (_type_): _description_
    """
    df["log_broker_size"] = np.log(df["broker_size"])
    return df


In [54]:
log_broker_size(df2)
df2.head()

Unnamed: 0,ibes_ticker_pk,official_ticker,cname,estimator,analyst,estimated_eps,fiscal_period_ending,revision_date,announce_date,actual_eps,announce_date_actual,forecast_horizon,afe_analyst_i,afe_analyst_i_avg,afe_mean_firm_j,pmafe,top_brokerage,broker_size,log_broker_size
0,0,TLMR,TALMER BANCORP,2500,72481,0.85,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26,316 days,0.17,0.198,0.166786,0.187152,0,413,6.023448
1,0,TLMR,TALMER BANCORP,149,119962,0.86,2015-12-31,2015-02-18,2015-02-18,1.02,2016-01-26,316 days,0.16,0.174286,0.166786,0.044968,0,2228,7.70886
2,0,TLMR,TALMER BANCORP,1267,71182,0.89,2015-12-31,2015-04-30,2015-02-18,1.02,2016-01-26,316 days,0.13,0.158,0.166786,-0.052677,1,5398,8.593784
3,0,TLMR,TALMER BANCORP,149,119962,0.89,2015-12-31,2015-04-30,2015-02-23,1.02,2016-01-26,311 days,0.13,0.174286,0.166786,0.044968,0,2228,7.70886
4,0,TLMR,TALMER BANCORP,873,79092,0.91,2015-12-31,2015-07-06,2015-04-07,1.02,2016-01-26,268 days,0.11,0.136667,0.166786,-0.180585,1,6492,8.778326


## Coverage

How many analysts within broker i cover company j (log)

In [None]:
def broker_coverage(df):
    """
    Calculate the coverage as the sum of distinct analysts at brokerage j issuing forecasts for company k in year t (announce year)
    Args:
        df (_type_): _description_
    """
    df["broker_coverage"] = df.groupby(["ibes_ticker_pk", "estimator", "announce_year"])["analyst"].transform("nunique")
    
    
    return df

# Lag Forecast Accuracy Feature
- Last period prediction error in percentage