In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Imports

In [None]:
# file imports
file_path_forecasts = '../data/raw/ibes-forecasts.parquet'

# IBES forecasts
df_forecasts = pd.read_parquet(file_path_forecasts)

In [None]:
df_forecasts.head()

In [None]:
df_forecasts.info()

# Preprocessing
df_forecasts:
- Lower case all columns in df_forecasts
- Drop unncessary columns
- Include only forecasts issued no earlier than 1 year ahead and no later than 30 days before fiscal year end
- Rename columns
- Delete rows with actual_eps = NAN or estimated_eps = NAN
- Exclude forecasts for companies which only one analyst provides a forecast
- Exclude all analysts appearing in the initial 2-3 years of the dataset (migitating problem of experience)
- (Retain the last forecast an analyst issues in a particular year)
- ...

In [None]:
# lower case
df_forecasts = df_forecasts.rename(columns={col: col.lower() for col in df_forecasts.columns})

# Drop unncessary columns
columns_to_drop = ["fpi","measure","cusip"]
df_forecasts = df_forecasts.drop(columns=columns_to_drop)

In [None]:
# convert columns with dates to datetime
def convert_to_datetime(df):
    columns_to_convert = ['fpedats','revdats', "anndats", 'anndats_act']
    for column in columns_to_convert:
        df[column] = pd.to_datetime(df[column])
    return df
df_forecasts = convert_to_datetime(df_forecasts)

# filter forecasts  based on time
df_forecasts['forecast_horizon'] = df_forecasts['fpedats'] - df_forecasts['anndats']
df_forecasts = df_forecasts[(df_forecasts['forecast_horizon']> pd.Timedelta(days=30))&(df_forecasts['forecast_horizon']<pd.Timedelta(days=365))]

# calculate the mean forecast horizon of analyst i's forecasts for company j in fiscal period t and append to df
df_forecasts['mean_forecast_horizon_days'] = df_forecasts.groupby(['analys','ticker','fpedats'])['forecast_horizon'].transform('mean')
df_forecasts['mean_forecast_horizon_days'] = df_forecasts['mean_forecast_horizon_days'].dt.days
df_forecasts.head()

mean estimated eps 

In [None]:
# calculate the mean forecast estimate for analyst i of company j in fiscal period t and append to df
df_forecasts['mean_estimate_ijt'] = df_forecasts.groupby(['analys','ticker','fpedats'])['value'].transform('mean')

In [None]:
# test
analyst_id = 113333
subset_analyst_i = df_forecasts[(df_forecasts['ticker'] == "AAPL") & (df_forecasts['fpedats'].dt.year == 2020) & (df_forecasts['analys'] == analyst_id)]
subset_analyst_i

rename columns to natural language

In [None]:
# rename columns
df_forecasts = df_forecasts.rename(columns={"ticker": "ibes_ticker_pk", "oftic": "official_ticker", "cname": "company_name",
                                            "analys": "analyst", "value": "estimated_eps", "fpedats": "fiscal_period_ending", 
                                            "revdats": "revision_date", "anndats": "announce_date", "actual": "actual_eps", "anndats_act": "announce_date_actual",
                                            "difference_date": "forecast_horizon"})

In [None]:
df_forecasts.head()

remove empty rows and cut off time horizon

In [None]:
# drop all rows where the actual_eps is NaN and report number of rows dropped
print("Number of rows dropped: ", df_forecasts['actual_eps'].isna().sum())
df_forecasts = df_forecasts.dropna(subset=['actual_eps'])

In [None]:
# drop all rows where the estimated_eps is NaN and report number of rows dropped
print("Number of rows dropped: ", df_forecasts['estimated_eps'].isna().sum())
df_forecasts = df_forecasts.dropna(subset=['estimated_eps'])

In [None]:
# plot the forecast count by year over time for the entire dataset
df_forecasts['fiscal_year'] = df_forecasts['fiscal_period_ending'].dt.year
df_forecasts['actual_announce_year'] = df_forecasts['announce_date'].dt.year

In [None]:
# remove all rows of where actual_announce_year = 2023 and print the number of rows dropped
print("Number of rows dropped: ", df_forecasts['actual_announce_year'].value_counts().loc[2023])
df_forecasts = df_forecasts[df_forecasts['actual_announce_year'] != 2023]


In [None]:
# remove all rows of where fiscal_year = 2023 and print the number of rows dropped
print("Number of rows dropped: ", df_forecasts['fiscal_year'].value_counts().loc[2023])
df_forecasts = df_forecasts[df_forecasts['fiscal_year'] != 2023]

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
sns.countplot(data=df_forecasts, x='fiscal_year', ax=ax)
ax.set_title('Forecast count by year')

# Accuracies

## Variant 1: PMAFE 
PMAFE is the proportional mean absolute forecast error

$$PMAFE = \frac{DAFE_{ijt}}{\overline{AFE_{jt}}} = \frac{AFE_{ijt} - \overline{AFE_{jt}}}{\overline{AFE_{jt}}} $$

where 
- $AFE_{ijt}$ is the absolute forecast error for analyst i's forecast of firm j for year t, and 
- $\overline{AFE_{jt}}$ is the mean absolute forecast error for firm j for year t

Negative PMAFE: better than average performance of analyst i (capped at -1.0)

Postive PMAFE: worse than average performance (theoretically towards infinity)

Source: Analyst forecast accuracy:
Do ability, resources, and
portfolio complexity matter?q
Michael B. Clement

In [None]:
def calculate_pmafe(df):
    """
    This function first calculates the absolute forecast error for each analyst i forecast of firm j in year t
    If analyst i has multiple forecasts for firm j in year t, the function calculates the average forecast error
    In the second step an extra column is calculated for the overall forecast error is calculate for each firm j in year t
    In the third step, the function calculates the PMAFE for each analyst i forecast of firm j in year t
    This function adds new columns to the dataframe:
    - afe_analyst_i: the absolute forecast error for each analyst i forecast of firm j in year t
    - afe_analyst_ijt_mean: the average absolute forecast error for each analyst i forecast of firm j in year t
    - afe_firm_jt_mean: the overall forecast error for each firm j in year t
    - pmafe: the PMAFE for each analyst i forecast of firm j in year t
    """
    # Step 1: Calculate the average absolute forecast error for each analyst i forecast of firm j in year t
    df['afe_analyst_i'] = np.abs(df['estimated_eps'] - df['actual_eps'])
    
    df_grouped = df.groupby(['ibes_ticker_pk', 'analyst', 'fiscal_period_ending']).agg({'afe_analyst_i': 'mean'}).reset_index()
    df_grouped = df_grouped.rename(columns={'afe_analyst_i': 'afe_analyst_ijt_mean'})
    df = pd.merge(df, df_grouped, on=['ibes_ticker_pk', 'analyst', 'fiscal_period_ending'], how='left')
    
    # Step 2: Calculate the overall forecast error for each firm j in year t
    df['afe_firm_jt_mean'] = df.groupby(["ibes_ticker_pk", "fiscal_period_ending"])["afe_analyst_i"].transform("mean")
    
    # Step 3: Calculate the PMAFE for each analyst i forecast of firm j in year t
    df['pmafe'] = (df['afe_analyst_ijt_mean'] - df['afe_firm_jt_mean']) / df['afe_firm_jt_mean']
    
    return df

In [None]:
df_forecasts = calculate_pmafe(df_forecasts)
df_forecasts.head()

In [None]:
# for descriptive statistics
df_forecasts.to_parquet('../data/processed/ibes_forecasts_revisions.parquet')

## Collapse df

in the final df, one row should correspond to one analyst i's mean forecast of firm j in fiscal year t with accuracy measure pmafe

columns:
 - ibes_ticker_pk
 - company_name
 - gvkey
 - fiscal_period_ending
 - analyst
 - estimator
 - estimated_eps
 - mean_forecast_horizon
 - actual_eps
 - announce_date_actual
 - pmafe
 - afe_analyst_i_mean
 - afe_firm_j_mean
 - feature x
 - ...

### Feature : Coverage

In [None]:
df_forecasts.head()

In [None]:
# test
analyst_id = 113333
test = df_forecasts[(df_forecasts['official_ticker'] == "AAPL") & (df_forecasts['fiscal_period_ending'].dt.year == 2018) & (df_forecasts['analyst'] == analyst_id)]
test

In [None]:
# Group by 'ibes_ticker_pk', 'analyst', 'fiscal_period_ending' and find the row with the minimum 'forecast_horizon'
min_forecast = test.groupby(['ibes_ticker_pk', 'analyst', 'fiscal_period_ending'])['forecast_horizon'].idxmin()

# Extract rows with the minimum forecast horizon
min_forecast_df = test.loc[min_forecast]

In [None]:
min_forecast_df

In [None]:
def collapse_processed_df(df):
    """_summary_
    This function collapses/groups the input df into the final df so that one row should correspond to one analyst 
    i's forecast of firm j in fiscal year t with accuracy measure pmafe and other relevant features
    """
    min_forecast = df.groupby(['ibes_ticker_pk', 'analyst', 'fiscal_period_ending'])['forecast_horizon'].idxmin()
    min_forecast_df = df.loc[min_forecast]
    return min_forecast_df


In [None]:
collapsed_df = collapse_processed_df(df_forecasts)
collapsed_df.head()

In [None]:
collapsed_df.shape

In [None]:
filtered_df = collapsed_df[(collapsed_df['analyst'] == analyst_id) & (collapsed_df['fiscal_period_ending'].dt.year == 2018) & ((collapsed_df['official_ticker'] == "NFLX") | (collapsed_df['official_ticker'] == "AAPL"))]
filtered_df

In [None]:
# density plot of PMAFE for all forecasts
plt.figure(figsize=(10, 6))
sns.kdeplot(collapsed_df['pmafe'], fill=True)
plt.title("PMAFE Distribution for all Forecasts")
plt.xlabel("PMAFE")
plt.ylabel("Density")
plt.show()

In [None]:
# show me all rows where pmafe is NaN and print the count
print("Number of rows with pmafe NaN: ", collapsed_df['pmafe'].isna().sum())

In [None]:
# drop all rows where pmafe is NaN
collapsed_df = collapsed_df.dropna(subset=['pmafe'])

Reason: forecasts for companies in year t, if only one analyst provides a forecast (otherwise pmafe = NaN if actual == estimated) AND the analyst provides a perfect forecast

In [None]:
collapsed_df['analysts_following_j'] = collapsed_df.groupby(['ibes_ticker_pk','fiscal_period_ending'])['analyst'].transform('count')
collapsed_df.head()

# Export df

In [None]:
# for descriptive statistics
collapsed_df.to_parquet('../data/processed/ibes_forecasts_collapsed.parquet')