In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from time import time

In [3]:
import warnings
warnings.filterwarnings('ignore')

### Organize data

In [4]:
df=pd.read_excel('raw_data_bbg.xlsx',
                 sheet_name='px_data',
                header=0)
df['Date']=pd.to_datetime(df['Date'])

# Because of saving issues, 39 NA appeared at the tail
df = df.iloc[:-39]

In [5]:
px=df.dropna(axis=1) #drop stocks without full data, 88 stocks remain
px=px.set_index('Date')
#px

In [6]:
stock_list=px.columns
stock_list[:5] #show first 5

Index(['AAPL UW Equity', 'ABT UN Equity', 'ACN UN Equity', 'ADBE UW Equity',
       'AIG UN Equity'],
      dtype='object')

In [7]:
len(stock_list)

88

In [8]:
#these will be part of the feature space
ma11=px.rolling(11).mean() 
ma50=px.rolling(50).mean()
ma200=px.rolling(200).mean()

In [9]:
df2=pd.read_excel('raw_data_bbg.xlsx',sheet_name='earn_data',header=0)

In [10]:

earn_dates=df2.iloc[:, range(2,807,8) ]
earn_dates.columns=df.columns[1:]
#earn_dates  #holds the earnings announcement dates

In [11]:
EPS=df2.iloc[:, range(5,807,8) ]
EPS.columns=df.columns[1:]
#EPS   #holds the 'comparable' EPS, i.e. not basic EPS

In [12]:
est_EPS=df2.iloc[:, range(6,807,8) ]
est_EPS.columns=df.columns[1:]
#est_EPS   #holds the forecast consensus for the comparable EPS

### Implementation of CorrelNowcast algorithm from paper

In [13]:
def my_CorrelNowcast(s, W, model, X_all, y_all, Info, start_date, end_date): 
    # s for bbg ticker, W for window size, nu for regularization param
    # output inclusive of end date
    
    begin = np.where(X_all.index==start_date)[0][0] #get integer index corresponding to date
    end = np.where(X_all.index==end_date)[0][0] 

    Ew_X = X_all.iloc[begin-W:begin,:] 
    Ew_y = y_all.iloc[begin-W:begin,0].to_numpy() #0 for the EPS column

    model.fit(Ew_X, Ew_y) #initialize

    P = None
    Q = None
    output = np.zeros(end-begin+1)
    for t in range(begin, end+1):

        P = pd.DataFrame([X_all.iloc[t,:].to_numpy()]) \
            if P is None else P.append(pd.DataFrame([X_all.iloc[t,:].to_numpy()]))

        if Q is None:
            Q = list(model.predict(X_all.iloc[t,:].to_numpy().reshape(1,-1)))
        else:
            Q.append(model.predict(X_all.iloc[t,:].to_numpy().reshape(1,-1)))
        
        output[t-begin] = np.mean(Q)

        if X_all.index[t] in Info.index: #if we are at an earnings announcement date
            P.columns=Ew_X.columns #needed for correct appending
            Ew_X=Ew_X.append(P) #works as the for loop in the paper
            Ew_y=np.concatenate([Ew_y, [Info.loc[X_all.index[t], 'EPS']]*len(Q)])
            P=None
            Q=None
            
            if Ew_X.shape[0]>W:
                Ew_X=Ew_X.iloc[-W:,:] #keep only W rows
                Ew_y=Ew_y[-W:]
            
            model.fit(Ew_X, Ew_y) #retrain
        
    return output

### Cross validation for nu and W (this code takes 6 hrs to run, results saved as text below) 

In [14]:
from sklearn.linear_model import LassoCV

In [38]:
%%time
from sklearn.metrics import mean_absolute_percentage_error # need sklearn 0.24

nu=np.array([10, 100, 1000, 3000, 5000, 10000])
W=np.array([11, 50, 125, 200, 250, 350]) #not doing 500 and 700

W = [11]

cv_result=dict()

model = LassoCV()

for b in W:
    print('Now in loop: nu={:d}, W={:d}...'.format(a,b), end='')
    MRE = np.zeros(len(stock_list))
    for i, ticker in enumerate(stock_list):
        start = time()
        print(i, end='') #visualize progress
        cor11=px[ticker].rolling(11).corr(px)
        cor50=px[ticker].rolling(50).corr(px)
        cor200=px[ticker].rolling(200).corr(px)


        temp1=pd.merge(px,cor11*ma11, left_index=True, right_index=True, suffixes=['_px', '_11'])
        temp2=pd.merge(temp1,cor50*ma50, left_index=True, right_index=True, suffixes=[None, None] )
        X=pd.merge(temp2,cor200*ma200, left_index=True, right_index=True, suffixes=['_50', '_200'])
        X=X.dropna() #some top rows have Nan due to rolling averge calculations

        Earnings_info=pd.DataFrame({'Announcement_dt':earn_dates[ticker].dropna(),
                              'EPS':EPS[ticker].dropna(),
                              'est_EPS':est_EPS[ticker].dropna()})
        Earnings_info=Earnings_info.set_index('Announcement_dt')

        #X and y_extra_info shall hv same no. of rows
        y_extra_info=pd.merge(pd.DataFrame(index=X.index),Earnings_info, left_index=True, right_index=True, how='left')
        y_extra_info=y_extra_info.fillna(method='bfill') #fill NaN with next earnings data, there will be some Nan
                                            #remaining at the end since earnings not out yet. We
                                            #won't be using those rows


        #cross validation period
        start_date = '2013-02-08'
        end_date = '2015-02-09'

        params = {
            'start_date': start_date, 
            'end_date': end_date, 
            'Info': Earnings_info, 
            'X_all': X, 
            'y_all': y_extra_info, 
            's': ticker, 
            'W': b,
            'model': model
        }

        y_pred = my_CorrelNowcast(**params)

        MRE[i] = mean_absolute_percentage_error(y_extra_info.loc[start_date:end_date,'EPS'], y_pred)

    cv_result[b]=np.mean(MRE)

    print('')   
            

cv_result

Now in loop: nu=10, W=11...0123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
CPU times: user 55min 24s, sys: 161 ms, total: 55min 24s
Wall time: 55min 25s


{11: 0.21839646295076245}

 cv_result saved as text here:{(10, 11): 0.2187638191270434,
 (10, 50): 0.22906451942601178,
 (10, 125): 0.31202466937432827,
 (10, 200): 0.3825881964232682,
 (10, 250): 0.33047717422058687,
 (10, 350): 0.3705547594743095,
 (100, 11): 0.21851150849725237,
 (100, 50): 0.22558408007391287,
 (100, 125): 0.26659903572682564,
 (100, 200): 0.2640797603053457,
 (100, 250): 0.2703969553958662,
 (100, 350): 0.29776815242467924,
 (1000, 11): 0.21763164046085368,
 (1000, 50): 0.22069437118027563,
 (1000, 125): 0.23746854866082315,
 (1000, 200): 0.22573256964428604,
 (1000, 250): 0.23435294423273242,
 (1000, 350): 0.25400878483123424,
 (3000, 11): 0.21708701575300682,
 (3000, 50): 0.21948953352799538,
 (3000, 125): 0.22872920667907815,
 (3000, 200): 0.2225562088940296,
 (3000, 250): 0.2264492982777717,
 (3000, 350): 0.24119754914558456,
 (5000, 11): 0.21686743840521164,
 (5000, 50): 0.21898640741750486,
 (5000, 125): 0.22568012060609888,
 (5000, 200): 0.22240374315178094,
 (5000, 250): 0.2248758424106235,
 (5000, 350): 0.23697964201516844,
 (10000, 11): 0.21669252019829163,
 (10000, 50): 0.21852854901945928,
 (10000, 125): 0.2232050513633187,
 (10000, 200): 0.22266263741956352,
 (10000, 250): 0.2246121264495324,
 (10000, 350): 0.2330335124678897}

In [34]:
table_text = {(10, 11): 0.2187638191270434,
 (10, 50): 0.22906451942601178,
 (10, 125): 0.31202466937432827,
 (10, 200): 0.3825881964232682,
 (10, 250): 0.33047717422058687,
 (10, 350): 0.3705547594743095,
 (100, 11): 0.21851150849725237,
 (100, 50): 0.22558408007391287,
 (100, 125): 0.26659903572682564,
 (100, 200): 0.2640797603053457,
 (100, 250): 0.2703969553958662,
 (100, 350): 0.29776815242467924,
 (1000, 11): 0.21763164046085368,
 (1000, 50): 0.22069437118027563,
 (1000, 125): 0.23746854866082315,
 (1000, 200): 0.22573256964428604,
 (1000, 250): 0.23435294423273242,
 (1000, 350): 0.25400878483123424,
 (3000, 11): 0.21708701575300682,
 (3000, 50): 0.21948953352799538,
 (3000, 125): 0.22872920667907815,
 (3000, 200): 0.2225562088940296,
 (3000, 250): 0.2264492982777717,
 (3000, 350): 0.24119754914558456,
 (5000, 11): 0.21686743840521164,
 (5000, 50): 0.21898640741750486,
 (5000, 125): 0.22568012060609888,
 (5000, 200): 0.22240374315178094,
 (5000, 250): 0.2248758424106235,
 (5000, 350): 0.23697964201516844,
 (10000, 11): 0.21669252019829163,
 (10000, 50): 0.21852854901945928,
 (10000, 125): 0.2232050513633187,
 (10000, 200): 0.22266263741956352,
 (10000, 250): 0.2246121264495324,
 (10000, 350): 0.2330335124678897}

In [36]:
for thing in table_text:
    print(thing, round(table_text[thing], 4))

(10, 11) 0.2188
(10, 50) 0.2291
(10, 125) 0.312
(10, 200) 0.3826
(10, 250) 0.3305
(10, 350) 0.3706
(100, 11) 0.2185
(100, 50) 0.2256
(100, 125) 0.2666
(100, 200) 0.2641
(100, 250) 0.2704
(100, 350) 0.2978
(1000, 11) 0.2176
(1000, 50) 0.2207
(1000, 125) 0.2375
(1000, 200) 0.2257
(1000, 250) 0.2344
(1000, 350) 0.254
(3000, 11) 0.2171
(3000, 50) 0.2195
(3000, 125) 0.2287
(3000, 200) 0.2226
(3000, 250) 0.2264
(3000, 350) 0.2412
(5000, 11) 0.2169
(5000, 50) 0.219
(5000, 125) 0.2257
(5000, 200) 0.2224
(5000, 250) 0.2249
(5000, 350) 0.237
(10000, 11) 0.2167
(10000, 50) 0.2185
(10000, 125) 0.2232
(10000, 200) 0.2227
(10000, 250) 0.2246
(10000, 350) 0.233


In [39]:
min(cv_result, key=cv_result.get) #run this if you've run the cv

11

Best parameter combo with lowest MRE in cv: nu=10000, W=11

### Run model on test set

In [19]:
b = 11

In [20]:
from sklearn.linear_model import RidgeCV

In [21]:
model = RidgeCV()

In [22]:
%%time
from sklearn.metrics import mean_absolute_percentage_error # need sklearn 0.24

MRE_model=np.zeros(len(stock_list))
MRE_bbg_est=np.zeros(len(stock_list))
print('Index running...')
for i, ticker in enumerate(stock_list):
    print(i,'',end='') #visualize progress
    cor11=px[ticker].rolling(11).corr(px)
    cor50=px[ticker].rolling(50).corr(px)
    cor200=px[ticker].rolling(200).corr(px)

    temp1=pd.merge(px,cor11*ma11, left_index=True, right_index=True, suffixes=['_px', '_11'])
    temp2=pd.merge(temp1,cor50*ma50, left_index=True, right_index=True, suffixes=[None, None] )
    X=pd.merge(temp2,cor200*ma200, left_index=True, right_index=True , suffixes=['_50', '_200'])
    X=X.dropna() #some top rows have Nan due to rolling averge calculations

    Earnings_info=pd.DataFrame({'Announcement_dt':earn_dates[ticker].dropna(),
                          'EPS':EPS[ticker].dropna(),
                          'est_EPS':est_EPS[ticker].dropna()})
    Earnings_info=Earnings_info.set_index('Announcement_dt')

    #X and y_extra_info shall hv same no. of rows
    y_extra_info=pd.merge(pd.DataFrame(index=X.index),Earnings_info, left_index=True, right_index=True, how='left')
    y_extra_info=y_extra_info.fillna(method='bfill') #fill NaN with next earnings data, there will be some Nan
                                        #remaining at the end since earnings not out yet. We
                                        #won't be using those rows
    #test data period - 4years
    start_date='2015-03-09' #start date at least 11 trading days after cv period
    end_date='2019-03-11'
    
    params = {
            'start_date': start_date, 
            'end_date': end_date, 
            'Info': Earnings_info, 
            'X_all': X, 
            'y_all': y_extra_info, 
            's': ticker, 
            'W': b,
            'model': model
        }

    y_pred=my_CorrelNowcast(**params)
    MRE_model[i]=mean_absolute_percentage_error(y_extra_info.loc[start_date:end_date,'EPS'], y_pred)
    MRE_bbg_est[i]=mean_absolute_percentage_error(y_extra_info.loc[start_date:end_date,'EPS'],
                                                  y_extra_info.loc[start_date:end_date,'est_EPS'])
    

print('\nTest period: 2015-03-09 to 2019-03-11')
print('MRE from model:', np.mean(MRE_model))
print('MRE from analyst estimates:', np.mean(MRE_bbg_est))




Index running...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 
Test period: 2015-03-09 to 2019-03-11
MRE from model: 0.23680006074868834
MRE from analyst estimates: 0.11550951150632237
CPU times: user 17min 6s, sys: 208 ms, total: 17min 6s
Wall time: 17min 6s


In [23]:
model = LassoCV()

In [24]:
%%time
from sklearn.metrics import mean_absolute_percentage_error # need sklearn 0.24

MRE_model=np.zeros(len(stock_list))
MRE_bbg_est=np.zeros(len(stock_list))
print('Index running...')
for i, ticker in enumerate(stock_list):
    print(i,'',end='') #visualize progress
    cor11=px[ticker].rolling(11).corr(px)
    cor50=px[ticker].rolling(50).corr(px)
    cor200=px[ticker].rolling(200).corr(px)

    temp1=pd.merge(px,cor11*ma11, left_index=True, right_index=True, suffixes=['_px', '_11'])
    temp2=pd.merge(temp1,cor50*ma50, left_index=True, right_index=True, suffixes=[None, None] )
    X=pd.merge(temp2,cor200*ma200, left_index=True, right_index=True , suffixes=['_50', '_200'])
    X=X.dropna() #some top rows have Nan due to rolling averge calculations

    Earnings_info=pd.DataFrame({'Announcement_dt':earn_dates[ticker].dropna(),
                          'EPS':EPS[ticker].dropna(),
                          'est_EPS':est_EPS[ticker].dropna()})
    Earnings_info=Earnings_info.set_index('Announcement_dt')

    #X and y_extra_info shall hv same no. of rows
    y_extra_info=pd.merge(pd.DataFrame(index=X.index),Earnings_info, left_index=True, right_index=True, how='left')
    y_extra_info=y_extra_info.fillna(method='bfill') #fill NaN with next earnings data, there will be some Nan
                                        #remaining at the end since earnings not out yet. We
                                        #won't be using those rows
    #test data period - 4years
    start_date='2015-03-09' #start date at least 11 trading days after cv period
    end_date='2019-03-11'
    
    params = {
            'start_date': start_date, 
            'end_date': end_date, 
            'Info': Earnings_info, 
            'X_all': X, 
            'y_all': y_extra_info, 
            's': ticker, 
            'W': b,
            'model': model
        }

    y_pred=my_CorrelNowcast(**params)
    MRE_model[i]=mean_absolute_percentage_error(y_extra_info.loc[start_date:end_date,'EPS'], y_pred)
    MRE_bbg_est[i]=mean_absolute_percentage_error(y_extra_info.loc[start_date:end_date,'EPS'],
                                                  y_extra_info.loc[start_date:end_date,'est_EPS'])
    

print('\nTest period: 2015-03-09 to 2019-03-11')
print('MRE from model:', np.mean(MRE_model))
print('MRE from analyst estimates:', np.mean(MRE_bbg_est))



Index running...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 
Test period: 2015-03-09 to 2019-03-11
MRE from model: 0.2368778361217034
MRE from analyst estimates: 0.11550951150632237
CPU times: user 1h 49min, sys: 200 ms, total: 1h 49min
Wall time: 1h 49min 1s


In [25]:
from sklearn.ensemble import RandomForestRegressor

In [26]:
model = RandomForestRegressor()
b = 11

In [27]:
%%time
from sklearn.metrics import mean_absolute_percentage_error # need sklearn 0.24

MRE_model=np.zeros(len(stock_list))
MRE_bbg_est=np.zeros(len(stock_list))
print('Index running...')
for i, ticker in enumerate(stock_list):
    print(i,'',end='') #visualize progress
    cor11=px[ticker].rolling(11).corr(px)
    cor50=px[ticker].rolling(50).corr(px)
    cor200=px[ticker].rolling(200).corr(px)

    temp1=pd.merge(px,cor11*ma11, left_index=True, right_index=True, suffixes=['_px', '_11'])
    temp2=pd.merge(temp1,cor50*ma50, left_index=True, right_index=True, suffixes=[None, None] )
    X=pd.merge(temp2,cor200*ma200, left_index=True, right_index=True , suffixes=['_50', '_200'])
    X=X.dropna() #some top rows have Nan due to rolling averge calculations

    Earnings_info=pd.DataFrame({'Announcement_dt':earn_dates[ticker].dropna(),
                          'EPS':EPS[ticker].dropna(),
                          'est_EPS':est_EPS[ticker].dropna()})
    Earnings_info=Earnings_info.set_index('Announcement_dt')

    #X and y_extra_info shall hv same no. of rows
    y_extra_info=pd.merge(pd.DataFrame(index=X.index),Earnings_info, left_index=True, right_index=True, how='left')
    y_extra_info=y_extra_info.fillna(method='bfill') #fill NaN with next earnings data, there will be some Nan
                                        #remaining at the end since earnings not out yet. We
                                        #won't be using those rows
    #test data period - 4years
    start_date='2015-03-09' #start date at least 11 trading days after cv period
    end_date='2019-03-11'
    
    params = {
            'start_date': start_date, 
            'end_date': end_date, 
            'Info': Earnings_info, 
            'X_all': X, 
            'y_all': y_extra_info, 
            's': ticker, 
            'W': b,
            'model': model
        }

    y_pred=my_CorrelNowcast(**params)
    MRE_model[i]=mean_absolute_percentage_error(y_extra_info.loc[start_date:end_date,'EPS'], y_pred)
    MRE_bbg_est[i]=mean_absolute_percentage_error(y_extra_info.loc[start_date:end_date,'EPS'],
                                                  y_extra_info.loc[start_date:end_date,'est_EPS'])
    

print('\nTest period: 2015-03-09 to 2019-03-11')
print('MRE from model:', np.mean(MRE_model))
print('MRE from analyst estimates:', np.mean(MRE_bbg_est))


Index running...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 
Test period: 2015-03-09 to 2019-03-11
MRE from model: 0.23677090315990598
MRE from analyst estimates: 0.11550951150632237
CPU times: user 25min 53s, sys: 1.14 s, total: 25min 54s
Wall time: 25min 54s


In [28]:
from sklearn.ensemble import GradientBoostingRegressor

In [29]:
model = GradientBoostingRegressor()

In [30]:
%%time
MRE_model=np.zeros(len(stock_list))
MRE_bbg_est=np.zeros(len(stock_list))
print('Index running...')
for i, ticker in enumerate(stock_list):
    print(i,'',end='') #visualize progress
    cor11=px[ticker].rolling(11).corr(px)
    cor50=px[ticker].rolling(50).corr(px)
    cor200=px[ticker].rolling(200).corr(px)

    temp1=pd.merge(px,cor11*ma11, left_index=True, right_index=True, suffixes=['_px', '_11'])
    temp2=pd.merge(temp1,cor50*ma50, left_index=True, right_index=True, suffixes=[None, None] )
    X=pd.merge(temp2,cor200*ma200, left_index=True, right_index=True , suffixes=['_50', '_200'])
    X=X.dropna() #some top rows have Nan due to rolling averge calculations

    Earnings_info=pd.DataFrame({'Announcement_dt':earn_dates[ticker].dropna(),
                          'EPS':EPS[ticker].dropna(),
                          'est_EPS':est_EPS[ticker].dropna()})
    Earnings_info=Earnings_info.set_index('Announcement_dt')

    #X and y_extra_info shall hv same no. of rows
    y_extra_info=pd.merge(pd.DataFrame(index=X.index),Earnings_info, left_index=True, right_index=True, how='left')
    y_extra_info=y_extra_info.fillna(method='bfill') #fill NaN with next earnings data, there will be some Nan
                                        #remaining at the end since earnings not out yet. We
                                        #won't be using those rows
    #test data period - 4years
    start_date='2015-03-09' #start date at least 11 trading days after cv period
    end_date='2019-03-11'
    
    params = {
            'start_date': start_date, 
            'end_date': end_date, 
            'Info': Earnings_info, 
            'X_all': X, 
            'y_all': y_extra_info, 
            's': ticker, 
            'W': b,
            'model': model
        }

    y_pred=my_CorrelNowcast(**params)
    MRE_model[i]=mean_absolute_percentage_error(y_extra_info.loc[start_date:end_date,'EPS'], y_pred)
    MRE_bbg_est[i]=mean_absolute_percentage_error(y_extra_info.loc[start_date:end_date,'EPS'],
                                                  y_extra_info.loc[start_date:end_date,'est_EPS'])
    

print('\nTest period: 2015-03-09 to 2019-03-11')
print('MRE from model:', np.mean(MRE_model))
print('MRE from analyst estimates:', np.mean(MRE_bbg_est))



Index running...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 
Test period: 2015-03-09 to 2019-03-11
MRE from model: 0.23672570355170766
MRE from analyst estimates: 0.11550951150632237
CPU times: user 17min 30s, sys: 204 ms, total: 17min 30s
Wall time: 17min 30s


In [31]:
from sklearn.linear_model import LinearRegression

In [32]:
model = LinearRegression()

In [33]:
%%time
MRE_model=np.zeros(len(stock_list))
MRE_bbg_est=np.zeros(len(stock_list))
print('Index running...')
for i, ticker in enumerate(stock_list):
    print(i,'',end='') #visualize progress
    cor11=px[ticker].rolling(11).corr(px)
    cor50=px[ticker].rolling(50).corr(px)
    cor200=px[ticker].rolling(200).corr(px)

    temp1=pd.merge(px,cor11*ma11, left_index=True, right_index=True, suffixes=['_px', '_11'])
    temp2=pd.merge(temp1,cor50*ma50, left_index=True, right_index=True, suffixes=[None, None] )
    X=pd.merge(temp2,cor200*ma200, left_index=True, right_index=True , suffixes=['_50', '_200'])
    X=X.dropna() #some top rows have Nan due to rolling averge calculations

    Earnings_info=pd.DataFrame({'Announcement_dt':earn_dates[ticker].dropna(),
                          'EPS':EPS[ticker].dropna(),
                          'est_EPS':est_EPS[ticker].dropna()})
    Earnings_info=Earnings_info.set_index('Announcement_dt')

    #X and y_extra_info shall hv same no. of rows
    y_extra_info=pd.merge(pd.DataFrame(index=X.index),Earnings_info, left_index=True, right_index=True, how='left')
    y_extra_info=y_extra_info.fillna(method='bfill') #fill NaN with next earnings data, there will be some Nan
                                        #remaining at the end since earnings not out yet. We
                                        #won't be using those rows
    #test data period - 4years
    start_date='2015-03-09' #start date at least 11 trading days after cv period
    end_date='2019-03-11'
    
    params = {
            'start_date': start_date, 
            'end_date': end_date, 
            'Info': Earnings_info, 
            'X_all': X, 
            'y_all': y_extra_info, 
            's': ticker, 
            'W': b,
            'model': model
        }

    y_pred=my_CorrelNowcast(**params)
    MRE_model[i]=mean_absolute_percentage_error(y_extra_info.loc[start_date:end_date,'EPS'], y_pred)
    MRE_bbg_est[i]=mean_absolute_percentage_error(y_extra_info.loc[start_date:end_date,'EPS'],
                                                  y_extra_info.loc[start_date:end_date,'est_EPS'])
    

print('\nTest period: 2015-03-09 to 2019-03-11')
print('MRE from model:', np.mean(MRE_model))
print('MRE from analyst estimates:', np.mean(MRE_bbg_est))




Index running...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 
Test period: 2015-03-09 to 2019-03-11
MRE from model: 0.2442538560345735
MRE from analyst estimates: 0.11550951150632237
CPU times: user 1h 8min 57s, sys: 54.2 s, total: 1h 9min 51s
Wall time: 17min 3s


MRE from model actually higher then MRE from analyst estimates. This does not support the paper's claim.