In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import pickle

### Read Data

In [2]:
file = open('../data/subsets.pkl', 'rb')
subsets = pickle.load(file)
file.close()

In [3]:
file = open('../model/models_by_year.pkl','rb')
models_by_year = pickle.load(file)
file.close()
print(models_by_year.keys())

dict_keys(['2013', '2014', '2015', '2016', '2017', '2018', '2019'])


### Test model

In [4]:
non_features = ['gvkey','datacqtr', 'cusip','tic','announcement_date','analyst_date',
                'nq_eps_actual_direction','nq_eps_actual_change',
                'nq_eps_predicted_mean_direction','nq_eps_predicted_mean_change',
                'nq_eps_predicted_median_direction','nq_eps_predicted_median_change',
                'mkvaltq', 'gsector']

In [5]:
predictions = {'datacqtr': [],'gvkey': [],'cusip':[],'tic':[], 'mkvaltq':[],'gsector':[],'atq':[],
               'announcement_date':[], 'analyst_date':[],
               'direction_pred':[],'value_pred':[],
               'direction_actual': [],'value_actual': [], 
               'direction_analyst_mean': [],'value_analyst_mean': [],
               'direction_analyst_median': [],'value_analyst_median': []}

results = []
splits = {}

In [6]:
for i, year in enumerate(sorted(subsets.keys())):

    # split into training and testing
    df_train = subsets[year]['train']
    df_test = subsets[year]['test']

    # create train and test features
    train_features = np.array(df_train.drop(non_features, axis = 1))
    test_features = np.array(df_test.drop(non_features, axis = 1))

    # create train and test labels
    train_labels = np.array(df_train['nq_eps_actual_direction'])
    test_labels = np.array(df_test['nq_eps_actual_direction'])

    # create feature names
    feature_names = list(df_test.drop(non_features, axis = 1).columns)

    # load model
    model = models_by_year[year]

    # make predictions
    cb_predict_proba = model.predict(test_features)
    cb_predict = (cb_predict_proba>0.5).astype(int)

    # results 
    results.append({'year': year, 
                    'accuracy': accuracy_score(test_labels, cb_predict), 
                    'f1': f1_score(test_labels, cb_predict, average='macro'), 
                    'auc': roc_auc_score(test_labels, cb_predict_proba)})
    
    # predictions
    predictions['direction_pred'] += list(cb_predict)
    predictions['value_pred'] += list(cb_predict_proba)

    predictions['direction_actual'] += list(test_labels)
    predictions['value_actual'] += list(df_test['nq_eps_actual_change'])

    predictions['direction_analyst_mean'] += list(df_test['nq_eps_predicted_mean_direction'])
    predictions['value_analyst_mean'] += list(df_test['nq_eps_predicted_mean_change'])

    predictions['direction_analyst_median'] += list(df_test['nq_eps_predicted_median_direction'])
    predictions['value_analyst_median'] += list(df_test['nq_eps_predicted_median_change'])

    predictions['datacqtr'] += list(df_test['datacqtr'])
    predictions['gvkey'] += list(df_test['gvkey'])
    predictions['cusip'] += list(df_test['cusip'])
    predictions['tic'] += list(df_test['tic'])
    predictions['announcement_date'] += list(df_test['announcement_date'])
    predictions['analyst_date'] += list(df_test['analyst_date'])

    predictions['mkvaltq'] += list(df_test['mkvaltq'])
    predictions['gsector'] += list(df_test['gsector'])
    predictions['atq'] += list(df_test['atq_current'])

    #  splits data
    split_data = {}
    split_data['model'] = model
    split_data['train_features'] = train_features
    split_data['test_features'] = test_features
    split_data['feature_names'] = feature_names
    split_data['predictions'] = predictions

    splits[year] = split_data

    print(i+1, results[-1])

1 {'year': '2013', 'accuracy': 0.7450826121164438, 'f1': 0.7437691331292466, 'auc': 0.82391674461856}
2 {'year': '2014', 'accuracy': 0.749507874015748, 'f1': 0.748631223654944, 'auc': 0.8353993781298868}
3 {'year': '2015', 'accuracy': 0.7519739897816999, 'f1': 0.7519705652977287, 'auc': 0.8370062984532708}
4 {'year': '2016', 'accuracy': 0.7544398158298619, 'f1': 0.7538619284683155, 'auc': 0.836826439582014}
5 {'year': '2017', 'accuracy': 0.745288879685235, 'f1': 0.7437600819184997, 'auc': 0.8273249687269688}
6 {'year': '2018', 'accuracy': 0.7307311274024173, 'f1': 0.7305888077079031, 'auc': 0.8219471607788794}
7 {'year': '2019', 'accuracy': 0.74285169653069, 'f1': 0.7427525291161472, 'auc': 0.8273055366600968}


### Save data

In [7]:
pickle.dump(splits, open('../data/splits_data.pkl', 'wb'))

In [8]:
predictions_df = pd.DataFrame(predictions)

In [13]:
((predictions_df['value_analyst_mean']<-1)|(predictions_df['value_analyst_mean']>1)).sum()/len(predictions_df)

0.13456348459172787

In [14]:
for value_analyst in ['value_analyst_mean','value_analyst_median']:
    predictions_df.loc[predictions_df[value_analyst]>1,value_analyst] = 1
    predictions_df.loc[predictions_df[value_analyst]<-1,value_analyst] = -1

    predictions_df[value_analyst] = (predictions_df[value_analyst]+1)/2
    
len(predictions_df)

31866

In [15]:
predictions_df.to_pickle('../results/lgbm_predictions.pkl')