In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import lightgbm as lgb
import pickle
import copy

### Read Data

In [2]:
file = open('../data/subsets.pkl', 'rb')
subsets = pickle.load(file)
file.close()

### Add Relative Earnings Change

In [3]:
def assign_ranks(group, col, num_ranks):
    sorted_group = group.sort_values(col, ascending=False)
    size = len(sorted_group)
    rank_size = size // num_ranks
    ranks = []

    for rank in range(1, num_ranks + 1):
        if rank != num_ranks:
            ranks.extend([rank] * rank_size)
        else:
            ranks.extend([rank] * (size - len(ranks)))

    sorted_group[col + '_rank'] = ranks
    return sorted_group

In [4]:
def df_rank(df, cols, num_ranks):
    df_ranked = df.copy(deep=True)
    for col in cols:
        df_ranked = df_ranked.groupby('datacqtr').apply(lambda x: assign_ranks(x, col, num_ranks=num_ranks))
        df_ranked = df_ranked.reset_index(drop=True)
        df_ranked['_'.join(col.split('_')[:-1])+'_direction'] = df_ranked[f'{col}_rank'] - 1

    df_ranked = df_ranked.drop(columns=[x for x in df_ranked.columns if '_rank' in x])
    return df_ranked

### Train Logistic Regression

In [5]:
non_features = ['gvkey','datacqtr', 'cusip','tic', 'announcement_date','analyst_date',
                'nq_eps_actual_direction','nq_eps_actual_change',
                'nq_eps_predicted_mean_direction','nq_eps_predicted_mean_change',
                'nq_eps_predicted_median_direction','nq_eps_predicted_median_change']

columns_to_rank = ['nq_eps_actual_change','nq_eps_predicted_mean_change','nq_eps_predicted_median_change']

In [6]:
predictions = {'datacqtr': [],'gvkey': [],'cusip':[],'tic':[], 
               'announcement_date':[], 'analyst_date':[],
               'direction_pred':[],'value_pred':[],
               'direction_actual': [],'value_actual': [], 
               'direction_analyst_mean': [],'value_analyst_mean': [],
               'direction_analyst_median': [],'value_analyst_median': []}

In [7]:
for num_classes in [3,6,9]:
    results = []
    predictions_class = copy.deepcopy(predictions)
    for i, year in enumerate(sorted(subsets.keys())):
        # split into training and testing
        df_train = subsets[year]['train']
        df_test = subsets[year]['test']
        
        # rank target variables
        df_train = df_rank(df_train, columns_to_rank, num_classes)
        df_test = df_rank(df_test, columns_to_rank, num_classes)

        # create train and test features
        train_features = np.array(df_train.drop(non_features, axis = 1))
        test_features = np.array(df_test.drop(non_features, axis = 1))

        # create train and test labels
        train_labels = np.array(df_train['nq_eps_actual_direction'])
        test_labels = np.array(df_test['nq_eps_actual_direction'])

        # train model
        params = {
            'objective': 'multiclass',
            'num_class': num_classes,
            'metric': 'multi_logloss',
            'verbose': -1
        }
        train_data = lgb.Dataset(train_features, label=train_labels)
        model = lgb.train(params, train_data)

        # make predictions
        cb_predict_proba = model.predict(test_features)
        cb_predict = np.argmax(cb_predict_proba, axis=1)
        cb_predict_proba = cb_predict_proba[:,0]
        
        # results 
        results.append({'year': year, 
                        'accuracy': accuracy_score(test_labels, cb_predict), 
                        'f1': f1_score(test_labels, cb_predict, average='macro')})
        
        predictions_class['direction_pred'] += list(cb_predict)
        predictions_class['value_pred'] += list(cb_predict_proba)

        predictions_class['direction_actual'] += list(test_labels)
        predictions_class['value_actual'] += list(df_test['nq_eps_actual_change'])

        predictions_class['direction_analyst_mean'] += list(df_test['nq_eps_predicted_mean_direction'])
        predictions_class['value_analyst_mean'] += list(df_test['nq_eps_predicted_mean_change'])

        predictions_class['direction_analyst_median'] += list(df_test['nq_eps_predicted_median_direction'])
        predictions_class['value_analyst_median'] += list(df_test['nq_eps_predicted_median_change'])

        predictions_class['datacqtr'] += list(df_test['datacqtr'])
        predictions_class['gvkey'] += list(df_test['gvkey'])
        predictions_class['cusip'] += list(df_test['cusip'])
        predictions_class['tic'] += list(df_test['tic'])
        predictions_class['announcement_date'] += list(df_test['announcement_date'])
        predictions_class['analyst_date'] += list(df_test['analyst_date'])

        print(i+1, results[-1])
    
    predictions_df = pd.DataFrame(predictions_class)
    predictions_df.to_pickle(f'../results/xinyue_predictions_{num_classes}.pkl')

1 {'year': '2013', 'accuracy': 0.6304746918436926, 'f1': 0.6281976808084239}
2 {'year': '2014', 'accuracy': 0.6279527559055118, 'f1': 0.6266438782845873}
3 {'year': '2015', 'accuracy': 0.6274965164886205, 'f1': 0.6269269461772167}
4 {'year': '2016', 'accuracy': 0.6281517211137908, 'f1': 0.626931439073262}
5 {'year': '2017', 'accuracy': 0.6376061296334645, 'f1': 0.6356358953810233}
6 {'year': '2018', 'accuracy': 0.6336437487616405, 'f1': 0.6334094589145207}
7 {'year': '2019', 'accuracy': 0.6338162409454823, 'f1': 0.6329453498423747}
1 {'year': '2013', 'accuracy': 0.43797534749541045, 'f1': 0.43124199854345663}
2 {'year': '2014', 'accuracy': 0.42076771653543305, 'f1': 0.41260107716308836}
3 {'year': '2015', 'accuracy': 0.4335810496980957, 'f1': 0.42433050587599}
4 {'year': '2016', 'accuracy': 0.4170138127603596, 'f1': 0.40490887869467307}
5 {'year': '2017', 'accuracy': 0.43466556222820457, 'f1': 0.4238637294370989}
6 {'year': '2018', 'accuracy': 0.4374876164057856, 'f1': 0.42735330290559