In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import pickle

### Read Data

In [2]:
file = open('../data/subsets.pkl', 'rb')
subsets = pickle.load(file)
file.close()

### Train Logistic Regression

In [3]:
non_features = ['gvkey','datacqtr', 'cusip','tic', 'announcement_date','analyst_date',
                'nq_eps_actual_direction','nq_eps_actual_change',
                'nq_eps_predicted_mean_direction','nq_eps_predicted_mean_change',
                'nq_eps_predicted_median_direction','nq_eps_predicted_median_change']

In [4]:
predictions = {'datacqtr': [],'gvkey': [],'cusip':[],'tic':[], 
               'announcement_date':[], 'analyst_date':[],
               'direction_pred':[],'value_pred':[],
               'direction_actual': [],'value_actual': [], 
               'direction_analyst_mean': [],'value_analyst_mean': [],
               'direction_analyst_median': [],'value_analyst_median': []}

results = []

In [5]:
for i, year in enumerate(sorted(subsets.keys())):

    # split into training and testing
    df_train = subsets[year]['train']
    df_test = subsets[year]['test']

    # create train and test features, and fill missing values
    imputer = SimpleImputer(strategy='constant', fill_value=0)
    train_features = imputer.fit_transform(df_train.drop(non_features, axis=1))
    test_features = imputer.transform(df_test.drop(non_features, axis=1))

    # Standardize the features
    scaler = StandardScaler()
    train_features = scaler.fit_transform(train_features)
    test_features = scaler.transform(test_features)

    # create train and test labels
    train_labels = np.array(df_train['nq_eps_actual_direction'])
    test_labels = np.array(df_test['nq_eps_actual_direction'])

    # train logistic regression model
    model = LogisticRegression(max_iter=1000) 
    model.fit(train_features, train_labels)

    # make predictions
    cb_predict_proba = model.predict_proba(test_features)[:, 1]
    cb_predict = model.predict(test_features)

    # results 
    results.append({'year': year, 
                    'accuracy': accuracy_score(test_labels, cb_predict), 
                    'f1': f1_score(test_labels, cb_predict, average='macro'), 
                    'auc': roc_auc_score(test_labels, cb_predict_proba)})
    
    predictions['direction_pred'] += list(cb_predict)
    predictions['value_pred'] += list(cb_predict_proba)

    predictions['direction_actual'] += list(test_labels)
    predictions['value_actual'] += list(df_test['nq_eps_actual_change'])

    predictions['direction_analyst_mean'] += list(df_test['nq_eps_predicted_mean_direction'])
    predictions['value_analyst_mean'] += list(df_test['nq_eps_predicted_mean_change'])

    predictions['direction_analyst_median'] += list(df_test['nq_eps_predicted_median_direction'])
    predictions['value_analyst_median'] += list(df_test['nq_eps_predicted_median_change'])

    predictions['datacqtr'] += list(df_test['datacqtr'])
    predictions['gvkey'] += list(df_test['gvkey'])
    predictions['cusip'] += list(df_test['cusip'])
    predictions['tic'] += list(df_test['tic'])
    predictions['announcement_date'] += list(df_test['announcement_date'])
    predictions['analyst_date'] += list(df_test['analyst_date'])


    print(i+1, results[-1])

1 {'year': '2013', 'accuracy': 0.6750590086546027, 'f1': 0.6749730738469741, 'auc': 0.7339150062223547}
2 {'year': '2014', 'accuracy': 0.6993110236220472, 'f1': 0.6982232009605895, 'auc': 0.7630745453107222}
3 {'year': '2015', 'accuracy': 0.7057594054807246, 'f1': 0.7056037893711011, 'auc': 0.7804733925676309}
4 {'year': '2016', 'accuracy': 0.7037930278447709, 'f1': 0.7037848260359689, 'auc': 0.7663873953840297}
5 {'year': '2017', 'accuracy': 0.6962103955270242, 'f1': 0.6944744508123069, 'auc': 0.7609348396956702}
6 {'year': '2018', 'accuracy': 0.6889241133346542, 'f1': 0.6883479802651865, 'auc': 0.7599006006424418}
7 {'year': '2019', 'accuracy': 0.6961494471978651, 'f1': 0.6961240068940525, 'auc': 0.7535334507758765}


In [6]:
predictions_df = pd.DataFrame(predictions)
predictions_df.to_pickle('../results/lr_predictions.pkl')