In [64]:
import pandas as pd
from dateutil import relativedelta, parser
from pandas.tseries.offsets import BDay 
from datetime import datetime, date, time
import numpy as np
import pickle
import time
import re
import matplotlib.pyplot as plt
import pandas_datareader.data as reader
from sklearn.model_selection import train_test_split
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import classification_report

import os
from torch.utils.tensorboard import SummaryWriter

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path_root = f"{os.getcwd()}"
path_root = "/content/drive/MyDrive/Colab Notebooks"
print(path_root)

/content/drive/MyDrive/Colab Notebooks


In [None]:
df = pd.read_pickle(f"{path_root}/with_all_ft_clean.pkl")
df.head()

In [5]:
def label1(row):
    if row["pct_change1"] > row["mkt_excess1"]:
        return 1
    else:
        return 0

def label2(row):
    if row["pct_change2"] > row["mkt_excess2"]:
        return 1
    else:
        return 0

In [6]:
df["label1"] = df.apply(label1, axis = 1)
df["label2"] = df.apply(label2, axis = 1)

In [7]:
# Split into train and test data
X_train, X_test, y_train, y_test = train_test_split(df.index, df["label2"],
                                                    stratify=df["label2"], 
                                                    test_size=0.2,
                                                    random_state = 20)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                    stratify=y_train, 
                                                    test_size=0.2,
                                                    random_state = 20)

In [8]:
# get column names of categorical features
item_list = [i for i in df.columns if not i.find("item")]
ind_list = [i for i in df.columns if not i.find("ind")]
shrcd_list = [i for i in df.columns if not i.find("SHRCD")]
exc_list = [i for i in df.columns if not i.find("EXCHCD")]

In [9]:
# combine all feature column names
cols = ['VIX', "Mkt_Cap"]
cols.extend(item_list)
cols.extend(ind_list)
cols.extend(shrcd_list)
cols.extend(exc_list)

In [10]:
# get numerical features
num_train = df.loc[X_train, cols]
num_val = df.loc[X_val, cols]
num_test = df.loc[X_test, cols]

#get texts
#text_train = df.loc[X_train, 'texts']
#text_val = df.loc[X_val, 'texts']
#text_test = df.loc[X_test, 'texts']

N, D = num_train.shape
print("num_train.shape", num_train.shape)
print("num_val.shape ", num_val.shape)
print("num_test.shape ", num_test.shape)

print("N:", N, "D:", D)

num_train.shape (13888, 70)
num_val.shape  (3473, 70)
num_test.shape  (4341, 70)
N: 13888 D: 70


In [11]:
from sklearn.preprocessing import StandardScaler

x_scaler = StandardScaler()

# standardize volatility index and market cap by mean and standard deviation (standardize train and test sets separately)
num_train["VIX"] = x_scaler.fit_transform(np.array(num_train["VIX"]).reshape(-1,1))
num_val["VIX"] = x_scaler.transform(np.array(num_val["VIX"]).reshape(-1,1))
num_test["VIX"] = x_scaler.transform(np.array(num_test["VIX"]).reshape(-1,1))
num_train["Mkt_Cap"] = x_scaler.fit_transform(np.array(num_train["Mkt_Cap"]).reshape(-1,1))
num_val["Mkt_Cap"] = x_scaler.transform(np.array(num_val["Mkt_Cap"]).reshape(-1,1))
num_test["Mkt_Cap"] = x_scaler.transform(np.array(num_test["Mkt_Cap"]).reshape(-1,1))

In [12]:
y_train.shape

(13888,)

In [17]:
def create_model(dropout_rate=0):
    # create model
    model = Sequential()

    model.add(Dense(60, input_dim=70, activation='relu'))

    # add a dropout layer if rate is not null
    if dropout_rate != 0:
        model.add(Dropout(rate=dropout_rate))

    
    model.add(Dense(1, activation='sigmoid'))


    # Compile model
    model.compile( 
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy'],
        )    
    return model

# define function to display the results of the grid search
def display_cv_results(search_results):
    print('Best score = {:.4f} using {}'.format(search_results.best_score_, search_results.best_params_))
    means = search_results.cv_results_['mean_test_score']
    stds = search_results.cv_results_['std_test_score']
    params = search_results.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print('mean test accuracy +/- std = {:.4f} +/- {:.4f} with: {}'.format(mean, stdev, param))    
    

In [18]:
model = KerasClassifier(build_fn=create_model, verbose=1)

In [19]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [20]:
# define parameters and values for grid search 
param_grid = {
    'batch_size': [16, 32, 64],
    'epochs': [10,20,30],
    'dropout_rate': [0.10, 0.20, 0.30],
}

In [23]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)

In [30]:
with tf.device('/device:GPU:0'):
  grid_result = grid.fit(num_train, y_train)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [35]:
#print('time for grid search = {:.0f} sec'.format(time()-start))
display_cv_results(grid_result)

Best score = 0.6047 using {'batch_size': 32, 'dropout_rate': 0.2, 'epochs': 10}
mean test accuracy +/- std = 0.6017 +/- 0.0089 with: {'batch_size': 16, 'dropout_rate': 0.1, 'epochs': 10}
mean test accuracy +/- std = 0.5934 +/- 0.0068 with: {'batch_size': 16, 'dropout_rate': 0.1, 'epochs': 20}
mean test accuracy +/- std = 0.5926 +/- 0.0113 with: {'batch_size': 16, 'dropout_rate': 0.1, 'epochs': 30}
mean test accuracy +/- std = 0.5997 +/- 0.0045 with: {'batch_size': 16, 'dropout_rate': 0.2, 'epochs': 10}
mean test accuracy +/- std = 0.5986 +/- 0.0068 with: {'batch_size': 16, 'dropout_rate': 0.2, 'epochs': 20}
mean test accuracy +/- std = 0.6021 +/- 0.0051 with: {'batch_size': 16, 'dropout_rate': 0.2, 'epochs': 30}
mean test accuracy +/- std = 0.6025 +/- 0.0069 with: {'batch_size': 16, 'dropout_rate': 0.3, 'epochs': 10}
mean test accuracy +/- std = 0.5944 +/- 0.0061 with: {'batch_size': 16, 'dropout_rate': 0.3, 'epochs': 20}
mean test accuracy +/- std = 0.5982 +/- 0.0064 with: {'batch_siz

In [36]:
def create_model():
    # best model
    model = Sequential()
    model.add(Dense(60, input_dim=70, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))

    # Compile model
    model.compile( 
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy'],
        )    
    return model 

In [37]:
model = KerasClassifier(build_fn=create_model, verbose=1)

In [38]:
num_val = np.array(num_val)

In [39]:
y_val = np.array(y_val)

In [40]:
with tf.device('/device:GPU:0'):
  hist =model.fit(num_train, y_train, epochs=10,batch_size=32, validation_data = (np.array(num_val), np.array(y_val)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [56]:
# Split into train and test data
X_train, X_test, y_train, y_test = train_test_split(df.index, df["label2"],
                                                    stratify=df["label2"], 
                                                    test_size=0.2,
                                                    random_state = 20)

In [57]:
# get numerical features
num_train = df.loc[X_train, cols]
num_test = df.loc[X_test, cols]

In [59]:
from sklearn.preprocessing import StandardScaler

x_scaler = StandardScaler()

# standardize volatility index and market cap by mean and standard deviation (standardize train and test sets separately)
num_train["VIX"] = x_scaler.fit_transform(np.array(num_train["VIX"]).reshape(-1,1))
num_test["VIX"] = x_scaler.transform(np.array(num_test["VIX"]).reshape(-1,1))
num_train["Mkt_Cap"] = x_scaler.fit_transform(np.array(num_train["Mkt_Cap"]).reshape(-1,1))
num_test["Mkt_Cap"] = x_scaler.transform(np.array(num_test["Mkt_Cap"]).reshape(-1,1))

In [60]:
with tf.device('/device:GPU:0'):
  hist =model.fit(num_train, y_train, epochs=10,batch_size=32, validation_data = (np.array(num_test), np.array(y_test)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
preds = model.predict(num_test)

In [65]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.51      0.25      0.34      1792
           1       0.61      0.83      0.71      2549

    accuracy                           0.59      4341
   macro avg       0.56      0.54      0.52      4341
weighted avg       0.57      0.59      0.55      4341

