# Testing and Ensemble

In [57]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from google.colab import output
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error

In [10]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall
 
def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
 
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))
 
def f1macro(y_true, y_pred):
    f_pos = f1_m(y_true, y_pred)
    # negative version of the data and prediction
    f_neg = f1_m(1-y_true, 1-K.clip(y_pred,0,1))
    return (f_pos + f_neg)/2

In [12]:
def datagen(data, seq_len, batch_size, targetcol, kind):
    #As a generator to produce samples for Keras model
    batch = []
    while True:
        # Pick one dataframe from the pool
        key = random.choice(list(data.keys()))
        df = data[key]
        input_cols = [c for c in df.columns if c != targetcol]
        index = df.index[df.index < TRAIN_TEST_CUTOFF]
        split = int(len(index) * TRAIN_VALID_RATIO)
        if kind == 'train':
            index = index[:split]   # range for the training set
        elif kind == 'valid':
            index = index[split:]   # range for the validation set
        # Pick one position, then clip a sequence length
        while True:
            t = random.choice(index)      # pick one time step
            n = (df.index == t).argmax()  # find its position in the dataframe
            if n-seq_len+1 < 0:
                continue # can't get enough data for one sequence length
            frame = df.iloc[n-seq_len+1:n+1]
            batch.append([frame[input_cols].values, df.loc[t, targetcol]])
            break
        # if we get enough for a batch, dispatch
        if len(batch) == batch_size:
            X, y = zip(*batch)
            X, y = np.expand_dims(np.array(X), 3), np.array(y)
            yield X, y
            batch = []

In [13]:
def testgen(data, seq_len, targetcol):
    #Return array of all test samples
    batch = []
    for key, df in data.items():
        input_cols = [c for c in df.columns if c != targetcol]
        # find the start of test sample
        t = df.index[df.index >= TRAIN_TEST_CUTOFF][0]
        n = (df.index == t).argmax()
        # extract sample using a sliding window
        for i in range(n+1, len(df)+1):
            frame = df.iloc[i-seq_len:i]
            batch.append([frame[input_cols].values, frame[targetcol][-1]])
    X, y = zip(*batch)
    return np.expand_dims(np.array(X),3), np.array(y)

In [14]:
dir={'DJI':'https://raw.githubusercontent.com/mandalnilabja/soc2022/main/data/Processed_DJI.csv',
     'NASDAQ':'https://raw.githubusercontent.com/mandalnilabja/soc2022/main/data/Processed_NASDAQ.csv',
     'NYSE':'https://raw.githubusercontent.com/mandalnilabja/soc2022/main/data/Processed_NYSE.csv',
     'RUSSELL':'https://raw.githubusercontent.com/mandalnilabja/soc2022/main/data/Processed_RUSSELL.csv',
     'S&P':'https://raw.githubusercontent.com/mandalnilabja/soc2022/main/data/Processed_S&P.csv'
     }


TRAIN_TEST_CUTOFF = '2016-04-21'
TRAIN_VALID_RATIO = 0.75


data = {}


for name, filepath in dir.items():
    X = pd.read_csv(filepath, index_col="Date", parse_dates=True)
    
    # basic preprocessing: get the name, the classification
    del X["Name"]
    cols = X.columns
    
    # Save the target variable as a column in dataframe and drop NaN values
    X["Target"] = (X["Close"].pct_change().shift(-1) > 0).astype(int)
    X.dropna(inplace=True)
   
    # Fit the standard scaler using the training dataset not whole data set
    index = X.index[X.index > TRAIN_TEST_CUTOFF]
    index = index[:int(len(index) * TRAIN_VALID_RATIO)]
    scaler = StandardScaler().fit(X.loc[index, cols])
    
    # Save scale transformed dataframe
    X[cols] = scaler.transform(X[cols])
    data[name] = X

# Testing

Testing the 64 2dCNNpred models generated on Accuaracy, Mean Absolute Error and Macro Averaged F1 Score. Hyperparameters that can be varied epochs, optimizer, batch size, droput rate, loss fuction.

In [29]:
param_list = [
 ['sgd', 'mae', 0.05], ['adam', 'mae', 0.1], ['adagrad', 'mae',  0.05], ['adamax', 'binary_focal_crossentropy', 0.1], 
 ['sgd', 'binary_focal_crossentropy', 0.1], ['adam', 'binary_focal_crossentropy', 0.15], ['adagrad', 'mae', 0.1], ['adamax', 'binary_focal_crossentropy', 0.15], 
 ['sgd', 'binary_crossentropy', 0.15], ['adam', 'binary_crossentropy', 0.2], ['adagrad', 'hinge', 0.2], ['adamax', 'binary_crossentropy', 0.15],
 ['sgd', 'hinge', 0.2], ['adam', 'hinge',  0.05], ['adagrad', 'hinge',  0.05], ['adamax', 'binary_crossentropy', 0.2]
 ]

id=2

df_eval=pd.DataFrame(columns=['id', 'optimizing_algo', 'loss_function','dropout_rate', 'batch_size', 'epochs', 'accuracy', 'MAE', 'F1'])


for batch_size in [64, 32]:
  for n_epochs in [20, 30]:
    for optim, lossfxn, dr in param_list:
   
      #load model for testing
      model = tf.keras.models.load_model('2dCNNpredm{}.h5'.format(id), compile=False)     
      model.compile(optimizer=optim, loss=lossfxn, metrics=["acc", f1macro])
      
      
      # Prepare test data
      test_data, test_target = testgen(data, 60, "Target")
      
      # Test the model
      test_out = model.predict(test_data)
      test_pred = (test_out > 0.5).astype(int)
      

      #measuring and saving performance
      df_eval.loc[len(df_eval)]=[id, optim, lossfxn, dr, batch_size, n_epochs, accuracy_score(test_pred, test_target), mean_absolute_error(test_pred, test_target), f1_score(test_pred, test_target)]

      output.clear()      
      print(id)

      #index of model
      id = id + 1
      

65


In [30]:
df_eval.to_csv('CNNpred_WallStreet_performances.csv')
df_eval.head()

Unnamed: 0,id,optimizing_algo,loss_function,dropout_rate,batch_size,epochs,accuracy,MAE,F1
0,2,sgd,mae,0.05,64,20,0.537561,0.462439,0.699239
1,3,adam,mae,0.1,64,20,0.507317,0.492683,0.566524
2,4,adagrad,mae,0.05,64,20,0.537561,0.462439,0.699239
3,5,adamax,binary_focal_crossentropy,0.1,64,20,0.514146,0.485854,0.511765
4,6,sgd,binary_focal_crossentropy,0.1,64,20,0.566829,0.433171,0.628141


# Ensemble - Bagging

Ensemble-Bagging based on Hard Voting

In [56]:
test_pred=np.zeros((1025,1))

# Prepare test data
test_data, test_target = testgen(data, 60, "Target")


for id in range (1,66):
  #load model for testing
  model = tf.keras.models.load_model('2dCNNpredm{}.h5'.format(id), compile=False)     
  model.compile(optimizer=optim, loss=lossfxn, metrics=["acc", f1macro])
      

  # Test the model
  test_out = model.predict(test_data)
  test_pred = np.add(test_pred, np.array((test_out > 0.5).astype(int)))
      

  #displaying progress
  output.clear() 
  print(id)

  #index of model
  id = id + 1

test_pred = (test_pred/id > 0.5).astype(int)

accuracy = accuracy_score(test_pred, test_target)
MAE = mean_absolute_error(test_pred, test_target)
F1 = f1_score(test_pred, test_target)

65


In [55]:
print(accuracy)
print(MAE)
print(F1)

0.5375609756097561
0.4624390243902439
0.6992385786802031
