A simplified implementation of the CNNpred paper has been done. Here 2d CNNpred version has been used on DJI, NASDAQ, NYSE, RUSSELL and S&P data. 

Reference:
https://machinelearningmastery.com/using-cnn-for-financial-time-series-prediction/


In [12]:
import random

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Input
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error

In [13]:
#sample dataset
df = pd.read_csv('https://raw.githubusercontent.com/mandalnilabja/soc2022/main/data/Processed_DJI.csv', index_col="Date", parse_dates=True)
df.head()

Unnamed: 0_level_0,Close,Volume,mom,mom1,mom2,mom3,ROC_5,ROC_10,ROC_15,ROC_20,...,NZD,silver-F,RUSSELL-F,S&P-F,CHF,Dollar index-F,Dollar index,wheat-F,XAG,XAU
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-12-31,10428.049805,,,,,,,,,,...,0.03,0.26,-1.08,-1.0,-0.11,-0.08,-0.06,-0.48,0.3,0.39
2010-01-04,10583.959961,,0.014951,,,,,,,,...,1.52,3.26,1.61,1.62,-0.57,-0.59,-0.42,3.12,3.91,2.1
2010-01-05,10572.019531,,-0.001128,0.014951,,,,,,,...,-0.07,1.96,-0.2,0.31,0.43,0.03,0.12,-0.9,1.42,-0.12
2010-01-06,10573.679688,0.515598,0.000157,-0.001128,0.014951,,,,,,...,0.56,2.15,-0.02,0.07,-0.56,-0.24,-0.17,2.62,2.25,1.77
2010-01-07,10606.860352,9.776045,0.003138,0.000157,-0.001128,0.014951,,,,,...,-0.72,0.94,0.5,0.4,0.58,0.58,0.54,-1.85,0.22,-0.58


In [14]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall
 
def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
 
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))
 
def f1macro(y_true, y_pred):
    f_pos = f1_m(y_true, y_pred)
    # negative version of the data and prediction
    f_neg = f1_m(1-y_true, 1-K.clip(y_pred,0,1))
    return (f_pos + f_neg)/2

In [15]:
def cnnpred_2d(seq_len=60, n_features=82, n_filters=(8,8,8), droprate=0.1):
    #2D-CNNpred model according to the paper
    model = Sequential([
        Input(shape=(seq_len, n_features, 1)),
        Conv2D(n_filters[0], kernel_size=(1, n_features), activation="relu"),
        Conv2D(n_filters[1], kernel_size=(3,1), activation="relu"),
        MaxPool2D(pool_size=(2,1)),
        Conv2D(n_filters[2], kernel_size=(3,1), activation="relu"),
        MaxPool2D(pool_size=(2,1)),
        Flatten(),
        Dropout(droprate),
        Dense(1, activation="sigmoid")
    ])
    return model
 

In [16]:
def datagen(data, seq_len, batch_size, targetcol, kind):
    #As a generator to produce samples for Keras model
    batch = []
    while True:
        # Pick one dataframe from the pool
        key = random.choice(list(data.keys()))
        df = data[key]
        input_cols = [c for c in df.columns if c != targetcol]
        index = df.index[df.index < TRAIN_TEST_CUTOFF]
        split = int(len(index) * TRAIN_VALID_RATIO)
        if kind == 'train':
            index = index[:split]   # range for the training set
        elif kind == 'valid':
            index = index[split:]   # range for the validation set
        # Pick one position, then clip a sequence length
        while True:
            t = random.choice(index)      # pick one time step
            n = (df.index == t).argmax()  # find its position in the dataframe
            if n-seq_len+1 < 0:
                continue # can't get enough data for one sequence length
            frame = df.iloc[n-seq_len+1:n+1]
            batch.append([frame[input_cols].values, df.loc[t, targetcol]])
            break
        # if we get enough for a batch, dispatch
        if len(batch) == batch_size:
            X, y = zip(*batch)
            X, y = np.expand_dims(np.array(X), 3), np.array(y)
            yield X, y
            batch = []

In [17]:
def testgen(data, seq_len, targetcol):
    #Return array of all test samples
    batch = []
    for key, df in data.items():
        input_cols = [c for c in df.columns if c != targetcol]
        # find the start of test sample
        t = df.index[df.index >= TRAIN_TEST_CUTOFF][0]
        n = (df.index == t).argmax()
        # extract sample using a sliding window
        for i in range(n+1, len(df)+1):
            frame = df.iloc[i-seq_len:i]
            batch.append([frame[input_cols].values, frame[targetcol][-1]])
    X, y = zip(*batch)
    return np.expand_dims(np.array(X),3), np.array(y)

In [22]:
dir={'DJI':'https://raw.githubusercontent.com/mandalnilabja/soc2022/main/data/Processed_DJI.csv',
     'NASDAQ':'https://raw.githubusercontent.com/mandalnilabja/soc2022/main/data/Processed_NASDAQ.csv',
     'NYSE':'https://raw.githubusercontent.com/mandalnilabja/soc2022/main/data/Processed_NYSE.csv',
     'RUSSELL':'https://raw.githubusercontent.com/mandalnilabja/soc2022/main/data/Processed_RUSSELL.csv',
     'S&P':'https://raw.githubusercontent.com/mandalnilabja/soc2022/main/data/Processed_S&P.csv'
     }


TRAIN_TEST_CUTOFF = '2016-04-21'
TRAIN_VALID_RATIO = 0.75


data = {}


for name, filepath in dir.items():
    X = pd.read_csv(filepath, index_col="Date", parse_dates=True)
    
    # basic preprocessing: get the name, the classification
    del X["Name"]
    cols = X.columns
    
    # Save the target variable as a column in dataframe and drop NaN values
    X["Target"] = (X["Close"].pct_change().shift(-1) > 0).astype(int)
    X.dropna(inplace=True)
   
    # Fit the standard scaler using the training dataset not whole data set
    index = X.index[X.index > TRAIN_TEST_CUTOFF]
    index = index[:int(len(index) * TRAIN_VALID_RATIO)]
    scaler = StandardScaler().fit(X.loc[index, cols])
    
    # Save scale transformed dataframe
    X[cols] = scaler.transform(X[cols])
    data[name] = X

In [None]:
print(data)

In [26]:
#CNNpred paper parameters
seq_len = 60 # for 60 past days data
n_features = 82 # no of features engineered and used

#hyperparameters that can be varied
n_epochs = 20


accuracy=[]
MAE=[]
F1=[]


for batch_size in [128, 64, 32, 16]:
  
  # Produce CNNpred as a binary classification problem
  model = cnnpred_2d(seq_len, n_features)
  model.compile(optimizer="adam", loss="mae", metrics=["acc", f1macro])
  model.summary()  # print model structure to console
  
  # Set up callbacks and fit the model
  # We use custom validation score f1macro() and hence monitor for "val_f1macro"
  checkpoint_path = "./cp2d-{epoch}-{val_f1macro:.2f}.h5"
  callbacks = [
      ModelCheckpoint(checkpoint_path,
                      monitor='val_f1macro', mode="max",
                      verbose=0, save_best_only=True, save_weights_only=False, save_freq="epoch")
  ]
  #fitting the model
  model.fit(datagen(data, seq_len, batch_size, "Target", "train"),
            validation_data=datagen(data, seq_len, batch_size, "Target", "valid"),
            epochs=n_epochs, steps_per_epoch=400, validation_steps=10, verbose=1, callbacks=callbacks)
  

  # Prepare test data
  test_data, test_target = testgen(data, seq_len, "Target")
  
  # Test the model
  test_out = model.predict(test_data)
  test_pred = (test_out > 0.5).astype(int)
  
  accuracy.append(accuracy_score(test_pred, test_target))
  MAE.append(mean_absolute_error(test_pred, test_target))
  F1.append(f1_score(test_pred, test_target))

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_27 (Conv2D)          (None, 60, 1, 8)          664       
                                                                 
 conv2d_28 (Conv2D)          (None, 58, 1, 8)          200       
                                                                 
 max_pooling2d_18 (MaxPoolin  (None, 29, 1, 8)         0         
 g2D)                                                            
                                                                 
 conv2d_29 (Conv2D)          (None, 27, 1, 8)          200       
                                                                 
 max_pooling2d_19 (MaxPoolin  (None, 13, 1, 8)         0         
 g2D)                                                            
                                                                 
 flatten_9 (Flatten)         (None, 104)              

Batch size of 128

66s for one epoch with only CPU

64s for one epoch with GPU

Total: 17m for 20 epochs

For the whole execution batch size [128, 64, 32, 16] total 24m was taking

Reducing batch size significantly improves f1 macro score. Maximum f1 score is achieved for batch size of 32

In [27]:
print(accuracy)
print(MAE)
print(F1)

[0.48097560975609754, 0.5375609756097561, 0.504390243902439, 0.5482926829268293]
[0.5190243902439025, 0.4624390243902439, 0.49560975609756097, 0.45170731707317074]
[0.551433389544688, 0.6589928057553958, 0.6037441497659906, 0.6505660377358491]
