In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import tqdm as tqdm
import Utils

import tensorflow as tf

#from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras import layers
from tensorflow.keras.regularizers import l2
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import scipy


from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

In [52]:
from tensorflow.python.keras import backend as K
K._get_available_gpus()

2.7.0


In [7]:
def load_data(surface_path, target_path):
    targets = pd.read_csv(target_path, skiprows=2, parse_dates=True, index_col=0)
    targets = pd.DataFrame(targets['Close'])

    surfaces = pd.read_csv(surface_path, parse_dates=True, index_col=[0,1])

    dates = surfaces.index.get_level_values(0).unique()
    dates = pd.Series(list(set(targets.index).intersection(set(dates))))
    dates = dates.sort_values()


    targets = targets.loc[dates]
    surfaces = surfaces.loc[dates]
    return surfaces, targets

def target_volatility(surfaces, targets):
    rets = Utils.convert_to_daily_returns(targets)
    volatility = rets.rolling(window=60).std()*np.sqrt(252)
    dates = volatility.dropna().index
    return surfaces.loc[dates], volatility.dropna()


def stack_days(surfaces, targets, stacks):

    dim = len(surfaces.index.get_level_values(0).unique())
    dim1 = len(surfaces.index.get_level_values(1).unique())

    surfaces_np = surfaces.values.reshape((dim, dim1, surfaces.shape[1]))

    stacked_surfaces = np.zeros((len(surfaces_np)-stacks, dim1, surfaces.shape[1], stacks))
    
    for i in range(len(surfaces_np)-stacks):
        temp = np.stack(surfaces_np[i:i+stacks]).transpose(1,2,0)
        stacked_surfaces[i] = temp

    stacked_dates = surfaces.index.get_level_values(0).unique()[stacks:]
    
    return stacked_surfaces, targets.loc[stacked_dates], stacked_dates


def lag_targets(surfaces, targets, lag, dates):
    lagged = targets.shift(-lag)
    return surfaces[:-lag], lagged.dropna(), dates[lag:]


def train_test_split(test_start, test_end, dates, surfaces, targets):
    test_dates = pd.Series(dates, index=dates)[test_start:test_end]
    
    start_ind = dates.get_loc(test_dates.iloc[0])
    end_ind = dates.get_loc(test_dates.iloc[-1]) + 1
        
    surfaces_test = surfaces[start_ind:end_ind]
    targets_test = targets.to_numpy()[start_ind:end_ind]
    
    surfaces_train = surfaces[:start_ind]
    surfaces_train = np.concatenate((surfaces_train,surfaces[end_ind:]))

    targets_train = targets.to_numpy()[:start_ind]
    targets_train = np.concatenate((targets_train,targets.to_numpy()[end_ind:]))
    
    train_dates = pd.Series(dates, index=dates).loc[~pd.Series(dates, index=dates).index.isin(test_dates)].index
    
    return surfaces_train, surfaces_test, targets_train, targets_test, train_dates, test_dates
    
    
def r_square(y_true, y_pred):
    from tensorflow.keras import backend as K
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

In [45]:
surfaces, targets = load_data("processed_data/SPX.csv", "data/VIX.csv")

dates = surfaces.index.get_level_values(0).unique()

#surfaces, targets = target_volatility(surfaces, targets)

surfaces, targets, dates = stack_days(surfaces, targets, 20)

surfaces = surfaces.transpose(0,3,1,2)
surfaces = np.expand_dims(surfaces, axis=4)
targets = targets.squeeze()

surfaces, targets_lagged, dates = lag_targets(surfaces, targets, 10, dates)

surfaces_train, surfaces_test, targets_train, targets_test, train_dates, test_dates = train_test_split('2019', '2023', dates, surfaces, targets_lagged)

input_shape = surfaces_train[0].shape

In [47]:
targets_train.shape

(5773,)

In [42]:
inp = layers.Input(shape=(surfaces_train.shape[1], *surfaces_train.shape[2:]))

# We will construct 3 `ConvLSTM2D` layers with batch normalization,
# followed by a `Conv3D` layer for the spatiotemporal outputs.
x = layers.ConvLSTM2D(
    filters=64,
    kernel_size=(3, 3),
    padding="same",
    kernel_regularizer=l2(0.0001),
    return_sequences=True,
    activation="relu",
)(inp)
#x = layers.BatchNormalization()(x)
x = layers.ConvLSTM2D(
    filters=128,
    kernel_size=(3, 3),
    padding="same",
    kernel_regularizer=l2(0.0001),
    return_sequences=True,
    activation="relu",
)(x)
#x = layers.BatchNormalization()(x)
x = layers.ConvLSTM2D(
    filters=256,
    kernel_size=(3, 3),
    padding="same",
    kernel_regularizer=l2(0.0001),
    return_sequences=True,
    activation="relu",
)(x)

x = layers.Flatten()(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dense(1, activation='linear')(x)

# Next, we will build the complete model and compile it.
model = tf.keras.models.Model(inp, x)
model.compile(optimizer='adam',
              loss=tf.keras.losses.Huber(),
              metrics=[r_square, tf.keras.losses.Huber()])

In [53]:
surfaces_train.shape

(5773, 10, 34, 11, 1)

In [48]:
epochs = 20
batch_size = 32

# Fit the model to the training data.
history = model.fit(
    surfaces_train,
    targets_train,
    batch_size=batch_size,
    epochs=epochs
)

Epoch 1/20
Epoch 2/20
 236/1155 [=====>........................] - ETA: 16:41 - loss: 6.1192 - r_square: -32430.9238 - huber_loss: 6.1192

KeyboardInterrupt: 

In [None]:
new_prediction = model.predict(surfaces_test)

In [None]:
plt.plot(history.history['loss'], label='Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='lower right')

test_loss = model.evaluate(surfaces_test,  targets_test, verbose=2)

In [None]:
def plot_train_fit(surfaces_train, targets_train, dates_train):
        
    preds = model.predict(surfaces_train)
    print('R2: ')
    print(r2_score(targets_train, preds))
    print()
    
    print('MSE: ')
    print(mean_squared_error(targets_train, preds))
    print()
    
    test = pd.DataFrame(columns=['preds', 'targets'], index=dates_train)
    test['preds'] = preds
    test['targets'] = targets_train
    
    print('Corr: ')
    print(test.corr().iloc[0,1])
    print()
    
    print('Mean Absolute Error: ')
    print(np.abs(preds - targets_train).mean())
    print()
    
    print('Stdev Absolute Error: ')
    print(np.abs(preds - targets_train).std())
    print()
    
    fig, ax = plt.subplots()
    fig.set_size_inches(16, 10)
    plt.plot(dates_train, targets_train, label='Realized')
    plt.plot(dates_train, preds, label='Model')
    plt.legend()

In [None]:
plot_train_fit(surfaces_train, targets_train, train_dates)

In [None]:
plot_train_fit(surfaces_test, targets_test, test_dates)