# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Dropout, Dense

Using TensorFlow backend.


# Data Collection and Organization

In [9]:
#Loading the data
VIX = pd.read_csv('data/VIX.csv', header = 2)
DOW = pd.read_csv('data/DOW.csv', header = 2)
GFD = pd.read_csv('data/GFD_TBILL_DAILY.csv', header = 2)
GOLD = pd.read_csv('data/GOLD.csv', header = 2)
SPX = pd.read_csv('data/SPX.csv', header = 2)
# I'm not using US_10YR because it contains duplicate values for many dates
#US_10YR = pd.read_csv('data/US_10yr_yield.csv', header = 2)

In [7]:
# Define a helper function to process the data
def process_data(df, label, pct_change = True):
    
    # Convert the Date column from a string to datetime format
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Get only the Dates earlier than 2021-10-21 since we aren't predicting into the future
    df = df.loc[(df['Date'] <= '2021-10-21')]
    
    # Carries forward old prices, so we aren't using future information
    df.fillna(method = 'ffill', inplace=True) 
    
    # Drop the 'Ticker' and 'Open' columns
    df = df.drop(columns = ['Ticker', 'Open'], axis=1)
    
    # Take percentage changes
    if pct_change == True:
        df.loc[:, 'Close'] = df.loc[:, 'Close'].pct_change()
    
    # Reverse the data to go backward in time
    df = df.sort_values(by='Date', ascending=False).reset_index(drop=True)
    
    # Rename the 'Close' column with the passed label
    df = df.rename(columns={'Close': label})
    
    return df

# Process all of the data
VIX = process_data(VIX, 'VIX')
DOW = process_data(DOW, 'DOW')
GFD = process_data(GFD, 'GFD')
GOLD = process_data(GOLD, 'GOLD')
SPX = process_data(SPX, 'SPX')
#US_10YR = process_data(US_10YR, 'US_10YR')


# Merge all of the data into one large dataframe
from functools import reduce

dfs = [VIX, DOW, GFD, GOLD, SPX]
#dfs = [VIX, DOW, GFD, GOLD, SPX, US_10YR]
data = reduce(lambda  left,right: pd.merge(left,right,on=['Date'], how='outer'), dfs)

# Sort the data by descending date
data = data.sort_values(by='Date', ascending=False).reset_index(drop=True)

data

Unnamed: 0,Date,VIX,DOW,GFD,GOLD,SPX
0,2021-10-21,-0.030988,-0.000176,0.000002,,0.003069
1,2021-10-20,-0.013376,0.004288,0.000001,-0.000871,0.003675
2,2021-10-19,-0.037400,0.005636,0.000001,0.006618,0.007408
3,2021-10-18,0.000613,-0.001024,0.000002,-0.002708,0.003375
4,2021-10-17,,,0.000002,,
...,...,...,...,...,...,...
84264,1791-02-05,,,0.000164,,
84265,1791-02-04,,,0.000164,,
84266,1791-02-03,,,0.000164,,
84267,1791-02-02,,,0.000164,,


# Data Visualization

In [None]:
data.plot('Date', subplots=True, figsize = (15,20))
plt.show()

data['VIX'].plot(kind = 'kde', figsize = (15,4), title = 'VIX Distribution')
plt.show()

# Data Processing

In [None]:
def lstm_data_processing(data, start_date, end_date, n_time_steps = 90, include_vix = True):

    # Get the indices of the start date and the end date
    start_index = int(np.where(data[:,0] == pd.to_datetime(start_date))[0])
    end_index = int(np.where(data[:,0] == pd.to_datetime(end_date))[0])
    
    # Flag to set if VIX data is included in the training set
    if include_vix:
        # Get all of the input data (X)
        X_data = data[:,1:].astype('float32')
    else:
        # Get all of the input data (X)
        X_data = data[:,2:].astype('float32')   
    
    # Create an empty array to store the restructured input data
    X_data_extended = np.zeros((X_data.shape[0], n_time_steps, X_data.shape[-1]))

    # Loop through all samples 
    for i in range(start_index, end_index+1):
        X_data_extended[i] = X_data[i-n_time_steps:i,:]
    
    # Get all of the input data (X) for the desired date range
    X_data = X_data_extended[start_index:end_index+1]

    # Get all of the output data (Y) for the desired date range
    y_data = data[start_index:end_index+1,1].astype('float32')
    
    return X_data, y_data

# Define a function to process the data for Bidirectional LSTM
def bidirectional_data_processing(data, start_date, end_date, n_time_steps = 30):

    # Get the indices of the start date and the end date
    start_index = int(np.where(data[:,0] == pd.to_datetime(start_date))[0])
    end_index = int(np.where(data[:,0] == pd.to_datetime(end_date))[0])
    
    # Get all of the input data (X)
    X_data = data[:,2:].astype('float32')
    
    # Create an empty array to store the restructured input data
    X_data_extended = np.zeros((X_data.shape[0], 2*n_time_steps+1, X_data.shape[-1]))

    # Loop through all samples 
    for i in range(start_index, end_index+1):
        X_data_extended[i] = X_data[i-n_time_steps:i+n_time_steps+1,:]
    
    # Get all of the input data (X) for the desired date range
    X_data = X_data_extended[start_index:end_index+1]

    # Get all of the output data (Y) for the desired date range
    y_data = data[start_index:end_index+1,1].astype('float32')
    
    return X_data, y_data

# Convert the data from a pandas dataframe to a numpy array
np_data = data.to_numpy()

# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = np_data.copy()
scaled_data[:,1:] = scaler.fit_transform(scaled_data[:,1:])

# Define the number of timesteps to look forward and backward (total number of timesteps is 2*n_timesteps+1)
n_time_steps = 90

# Split data into train, dev, and test sets
X_train, y_train = lstm_data_processing(scaled_data, start_date = '12/31/2020', end_date = '01/01/1991', n_time_steps = n_time_steps, include_vix = False)
X_dev, y_dev = lstm_data_processing(scaled_data, start_date = '12/31/1990', end_date = '07/01/1988', n_time_steps = n_time_steps, include_vix = False)
X_test, y_test = lstm_data_processing(scaled_data, start_date = '06/30/1988', end_date = '01/01/1986', n_time_steps = n_time_steps, include_vix = False)

# Verify the shapes of the input and output data
print(X_train.shape)
print(y_train.shape)
print(X_dev.shape)
print(y_dev.shape)
print(X_test.shape)
print(y_test.shape)

# LSTM Model
## Model Definition

In [None]:
# Building the model
lstm_model = Sequential()

# Adding a Bidirectional LSTM layer
lstm_model.add(LSTM(64, return_sequences=False, dropout=0.5, input_shape=(X_train.shape[1], X_train.shape[-1])))
lstm_model.add(Dense(1))
lstm_model.compile(loss='mse', optimizer='adam')

## Model Training

In [None]:
# Training the model
lstm = lstm_model.fit(X_train, y_train, batch_size=128, epochs=50, validation_data=(X_dev, y_dev))

In [None]:
lstm_model.summary()

In [None]:
# plot training and dev losses over epoch
plt.plot(lstm.history['loss'], label='train')
plt.plot(lstm.history['val_loss'], label='dev')
plt.legend()
plt.show()

## Model Prediction and Visualization

In [None]:
# make predictions for the training and dev sets
y_train_hat = lstm_model.predict(X_train)
y_dev_hat = lstm_model.predict(X_dev)

# Showing the predicted vs. actual values
fig, axs = plt.subplots()
fig.set_figheight(4)
fig.set_figwidth(15)

axs.plot(y_train_hat, color='red', label='Predicted')
axs.plot(y_train, color='blue', label='Actual')
plt.title('Training Set')
plt.xlabel('Timestamp')
plt.ylabel('Scaled VIX')
plt.legend(loc='upper left')
plt.show()

fig, axs = plt.subplots()
fig.set_figheight(4)
fig.set_figwidth(15)

axs.plot(y_dev_hat, color='red', label='Predicted')
axs.plot(y_dev, color='blue', label='Actual')
plt.title('Dev Set')
plt.xlabel('Timestamp')
plt.ylabel('Scaled VIX')
plt.legend(loc='upper left')
plt.show()