# SPDR ETF Dimension Estimation

Autoencoder innermost layer is refashioned into singular value proxies (SVP).  These SVP are used to estimate dimension of the dataset.

In [45]:
import numpy as np
import pandas as pd
import os
import sys
import csv
import json

from datetime import datetime

from tensorflow.keras.layers import Input, Dense, Lambda, Dropout
from tensorflow.keras.models import Model
from keras import regularizers
from keras.callbacks import Callback
from keras import backend as K
import tensorflow as tf
from keras.regularizers import Regularizer

import scipy.sparse
from keras.models import load_model

In [46]:
from scipy.spatial.distance import squareform, pdist

The iPython notebook code is only being provided for convenience.  

All Linux code along with scripts is available at https://github.com/nitishbahadur/book_chapter. Our Linux code is based on tensorflow 1.x.  Python package requirements were exported and available https://github.com/nitishbahadur/book_chapter/blob/master/src/requirements.txt.

We run our production code on https://arc.wpi.edu/cluster-documentation/build/html/clusters.html for performance reasons.

In [47]:
tf.compat.v1.get_default_graph()
tf.compat.v1.disable_v2_behavior()
tf.compat.v1.disable_eager_execution()

Load S&P 500 data frp, data folder

In [48]:
def get_etf_data(etf_ticker):
    df_etf_ret = pd.read_csv(r'../data/etf_de/input/{}_returns.csv'.format(etf_ticker))
    df_etf_ret['Date'] = pd.to_datetime(df_etf_ret['Date'], format='%Y-%m-%d')
    df_etf_ret.set_index(df_etf_ret['Date'], inplace=True)
    df_etf_ret.drop(columns=['Date'], inplace=True)    
    X = df_etf_ret.values
    
    split_index = int(len(df_etf_ret)*.80) # 80% is training
    
    X = df_etf_ret.values
    X = X.astype('float32')
    X = X / np.max(np.abs(X))

    x_train = X[:split_index,:]
    x_test = X[split_index:,:]

    return x_train, x_test, df_etf_ret, split_index

Build the autoencoder model where the innermost layer is using a sigmoid activation function.  The autoencoder also uses dropout layers to control for overfitting.  We use a custom loss function.

In [49]:
def build_ae_model(l1_reg, input_dim, encoding_dim):
    input_img = Input(shape=(input_dim,))
    encoded = Dense(50, activation='relu')(input_img)
    encoded = Dense(40, activation='relu')(encoded)
    
    z_layer_input = Lambda(lambda  x: K.l2_normalize(x,axis=1))(encoded)
    encoded = Dense(encoding_dim, activation='relu')(z_layer_input)
    encoded_norm = Lambda(lambda  x: K.l2_normalize(x,axis=1))(encoded)

    # create encoder model
    encoder = Model(input_img, encoded)
    
    # decoder
    decoded = Dense(40, activation='relu')(encoded)
    decoded = Dense(50, activation='relu')(decoded)
    decoded = Dense(input_dim, activation='sigmoid')(decoded)

    # create autoencoder model
    autoencoder = Model(input_img, decoded)

    # create decoder model
    encoded_input = Input(shape=(encoding_dim,))
    deco = autoencoder.layers[-3](encoded_input)
    deco = autoencoder.layers[-2](deco)
    deco = autoencoder.layers[-1](deco)    
    decoder = Model(encoded_input, deco)    
                
    autoencoder.compile(optimizer='adadelta', loss=mse_l1_loss(encoded_norm, l1_reg))
    return encoder, decoder, autoencoder

# the loss function
def mse_l1_loss(encoded_layer, lambda_):    
    def loss(y_true, y_pred):
        return K.mean(K.square(y_pred - y_true) + lambda_ * K.sum(K.abs(encoded_layer)))
    return loss 

In [50]:
def save_output(x, encoder, date_str, etf_ticker, encoding_dim, l1_reg):
    x_encoded = encoder.predict(x)
    z_row_sorted = sort_by_row(x_encoded)
    mu = np.mean(z_row_sorted, axis=0)
    print("1% count_gt_threshold(z_row_sorted, threshold): {}".format(count_gt_threshold(mu, 0.01)))

    mu1 = np.mean(x_encoded, axis=0)
    print("1% count_gt_threshold(z, threshold): {}".format(count_gt_threshold(mu1, 0.01)))

    x_encoded_filename = r"../data/etf_de/output/x_{}_encoded_{}_{}_{}"
    np.save(x_encoded_filename.format(date_str, encoding_dim, l1_reg, etf_ticker), x_encoded)

In [51]:
def get_data_by_date(df_etf_ret, x_test, split_index, date_str):
    test_width = 60
    test_max_rows = len(x_test) - test_width + 1
    for i in range(0, test_max_rows):
        df_ = df_etf_ret.iloc[i+split_index:i+test_width+split_index,:].copy()
        dt = df_.index[-1]
        if dt.strftime('%Y%m%d') == date_str:
            dist = squareform(pdist(df_.values)) 
            x_train = dist.astype('float32')
            x_train = x_train / np.max(x_train)    
            return x_train

In [52]:
def count_gt_threshold(z, threshold):
    tot = sum(z)
    z_pct = [(i/tot) for i in sorted(z, reverse=True)]
    z_gt_theta = [i for i in z_pct if i >= threshold]
    return len(z_gt_theta)

def sort_by_row(z):
    z_sorted = None
    for i in np.arange(z.shape[0]):
        z_s = sorted(z[i,:], reverse=True)
        if z_sorted is None:
            z_sorted = z_s
        else:
            z_sorted = np.vstack((z_sorted,z_s))
    return z_sorted

We run our production code on https://arc.wpi.edu/cluster-documentation/build/html/clusters.html for scalability reasons.  To estimate dimension time series we need to run the same model for different ETFs on different days.  

For demonstration purposes we show how to run this for 'XLK' (Technology Sector) on 20200302 (March 2nd, 2020)

In [53]:
etf_ticker = 'XLK'
date_str = '20200302'

In [54]:
etf_train, etf_test, df_etf_ret, split_index = get_etf_data(etf_ticker)
x_train = get_data_by_date(df_etf_ret, etf_test, split_index, date_str)

In [55]:
l1_reg = 5e-5
encoding_dim = 30
epochs = 100 # on production we run this for large number (30000) epochs
batch_size = 1
input_dim = x_train.shape[1]

In [56]:
print("Running RELU/SIGMOID AE with the following parameters : ")
print("etf_ticker : {} date_str: {}".format(etf_ticker, date_str))
print("x_train dimension : ({} x {})".format(x_train.shape[0], x_train.shape[1]))
print("encoding_dim dimension : {}".format(encoding_dim))
print("epochs : {} batch_size : {}".format(epochs, batch_size))
print("Running encoding_dim: {} l1_reg: {}".format(encoding_dim, l1_reg))

Running RELU/SIGMOID AE with the following parameters : 
etf_ticker : XLK date_str: 20200302
x_train dimension : (60 x 60)
encoding_dim dimension : 30
epochs : 100 batch_size : 1
Running encoding_dim: 30 l1_reg: 5e-05


Create the encoder, decoder, and autoencoder model

In [57]:
encoder, decoder, autoencoder = build_ae_model(l1_reg, input_dim, encoding_dim) 

In [58]:
autoencoder.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 60)]              0         
_________________________________________________________________
dense_6 (Dense)              (None, 50)                3050      
_________________________________________________________________
dense_7 (Dense)              (None, 40)                2040      
_________________________________________________________________
lambda_2 (Lambda)            (None, 40)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 30)                1230      
_________________________________________________________________
dense_9 (Dense)              (None, 40)                1240      
_________________________________________________________________
dense_10 (Dense)             (None, 50)                2050

In [59]:
history = autoencoder.fit(x_train, x_train, epochs=epochs, batch_size=batch_size, verbose=2, shuffle=False)

Train on 60 samples
Epoch 1/100
60/60 - 0s - loss: 0.0679
Epoch 2/100
60/60 - 0s - loss: 0.0679
Epoch 3/100
60/60 - 0s - loss: 0.0679
Epoch 4/100
60/60 - 0s - loss: 0.0679
Epoch 5/100
60/60 - 0s - loss: 0.0678
Epoch 6/100
60/60 - 0s - loss: 0.0678
Epoch 7/100
60/60 - 0s - loss: 0.0678
Epoch 8/100
60/60 - 0s - loss: 0.0678
Epoch 9/100
60/60 - 0s - loss: 0.0678
Epoch 10/100
60/60 - 0s - loss: 0.0677
Epoch 11/100
60/60 - 0s - loss: 0.0677
Epoch 12/100
60/60 - 0s - loss: 0.0677
Epoch 13/100
60/60 - 0s - loss: 0.0677
Epoch 14/100
60/60 - 0s - loss: 0.0677
Epoch 15/100
60/60 - 0s - loss: 0.0676
Epoch 16/100
60/60 - 0s - loss: 0.0676
Epoch 17/100
60/60 - 0s - loss: 0.0676
Epoch 18/100
60/60 - 0s - loss: 0.0676
Epoch 19/100
60/60 - 0s - loss: 0.0675
Epoch 20/100
60/60 - 0s - loss: 0.0675
Epoch 21/100
60/60 - 0s - loss: 0.0675
Epoch 22/100
60/60 - 0s - loss: 0.0675
Epoch 23/100
60/60 - 0s - loss: 0.0675
Epoch 24/100
60/60 - 0s - loss: 0.0674
Epoch 25/100
60/60 - 0s - loss: 0.0674
Epoch 26/100
6

In [60]:
z = encoder.predict(x_train) # use x_test
z_row_sorted = sort_by_row(z)
z_mu = np.mean(z_row_sorted, axis=0)
gte_sorted = count_gt_threshold(z_mu, 0.01)
        
z_mu_1 = sorted(np.mean(z, axis=0), reverse=True)
gte_dim = count_gt_threshold(z_mu_1, 0.01)
loss = history.history['loss'][-1]
                
print("AE,{},{},{:.4f},{},{}".format(etf_ticker, date_str, loss, gte_sorted, gte_dim))

AE,XLK,20200302,0.0658,14,14


In [61]:
# save_output(x_train, encoder, date_str, etf_ticker, encoding_dim, l1_reg)