# S&P 500 Dimension Estimation

Autoencoder innermost layer is refashioned into singular value proxies (SVP).  These SVP are used to estimate dimension of the dataset.

In [29]:
import numpy as np
import pandas as pd
import os
import sys
import csv
import json

from datetime import datetime

from tensorflow.keras.layers import Input, Dense, Lambda, Dropout
from tensorflow.keras.models import Model
from keras import regularizers
from keras.callbacks import Callback
from keras import backend as K
import tensorflow as tf
from keras.regularizers import Regularizer

import scipy.sparse
from keras.models import load_model

The iPython notebook code is only being provided for convenience.  

All Linux code along with scripts is available at https://github.com/nitishbahadur/book_chapter. Our Linux code is based on tensorflow 1.x.  Python package requirements were exported and available https://github.com/nitishbahadur/book_chapter/blob/master/src/requirements.txt.

We run our production code on https://arc.wpi.edu/cluster-documentation/build/html/clusters.html for performance reasons.

In [30]:
tf.compat.v1.get_default_graph()
tf.compat.v1.disable_v2_behavior()
tf.compat.v1.disable_eager_execution()

Load S&P 500 data frp, data folder

In [31]:
def get_etf_data(etf_ticker):
    df_etf_ret = pd.read_csv(r'../data/etf_de/input/{}_returns.csv'.format(etf_ticker))
    df_etf_ret['Date'] = pd.to_datetime(df_etf_ret['Date'], format='%Y-%m-%d')
    df_etf_ret.set_index(df_etf_ret['Date'], inplace=True)
    df_etf_ret.drop(columns=['Date'], inplace=True)    
    X = df_etf_ret.values
    
    split_index = int(len(df_etf_ret)*.80) # 80% is training
    
    X = df_etf_ret.values
    X = X.astype('float32')
    X = X / np.max(np.abs(X))

    x_train = X[:split_index,:]
    x_test = X[split_index:,:]

    return x_train, x_test, df_etf_ret, split_index

Build the autoencoder model where the innermost layer is using a sigmoid activation function.  The autoencoder also uses dropout layers to control for overfitting.  We use a custom loss function.

In [32]:
def build_l1_ae_model(l1_reg, input_dim, encoding_dim):
    input_img = Input(shape=(input_dim,))
    encoded = Dense(256, activation='relu')(input_img)
    encoded = Dropout(0.3)(encoded)
    encoded = Dense(128, activation='relu')(encoded)
    encoded = Dropout(0.3)(encoded)
    
    z_layer_input = Lambda(lambda  x: K.l2_normalize(x,axis=1))(encoded)
    encoded = Dense(encoding_dim, activation='sigmoid')(z_layer_input)
    encoded_norm = Lambda(lambda  x: K.l2_normalize(x,axis=1))(encoded)

    # create encoder model
    encoder = Model(input_img, encoded)
    
    decoded = Dense(128, activation='relu')(encoded)
    decoded = Dense(256, activation='relu')(decoded)
    decoded = Dense(input_dim, activation='tanh')(decoded)

    # create autoencoder model
    autoencoder = Model(input_img, decoded)

    # create decoder model
    encoded_input = Input(shape=(encoding_dim,))
    deco = autoencoder.layers[-3](encoded_input)
    deco = autoencoder.layers[-2](deco)
    deco = autoencoder.layers[-1](deco)
    decoder = Model(encoded_input, deco)    
    
    autoencoder.compile(optimizer='adadelta', loss=mse_l1_loss(encoded_norm, l1_reg))
    return encoder, decoder, autoencoder

# the loss function
def mse_l1_loss(encoded_layer, lambda_):    
    def loss(y_true, y_pred):
        return K.mean(K.square(y_pred - y_true) + lambda_ * K.sum(K.abs(encoded_layer)))
    return loss 

In [33]:
def save_model(encoding_dim, l1_reg, autoencoder, encoder, decoder, etf_ticker):
    autoencoder_model_path = r"../data/etf_de/output/autoencoder_l1_reg_{}_{}_{}.h5".format(encoding_dim, l1_reg, etf_ticker)
    encoder_model_path = r"../data/etf_de/output/encoder_l1_reg_{}_{}_{}.h5".format(encoding_dim, l1_reg, etf_ticker)
    decoder_model_path = r"../data/etf_de/output/decoder_l1_reg_{}_{}_{}.h5".format(encoding_dim, l1_reg, etf_ticker)

    autoencoder.save(autoencoder_model_path)
    print("autoencoder saved!!!")

    encoder.save(encoder_model_path) 
    print("encoder saved!!!")

    decoder.save(decoder_model_path) 
    print("decoder saved!!!")

def save_history(encoding_dim, l1_reg, history, etf_ticker):
    history_filename = r"../data/etf_de/output/history_l1_{}_{}_{}".format(encoding_dim, l1_reg, etf_ticker)
    with open(history_filename, 'w') as f:
        json.dump(history.history, f)

def save_output(x, autoencoder, encoder, decoder, input_type, etf_ticker):
    print("{} Original : ".format(input_type))
    print(x)

    print("{} Predicted : ".format(input_type))
    x_predicted = autoencoder.predict(x)
    print(x_predicted)

    print("{} Original->Encoded->Decoded(Reconsturcted) : ".format(input_type))
    x_encoded = encoder.predict(x)
    x_reconstructed = decoder.predict(x_encoded)
    print(x_reconstructed)

    print("{} Encoded : ".format(input_type))
    print(x_encoded)    

    x_filename = r"../data/etf_de/output/x_{}_{}_{}_{}"
    np.save(x_filename.format(input_type, encoding_dim, l1_reg, etf_ticker), x)

    x_encoded_filename = r"../data/etf_de/output/x_{}_encoded_{}_{}_{}"
    np.save(x_encoded_filename.format(input_type, encoding_dim, l1_reg, etf_ticker), x_encoded)

    x_predicted_filename = r"../data/etf_de/output/x_{}_predicted_{}_{}_{}"
    np.save(x_predicted_filename.format(input_type, encoding_dim, l1_reg, etf_ticker), x_predicted)


In [34]:
def count_gt_threshold(z, threshold):
    tot = sum(z)
    z_pct = [(i/tot) for i in sorted(z, reverse=True)]
    z_gt_theta = [i for i in z_pct if i >= threshold]
    return len(z_gt_theta)

def sort_by_row(z):
    z_sorted = None
    for i in np.arange(z.shape[0]):
        z_s = sorted(z[i,:], reverse=True)
        if z_sorted is None:
            z_sorted = z_s
        else:
            z_sorted = np.vstack((z_sorted,z_s))
    return z_sorted

We run our production code on https://arc.wpi.edu/cluster-documentation/build/html/clusters.html for performance reasons.  For sample code purposes we run it only for 100 epochs.

In [35]:
l1_reg = 5e-5
encoding_dim = 100
epochs = 100 #we need to run this for more epochs
batch_size = 1
etf_ticker = 'SPY'

In [36]:
x_train, x_test, df_etf_ret, split_index = get_etf_data(etf_ticker)

In [37]:
input_dim = x_train.shape[1]

Create the encoder, decoder, and autoencoder model

In [38]:
encoder, decoder, autoencoder = build_l1_ae_model(l1_reg, input_dim, encoding_dim)

In [39]:
autoencoder.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 494)]             0         
_________________________________________________________________
dense_12 (Dense)             (None, 256)               126720    
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
lambda_4 (Lambda)            (None, 128)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 100)               1290

In [40]:
history = autoencoder.fit(x_train, x_train, epochs=epochs, batch_size=batch_size, verbose=2)

Train on 468 samples
Epoch 1/100
468/468 - 2s - loss: 0.0189
Epoch 2/100
468/468 - 1s - loss: 0.0161
Epoch 3/100
468/468 - 1s - loss: 0.0137
Epoch 4/100
468/468 - 1s - loss: 0.0119
Epoch 5/100
468/468 - 1s - loss: 0.0104
Epoch 6/100
468/468 - 1s - loss: 0.0092
Epoch 7/100
468/468 - 1s - loss: 0.0082
Epoch 8/100
468/468 - 1s - loss: 0.0074
Epoch 9/100
468/468 - 1s - loss: 0.0068
Epoch 10/100
468/468 - 1s - loss: 0.0062
Epoch 11/100
468/468 - 1s - loss: 0.0057
Epoch 12/100
468/468 - 1s - loss: 0.0053
Epoch 13/100
468/468 - 1s - loss: 0.0049
Epoch 14/100
468/468 - 1s - loss: 0.0046
Epoch 15/100
468/468 - 1s - loss: 0.0043
Epoch 16/100
468/468 - 1s - loss: 0.0041
Epoch 17/100
468/468 - 1s - loss: 0.0039
Epoch 18/100
468/468 - 1s - loss: 0.0037
Epoch 19/100
468/468 - 1s - loss: 0.0035
Epoch 20/100
468/468 - 1s - loss: 0.0034
Epoch 21/100
468/468 - 1s - loss: 0.0032
Epoch 22/100
468/468 - 1s - loss: 0.0031
Epoch 23/100
468/468 - 1s - loss: 0.0030
Epoch 24/100
468/468 - 1s - loss: 0.0029
Epoc

In [41]:
z = encoder.predict(x_test) # use x_test
z_row_sorted = sort_by_row(z)
z_mu = np.mean(z_row_sorted, axis=0)
gte_sorted = count_gt_threshold(z_mu, 0.01)
        
z_mu_1 = sorted(np.mean(z, axis=0), reverse=True)
gte_dim = count_gt_threshold(z_mu_1, 0.01)
loss = history.history['loss'][-1]
                
print("AE,{:.4f},{},{}".format(loss, gte_sorted, gte_dim))

AE,0.0013,49,45


In [42]:
# save_model(encoding_dim, l1_reg, autoencoder, encoder, decoder, etf_ticker)

# save_output(x_train, autoencoder, encoder, decoder, 'train', etf_ticker)
# save_output(x_test, autoencoder, encoder, decoder, 'test', etf_ticker)

# save_history(encoding_dim, l1_reg, history, etf_ticker)