# Synthetic Dataset Dimension Estimation

Autoencoder innermost layer is refashioned into singular value proxies (SVP).  These SVP are used to estimate dimension of the dataset.

How did we create the sythetic dataset?

We created a random array $\mathbf{C} = \mathbf{A} \times \mathbf{B}$ 
 - $\mathbf{A}~\epsilon~\mathbb{R}^{5000\times5}$
 - $\mathbf{B}~\epsilon~\mathbb{R}^{5\times784}$
 
We apply nonlinear transformations to $\mathbf{C}$ to create $\mathbf{D} = g(\mathbf{C})$, where $\mathbf{D}~\epsilon~\mathbb{R}^{5000\times784}$ but has known dimension $5$.

In [94]:
import numpy as np
import pandas as pd
import os
import sys
import csv
import json

from datetime import datetime

from tensorflow.keras.layers import Input, Dense, Lambda, Dropout
from tensorflow.keras.models import Model
from keras import regularizers
from keras.callbacks import Callback
from keras import backend as K
import tensorflow as tf
from keras.regularizers import Regularizer

import scipy.sparse
from keras.models import load_model

The iPython notebook code is only being provided for convenience.  

All Linux code along with scripts is available at https://github.com/nitishbahadur/book_chapter. Our Linux code is based on tensorflow 1.x.  Python package requirements were exported and available https://github.com/nitishbahadur/book_chapter/blob/master/src/requirements.txt.

We run our production code on https://arc.wpi.edu/cluster-documentation/build/html/clusters.html for performance reasons.

In [95]:
tf.compat.v1.get_default_graph()
tf.compat.v1.disable_v2_behavior()
tf.compat.v1.disable_eager_execution()

The synthetic data is loaded from data folder.  The large numeric number indicates in millisecond when this process was run.  $20$ is the linear dimension estimated using PCA.  We treat the first $3000$ instances as training dataset and remaining $2000$ instances as test dataset.

In [96]:
def get_synthetic_data():
    X = np.load(r'../data/synthetic_data/input/D_{}_dim_{}.npy'.format(1586038282841, 20), allow_pickle=False)
    X = X.astype('float32')

    X = X / (np.max(X) - np.min(X))
    x_train = X[0:3000,:]
    x_test = X[3000:,:]
    return x_train, x_test

Build the autoencoder model where the innermost layer is using a sigmoid activation function.  The autoencoder also uses dropout layers to control for overfitting.  We use a custom loss function.

In [97]:
def build_lite_ae_model(l1_reg, encoding_dim, layer1_dropout, layer2_dropout):
    input_img = Input(shape=(784,))
    encoded = Dense(392, activation='relu')(input_img)
    encoded = Dropout(layer1_dropout)(encoded)
    encoded = Dense(128, activation='relu')(encoded)
    encoded = Dropout(layer2_dropout)(encoded)

    z_layer_input = Lambda(lambda  x: K.l2_normalize(x,axis=1))(encoded)
    encoded = Dense(encoding_dim, activation='sigmoid')(z_layer_input)
    encoded_norm = Lambda(lambda  x: K.l2_normalize(x,axis=1))(encoded)
    
    decoded = Dense(128, activation='relu')(encoded)
    decoded = Dense(392, activation='relu')(decoded)
    decoded = Dense(784, activation='tanh')(decoded)

    # create autoencoder
    autoencoder = Model(input_img, decoded)

    # create encoder
    encoder = Model(input_img, encoded)

    # create decoder model
    encoded_input = Input(shape=(encoding_dim,))
    deco = autoencoder.layers[-3](encoded_input)
    deco = autoencoder.layers[-2](deco)
    deco = autoencoder.layers[-1](deco)
    decoder = Model(encoded_input, deco)    

    autoencoder.compile(optimizer='adadelta', loss=mse_regularized_loss(encoded_norm, l1_reg)) 
    return encoder, decoder, autoencoder

def mse_regularized_loss(encoded_layer, lambda_):    
    def loss(y_true, y_pred):
        return K.mean(K.square(y_pred - y_true) + lambda_ * K.sum(K.abs(encoded_layer)))
    return loss 

The utility functions provided below is equivalent to the python code we use on HPC cluster.  We provide this for completeness here.

In [98]:
def save_model(encoding_dim, l1_reg, autoencoder, encoder, decoder):
    autoencoder_model_path = r"../data/synthetic_data/output/autoencoder_l1_reg_{}_{}.h5".format(encoding_dim, l1_reg)
    encoder_model_path = r"../data/synthetic_data/output/encoder_l1_reg_{}_{}.h5".format(encoding_dim, l1_reg)
    decoder_model_path = r"../data/synthetic_data/output/decoder_l1_reg_{}_{}.h5".format(encoding_dim, l1_reg)

    autoencoder.save(autoencoder_model_path)
    print("autoencoder saved!!!")

    encoder.save(encoder_model_path) 
    print("encoder saved!!!")

    decoder.save(decoder_model_path) 
    print("decoder saved!!!")

def save_history(encoding_dim, l1_reg, history):
    history_filename = r"../data/synthetic_data/output/history_l1_{}_{}".format(encoding_dim, l1_reg)
    with open(history_filename, 'w') as f:
        json.dump(history.history, f)

def save_intermediate_training(x, encoder, decoder, epoch):
    input_type = 'train'
    x_encoded = encoder.predict(x)
    x_reconstructed = decoder.predict(x_encoded)

    x_encoded_filename = r"../data/synthetic_data/output/x_{}_{}_encoded_{}_{}"
    np.save(x_encoded_filename.format(input_type, epoch, encoding_dim, l1_reg), x_encoded)

    x_reconstructed_filename = r"../data/synthetic_data/output/x_{}_{}_reconstructed_{}_{}"
    np.save(x_reconstructed_filename.format(input_type, epoch, encoding_dim, l1_reg), x_reconstructed)


def save_output(x, autoencoder, encoder, decoder, layer1_dropout, layer2_dropout, input_type):
    print("{} Original : ".format(input_type))
    print(x)

    print("{} Predicted : ".format(input_type))
    x_predicted = autoencoder.predict(x)
    print(x_predicted)

    print("{} Original->Encoded->Decoded(Reconsturcted) : ".format(input_type))
    x_encoded = encoder.predict(x)
    x_reconstructed = decoder.predict(x_encoded)
    print(x_reconstructed)

    print("{} Encoded : ".format(input_type))
    print(x_encoded)    

    x_filename = r"../data/synthetic_data/output/x_{}_{}_{}_{}_{}"
    np.save(x_filename.format(input_type, encoding_dim, l1_reg, layer1_dropout, layer2_dropout), x)

    x_encoded_filename = r"../data/synthetic_data/output/x_{}_encoded_{}_{}_{}_{}"
    np.save(x_encoded_filename.format(input_type, encoding_dim, l1_reg, layer1_dropout, layer2_dropout), x_encoded)

    x_predicted_filename = r"../data/synthetic_data/output/x_{}_predicted_{}_{}_{}_{}"
    np.save(x_predicted_filename.format(input_type, encoding_dim, l1_reg, layer1_dropout, layer2_dropout), x_predicted)

class SaveIntermediateTrainingOutput(Callback):
    def __init__(self, x, encoder, decoder):
        super(Callback, self).__init__()
        self.x = x
        self.encoder = encoder
        self.decoder = decoder
        self.counter = 1

    def on_epoch_end(self, epoch, logs={}):
        if epoch % 100 == 0:
            print("File counter: {}".format(self.counter*(epoch+1)))
            save_intermediate_training(self.x, self.encoder, self.decoder, self.counter*(epoch+1))
            self.counter = self.counter + 1


Estimate dimension by counting how many singular value proxies are greater than 1%

In [99]:
def count_gt_threshold(z, threshold):
    tot = sum(z)
    z_pct = [(i/tot) for i in sorted(z, reverse=True)]
    z_gt_theta = [i for i in z_pct if i >= threshold]
    return len(z_gt_theta)

def sort_by_row(z):
    z_sorted = None
    for i in np.arange(z.shape[0]):
        z_s = sorted(z[i,:], reverse=True)
        if z_sorted is None:
            z_sorted = z_s
        else:
            z_sorted = np.vstack((z_sorted,z_s))
    return z_sorted

For convenience we provide default values from run_synthetic_de.py script.  The script is used to run DE process on High Performance Computing cluster at WPI

In [100]:
# sparsity parameter
l1_reg = 1.1e-2

# number of nodes in innermost hidden layer
encoding_dim = 16

# number of times you want to run 100 epochs
# DE converges slowly.
num_epochs = 30

# the batch size
batch_size = 64

# 30% of nodes are dropped out
layer1_dropout = 0.3

# 30% of nodes are dropped out
layer2_dropout = 0.3

Load synthetic data

In [101]:
x_train, x_test = get_synthetic_data()

Create the encoder, decoder, and autoencoder model

In [102]:
encoder, decoder, autoencoder = build_lite_ae_model(l1_reg, encoding_dim, layer1_dropout, layer2_dropout)

In [103]:
svp_dict_ = {}
dim_dict_ = {}
for i in range(1, num_epochs+1):
        history = autoencoder.fit(x_train, x_train, epochs=100, batch_size=batch_size, verbose=0)
        z = encoder.predict(x_test)
        z_row_sorted = sort_by_row(z)
        z_mu = np.mean(z_row_sorted, axis=0)
        gte_sorted = count_gt_threshold(z_mu, 0.01)
        
        z_mu_1 = sorted(np.mean(z, axis=0), reverse=True)
        gte_dim = count_gt_threshold(z_mu_1, 0.01)
        loss = history.history['loss'][-1]
        print("AE,{},{:.4f},{},{}".format(i*100, loss, gte_sorted, gte_dim))
        
        converted_list = [str(np.round(element, 4)) for element in z_mu_1]
        svps = ",".join(converted_list)    
        print(svps)
        print()
        
        # save it for plotting later
        svp_dict_[i*100] = svps
        dim_dict_[i*100] = gte_sorted

AE,100,2.7816,16,16
0.6259,0.6203,0.5744,0.5638,0.5514,0.5344,0.5181,0.4979,0.4895,0.4572,0.4375,0.4268,0.4252,0.3863,0.3641,0.3258

AE,200,2.7595,16,16
0.6646,0.6497,0.5931,0.5839,0.5625,0.5354,0.5306,0.493,0.4694,0.4308,0.4199,0.3925,0.3923,0.3693,0.3178,0.2994

AE,300,2.7303,16,16
0.6987,0.6775,0.6153,0.6074,0.5779,0.5449,0.5443,0.4863,0.4524,0.3995,0.3919,0.357,0.3553,0.3438,0.2778,0.2694

AE,400,2.6887,16,16
0.7331,0.7073,0.6431,0.6337,0.5951,0.5567,0.5555,0.4749,0.4275,0.3622,0.3571,0.3152,0.3132,0.3116,0.2385,0.2378

AE,500,2.6327,16,16
0.7662,0.7383,0.6718,0.6628,0.6137,0.5682,0.5655,0.4545,0.3931,0.3179,0.3158,0.2752,0.2691,0.2677,0.2074,0.2008

AE,600,2.5649,16,16
0.7964,0.7678,0.7013,0.6921,0.6344,0.5782,0.5739,0.4247,0.3486,0.2712,0.2702,0.2363,0.2248,0.2238,0.1779,0.1667

AE,700,2.4875,16,16
0.8221,0.7938,0.7293,0.7209,0.6559,0.5852,0.5794,0.3824,0.2974,0.2269,0.2237,0.1986,0.1845,0.1839,0.1513,0.1374

AE,800,2.4062,16,16
0.8441,0.8168,0.7548,0.7477,0.6761,0.587,0.5797,0.3

The singular value proxy keeps reducing.  We take a snapshot every 100 epochs.