In [2]:
import random
import pandas as pd
import numpy as np
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dropout, Dense
import keras as k
import keras.backend as K
from scipy.stats import zscore

In [48]:
def masked_mae(X_true, X_pred, mask):
    masked_diff = X_true[mask] - X_pred[mask]
    return np.mean(np.abs(masked_diff))


def fill(self, missing_mask):
  self.data[missing_mask] = -1


def create_missing_mask(X):
  if X.dtype != "f" and X.dtype != "d":
      X = X.astype(float)
  return np.isnan(X.data)


def bool_to_binary(matrix):
  """
  Converts a boolean matrix to a binary matrix

  :param matrix: a boolean matrix
  :return: a binary matrix
  """
  binary_matrix = []
  for row in matrix:
      binary_row = []
      for value in row:
          binary_row.append(1 if value else 0)
      binary_matrix.append(binary_row)
  return binary_matrix


def replace_nan(data, replacement):
  """
  Replace NaN values in a given array with a specific number.

  Args:
  data (array): The data to be processed.
  replacement (float or int): The number to replace NaN values with.

  Returns:
  The processed data with NaN values replaced by the specified number.
  """

  if not isinstance(data, np.ndarray):
      raise ValueError("Unsupported data type. Function supports numpy arrays only.")

  data[np.isnan(data)] = replacement

  return data


def loss_func(z_mean, z_log_var):
  
    def vae_reconstruction_loss(input_and_mask, y_pred):
        X_values = input_and_mask[:, :n_dims]

        missing_mask = input_and_mask[:, n_dims:]

        observed_mask = 1 - missing_mask

        X_values_observed = X_values * observed_mask

        pred_observed = y_pred * observed_mask

        squared_diff = K.square(pred_observed - X_values_observed)
        mse = K.sum(squared_diff, axis=-1)/K.sum(observed_mask, axis=-1)

        return mse

    def vae_kl_loss(z_mean, z_log_var):
        X_values = z_mean[:, :n_dims]

        missing_mask = z_mean[:, n_dims:]

        observed_mask = 1 - missing_mask

        z_mean = X_values * observed_mask
        
        kl_loss = -0.5 * k.backend.sum(1.0 + z_log_var - k.backend.square(z_mean) - k.backend.exp(z_log_var), axis=1)
        return kl_loss

    def vae_loss(y_true, y_predict):
        reconstruction_loss = vae_reconstruction_loss(y_true, y_predict)
        kl_loss = vae_kl_loss(y_true, y_predict)
        loss = reconstruction_loss + kl_loss
        return loss
    return vae_loss

# Define the sampling layer
def sampling(args):
    z_mean, z_log_var = args
    epsilon = k.backend.random_normal(shape=(k.backend.shape(z_mean)[0], latent_dim), mean=0., stddev=1.)
    return z_mean + k.backend.exp(0.5 * z_log_var) * epsilon

In [49]:
# Read the Data
x = pd.read_csv('../../data/MCAR/mcar10/mcar_10.csv', index_col = 0)
column_names = list(x.columns.values)
labels = x['group']
X = np.array(x.iloc[:, 0:999])  
n_dims = X.shape[1]

latent_dim = 2

In [50]:
#Define Variational Auto Encoder

encoder_input = k.Input(shape=(2*n_dims,))
x = k.layers.Dense(128, activation='relu')(encoder_input)
x = k.layers.Dropout(0.2)(x)
x = k.layers.Dense(32, activation='relu')(x)
x = k.layers.Dropout(0.2)(x)
z_mean = k.layers.Dense(latent_dim, name='z_mean')(x)
z_log_var = k.layers.Dense(latent_dim, name='z_log_var')(x)
z = k.layers.Lambda(sampling, name='z')([z_mean, z_log_var])

# Define decoder layers
decoder_input = k.layers.Input(shape=(latent_dim,))
x = k.layers.Dense(32, activation='relu')(decoder_input)
x = k.layers.Dense(128, activation='relu')(x)
x = k.layers.Dense(n_dims, activation='sigmoid')(x)
decoder_output = k.layers.Reshape((n_dims,))(x)

decoder = k.models.Model(decoder_input, decoder_output, name='decoder')

# Define the VAE model
encoder_output = [z_mean, z_log_var, z]
encoder = k.Model(encoder_input, encoder_output, name='encoder')
output = decoder(encoder_output[2])
vae = k.Model(encoder_input, output, name='vae')

# Define the VAE loss function
reconstruction_loss = loss_func(z_mean, z_log_var)
#kl_loss = -0.5 * k.backend.mean(1 + z_log_var - k.backend.square(z_mean) - k.backend.exp(z_log_var), axis=-1)
#vae_loss = k.backend.mean(reconstruction_loss + kl_loss)


vae.compile(optimizer='adam', loss = reconstruction_loss)

In [51]:
mask = np.array(bool_to_binary(create_missing_mask(X)))
X_no_na = replace_nan(X, replacement = 1)
input_with_mask = np.hstack([X_no_na, mask])

vae.fit(x=input_with_mask, y=input_with_mask, epochs=40, batch_size=16, verbose=1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f7e118a0fd0>

In [44]:
y = vae.predict(input_with_mask)



In [47]:
X

array([[4.41195763, 4.25944405, 1.        , ..., 4.41195763, 2.8888282 ,
        6.69358422],
       [4.86672759, 5.13119538, 8.51153343, ..., 5.35461077, 2.50996399,
        5.35461077],
       [5.3953902 , 5.09540605, 8.60061913, ..., 4.95545134, 1.36757646,
        6.67445372],
       ...,
       [5.50354872, 4.75708098, 8.58458809, ..., 4.75708098, 2.54161956,
        6.66258138],
       [5.51977628, 4.97810536, 8.06867908, ..., 4.70777289, 2.67480735,
        5.82401733],
       [5.22955271, 4.35592154, 8.0035662 , ..., 4.66368938, 2.55316663,
        7.49140028]])