In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
import numpy as np
from scipy.stats import norm

In [3]:
from keras.layers import Input, Dense, Activation, Lambda, Embedding, Reshape, RepeatVector
from keras.layers import merge
from keras.models import Model
from keras import regularizers
from keras.optimizers import SGD, RMSprop, Adam
from keras import backend as K
from keras import objectives

Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is disabled)


In [42]:
#from keras.layers import stack

### References :
##### VAE with GMM : https://arxiv.org/pdf/1611.05148.pdf
##### Categorical VAE parameterization : http://blog.evjang.com/2016/11/tutorial-categorical-variational.html
##### Auto-Encoding Variational Bayes : https://arxiv.org/abs/1312.6114
##### Tutorial from Oliver Durr : https://home.zhaw.ch/~dueo/bbs/files/vae.pdf
##### Building autoencoders in keras : https://blog.keras.io/building-autoencoders-in-keras.html

In [4]:
from keras.datasets import mnist
import numpy as np

(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))

## Variational Auto-Encoder with Gaussian Mixture generator

In [5]:
batch_size = 16
original_dim = 784
latent_dim = 2

NM = 10 # number of mixture clusters

intermediate_dim1 = 500
intermediate_dim2 = 500

nb_epoch = 25
epsilon_std = 1.0

#### Reparameterization trick

In [54]:
# Approximate Multinomial distrib. with a Gumbel-Softmax distrib.
# see documentation at :
# http://blog.evjang.com/2016/11/tutorial-categorical-variational.html
def sampling_categorical(c_logits):
    temperature = 0.1 # the lower it is the sharper the function is
    eps = 1e-20
    U = K.random_uniform(shape=(batch_size, NM), low=0, high=1)
    gumbel_noise = -K.log(-K.log(U + eps) + eps) # sample from Gumbel(0, 1)
    #return K.softmax((c_logits+gumbel_noise)/temperature)
    #return (c_logits+gumbel_noise)/temperature
    return gumbel_noise

def sampling_normal(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
                              std=epsilon_std)
    return z_mean + K.exp(z_log_var / 2) * epsilon

In [164]:
x = Input(batch_shape=(batch_size, original_dim))
h1 = Dense(intermediate_dim1, activation='softplus')(x)
h2 = Dense(intermediate_dim2, activation='softplus')(h1)
z_mean = Dense(latent_dim, activation=None)(h2)
z_log_var = Dense(latent_dim, activation=None)(h2)


ONEINT = Input(batch_shape=(batch_size, 1))


# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling_normal, output_shape=(latent_dim,))([z_mean, z_log_var])


E_mu = Embedding(NM, latent_dim, input_length=1) # WARNING : needs near-0 initialization
E_log_var = Embedding(NM, latent_dim, input_length=1) # WARNING : needs near-1 initialization

def log_pdf_normal(args): # WARNING : complete into real log(PDF(z)) function
    # WARNING : returns log for softmax function
    mu, log_var, z = args
    #return K.sum( (mu - z) * (mu - z) / K.exp(log_var) )
    # pdf = exp(-0.5 * (mu - z)^2 / exp(log_var) ) * 1/prod(sigma i) * constant
    return K.sum(-0.5 * (mu - z) * (mu - z) / K.exp(log_var)) - 0.5 * K.sum(log_var) #+ log(constant)

logpdfs = []
for i in range(NM):
    # batch with constant i value
    if True:
        dummy_d = Dense(1, weights=[np.zeros((original_dim, 1)),i*np.ones((1,))], input_dim=original_dim)
        dummy_d.trainable = False
        ohe_i = dummy_d(x) 
    else:
        ohe_i = ONEINT
    mu = Reshape((latent_dim,))( E_mu(ohe_i) )
    log_var = Reshape((latent_dim,))( E_log_var(ohe_i) )
    #logpdfs.append( mu )
    logpdfs.append( Reshape((1,))( Lambda(log_pdf_normal, output_shape=(1,))([mu, log_var, z]) ) )


logpdfmat = Reshape((NM,1))( merge(logpdfs, mode='concat', concat_axis=1) ) # .summary to check axis
c_logits = Activation('softmax')(logpdfmat)
# WARNING : need to compute c_logits from z
c = Activation('softmax')( Lambda(sampling_categorical, output_shape=(NM,))(c_logits) )


##############################

#inputs = Input(shape=(nn_input_dim,))
inputs = x
#outputs = Input(shape=(8,)) # in AE model output=input=x
outputs = x

predictions = []
for i in range(NM):
    decoder_h1 = Dense(intermediate_dim1, activation='softplus')
    decoder_h2 = Dense(intermediate_dim2, activation='softplus')
    decoder_proba = Dense(original_dim, activation='sigmoid')
    h1_decoded = decoder_h1(z)
    h2_decoded = decoder_h2(h1_decoded)
    x_decoded_proba = decoder_proba(h2_decoded) # a Bernoulli dist. has a single prob. parameter
    predictions.append( x_decoded_proba )


predmat = Reshape((NM,original_dim))( merge(predictions, mode='concat', concat_axis=1) ) #.summary to check axis

deltas = merge([RepeatVector(NM)(outputs), predmat], output_shape=(NM,original_dim), mode=lambda x: -(x[0] * K.log(x[1])))

deltasums = Lambda(lambda x: K.sum(x, axis=2), output_shape=lambda s: (s[0], s[1]))(deltas)# .summary to check axis

hinton_trick = True # see "Adaptive Mixtures of Local Experts"
if hinton_trick:
    Hinton1 = Lambda(lambda x: K.exp(-x), output_shape=lambda s: s)
    deltasums = Hinton1(deltasums)
'''
# WARNING : gate is just the sampled c vector
gate = c

errors = merge([gate, deltasums], mode='dot')
##############################



def vae_loss(x, x_decoded_proba):
    xent_loss = original_dim * objectives.binary_crossentropy(x, x_decoded_proba)
    kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
    return xent_loss + kl_loss

vae = Model(x, x_decoded_proba)
vae.compile(optimizer=Adam(1e-3), loss=vae_loss)
'''



In [167]:
#mdl = Model([x, ONEINT], [logpdfs[0], logpdfs[1], logpdfs[2], logpdfs[3], logpdfs[4]])
#mdl = Model([x, ONEINT], logpdfmat)
#mdl = Model([x, ONEINT], c_logits)
mdl = Model([x, ONEINT], c)
#mdl = Model([x, ONEINT], predmat)
#mdl = Model([x, ONEINT], deltas)
#mdl = Model([x, ONEINT], deltasums)
#mdl.summary()

onearr = np.ones((batch_size, 1)).astype(int)
#mdl.predict(input_array).shape
np.array(mdl.predict([x_train[0:batch_size,:], onearr]))[0:3,:]
#x_train[0:batch_size,:].shape

array([[ 0.12574185,  0.06108518,  0.21009727,  0.05462784,  0.04958955,
         0.05703476,  0.0801391 ,  0.07700752,  0.01824195,  0.26643494],
       [ 0.29096895,  0.17153522,  0.05922543,  0.02664155,  0.02271858,
         0.04022451,  0.06268827,  0.08329213,  0.1022932 ,  0.14041215],
       [ 0.02728114,  0.03183626,  0.03543318,  0.015043  ,  0.21386734,
         0.07227509,  0.09050404,  0.13416098,  0.03568605,  0.34391293]], dtype=float32)

In [168]:
vae.fit(x_train, x_train,
        shuffle=True,
        nb_epoch=1, # may have to increase this value
        batch_size=batch_size,
        validation_data=(x_test, x_test))

NameError: name 'vae' is not defined

In [170]:
# build a model to project inputs on the latent space
encoder_mean = Model(x, z_mean)
#encoder_stdev = Model(x, K.exp(z_log_var / 2))

#encoder = Model(x, z)

# build a digit generator that can sample from the learned distribution
decoder_input = Input(shape=(latent_dim,))
_h1_decoded = decoder_h1(decoder_input)
_h2_decoded = decoder_h2(_h1_decoded)
_x_decoded_proba = decoder_proba(_h2_decoded)
generator = Model(decoder_input, _x_decoded_proba)