# Build a variational autoencoder

Based on the ability to generate UK car registration numbers, one can build a dataset of training and test data. With a suitable vectorizer, one can go ahead and make an autoencoder.

The decoder part of the variational encoder can then be used to generate new reg numbers.

### References

https://blog.keras.io/building-autoencoders-in-keras.html

https://towardsdatascience.com/understanding-variational-autoencoders-vaes-f70510919f73

In [1]:
import sys
import numpy as np
import keras
from keras import regularizers
from keras import layers
from keras import backend
from keras import optimizers
import random

if '..' not in sys.path:
    sys.path.append('..')
from car_reg_generator.car_reg_generator.uk_reg import UkRegGenerator
from car_reg_generator.car_reg_generator.uk_reg import UkRegDvlaVectorizer

### Data generation

In [2]:
n_train = 10000
n_val = 1000
n_test = 1000

random.seed(0)
g = UkRegGenerator()
v = UkRegDvlaVectorizer()

train_strs = [g.get_reg() for _ in range(n_train)]
train_vecs = np.array([v.vectorize(x) for x in train_strs])
val_strs = [g.get_reg() for _ in range(n_val)]
val_strs = [x for x in val_strs if x not in set(train_strs)]  # sanity check
val_vecs = np.array([v.vectorize(x) for x in val_strs])
test_strs = [g.get_reg() for _ in range(n_test)]
test_strs = [x for x in test_strs if x not in set(train_strs) | set(val_strs)]  # sanity check
test_vecs = np.array([v.vectorize(x) for x in test_strs])

vec_length = len(train_vecs[0])
print('input vector length = ' + str(vec_length))
print('actual number of validation samples = ' + str(len(val_strs)))
print('actual number of test samples = ' + str(len(test_strs)))

input vector length = 150
actual number of validation samples = 1000
actual number of test samples = 1000


### Variational autoencoder

Design the architecture

In [3]:
latent_dim = 20
original_dim = vec_length

### Define the encoder
inputs = keras.Input(shape=(original_dim,))
h = inputs
h = layers.Dense(70, activation='relu')(h)
h = layers.Dense(30, activation='relu')(h)
z_mean = layers.Dense(latent_dim)(h)
z_log_sigma = layers.Dense(latent_dim)(h)

def sampling(args):
    z_mean, z_log_sigma = args
    epsilon = backend.random_normal(shape=(backend.shape(z_mean)[0], latent_dim),
                                    mean=0., stddev=0.1)
    return z_mean + backend.exp(z_log_sigma) * epsilon

z = layers.Lambda(sampling)([z_mean, z_log_sigma])

encoder = keras.Model(inputs, [z_mean, z_log_sigma, z], name='encoder')

### Define the decoder
latent_inputs = keras.Input(shape=(latent_dim,), name='z_sampling')
x = latent_inputs
x = layers.Dense(30, activation='relu')(x)
x = layers.Dense(70, activation='relu')(x)
x = layers.Dense(vec_length, activation='sigmoid')(x)
outputs = x
decoder = keras.Model(latent_inputs, outputs, name='decoder')

outputs = decoder(encoder(inputs)[2])
vae = keras.Model(inputs, outputs, name='vae_mlp')

### Define the loss function
reconstruction_loss = keras.losses.binary_crossentropy(inputs, outputs)
reconstruction_loss *= original_dim
kl_loss = 1 + z_log_sigma - backend.square(z_mean) - backend.exp(z_log_sigma)
kl_loss = backend.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = backend.mean(reconstruction_loss + kl_loss)
vae.add_loss(vae_loss)

Do some training

In [4]:
opt = optimizers.Adamax(learning_rate=0.004)
vae.compile(optimizer=opt, loss='binary_crossentropy')
vae.fit(train_vecs, train_vecs,
        epochs=120,
        batch_size=8,
        shuffle=True,
        validation_data=(val_vecs, val_vecs))

Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120


Epoch 78/120
Epoch 79/120
Epoch 80/120
Epoch 81/120
Epoch 82/120
Epoch 83/120
Epoch 84/120
Epoch 85/120
Epoch 86/120
Epoch 87/120
Epoch 88/120
Epoch 89/120
Epoch 90/120
Epoch 91/120
Epoch 92/120
Epoch 93/120
Epoch 94/120
Epoch 95/120
Epoch 96/120
Epoch 97/120
Epoch 98/120
Epoch 99/120
Epoch 100/120
Epoch 101/120
Epoch 102/120
Epoch 103/120
Epoch 104/120
Epoch 105/120
Epoch 106/120
Epoch 107/120
Epoch 108/120
Epoch 109/120
Epoch 110/120
Epoch 111/120
Epoch 112/120
Epoch 113/120
Epoch 114/120
Epoch 115/120
Epoch 116/120
Epoch 117/120
Epoch 118/120
Epoch 119/120
Epoch 120/120


<tensorflow.python.keras.callbacks.History at 0x7f268c547110>

Evaluation

In [5]:
decoded_regs = vae.predict(test_vecs)
print(test_strs[:10])
recovered = [v.recover(x) for x in decoded_regs]
print(recovered[:10])
acc = np.sum([x == y for x, y in zip(recovered, test_strs)]) / len(test_strs)
print('overall accuracy = ' + str(acc))
print('by digit accuracy:')
for i_digit in range(len(test_strs[0])):
    acc = np.sum([x[i_digit] == y[i_digit] for x, y in zip(recovered, test_strs)]) / len(test_strs)
    print('    ' + str(acc))

['MY33WSZ', 'WE83GXF', 'LU19JLV', 'KX41GVE', 'WP91NCV', 'SG47LFF', 'HB17WIY', 'CA89AXN', 'GT57AKA', 'CD77SJT']
['MY33WSZ', 'WE83GXF', 'LU19JLV', 'KX11GVE', 'WP91NCV', 'SG47LSF', 'HB17WIY', 'CA89AXN', 'GT57ZKA', 'CD77SJT']
overall accuracy = 0.827
by digit accuracy:
    0.986
    0.984
    0.973
    0.988
    0.934
    0.978
    0.966


Use decoder as generator

In [6]:
generated_regs = decoder.predict(np.random.random((5, latent_dim)))
print([v.recover(x) for x in generated_regs])

['RA63MDZ', 'PT13HZX', 'PA43KGX', 'PA83VWZ', 'PA34VZX']


### Avoid generating any training data

There is a chance that a random generation of data might produce one of the actual registration numbers used in training. However, any input registrations are represented as a probability distribution in the latent space; so one could require that any sample vector used in the latent space to be 'sufficiently far' down the tails of all the recorded distributions.



In [7]:
# Get the distributions for each reg number in the data
all_strs = train_strs + val_strs + test_strs
all_vecs = np.array([v.vectorize(x) for x in all_strs])
means, log_sigmas, _ = encoder.predict(all_vecs)
distn_dict = {x: (y,z) for x, y, z in zip(all_strs, means, np.exp(log_sigmas))}  # dict<reg no: (mean, sigma)>
distn_dict['YK66BIQ']
# for a multivariate distribution ~ N(mean, Var) valued in the latent space, 
# mean is the first vector below, and Var is a diagonal matrix, the square root
# of the diagonal entries being given in the second vector.
# If the KL loss function has done its job, the distributions should be close to N(0,1)

(array([ 0.2781712 ,  0.3491726 , -0.102899  , -0.27254137, -0.54578245,
         0.02597244, -0.02890204, -0.01442936,  0.24338357, -0.5948266 ,
         0.26151133,  0.45709428, -0.7044419 ,  0.5233098 ,  1.0202402 ,
         0.23642889, -0.40227908,  0.02383937,  0.7365775 ,  0.29046974],
       dtype=float32),
 array([0.75283927, 0.6725014 , 0.66999155, 0.6595008 , 0.6006482 ,
        0.647655  , 0.9895292 , 0.99312407, 0.54756856, 0.63305753,
        0.77145696, 0.6659439 , 0.55902433, 0.60124886, 0.6191432 ,
        0.5863514 , 0.6290502 , 1.0134101 , 0.55971897, 0.6589626 ],
       dtype=float32))