# First part: training the model based on the real data
Getting the data would require spark, but let's skip that for now
Instead, let's start off by generating a really simple example dataset.
4 digit numbers, converted to string format; perhaps 1 million rows.

In [37]:
import numpy as np
from random import randint
from str_to_int import StrToInt
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda, LSTM, Embedding, Flatten, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

In [16]:
test_data = [str(randint(1000, 9999)) for _ in range(1000000)] # 'collect' one million samples

Assuming we don't know the strings are numbers and can't just straight cast them to int, use ascii ord to map to int

In [26]:
encoded_data = [[ord(char) for char in row] for row in test_data]

In [33]:
decoded_data = [''.join(chr(num) for num in row) for row in encoded_data]

In [34]:
encoded_data = np.array(encoded_data)

In [35]:
encoded_data = tf.keras.preprocessing.sequence.pad_sequences(encoded_data, padding='post')

 Good, now we can use this to train our VAE Model

In [36]:
def sampling(args):
    """Reparameterization trick by sampling from an isotropic unit Gaussian."""
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

In [38]:
# Network parameters
input_shape = (encoded_data.shape[1],)  # encoded_data.shape[1] is the sequence length after padding
intermediate_dim = 64
latent_dim = 2

In [40]:
# Build the encoder
inputs = Input(shape=input_shape, name='encoder_input')
x = Embedding(input_dim=256, output_dim=8, input_length=input_shape[0])(inputs)  # Embedding for ASCII range
x = Flatten()(x)  # Flatten the embedded output
x = Dense(intermediate_dim, activation='relu')(x)
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)
z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])

encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
encoder.summary()

Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_input (InputLayer)  [(None, 4)]                  0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 4, 8)                 2048      ['encoder_input[0][0]']       
                                                                                                  
 flatten_1 (Flatten)         (None, 32)                   0         ['embedding_1[0][0]']         
                                                                                                  
 dense_1 (Dense)             (None, 64)                   2112      ['flatten_1[0][0]']           
                                                                                            

In [41]:
# Build the decoder
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
x = Dense(intermediate_dim, activation='relu')(latent_inputs)
x = Dense(input_shape[0] * 8, activation='relu')(x)  # Adjusted size for the decoder
x = Reshape((input_shape[0], 8))(x)
outputs = Dense(256, activation='softmax')(x)  # Output layer size of 256 to cover the full ASCII range

decoder = Model(latent_inputs, outputs, name='decoder')
decoder.summary()

Model: "decoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 z_sampling (InputLayer)     [(None, 2)]               0         
                                                                 
 dense_2 (Dense)             (None, 64)                192       
                                                                 
 dense_3 (Dense)             (None, 32)                2080      
                                                                 
 reshape (Reshape)           (None, 4, 8)              0         
                                                                 
 dense_4 (Dense)             (None, 4, 256)            2304      
                                                                 
Total params: 4576 (17.88 KB)
Trainable params: 4576 (17.88 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [42]:
# VAE model
outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs, name='vae')

In [43]:
# Loss function for VAE
reconstruction_loss = tf.keras.losses.sparse_categorical_crossentropy(inputs, outputs)
reconstruction_loss *= input_shape[0]
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = K.mean(reconstruction_loss + kl_loss)
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')

Train the VAE

In [44]:
vae.fit(encoded_data, epochs=50, batch_size=32)

Epoch 1/50


ValueError: in user code:

    File "/home/developer/.local/lib/python3.8/site-packages/keras/src/engine/training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "/home/developer/.local/lib/python3.8/site-packages/keras/src/engine/training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/developer/.local/lib/python3.8/site-packages/keras/src/engine/training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "/home/developer/.local/lib/python3.8/site-packages/keras/src/engine/training.py", line 1080, in train_step
        y_pred = self(x, training=True)
    File "/home/developer/.local/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None

    ValueError: Exception encountered when calling layer 'tf.__operators__.add_1' (type TFOpLambda).
    
    Dimensions must be equal, but are 4 and 32 for '{{node vae/tf.__operators__.add_1/AddV2}} = AddV2[T=DT_FLOAT](vae/tf.math.multiply/Mul, vae/tf.math.multiply_1/Mul)' with input shapes: [32,4], [32].
    
    Call arguments received by layer 'tf.__operators__.add_1' (type TFOpLambda):
      • x=tf.Tensor(shape=(32, 4), dtype=float32)
      • y=tf.Tensor(shape=(32,), dtype=float32)
      • name=None


# Second part: generating the synthetic data based on the model

In [None]:
# Generate new data
z_sample = np.random.normal(size=(1, latent_dim))
generated_sequences = decoder.predict(z_sample)

# Convert probabilities to discrete ASCII indices
generated_sequences = np.argmax(generated_sequences, axis=-1)

# Decode the generated ASCII indices back to characters
decoded_data = [''.join(chr(num) for num in row) for row in generated_sequences]
print(decoded_data)