The following notebook generates "startup names" by reading a dataset of
public companies.

Sources:
[TextKD-GAN: Text Generation using KnowledgeDistillation and Generative Adversarial Networks](https://arxiv.org/abs/1905.01976)
[Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028v1)
["NASDAQ and NYSE stocks histories" on Kaggle](https://www.kaggle.com/qks1lver/nasdaq-and-nyse-stocks-histories?select=NASDAQ.txt).

In [None]:
!pip install pandas numpy tensorflow tensorflow-text

In [None]:
import pandas

# Read in the dataframe.
df = pandas.read_csv('./stocks.txt', dtype=str, keep_default_na=False, sep='\t',)

In [None]:
import time
import tensorflow as tf
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

# The location for the new vocabulary.
vocab_location = 'vocab-' + str(int(time.time())) + '.txt'

# Reserved tokens to include in the vocabulary. The "[padding]" token is used 
# later to pad short names to be the same width.
reserved_tokens = ['[padding]']

# The number of vocabulary tokens to generate.
vocab_size = 300

def get_vocab():
    vocab = bert_vocab.bert_vocab_from_dataset(
        tf.data.Dataset.from_tensor_slices(df['Company']), 
        vocab_size=vocab_size, 
        reserved_tokens=reserved_tokens, 
        bert_tokenizer_params={}
    )
    return vocab

# Load the vocabulary and write it to a file.
vocab = get_vocab()
with open(vocab_location, 'w') as file:
    for token in vocab:
        print(token, file=file)

In [None]:
import tensorflow_text as text
from tensorflow import keras
from tensorflow.keras import layers, losses

# The number of tokens expected for each name. Longer names will be 
# truncated, and shorter ones will be padded.
input_shape = 20

# The index of the padding token in the vocabulary.
padding_token = 0

string_lookup = text.BertTokenizer(vocab_location)

def encode_name(name):
    tokenized = string_lookup.tokenize(name).merge_dims(-2, -1)[0]
    padded = tf.slice(
        tf.pad(
            tokenized, 
            [[0, max(0, input_shape - len(tokenized))]], 
            constant_values=padding_token
        ),
        [0],
        [input_shape]
    )

    return tf.one_hot(padded, 300)

def without_short_words(name):
    return ' '.join([word for word in name.split(' ') if len(word) >= 4])

def get_names_encoded():
    return tf.convert_to_tensor([
        encode_name(without_short_words(company)) for company in df['Company']
    ])

# Load the names as one-hot vectors.
one_hot_names = get_names_encoded()

In [None]:
vocab_size = len(vocab)
conv_dim = 100
generator_noise_shape = (100,)

def get_autoencoder():
    # An autoencoder is used to soften real inputs to the discriminator.
    # See "TextKD-GAN: Text Generation using KnowledgeDistillation and 
    # Generative Adversarial Networks."
    return keras.Sequential([
        keras.Input((input_shape, vocab_size), name="autoencoder_input"),
        layers.Flatten(name="autoencoder_1"),
        layers.Dense(4, name="autoencoder_2"),
        layers.Dense(input_shape * vocab_size, name="autoencoder_3"),
        layers.Reshape((input_shape, vocab_size), name="autoencoder_4"),
        layers.Softmax(name="autoencoder_5")
    ])

def res_block():
    # Each residual block within the residual network modifies
    # the input. The convolutional layer uses the padding option
    # "same" to preserve the same dimensions as the input in the output,
    # so that addition can be performed at the end. This ensures
    # the input and output have size (input_shape, conv_dim).
    inputs = keras.Input((input_shape, conv_dim))
    x = inputs
    x = layers.Conv1D(conv_dim, 5, activation='relu', padding='same')(x)
    x = layers.Conv1D(conv_dim, 5, activation='relu', padding='same')(x)
    x = inputs + x * 0.3
    return keras.Model(inputs=inputs, outputs=x)

def get_discriminator():
    # The discriminator takes size (input_shape, vocab_size) and
    # outputs size (1).
    return keras.Sequential([
        keras.Input((input_shape, vocab_size), name="discriminator_input"),
        layers.Conv1D(conv_dim, 1, activation='relu'),
        res_block(),
        res_block(),
        res_block(),
        res_block(),
        res_block(),
        layers.Flatten(),
        layers.Dense(1, name="discriminator_4"),
    ])

def get_generator():
    # The generator takes size (generator_noise_shape) and
    # outputs size (input_shape, vocab_size).
    return keras.Sequential([
        keras.Input(generator_noise_shape, name="generator_input"),
        layers.Dense(input_shape * conv_dim),
        layers.Reshape((input_shape, conv_dim)),
        res_block(),
        res_block(),
        res_block(),
        res_block(),
        res_block(),
        layers.Conv1D(vocab_size, 1, activation='relu'),
        layers.Softmax(name="generator_6")
    ])

In [None]:
import statistics
import os
import random
from datetime import timedelta

# The number of training iterations to run.
training_iterations = 5

# The number of epochs to run for the autoencoder
# within each iteration.
autoencoder_epochs = 10

# The number of epochs to run for the discriminator
# and generator within each iteration.
discriminator_training_iterations = 500

# The batch size to use when training the discriminator
# and generator.
discriminator_batch_size = 512

# The shuffle buffer size to use when training
# the discriminator and generator, which can be set
# to the number of real names to ensure a complete shuffle.
discriminator_shuffle_buffer_size = len(one_hot_names)

# The coefficient to use in the gradient penalty for
# enforcing the Lipschitz constraint. See
# "Improved Training of Wasserstein GANs."
lipschitz_lambda = 10

autoencoder = get_autoencoder()
discriminator = get_discriminator()
generator = get_generator()

one_hot_named_batched_shuffled = tf.data.Dataset \
    .from_tensor_slices(one_hot_names) \
    .shuffle(discriminator_shuffle_buffer_size, reshuffle_each_iteration=True) \
    .batch(discriminator_batch_size, drop_remainder=True)

cross_entropy = tf.keras.losses.BinaryCrossentropy()
generator_optimizer = tf.keras.optimizers.Adam(
    learning_rate=1e-4, 
    beta_1=0.5, 
    beta_2=0.9
)
discriminator_optimizer = tf.keras.optimizers.Adam(
    1e-4, 
    beta_1=0.5, 
    beta_2=0.9
)

autoencoder.compile(optimizer='adam', loss=losses.MeanSquaredError())

start_time = time.time()

for i in range(training_iterations):
    print('Iteration {}/{}...'.format(i + 1, training_iterations))

    # Train the autoencoder using Keras's `fit` method.
    autoencoder_history = autoencoder.fit(
        one_hot_names,
        one_hot_names, 
        epochs=autoencoder_epochs, 
        verbose=0, 
        shuffle=True, 
        validation_split=0.2
    )

    print(
        '  Encode:\n    training loss: mean {}, stdev {}\n    validation loss: mean {}, stdev {}'.format(
            statistics.mean(autoencoder_history.history['loss']),
            statistics.stdev(autoencoder_history.history['loss']),
            statistics.mean(autoencoder_history.history['val_loss']),
            statistics.stdev(autoencoder_history.history['val_loss'])
        )
    )

    for j in range(discriminator_training_iterations):
        k = 0
        for discriminator_batch in one_hot_named_batched_shuffled:
            k += 1

            noise = tf.random.normal(
                [discriminator_batch_size] + list(generator_noise_shape)
            )

            # Compute gradients for the generator and discriminator for this batch.
            with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape, tf.GradientTape() as lipschitz_tape:
                generated = generator(noise, training=True)

                real_discriminated = discriminator((discriminator_batch), training=True)
                generated_discriminated = discriminator(generated, training=True)

                # Compute the Wasserstein loss. See "Improved Training of
                # Wasserstein GANs."
                generator_loss = -tf.reduce_mean(generated_discriminated)
                discriminator_loss_initial = tf.reduce_mean(generated_discriminated) - tf.reduce_mean(real_discriminated)

                # Compute the gradient penalty to enforce the Lipschitz 
                # constraint. See "Improved Training of Wasserstein GANs."
                lipschitz_line_alpha = tf.random.uniform(
                    shape=[discriminator_batch_size, 1, 1], 
                    minval=0,
                    maxval=1
                )
                lipschitz_line = discriminator_batch + lipschitz_line_alpha * (discriminator_batch - generated)
                lipschitz_discriminated = discriminator(
                    lipschitz_line, training=True
                )
                gradients_of_lipschitz = lipschitz_tape.gradient(
                    lipschitz_discriminated, [lipschitz_line]
                )[0]
                lipschitz_slopes_by_batch = tf.math.sqrt(
                    tf.math.reduce_sum(tf.math.square(gradients_of_lipschitz), 
                    axis=[-2, -1])
                )
                lipschitz_mean = tf.math.reduce_mean(
                    tf.math.square(lipschitz_slopes_by_batch - 1)
                )

                discriminator_loss = discriminator_loss_initial + lipschitz_lambda * lipschitz_mean

            gradients_of_generator = gen_tape.gradient(
                generator_loss, 
                generator.trainable_variables
            )            
            gradients_of_discriminator = disc_tape.gradient(
                discriminator_loss, 
                discriminator.trainable_variables
            )

            # Apply the gradients to the generator and discriminator.
            generator_optimizer.apply_gradients(zip(
                gradients_of_generator, 
                generator.trainable_variables
            ))
            discriminator_optimizer.apply_gradients(zip(
                gradients_of_discriminator, 
                discriminator.trainable_variables
            ))

        elapsed = time.time() - start_time
        estimated_total = elapsed / (j + 1 + i * discriminator_training_iterations) * discriminator_training_iterations * training_iterations
        estimated_remaining = estimated_total - elapsed

        print(
            '  Discriminator: Epoch {}/{}. Generator loss {}. Discriminator loss {}. Total time {}, estimated {} remaining.'.format(
                j + 1, 
                discriminator_training_iterations, 
                generator_loss.numpy(), 
                discriminator_loss.numpy(),
                str(timedelta(seconds=elapsed)).split('.')[0],
                str(timedelta(seconds=estimated_remaining)).split('.')[0]
            )
        )

def save_models():
    suffix = str(int(time.time()))
    os.makedirs('models/' + suffix)
    autoencoder.save('./models/' + suffix + '/autoencoder')
    generator.save('./models/' + suffix + '/generator')
    discriminator.save('./models/' + suffix + '/discriminator')

save_models()

print('Done training.')

In [None]:
# The number of names to generate.
random_iterations = 5

# How many names are generated for each one
# shown. Setting this to a value higher than 1
# will lead to the best name (as scored by the
# discriminator) being shown from each group.
compared_generated_names = 1

def simplify_name(name):
    new_name = name

    # Remove reserved tokens such as "[padding]" from
    # the string.
    for token in reserved_tokens:
        new_name = new_name.replace(token, '')

    # Remove words with fewer than three characters.
    return ' '.join([word for word in new_name.split(' ') if len(word) >= 3])

def decode_names(encoded_names):
    max_indices = tf.math.argmax(encoded_names, axis=-1)
    joined = tf.strings.reduce_join(
        string_lookup.detokenize(max_indices), 
        axis=-1, 
        separator=' '
    )
    return [simplify_name(name.numpy().decode('utf-8')) for name in joined]

def is_valid_name(encoded_name):
    # Names that include "##" indicate that a suffix
    # from the BERT vocabulary was placed at the
    # beginning of a word, and therefore they 
    # should be excluded.
    return '##' not in decode_names([encoded_name])[0]

for i in range(random_iterations):
    # Generate and discriminate new names.
    generated_options = generator(
        tf.random.normal([compared_generated_names] + list(generator_noise_shape))
    )
    discriminated_options = discriminator(generated_options)

    # Determine the best generated name, if more than one
    # were generated.
    best_generated_index = 0
    best_discriminated = 0
    for i in range(len(generated_options)):
        if discriminated_options[i] > best_discriminated and is_valid_name(generated_options[i]):
            best_discriminated = discriminated_options[i]
            best_generated_index = i

    generated_choice = [generated_options[best_generated_index]]
    discriminated_choice = [discriminated_options[best_generated_index]]
    inverted_join = decode_names(generated_choice)

    # Randomly choose a real name and score it
    # using the discriminator.
    element_real = [random.choice(one_hot_names)]
    discriminated_real = discriminator((tf.convert_to_tensor(element_real)))
    inverted_join_real = decode_names(element_real)

    # Print the generated and real names along with their scores.
    print('\n'.join([
        word + '\t' + str(confidence.numpy()) + '\t\t' + real_word + '\t' + str(real_confidence.numpy()) \
            for word, confidence, real_word, real_confidence \
            in zip(
                inverted_join, 
                discriminated_choice, 
                inverted_join_real, 
                discriminated_real
            )
    ]))