### Setup

In [1]:
# NOTE: import TF and other libraries

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import time
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import tensorflow as tf


In [2]:
# NOTE: download Shakespeare dataset

path_to_file = tf.keras.utils.get_file(
    "shakespeare.txt",
    "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt",
)

In [3]:
# NOTE: read the data

text = open(path_to_file, "rb").read().decode(encoding="utf-8")
print(f"[DEBUG] length of text: {len(text)} characters")
print(f"{text[:250]=}")

vocab = sorted(set(text))
print(f"{len(vocab)} unique characters")

[DEBUG] length of text: 1115394 characters
text[:250]='First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n'
65 unique characters


### Process the text

In [4]:
# NOTE: vectorize the text

example_texts = ["abcdefg", "xyz"]

# string to set of chars
chars = tf.strings.unicode_split(example_texts, input_encoding="UTF-8")
print(chars)

# set of chars to numerical representation
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None
)
ids = ids_from_chars(chars)
print(ids)

# numerical representation to set of chars
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), 
    invert=True, 
    mask_token=None
)
chars = chars_from_ids(ids)
print(chars)

# set of chars to string
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>
<tf.RaggedTensor [[40, 41, 42, 43, 44, 45, 46], [63, 64, 65]]>
<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>


In [5]:
# NOTE: the prediction task: 
# given sequence of character, 
# what is the most probable next character?

# NOTE: create training examples and targets
all_ids = ids_from_chars(tf.strings.unicode_split(text, "UTF-8"))
print(f"{all_ids=}")

ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode("utf-8"))

# batching
seq_length = 100
examples_per_epoch = len(text) // (seq_length+1) # NOTE: training set; given sequence(seq_length), predict next character(+1)
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)
for seq in sequences.take(5):
    print(text_from_ids(seq).numpy())

all_ids=<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([19, 48, 57, ..., 46,  9,  1])>
F
i
r
s
t
 
C
i
t
i
b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
b'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
b"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
b"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
b'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [6]:
# NOTE: preparing dataset: (input, label)
# `input`: current character
# `label`: next character

def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]

    return input_text, target_text
split_input_target(list("Tensorflow"))

dataset = sequences.map(split_input_target) # NOTE: transformation using input function
for input_example, target_example in dataset.take(1):
    print(f"[DEBUG] Input:\t {text_from_ids(input_example).numpy()}")
    print(f"[DEBUG] Target:\t {text_from_ids(target_example).numpy()}")

[DEBUG] Input:	 b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
[DEBUG] Target:	 b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [7]:
# NOTE: create training batches

BATCH_SIZE = 64
BUFFER_SIZE = 10000 # NOTE: size of container which will store already shuffled dataset

dataset = (
    dataset.shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
) # NOTE: pack shuffled data into batches

print(dataset)

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>


In [8]:
# NOTE: build the model

vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)

        # NOTE: layers
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(
            rnn_units, 
            return_sequences=True, 
            return_state=True
        )
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, 
             inputs, 
             states=None, 
             return_state=False, 
             training=False):
        
        x = self.embedding(inputs, training=training)

        # NOTE: if training, load & use previous state
        if None is states:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        return (x, states) if return_state else x
    
model = MyModel(
    # NOTE: be sure the vocabulary size matches the `StringLookup` layers
    vocab_size=len(ids_from_chars.get_vocabulary()), 
    embedding_dim=embedding_dim, 
    rnn_units=rnn_units
)

### Try the model

In [9]:
# NOTE: validate the model

# NOTE: 1. check the shape of the output:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(f"{example_batch_predictions.shape} # (batch_size, sequence_length, vocab_size)")
print()

# NOTE: 2. layer architecture
print(f"{model.summary()}")
print()

# NOTE: 3. get predictions using random
# NOTE: using softmax for choosing next best prediction will result stuck in a loop!
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
print(f"[DEBUG] {sampled_indices=}")
print()
print(f"[DEBUG] input: \n{text_from_ids(input_example_batch[0]).numpy()}")
print(f"[DEBUG] next char predictions: \n{text_from_ids(sampled_indices).numpy()}")

(64, 100, 66) # (batch_size, sequence_length, vocab_size)

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  16896     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  67650     
                                                                 
Total params: 4022850 (15.35 MB)
Trainable params: 4022850 (15.35 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None

[DEBUG] sampled_indices=array([36, 27, 43, 34, 37, 52, 39, 14, 61, 40, 47, 25, 37, 29, 20, 34,  0,
       42, 38,  7, 34, 61, 63, 52, 41, 36, 60,  9, 44, 64, 55,  3, 55,  2,
       17, 17, 40, 36, 36, 18,  1, 57,  6, 

### Train the model

In [10]:
# NOTE: attach an optimizer & loss function

loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print(f"[DEBUG] prediction shape: {example_batch_predictions.shape} # (batch_size, sequence_length, vocab_size)")
print(f"[DEBUG] mean loss: {example_batch_mean_loss}")

"""
    A newly initialized model shouldn't be too sure of itself, the output logits should all have similar magnitudes. 
    To confirm this you can check that the exponential of the mean loss is approximately equal to the vocabulary size. 
    A much higher loss means the model is sure of its wrong answers, and is badly initialized:
"""
print(f"{tf.exp(example_batch_mean_loss).numpy()}")

# NOTE: configure the training procedure
model.compile(optimizer="adam", loss=loss)

[DEBUG] prediction shape: (64, 100, 66) # (batch_size, sequence_length, vocab_size)
[DEBUG] mean loss: 4.191465377807617
66.1196060180664


In [11]:
# NOTE: configure checkpoints

checkpoint_dir = "./training_checkpoints"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix, 
    save_weights_only=True
)

In [14]:
# NOTE: execute the training

EPOCHS = 30
history = model.fit(
    dataset, 
    epochs=EPOCHS, 
    callbacks=[checkpoint_callback]
)

Epoch 1/30


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [15]:
# NOTE: generate text

class OneStep(tf.keras.Model):
    def __init__(self, 
                 model, 
                 chars_from_ids, 
                 ids_from_chars, 
                 temperature=1.0):
        
        super().__init__()
        
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars

        # NOTE: create a mask to prevent `[UNK]` from being generated
        skip_ids = self.ids_from_chars(["[UNK]"])[:, None]
        sparse_mask = tf.SparseTensor(
            # NOTE: put -inf for each bad index
            values=[-float("inf")] * len(skip_ids), 
            indices=skip_ids, 
            # NOTE: match the shape to the vocabulary, though this is sparse and filtered
            dense_shape=[len(ids_from_chars.get_vocabulary())]
        )
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

        return
    
    @tf.function
    def generate_one_step(self, inputs, states=None):

        # NOTE: convert strings to token IDs
        input_chars = tf.strings.unicode_split(inputs, "UTF-8")
        input_ids = self.ids_from_chars(input_chars).to_tensor()

        # NOTE: run the model
        predicted_logits, states = self.model(
            inputs=input_ids, 
            states=states, 
            return_state=True
        ) # `predicted_logits`: [batch, char, next_char_logits]

        # NOTE: only use the last prediction <- why?
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits / self.temperature
        predicted_logits = predicted_logits + self.prediction_mask # this way, unwanted index becomes `-inf`

        # NOTE: sample the output logits, to generate token IDs
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)

        # NOTE: convert from token to chars
        predicted_chars = self.chars_from_ids(predicted_ids)

        return predicted_chars, states
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

# NOTE: run this in a loop to generate some text
start = time.time()
states = None
next_char = tf.constant(["ROMEO:"])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(
        next_char, states=states
    )
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode("utf-8"), "\n\n" + "_" * 80)
print()
print(f"[DEBUG] run time: {end - start}")

ROMEO:O:
Why foolish being now might you ade-hard for frost?

DUKE VINCENTIO:
I'll vantage you have gold.

BIONDELLO:
O heavens! O heavens! O house! have made mine heart
Of what is well deserved to have light encounter;
Chather Geothest things of me.

GLOUCESTER:
Why, what take thou overth, Pruntium queen,
A centage of his father;
And whatsoever you will remember what with kings
And harsh had heards me to to pass.

POLIXENES:
I cannot thou mintless sky,
Far beheld the other of a woman:
And Bolingbroke forget to fawn upon thy
fault, nor never was care so far off
And rack many hours my tongue nor drinking fair.
Farewell: for 'tis worse!

SICINIUS:
O poison to submising land;
To silk, any methought I thank you farth.

KING EDWARD IV:
Now take the son, the fortune spirit is in thine own motion.
Where, for an ass come to keep the body that would not have?

KING HENRY VI:
Can I can holy heart, my son Edward not with hum.
To have her honour to the king, and who hooks
Mercilither me to lose hi

In [16]:
# NOTE: batched text generation

start = time.time()
states = None
next_char = tf.constant(["ROMEO:", "ROMEO:", "ROMEO:", "ROMEO:", "ROMEO:"])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(
        next_char, states=states
    )
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode("utf-8"), "\n\n" + "_" * 80)
print()
print(f"[DEBUG] batched run time: {end - start}")

ROMEO:umbrace did
Against me of your bearding. Mine uncountry's face,
Whose mighty mowhat, even your honour,
Out of her wonderful. Whose lady and her's dost mother?
I'll keep me from this born, forget thee from force.
Nay, no more deep war, what he should we din,
And make a posses of that word, we must to thee?

COMINIUS:
Carbas thee well.

BIANCA:
Why, no; he, now to thine own.

CLAUDIO:
'Tis wrong to tender for what thou hast?

STANLEY:
Unfrain the war, who crown'd with knighting Bolingbroke
As Valpids that stabber happiness that you make
Your torgue with false foe: I am going
Altht home. But what?

BRUTUS:
O heavens!
O woe for valiant, great son and happy mothers!
Farewell the rumbless where no mortal compalase,
And by the king's shapen made a general lamentatio;
for left no pardly with thy womb hath hed us,
In fair proud quarrel, to quistisure grows,
And we allow their sovereign pland.
Dare-tell me, lack Warwick, I am too study
To say but harm look' to a king,
Kate, softend-harding

In [17]:
# NOTE: export the generator

tf.saved_model.save(one_step_model, "one_step")
one_step_reloaded = tf.saved_model.load("one_step")

# validation
states = None
next_char = tf.constant(["ROMEO:"])
result = [next_char]

for n in range(100):
    next_char, states = one_step_reloaded.generate_one_step(
        next_char, states=states
    )
    result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))

INFO:tensorflow:Assets written to: one_step/assets


INFO:tensorflow:Assets written to: one_step/assets


ROMEO:isteman.

CAPULET:
And Poase this tome; I will soon abused good.
I'll quench the party of your love.
