# Imports

In [2]:
import tensorflow as tf

import numpy as np
import os
import time

# Load data

In [3]:
# Read, then decode for py2 compat.
text = open('shakespeare_train.txt', 'rb').read().decode(encoding='utf-8')

# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

68 unique characters


# Process the text
Methods to convert the strings to a numerical representation and the other way around. This section also creates sections of text to define training examples.

Create layer to go from characters to ID's. The input of the layer is the output from tf.strings.unicode_split.

In [4]:
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)

Create layer and method to go from ID's to characters, the reverse process.

In [5]:
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [6]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

Divide training data into chunks.

In [7]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

F
i
r
s
t
 
C
i
t
i


In [8]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

In [11]:
# Actual examples of chunks of text
print(sequences)
for seq in sequences.take(5):
    print(text_from_ids(seq).numpy())

<BatchDataset shapes: (101,), types: tf.int64>
b'First Citizen:\r\nBefore we proceed any further, hear me speak.\r\n\r\nAll:\r\nSpeak, speak.\r\n\r\nFirst Citizen'
b':\r\nYou are all resolved rather to die than to famish?\r\n\r\nAll:\r\nResolved. resolved.\r\n\r\nFirst Citizen:\r'
b"\nFirst, you know Caius Marcius is chief enemy to the people.\r\n\r\nAll:\r\nWe know't, we know't.\r\n\r\nFirst "
b"Citizen:\r\nLet us kill him, and we'll have corn at our own price.\r\nIs't a verdict?\r\n\r\nAll:\r\nNo more ta"
b"lking on't; let it be done: away, away!\r\n\r\nSecond Citizen:\r\nOne word, good citizens.\r\n\r\nFirst Citizen"


### Create training pairs
Function to make pairs of a letter and the one that comes after it.

In [12]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [13]:
dataset = sequences.map(split_input_target)
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'First Citizen:\r\nBefore we proceed any further, hear me speak.\r\n\r\nAll:\r\nSpeak, speak.\r\n\r\nFirst Citize'
Target: b'irst Citizen:\r\nBefore we proceed any further, hear me speak.\r\n\r\nAll:\r\nSpeak, speak.\r\n\r\nFirst Citizen'


### Create training batches

In [14]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

# Define model

In [15]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [16]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.SimpleRNN(rnn_units,
                                       return_sequences=True,
                                       return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x

In [17]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [18]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 69) # (batch_size, sequence_length, vocab_size)


In [19]:
model.summary()

Model: "my_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  17664     
_________________________________________________________________
simple_rnn (SimpleRNN)       multiple                  1311744   
_________________________________________________________________
dense (Dense)                multiple                  70725     
Total params: 1,400,133
Trainable params: 1,400,133
Non-trainable params: 0
_________________________________________________________________


In [20]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([60,  8, 14, 38, 67, 61, 67, 60, 13, 15, 21, 41, 33,  8, 45,  0, 15,
       62, 46, 26, 27, 27, 48, 38, 47,  3,  8,  6, 15, 29, 61, 28,  7, 64,
        8, 35, 57, 24, 58, 12, 62,  3, 55, 23, 13, 55, 14,  6, 23, 65, 33,
       51, 61, 43, 49, 10, 10, 39, 64, 37, 10, 67, 14, 33, 65, 49, 19, 58,
       35, 35, 13,  8, 20, 60, 67, 14, 12, 18, 49,  6, 42, 11, 30,  0, 18,
       27, 60, 11, 16, 25, 47, 16, 15, 20, 24,  2, 28, 29, 61, 47],
      dtype=int64)

In [21]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b'e glorious gods sit in hourly synod about thy\r\nparticular prosperity, and love thee no worse than\r\nt'

Next Char Predictions:
 b"r,?Xysyr;AG[S,c[UNK]AtdLMMfXe ,&AOsN'v,UoJp:t mI;m?&IwSisag..YvW.y?SwgEpUU;,Fry?:Dg&]3P[UNK]DMr3BKeBAFJ\rNOse"


# Train model

In [22]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss)

In [23]:
EPOCHS = 20

In [None]:
history = model.fit(dataset, epochs=EPOCHS)

Epoch 1/20
Epoch 2/20
Epoch 3/20

In [55]:
class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars

        # Create a mask to prevent "[UNK]" from being generated.
        skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
        sparse_mask = tf.SparseTensor(
            # Put a -inf at each bad index.
            values=[-float('inf')]*len(skip_ids),
            indices=skip_ids,
            # Match the shape to the vocabulary
            dense_shape=[len(ids_from_chars.get_vocabulary())])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

    @tf.function
    def generate_one_step(self, inputs, states=None):
        # Convert strings to token IDs.
        input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
        input_ids = self.ids_from_chars(input_chars).to_tensor()

        # Run the model.
        # predicted_logits.shape is [batch, char, next_char_logits]
        predicted_logits, states = self.model(inputs=input_ids, states=states,
                                              return_state=True)
        # Only use the last prediction.
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits/self.temperature
        # Apply the prediction mask: prevent "[UNK]" from being generated.
        predicted_logits = predicted_logits + self.prediction_mask

        # Sample the output logits to generate token IDs.
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)

        # Convert from token ids to characters
        predicted_chars = self.chars_from_ids(predicted_ids)

        # Return the characters and model state.
        return predicted_chars, states

In [56]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [57]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

ROMEO:
I know us out 'twixt his unclean in death.

TALBOT:
Go to, England! Tell me, befall it: the true
issuir: the true't master do the cruelty.

MIRANDA:
O, pray, give me what doctor, your son's daughter.

CLOTEN:
I love those bond.

TYBELLOTO:
To the king.

PANDARUS:
Mistress Ford! a girdle, chamorous enemy!

ARIEL:

GLOUCESTER:
Now he is vouch'd into Posthumus,
They two are twenty tooldersing bed.
Marry, sir?

CARDINAL WOLSEY:
What trust?

PANDARUS:
We are lady at the beasts. What were good choice of it
with you, and could they creph quick again, as it be,
That in this sele-a field but majesty
To mongrel? Let it all dream'd like a Glendic;
Who bear his boots within: I see you were
As doctor, would be shorted, lads of darkness,
Yet laugh'd in to answer, glide: where are you?

COUNTESS:
Nay, heaven can make condemn enough to work
Reputation? show deward and starve: stealth,
Intion! Say'st thou?

KING HENRY V:
Pray, sir, art thou not?

DIC