# Chapter 16: Natural Language Processing with RNNs and Attention

### Setup

In [14]:
import tensorflow as tf
from tensorflow import keras
from keras import (
    layers,
    optimizers,
    losses,
    Sequential,
    utils,
)
import keras_tuner as kt

In [4]:
_AUTOTUNE = tf.data.AUTOTUNE
_BATCH_SIZE = 64
_SEED = 1992

### Generating Shakespearean Text Using a Character RNN

In [5]:
SHAKESPEARE_URL = "https://homl.info/shakespeare"

In [6]:
filepath = utils.get_file("shakespeare.txt", SHAKESPEARE_URL)

with open(filepath) as f:
    shakespeare_text = f.read()

In [7]:
print(shakespeare_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [8]:
text_vec_layer = layers.TextVectorization(split="character", standardize="lower")
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]


Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



In [9]:
encoded -= 2
n_tokens = text_vec_layer.vocabulary_size() - 2
dataset_size = len(encoded)

print(f"There are {n_tokens} different, and the dataset has {dataset_size:_} total characters.")

There are 39 different, and the dataset has 1_115_394 total characters.


In [11]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=_BATCH_SIZE):
    dataset = tf.data.Dataset.from_tensor_slices(sequence)
    dataset = dataset.window(length + 1, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=100_000, seed=seed)
    dataset = dataset.batch(batch_size)
    return dataset.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(
        _AUTOTUNE
    )


In [13]:
length = 100
tf.random.set_seed(_SEED)
train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True, seed=_SEED)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[:1_060_000:], length=length)

In [None]:
# Build the model
char_rnn_model = Sequential([
    layers.Embedding(input_dim=n_tokens, output_dim=16),
    layers.GRU(128, return_sequences=True),
    layers.Dense(n_tokens, activation="softmax"),
])

# Compile the model
optimizer = optimizers.Nadam()
loss = losses.sparse_categorical_crossentropy
char_rnn_model.compile(
    loss=loss,
    optimizer=optimizer,
    metrics=["accuracy"],
)

# Callbacks and training
model_checkpoint_cb = 