# Chapter 16: Natural Language Processing with RNNs and Attention

### Setup

In [4]:
import tensorflow as tf
from tensorflow import keras
from keras import (
    callbacks,
    layers,
    optimizers,
    losses,
    Sequential,
    utils,
)


NotFoundError: dlopen(/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/tensorflow-plugins/libmetal_plugin.dylib, 0x0006): symbol not found in flat namespace '__ZN10tensorflow8internal10LogMessage16VmoduleActivatedEPKci'

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
BATCH_SIZE = 64
SEED = 1992
LOGS_DIR = "../../reports/logs/chapter_16/"
MODELS_PATH = "../../models/chapter_16/"

In [None]:
if not tf.io.gfile.exists(LOGS_DIR):
    tf.io.gfile.mkdir(LOGS_DIR)

if not tf.io.gfile.exists(MODELS_PATH):
    tf.io.gfile.mkdir(MODELS_PATH)

### Generating Shakespearean Text Using a Character RNN

In [None]:
SHAKESPEARE_URL = "https://homl.info/shakespeare"

In [None]:
filepath = utils.get_file("shakespeare.txt", SHAKESPEARE_URL)

with open(filepath) as f:
    shakespeare_text = f.read()

In [None]:
print(shakespeare_text[:80])

In [None]:
text_vec_layer = layers.TextVectorization(split="character", standardize="lower")
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]

In [None]:
encoded -= 2
n_tokens = text_vec_layer.vocabulary_size() - 2
dataset_size = len(encoded)

print(f"There are {n_tokens} different, and the dataset has {dataset_size:_} total characters.")

In [None]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=BATCH_SIZE):
    
    def flat_map_fn(window):
        return window.batch(length + 1)
    
    def map_fn(window):
        return (window[:, :-1], window[:, 1:])
    
    dataset = tf.data.Dataset.from_tensor_slices(sequence)
    dataset = dataset.cache()
    dataset = dataset.window(length + 1, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(flat_map_fn)
    if shuffle:
        dataset = dataset.shuffle(buffer_size=100_000, seed=seed)
    dataset = dataset.batch(batch_size)
    return dataset.map(map_fn, num_parallel_calls=AUTOTUNE).prefetch(
        AUTOTUNE
    )


In [None]:
length = 100
tf.random.set_seed(SEED)
train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True, seed=SEED)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[:1_060_000:], length=length)

In [None]:
# Build the model
char_rnn_model = Sequential(
    [
        layers.Embedding(input_dim=n_tokens, output_dim=16),
        layers.GRU(128, return_sequences=True),
        layers.Dense(n_tokens, activation="softmax"),
    ]
)

# Compile the model
optimizer = optimizers.Nadam()
loss = losses.sparse_categorical_crossentropy
char_rnn_model.compile(
    loss=loss,
    optimizer=optimizer,
    metrics=["accuracy"],
)

# Callbacks and training
model_filepath = tf.io.gfile.join(MODELS_PATH, "char_rnn")
model_checkpoint_cb = callbacks.ModelCheckpoint(
    model_filepath,
    monitor="val_accuracy",
    save_best_only=True,
)

log_dir = tf.io.gfile.join(LOGS_DIR, "char_rnn")
profile_batch = int(len(encoded) / BATCH_SIZE) * 2
tensorboard_cb = callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch=f"1, {profile_batch}")
callbacks_ = [model_checkpoint_cb, tensorboard_cb]

history = char_rnn_model.fit(
    train_set,
    validation_data=valid_set,
    epochs=2,
    callbacks=callbacks_
)

In [None]:
char_rnn_model = Sequential([
    text_vec_layer,
    layers.Lambda(lambda X: X - 2),
    char_rnn_model,
])

In [None]:
y_proba = char_rnn_model.predict(["To be or not to b"])[0, -1]
y_pred = tf.argmax(y_proba)
text_vec_layer.get_vocabulary()[y_pred + 2]

#### Generating Fake Shakespearean Text

Let's use the `tf.random.categorical()` function to generate random classes indices:

In [None]:
log_probas = tf.math.log([0.5, 0.4, 0.1]) # Probas = 50%, 40%, 10%
tf.random.set_seed(SEED)
tf.random.categorical(log_probas, num_samples=8)

In [None]:
def next_char(text, temperature=1):
    y_proba = char_rnn_model.predict([text])[0, -1:]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]
    return text_vec_layer.get_vocabulary()[char_id + 2]


def extent_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [None]:
tf.random.set_seed(SEED)
input_text = "to be or not to b"
for temp in [0.001, 1, 10, 1000]:
    text = extent_text(input_text, temperature=temp)
    print(f"TEMP:{temp}")
    print(f"\n\t{text}")

#### Stateful RNN

In [2]:
def to_dataset_for_stateful_rnn(sequence, length):
    
    def window_to_batch(window):
        return window.batch(length + 1)
    
    def map_fn(window):
        return (window[:, :-1], window[:, 1:])
    
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=length, drop_remainder=True)
    ds = ds.flat_map(window_to_batch).batch(1)
    return ds.map(map_fn, num_parallel_calls=AUTOTUNE).prefetch(AUTOTUNE)

In [3]:
stateful_train_set = to_dataset_for_stateful_rnn(encoded[:1_000_000], length=length)
stateful_valid_set = to_dataset_for_stateful_rnn(
    encoded[1_000_000:1_060_000], length=length
)
stateful_test_set = to_dataset_for_stateful_rnn(encoded[:1_060_000:], length=length)


NameError: name 'encoded' is not defined

Creating the model requires in this case to specify the batch size:

In [None]:
stateful_model = Sequential(
    [
        layers.Embedding(
            input_dim=n_tokens, output_dim=16, batch_input_shape=[1, None]
        ),
        layers.GRU(128, return_sequences=True, stateful=True),
        layers.Dense(n_tokens, activation="softmax"),
    ]
)


In [None]:
class ResetStatesCallback(callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [None]:
stateful_model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="nadam",
    metrics=["accuracy"]
)

# Callbacks and training
model_filepath = tf.io.gfile.join(MODELS_PATH, "char_rnn")
model_checkpoint_cb = callbacks.ModelCheckpoint(
    model_filepath,
    monitor="val_accuracy",
    save_best_only=True,
)

log_dir = tf.io.gfile.join(LOGS_DIR, "char_rnn")
profile_batch = int(len(encoded) / BATCH_SIZE) * 2
tensorboard_cb = callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch=f"1, {profile_batch}")
callbacks_ = [model_checkpoint_cb, tensorboard_cb, ResetStatesCallback()]

stateful_model.fit(
    stateful_train_set,
    validation_data=stateful_valid_set,
    epochs=10,
    callbacks=callbacks_
)