# NLP with RNNs and Attention

## Using Character RNNs

Character RNNs or char-RNNs are RNNs that try to predict the next character or word in a sentence allowing them to generate text

### Example of Generating Text

#### Data Prep

In [19]:
import numpy as np
import tensorflow as tf

In [2]:
shakespeare_url = 'https://homl.info/shakespeare'
filepath = tf.keras.utils.get_file('shakespeare.txt', shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [3]:
print(shakespeare_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [4]:
## Encode text
text_vec_layer = tf.keras.layers.TextVectorization(split='character', standardize='lower')
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text][0])

I0000 00:00:1743221447.696809    4591 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9710 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:01:00.0, compute capability: 8.6


In [7]:
encoded -= 2  ### Drop tokens 0 (pad) and 1 (unknown) which we will not use
n_tokens = text_vec_layer.vocabulary_size() - 2 ### Number of distinct chars 
dataset_size = len(encoded) ### Total number of chars

In [8]:
### Convert to sequences so the model can guess the next character
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(buffer_size=100_000, seed=seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [9]:
length = 100
tf.random.set_seed(42)

train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True,
                      seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[1_060_000:], length=length)

#### Building Char-RNN Model

In [22]:
# model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
#     tf.keras.layers.GRU(128, return_sequences=True),
#     tf.keras.layers.Dense(n_tokens, activation='softmax')
# ])
# model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam',
#              metrics=['accuracy'])
# model_ckpt = tf.keras.callbacks.ModelCheckpoint(
#     'my_shakespeare_model.keras', monitor='val_accuracy', save_best_only=True
# )
# history = model.fit(train_set, validation_data=valid_set, epochs=10,
#                    callbacks=[model_ckpt])

In [23]:
model = tf.keras.models.load_model('my_shakespeare_model.keras')

In [24]:
shakespeare_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X - 2), ### NO pad or <UNK> tokens
    model
])

In [25]:
# Test Model
test_input = tf.constant(["To be or not to b"])
y_proba = shakespeare_model.predict(test_input)[0, -1]
y_pred = tf.argmax(y_proba) ## Choose the most probable character ID
text_vec_layer.get_vocabulary()[y_pred + 2]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step


'e'

### Generating Fake SS Text