# Shakespeare Text Generation NLP
Learning some text generation from the book Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow.

In [1]:
import tensorflow as tf
print(f'TensorFlow version {tf.__version__}')

TensorFlow version 2.12.0


# Load Data

In [2]:
shakespeare_url = "https://homl.info/shakespeare"
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)

with open(filepath) as f:
    shakespeare_text = f.read()

print(shakespeare_text[:80])

Downloading data from https://homl.info/shakespeare
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


# Data Processing
In this one we are going to use character vectorization

In [3]:
# Encode the characters into numbers
text_vec_layer = tf.keras.layers.TextVectorization(
    split="character", 
    standardize="lower")
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text][0])

# 0 is padding, and 1 is <oov>. Drop them
encoded -= 2
n_tokens = text_vec_layer.vocabulary_size() - 2
dataset_size = len(encoded)

Next, window the dataset. First, create a windowing function, then use it to create sequences and targets.

In [4]:
# Create the windowing function
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length+1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length+1))
    if shuffle:
        ds = ds.shuffle(buffer_size=100_000, seed=seed)
    ds = ds.batch(batch_size)

    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [5]:
# Use windowing function to create sequences and targets
# Use data for 90% training, 5% validation, 5% for testing
length = 100
tf.random.set_seed(42)

train_set = to_dataset(
    encoded[:1_000_000],
    length=length,
    shuffle=True, seed=42)

valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[1_060_000:], length=length)

# Build and Train a Model

### Use a reasonable GRU Model

In [None]:
# Create the model
model_gru = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

# Compile the model
model_gru.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="nadam",
    metrics=["accuracy"]
)

# Create a model checkpoint
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "my_shakespeare_model",
    monitor="val_accuracy",
    save_best_only=True
)

# Fit the model and save the history
history = model_gru.fit(
    train_set,
    validation_data=valid_set,
    epochs=20,
    callbacks=[model_ckpt]
)

In [None]:
# Save the model because that took forever
model_gru.save("shakespeare.h5")

In [None]:
# Load the model again
model_gru = tf.keras.models.load_model("shakespeare.h5")

In [7]:
# Last, wrap the model in the text to vec layer for predictions.
shakespear_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X - 2),
    model_gru
])

## Make some Predictions

In [8]:
# Get some text from the model
y_proba = shakespear_model.predict(["To be or not to b"])[0, -1]
y_pred = tf.argmax(y_proba)
text_vec_layer.get_vocabulary()[y_pred + 2]



'e'

Nice! Now let us get some actual text.

In [9]:
log_probas = tf.math.log([[0.5, 0.4, 0.1]])
tf.random.set_seed(42)
tf.random.categorical(log_probas, num_samples=8)

<tf.Tensor: shape=(1, 8), dtype=int64, numpy=array([[0, 0, 1, 1, 1, 0, 0, 0]])>

In [10]:
# Use "temperature" to alter how crazy the text is
# These are just a few helper functions 

def next_char(text, temperature=1):
    y_proba = shakespear_model.predict([text])[0, -1:]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]
    return text_vec_layer.get_vocabulary()[char_id + 2]

def extend_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [11]:
tf.random.set_seed(42)
print(extend_text("To be or not to be", temperature=1))

To be or not to be true
doth know that i love your daughter would
an
