# Setup

In [2]:
import tensorflow as tf

2023-11-14 05:23:25.723136: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Generating Shakespearean Text Using a Character RNN

## Creating the Training Dataset

In [3]:
shakespeare_url = "https://homl.info/shakespeare"
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare


In [5]:
print(shakespeare_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [7]:
len(shakespeare_text)

1115394

In [9]:
text_vec_layer = tf.keras.layers.TextVectorization(
    split="character",
    standardize="lower"
)
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]
encoded

2023-11-14 05:28:32.670215: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [10]:
encoded[:80]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([21,  7, 10,  9,  4,  2, 20,  7,  4,  7, 37,  3, 11, 25, 12, 23,  3,
       21,  5, 10,  3,  2, 18,  3,  2, 24, 10,  5, 20,  3,  3, 14,  2,  6,
       11, 17,  2, 21, 15, 10,  4,  8,  3, 10, 19,  2,  8,  3,  6, 10,  2,
       16,  3,  2,  9, 24,  3,  6, 26, 28, 12, 12,  6, 13, 13, 25, 12,  9,
       24,  3,  6, 26, 19,  2,  9, 24,  3,  6, 26, 28])>

In [11]:
encoded -= 2 # drop tokens 0 (pad) and 1 (unknown character)
n_tokens = text_vec_layer.vocabulary_size() - 2 # number of distinct chars)
dataset_size = len(encoded) # total number of chars

In [52]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=12):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(buffer_size=100_000, seed=seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

ds = to_dataset(range(1,20), length=11, shuffle=True, seed=42)
for window in ds:
    print(window)
    # print([elem.numpy() for elem in window])

(<tf.Tensor: shape=(8, 11), dtype=int32, numpy=
array([[ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
       [ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
       [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
       [ 6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16],
       [ 7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17],
       [ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13],
       [ 8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
       [ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]], dtype=int32)>, <tf.Tensor: shape=(8, 11), dtype=int32, numpy=
array([[ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
       [ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13],
       [ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
       [ 7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17],
       [ 8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
       [ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
       [ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [ 6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]], dtype=int32)>)


In [53]:
length = 100
tf.random.set_seed(42)

train_set = to_dataset(encoded[:100_000_000], length=length, shuffle=True, seed=42)
valid_set = to_dataset(encoded[100_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[1_060_000:], length=length)

## Building and Training the Char-RNN Model

In [54]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=26),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="nadam",
    metrics=["accuracy"]
)
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "my_shakepeare_model",
    monitor="val_accuracy",
    save_best_only=True
)
history = model.fit(
    train_set,
    validation_data=valid_set,
    epochs=10,
    callbacks=[model_ckpt]
)

Epoch 1/10


2023-11-14 06:16:48.245842: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 69132 of 100000
2023-11-14 06:16:52.679019: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:417] Shuffle buffer filled.


  23135/Unknown - 1579s 67ms/step - loss: 1.3926 - accuracy: 0.5752

KeyboardInterrupt: 

In [56]:
shakespeare_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X - 2),
    model
])