In [None]:
import tensorflow as tf
shakespeare_url = "https://homl.info/shakespeare"
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
  shakespeare_text = f.read()

In [None]:
print(shakespeare_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [None]:
"".join(sorted(set(shakespeare_text.lower())))


"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

In [None]:
text_vec_layer = tf.keras.layers.TextVectorization(split = "character",
                                                   standardize = "lower")
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]
encoded

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [None]:
encoded

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [None]:
text_vec_layer.get_vocabulary()

['',
 '[UNK]',
 ' ',
 'e',
 't',
 'o',
 'a',
 'i',
 'h',
 's',
 'r',
 'n',
 '\n',
 'l',
 'd',
 'u',
 'm',
 'y',
 'w',
 ',',
 'c',
 'f',
 'g',
 'b',
 'p',
 ':',
 'k',
 'v',
 '.',
 "'",
 ';',
 '?',
 '!',
 '-',
 'j',
 'q',
 'x',
 'z',
 '3',
 '&',
 '$']

In [None]:
encoded -= 2 # drop tokens 0(pad) and 1(unknown), which we will not see
n_tokens = text_vec_layer.vocabulary_size() - 2 # number of distinct chars = 39
dataset_size = len(encoded)

In [None]:
n_tokens

39

In [None]:
len(shakespeare_text)

1115394

In [None]:
def to_dataset(sequence, length, shuffle = False, seed = None, batch_size = 32):
  ds = tf.data.Dataset.from_tensor_slices(sequence)
  ds = ds.window(length + 1, shift=1, drop_remainder=True)
  ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
  if shuffle:
    ds = ds.shuffle(100_000, seed = seed)
  ds = ds.batch(batch_size)
  return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [None]:
list(to_dataset(tf.range(10), 3))

[(<tf.Tensor: shape=(7, 3), dtype=int32, numpy=
  array([[0, 1, 2],
         [1, 2, 3],
         [2, 3, 4],
         [3, 4, 5],
         [4, 5, 6],
         [5, 6, 7],
         [6, 7, 8]], dtype=int32)>,
  <tf.Tensor: shape=(7, 3), dtype=int32, numpy=
  array([[1, 2, 3],
         [2, 3, 4],
         [3, 4, 5],
         [4, 5, 6],
         [5, 6, 7],
         [6, 7, 8],
         [7, 8, 9]], dtype=int32)>)]

In [None]:
list(to_dataset(text_vec_layer(['To be'])[0], length=4))

[(<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[ 4,  5,  2, 23]])>,
  <tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[ 5,  2, 23,  3]])>)]

In [None]:
length = 100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1_000_000], length = length, shuffle = True,
                       seed = 42)
valid_set = to_dataset(encoded[1_000_000 : 1_060_000], length = length)
test_set = to_dataset(encoded[1_060_000:], length = length)

In [None]:
# for i, l in test_set.take(1):
#   print(l)

KeyboardInterrupt: 

Building and Training the Char-RNN Mode

In [None]:
tf.random.set_seed(42)
model = tf.keras.Sequential([
     tf.keras.layers.Embedding(input_dim = n_tokens, output_dim = 16),
     tf.keras.layers.GRU(128, return_sequences = False),
     tf.keras.layers.Dense(n_tokens, activation = 'softmax')
])
model.compile(loss = "sparse_categorical_crossentropy", optimizer = "nadam",
              metrics=["accuracy"])
model_cktp = tf.keras.callbacks.ModelCheckpoint(
    "my_shakespeare_model.keras", monitor="val_accuracy", save_best_only = True
)

history = model.fit(train_set, validation_data = valid_set, epochs = 3,
                    callbacks = [model_cktp])




In [None]:
shakespeare_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X -2),
    model
])