In [25]:
import tensorflow as tf

shakespeare_url = "https://homl.info/shakespeare" # shortcut URL
filepath = tf.keras.utils.get_file("shkespeare.txt", shakespeare_url)

with open(filepath) as f:
    shakespeare_text = f.read()

print(shakespeare_text[:80])


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [26]:
# se set split="charachter" to split the text into a list of single characters instead of words (the default)
text_vec_layer = tf.keras.layers.TextVectorization(split="character", standardize='lower')
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]

In [27]:
text_vec_layer(shakespeare_text)

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [28]:
text_vec_layer([shakespeare_text])[0]

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [29]:
encoded[:5]

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([21,  7, 10,  9,  4])>

In [30]:
# we don't need the pad and unk tokens, so we can skip the first 2
encoded -= 2 # drop 0 (pad) and 1 (unkown) tokens

n_tokens = text_vec_layer.vocabulary_size() - 2 # number of distinct characters
dataset_size =  len(encoded) # total number of chars = 1,155,394
dataset_size

1115394

In [31]:
# utility function to convert a long sequence of characters into many small windows of text: convert long sequence of charcters IDs into a dataset of input/target pairs of small windows of text

def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
  ds = tf.data.Dataset.from_tensor_slices(sequence)
  ds = ds.window(length + 1, shift=1, drop_remainder=True)
  ds = ds.flat_map(lambda window: window.batch(length+1))
  if shuffle:
    ds = ds.shuffle(buffer_size=100_000, seed=seed)
  ds = ds.batch(batch_size)
  return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [32]:
# Now let's create the training (90%), validation (5%) and test (5%) sets
length = 100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:dataset_size*90//100], length=length, shuffle=True, seed=42)
valid_set = to_dataset(encoded[dataset_size*90//100:dataset_size*90//100+dataset_size*5//100], length=length)
test_set = to_dataset(encoded[dataset_size*95//100:], length=length)

In [33]:
# Now let's create the model
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
  tf.keras.layers.GRU(128, return_sequences=True),
  tf.keras.layers.Dense(n_tokens, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
model_ckpt = tf.keras.callbacks.ModelCheckpoint("my_shakespeare_model", save_best_only=True)
history = model.fit(train_set, epochs=20, validation_data=valid_set, callbacks=[model_ckpt])

Epoch 1/20
   1620/Unknown - 77s 42ms/step - loss: 2.0225 - accuracy: 0.4111

KeyboardInterrupt: 

In [None]:
# This model does not handle text preprocessing, so we need to add a preprocessing layer to take care of that
# We also need to add a RNN layer to handle the long sequences of text

shakespeare_model = tf.keras.Sequential([
  text_vec_layer,
  tf.keras.layers.lamda(lambda x: x - 2), # no <PAD> and <UNK> tokens
  model
  ])


In [24]:
# Now we can use the model to generate some text
y_pred = model.predict_classes(['To be or nor to be'])[0, -1]
y_pred = tf.argmax(y_pred)
text_vec_layer.get_vocabulary()[y_pred + 2] # output 'e'

NameError: name 'model' is not defined

In [36]:
log_probas = tf.math.log([[0.5, 0.4, 0.1]])
tf.random.set_seed(42)
tf.random.categorical(log_probas, num_samples=8)

<tf.Tensor: shape=(1, 8), dtype=int64, numpy=array([[0, 1, 0, 2, 1, 0, 0, 1]])>