In [37]:
import tensorflow as tf

shakespeare_url = "https://homl.info/shakespeare" # shortcut URL
filepath = tf.keras.utils.get_file("shkespeare.txt", shakespeare_url)

with open(filepath) as f:
    shakespeare_text = f.read()

print(shakespeare_text[:80])


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [50]:
# se set split="charachter" to split the text into a list of single characters instead of words (the default)
text_vec_layer = tf.keras.layers.TextVectorization(split="character", standardize='lower')
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]

In [68]:
text_vec_layer(shakespeare_text)

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [75]:
text_vec_layer([shakespeare_text])[0]

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [48]:
encoded[:5]

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([21,  7, 10,  9,  4])>

In [49]:
# we don't need the pad and unk tokens, so we can skip the first 2
encoded -= 2 # drop 0 (pad) and 1 (unkown) tokens

n_tokens = text_vec_layer.vocabulary_size() - 2 # number of distinct characters
dataset_size =  len(encoded) # total number of chars = 1,155,394
dataset_size

1115394

In [89]:
# utility function to convert a long sequence of characters into many small windows of text: convert long sequence of charcters IDs into a dataset of input/target pairs of small windows of text

def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
  ds = tf.data.Dataset.from_tensor_slices(sequence)
  ds = ds.window(length + 1, shift=1, drop_remainder=True)
  ds = ds.flat_map(lambda window: window.batch(length+1))
  if shuffle:
    ds = ds.shuffle(buffer_size=100_000, seed=seed)
  ds = ds.batch(batch_size)
  return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [90]:
sequence = ["The quick brown fox jumps over the lazy dog."]
dataset = to_dataset(sequence, length=5, shuffle=True, seed=42, batch_size=2)

In [93]:
for inputs, targets in dataset.take(5):
    print("Inputs: ", inputs.numpy())
    print("Targets: ", targets.numpy())