In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [None]:
text_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", text_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [None]:
shakespeare_text

Firstly, we are going to fit a tokenizer to our text by using Keras's Tokenizer class: it will map all characters to an ID from 1 to 'num_char' (not 0, which is used for masks). We use char_level=True but not word level encoding and all characters are converted to lowercase by default.

In [None]:
# tokenizer is a text preprocessor, therefore:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)

# Then call fit on it:
tokenizer.fit_on_texts(shakespeare_text)

In [None]:
tokenizer.texts_to_sequences(["Hello"])

In [None]:
tokenizer.sequences_to_texts([[7, 2, 12, 12, 4]])

In [None]:
# Let's visualize our dictionnary
max_id = len(tokenizer.word_index)
tokenizer.word_index

In [None]:
# Let's define the dataset size (total number of characters)
dataset_size = tokenizer.document_count
tokenizer.document_count

Now let's encode the text into a numpy array with tokens ranging from 0 to 38 (and not 1 to 39):

In [None]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1
[encoded]

In [None]:
encoded

Splitting a sequence into batches of shuffled windows.

Now, to generate the training, validation, and test sets, we can split the dataset across time. But we must assume that patterns from the past will still appear in the future -> we must assume that the time series is stationary (variance, mean and autocorrelations are constant over time). Now let's use 90% of the series for the training set:

In [None]:
train_size = dataset_size * 90//100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

Now we are going to generate many small sequences of characters and the RNN will be unrolled over the length of these short substrings of text (the RNN will not be able to learn pattern longer than the substrings). To achieve this, we are going to use the window() method:

In [None]:
n_steps = 100
window_length = n_steps + 1

# To create more examples: shift=1 -> first window from 0 to 100, second from 1 to 101, etc
# drop_remainder=True for all batches to be of equal size without using padding (eg the final windows<length 100)
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

Our model expects tensors as input, not windows. Then we can shuffle these windows and separate the inputs (100 first characters) and their targets:

In [None]:
batch_size = 32

dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

# Now prefetch one next batch in advance and get it ready while training with current batch
dataset = dataset.prefetch(1)

In [None]:
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)
    
# (32, 100, 39) (32, 100)

Our model has an input_shape of [None, max_id] because of input of size any and 'max_id' dimensions for all 39 features of the one-hot-encoded input. it will output the most probable character among lax_id=39 at each timestep (TimeDistributed) with the Dense layer:

In [None]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     # dropout=0.2, recurrent_dropout=0.2,
                     ),
    keras.layers.GRU(128, return_sequences=True,
                     # dropout=0.2, recurrent_dropout=0.2
                     ),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

In [None]:
history = model.fit(dataset, epochs=10)

In [None]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

In [None]:
X_new = preprocess(["How are yo"])
Y_pred = model.predict_classes(X_new)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # 1st sentence, last char