# Natural Language Processing with RNNs and Attention

In [1]:
# FIXME: meke autocompletion working again
%config Completer.use_jedi = False

import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU')

if not physical_devices:
    print("No GPU was detected.")
else:
    # https://stackoverflow.com/a/60699372
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    
from tensorflow import keras

No GPU was detected.


## Char-RNN
Let's build a RNN processing sequences of text and predicting single character.

### Loading the Data and Preparing the Dataset
Following example uses famous Shakespear's texts.

In [2]:
# Set RNG state
np.random.seed(42)
tf.random.set_seed(42)

# Download the dataset
filepath = keras.utils.get_file(
    "shakespeare.txt",
    "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
)

# Load raw dataset
with open(filepath) as f:
    shakespeare_text = f.read()
    
# Show a pice of the text
print(shakespeare_text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [3]:
# Setup a character-based text tokenizer
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [4]:
# Convert a text to a sequence of character IDs
tokenizer.texts_to_sequences(["First"])

[[20, 6, 9, 8, 3]]

In [5]:
# Convert a sequence of character IDs back to text
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])

['f i r s t']

In [6]:
# Set RNG state
np.random.seed(42)
tf.random.set_seed(42)

# number of distinct characters
max_id = len(tokenizer.word_index)

# total number of characters
dataset_size = tokenizer.document_count

# Encode the whole dataset
#  - TF tokenizer assigns the first character it encounters with ID=1, we shift it back to start from 0
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

# Build a training TF Dataset from the first 90% of the text
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

# Preprocessing parameters
# - length of a training instance (sequence of text)
# - size of a training micro-batch
n_steps = 100
batch_size = 32

# target = input shifted 1 character ahead
window_length = n_steps + 1

# Create training instances (sequences of text) by sliding a window over the text
#  - each time we shift it by single character (`shift=1`)
#  - `drop_remainder=True` means that we don't want to include final shortened windows with length < window length 
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

# Because `window()` creates a nested Dataset (containing sub-datasets), we want to flatten and convert it to single dataset of tensors
#  - the trick here is that we batch the windows to the same length they already have
dataset = dataset.flat_map(lambda window: window.batch(window_length))

# Now we can safely shuffle the dataset and not to break the text
#  - note: shuffling ensures some degree of i.i.d. which is necessary for SGD to work well
#  - we also create training micro-batches
dataset = dataset.shuffle(10000).batch(batch_size)

# Split the instances to (inputs, target) where the target is the next character
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

# As the last step we must either encode or embed categorical features (characters)
#  - here we use 1-hot encoding since there's fairly few distinct characters
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

# Finally we prefetch the data for better training performance
dataset = dataset.prefetch(1)

# Show shapes of 1st batch tensors
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

(32, 100, 39) (32, 100)


### Creating and Training the Model

In [7]:
# Build a simple Char-RNN model:
# - there are two GRU recurrent layers with 128 units, both of which use a 20% dropout (`recurrent_dropout`)
# - there's also a 20% input dropout (`dropout` parameter of the 1st layer)
# - the output layer is a time-distributed dense layer with 39 units and softmax activation to predict each character's class probability
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

# Train and validate the model for 10 epochs
# - Note: This would take forever to train on my PC, so let's use just few batches
history = model.fit(dataset.take(40), epochs=10)
# history = model.fit(dataset, steps_per_epoch=train_size // batch_size, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Using the Model to Generate Text

In [8]:
def preprocess(texts):
    """Preprocess given text to conform to Char-RNN's input"""
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

# Make a new prediction using the model
X_new = preprocess(["How are yo"])
Y_pred = np.argmax(model.predict(X_new), axis=-1)

# Show the prediction as text: 1st sentence, last char
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]

'u'

Next, let's generate not only single letter but whole new text. One approach is to repeatedly call the above. However, this often leads to repeating the same letter over and over again. Better approach is to select next letter randomly based on the learned class probabilities.

In [9]:
def next_char(text, temperature=1):
    """
    Generate new characters based on given text.
     1. we pre-process and predict as before but return all character probablilities
     2. then we compute the log of probabilities and scale it by the `temperature` parameter (the higher, the more in favour of higher prob. letters)
     3. finally we select single character randomly given these log-probs. and convert the character ID back to text 
    """
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]


def complete_text(text, n_chars=50, temperature=1):
    """Extend given text with `n_chars` new letters"""
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text


# Reset RNG state
tf.random.set_seed(42)

# Complete some text using different temperatures
#  - Note: this example dosn't present the model very well since it's not been trained on the full dataset
print(complete_text("t", temperature=0.2))

t the beall the beall the what the belly sinst the 


In [10]:
print(complete_text("t", temperature=1))

tucio. as you up. greccoun:
the beabudt tot enius:



In [11]:
print(complete_text("t", temperature=2))

ty no c't;
mest,-haigeatfrai' at:,
mearbsgr:
ger. b


## Stateful RNN
The premise of a *Stateful RNN* is simple: So far we've thrown all neurons' hidden states away after applying BPTT on a training batch. In other words, hidden states were re-initialized for each partial update and so the model had hard time to learn long term patterns. The idea of a *Stateful RNN* is to keep the hidden state from previous batch and not to initialize it over again.

This has, however, a consequence for the pre-processing logic. If we assume the state is transferred over from previous batches, these batches of training instances cannot overlap - they must consecutively extend each one. In our text generating example, this means we can't use overlapping windows and shuffling anymore.

In [12]:
# Reset RNG state
tf.random.set_seed(42)

# (a) Updated pre-processing logic for Stateful Char-RNN
# - In this version we apply single window at a time

dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

# Contrary to before, we shift windows by full `n_steps` to create non-overlapping inputs
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))

# We skip shuffling altogether so that we don't break the preserved state and batch by 1
#  - batching by 1 means that we apply just single window at a time and, again, preserve the state
dataset = dataset.repeat().batch(1)

# The rest of the logic is analogous
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

# (b) Updated pre-processing logic for Stateful Char-RNN
# - In this more complicated version we apply a micro-batch of windows as before
batch_size = 32

@tf.function
def make_windowed_ds(encoded_part):
    """Creates a flat windowed TF Dataset of non-overlapping windows"""
    dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
    dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
    return dataset.flat_map(lambda window: window.batch(window_length))

# Contrary to before, we make a windowed Dataset in two steps:
#  1. We split the dateset into equal length batches and make windowed Dataset from each batch
#  2. Then we put put all these batches back together and stack the windows so that 
#     the n-th inputs sequence of a batch starts where the n-th sequence of the previous one ended
datasets = map(make_windowed_ds, np.array_split(encoded[:train_size], batch_size))
dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))

# Final steps are the same:
#  - Split each window to (inputs, target)
#  - 1-hot encode the categorical input features
#  - Prefetch the data for better performance
dataset = dataset.repeat().map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

# Build a Stateful RNN model
# The architecture is basically the same as before, notice two distinctions:
#  - `stateful=True` on the recurrent layers to preserve hidden state
#  - `batch_input_shape` set for the initial recurrent layer to let the model know the shape (batch size) for the hidden state
model = keras.models.Sequential([
    keras.layers.GRU(
        128,
        return_sequences=True,
        stateful=True,
        dropout=0.2,
        recurrent_dropout=0.2,
        batch_input_shape=[batch_size, None, max_id],
    ),
    keras.layers.GRU(
        128, 
        return_sequences=True,
        stateful=True,
        dropout=0.2,
        recurrent_dropout=0.2,
    ),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax")),
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

# Train and validate the model
#  - we use custom callback to reset model's state at the start of each epoch (instead of each batch)
#  - we train the model for 50 epochs, also notice the updated `steps_per_epoch`

class ResetStatesCallback(keras.callbacks.Callback):
    """Callback that resets model's state each epoch"""
    
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()


history = model.fit(
    dataset, 
    steps_per_epoch=train_size // batch_size // n_steps,
    epochs=50,
    callbacks=[ResetStatesCallback()],
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


To use the model with different batch sizes, we need to create a stateless copy. We can get rid of dropout since it is only used during training.

In [13]:
# Set RNG state
tf.random.set_seed(42)

# Create a steteless Char-RNN model
# - This model is based on our steteful Char-RNN but used only for making predictions
# - Notice: We don't need dropout since it's used only during training
stateless_model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax")),
])

# Build the stateless model
#  - Firstly, we can loosen the fixed batch size restriction
#  - Secondly, we copy learned weights from the stateful model (this works fine since dropout layers have no trainable params)
stateless_model.build(tf.TensorShape([None, None, max_id]))
stateless_model.set_weights(model.get_weights())

# Replace our main model by this one
#  - because `complete_text()` implicitly works with `model`
model = stateless_model

# Try to complete some text
print(complete_text("t"))

thee: do your carioble,
thou like saggn,' dear chop


## Sentiment Analysis
Let's take a step further from the character-level RNNs to word-level sentiment analysis. Typical dataset from this taks is the IMDb reviews dataset, so let's play.

In [14]:
# Reset RNG state
tf.random.set_seed(42)

# Load the IMDb reviews dataset
(X_train, y_test), (X_valid, y_test) = keras.datasets.imdb.load_data()

# Show a training instance
#  - The dataset is already preprocessed, each instance is a sequence integers which represent an ID of a word
X_train[0][:10]

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [15]:
# In order to reconstruct a word we can load the word to ID index
word_index = keras.datasets.imdb.get_word_index()

# And then create an inverse mapping
# - Note: We shift the ID by 3 to reserve first three IDs for special markers
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}

# These special markers are for the:
#  - padding symbol
#  - start of sequence
#  - unknown word
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token
    
# Show a sample of decoded words
" ".join(id_to_word[id_] for id_ in X_train[0][:10])

'<sos> this film was just brilliant casting location scenery story'

Now, let's create the same pre-processing logic and trainable dataset using TensorFlow's Datasets API.

In [16]:
import tensorflow_datasets as tfds

# Load the IMDb reviews TF Dataset
#  - Note: Using TF-only functions allows us to reuse the same pre-processing logic in every environment
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

# List the dataset content
datasets.keys()

dict_keys(['train', 'test', 'unsupervised'])

In [17]:
# Save and show training and test set sizes
train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples

train_size, test_size

(25000, 25000)

In [18]:
# Peek the training dataset
for X_batch, y_batch in datasets["train"].batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")
        print("Label:", label, "= Positive" if label else "= Negative")
        print()

Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0 = Negative



In [19]:
def preprocess(X_batch, y_batch):
    """
    Pre-process an input batch:
     1. Crops each instance to first 300 characters (speeds up training and sentiment can usually be deduced by the first few sentences)
     2. Replaces '<br />' symbols by a space character
     3. Replaces each non-letter and quote character by a space
     4. Splits instances by space creating a ragged tensor
     5. Returns a dense tensor (and original label) made by padding the splits with '<pad>'
    """
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

# Try the preprocessing logic on the first training batch
preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(2, 53), dtype=string, numpy=
 array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
         b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
         b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
         b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
         b'their', b'worst', b'role', b'in', b'history', b'Even',
         b'their', b'great', b'acting', b'could', b'not', b'redeem',
         b'this', b"movie's", b'ridiculous', b'storyline', b'This',
         b'movie', b'is', b'an', b'early', b'nineties', b'US',
         b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>'],
        [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
         b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
         b'to', b'a', b'combination', b'of', b'things', b'including',
         b'really', b'tired', b'being', b'warm', b'and', b'comfortable',
         b'on', b'the', b'sette', b'and', b'having', b'j

In [20]:
from collections import Counter

batch_size = 32

# Do a word-count over the whole pre-processed training dataset (in one pass)
vocabulary = Counter(
    word.numpy()
    for X_batch, _ in datasets["train"].batch(batch_size).map(preprocess)
    for review in X_batch
    for word in review
)

# Show first 3 most common words in the training corpus
vocabulary.most_common()[:3]

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [21]:
len(vocabulary)

53893

In [22]:
# Drop the least important words and keep just 10k most frequent ones
vocab_size = 10_000
truncated_vocabulary = [word for word, _ in vocabulary.most_common(vocab_size)]

# Make a word index from the truncated vocabulary
word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}

# Test the word index on an example sentence
for word in b"This movie was faaaaaantastic".split():
    print(word_to_id.get(word) or vocab_size)

22
12
11
10000


In [23]:
# Build a static vocabulary table with 1k OOV buckets
num_oov_buckets = 1000

# Initialize the vocabulary from our truncated vocabulary and word index
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)

# Build the lookup table
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

# Test the lookup table on the example sentence we used before
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]])>

In [24]:
def encode_words(X_batch, y_batch):
    """Encode each word in an input batch using the static vocabulary table"""
    return table.lookup(X_batch), y_batch

# Preprocess and encode the whole training set
train_set = (
    datasets["train"]
    .repeat()
    .batch(batch_size)
    .map(preprocess)
    .map(encode_words)
    .prefetch(1)
)

# Display the 1st training batch
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[  22   11   28 ...    0    0    0]
 [   6   21   70 ...    0    0    0]
 [4099 6881    1 ...    0    0    0]
 ...
 [  22   12  118 ...  331 1047    0]
 [1757 4101  451 ...    0    0    0]
 [3365 4392    6 ...    0    0    0]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


In [25]:
# The embedding dimention hyperparameter
embed_size = 128

# Build a classification RNN with initial word embedding layer
#  - This layer's matrix has shape [ID count = vocabulary size + OOV buckets, embedding dimension]
#  - So the model's inputs are 2D tensors of shape [batch size, time steps], the embedding output is 3D tensor [batch size, time steps, embedding size]
#  - `mask_zero=True` means that we ignore ID=0 - the most frequent word which in our case is `<pad>` (so the model doesn't have to learn to ignore it)
#  - note: It would clearner to ensure that the padding word really has ID 0 than to count on the fact that it's the most frequent one.
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, mask_zero=True, input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid"),
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train and validate the model for 5 epochs
history = model.fit(train_set, steps_per_epoch=train_size // batch_size, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Manual Masking

In [26]:
K = keras.backend

# Define an input layer
inputs = keras.layers.Input(shape=[None])

# Create a mask that ignores inputs equal to 0
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)

# Build the same model structure as before but with explicit masking of layer inputs
#  - Note: In the previous example the output dense layer didn't receive the implicit mask because the time dimension was not the same, 
#          so the explicit masking is necessary if we want to propagate this information all the way to the loss function.
#  - Note 2: The downside is that LSTMs and GRUs won't use optimized impl. for GPUs and so the training might be slower.
z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)
z = keras.layers.GRU(128, return_sequences=True)(z, mask=mask)
z = keras.layers.GRU(128)(z, mask=mask)

# Define model's outputs
outputs = keras.layers.Dense(1, activation="sigmoid")(z)

# Compose and compile the model
model = keras.models.Model(inputs=[inputs], outputs=[outputs])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train and validate the model for 5 epochs
history = model.fit(train_set, steps_per_epoch=train_size // batch_size, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Reusing Pretrained Embeddings

In [27]:
import tensorflow_hub as hub

# Reset RNG state
tf.random.set_seed(42)

# Build a model with pre-trained layers:
#  - Main portion of this model reuses Google's model that pre-processes and embeds words from an input text to 50 dimensional vectors
#  - Then we just add two dense layers for our classification task of sentiment analysis
#  - Note: By default TF Hub downloads models to /tmp, one can override this by setting `TFHUB_CACHE_DIR` env. variable
#  - Note 2: TF Hub layers are also by default non-trainable - if we want to tweak their weights we must unfreeze them
model = keras.Sequential([
    hub.KerasLayer(
        "https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1",
        dtype=tf.string,
        input_shape=[],
        output_shape=[50],
    ),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Then we can just load the IMDb reviews dataset
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

# Take the training set and just batch it (and prefetch)
#  - Note: The rest of the preprocessing logic is handled by the TF Hub portion of the model
train_size = info.splits["train"].num_examples
train_set = datasets["train"].repeat().batch(batch_size).prefetch(1)

# Finally we just train and validate the model on our IMDb dataset
history = model.fit(train_set, steps_per_epoch=train_size // batch_size, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Encoder-Decoder Network for Neural Machine Translation

As the name suggests, in the *Encoder-Decoder* architecture we split a *sequence-to-sequence* RNN into two parts:
1. Encoder - takes as inputs reversed sequences of words (or rather embeddings thereof; reversed so that the decoder reveives the first word first)
1. Decoder - this part has actually two inputs, first the hidden states of the encoder and socond is either previous target word (during training; embedded) or the actual token that was output in the previous step (during inference; embedded)

Additional notes to the architecture:
* The outputs of the decoder are scores for each word in the vocabulary which are turned to probabilities using time-distributed *softmax*. Because we can easily get to very high-dimensional outputs, typically a *sampled softmax* is used for training and regular *softmax* for inference
* In this task we cannot simply truncate input sequences to common length as before because we want to get complete translations. Also pedding to some large common lenght does not work. Instead, we can bucket the sentenced into sets of close-enough lenght and pad these to match the longes one in each set.
* Finally, we should ignore part of the output after an `<EOS>` token - both from the output and loss function

In [28]:
import tensorflow_addons as tfa

# Set the RNG state
tf.random.set_seed(42)

# Sutup vocabulary and embedding size hyperparameters
vocab_size = 100
embed_size = 10

# Define Encoder and Decoder inputs
encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

# Create embedding layers for the Encoder and Decoder parts
embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)

# Encoder is a 512 unit LSTM layer
#  - we can ignore encoder ouputs but we return both the short-term and long-term states with `return_state=True`
#  - the complete hidden state of the encoder is a pair of the short and long-term states
encoder = keras.layers.LSTM(512, return_state=True)
_, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

# Decoder is based on the `BasicDecoder` from TF Addons
#  - Decoder cell is a 512 unit LSTM cell
#  - Sampler is a component tells the Decoder what it should pretend the last step's output was:
#    - in this case `TrainingSampler` takses the embedding of previous target token
#    - other option is `ScheduledEmbedingTrainingSampler` which randomly chooses between target and actual outputs
#  - Model's output is a dense layer with one unit per word in the vocabulary

decoder_cell = keras.layers.LSTMCell(512)
output_layer = keras.layers.Dense(vocab_size)

decoder = tfa.seq2seq.basic_decoder.BasicDecoder(
    cell=decoder_cell,
    sampler=tfa.seq2seq.sampler.TrainingSampler(),
    output_layer=output_layer,
)

# Construct the Decoder
#  - Initial state is the complete encoder state
#  - We can ignore final decoder state and sequence lengths but we do care about the final outputs
final_outputs, _, _ = decoder(
    decoder_embeddings,
    initial_state=encoder_state,
    sequence_length=sequence_lengths,
)

# Final class (word) probabilities are retrieved as the (sampled) softmax of the final outputs (decoder)
Y_proba = tf.nn.softmax(final_outputs.rnn_output)

# Build an Encoder-Decoder model
#  - Note: Because the task is basically a classification task, we can use `sparse_categorical_crossentropy` as the loss function
model = keras.models.Model(
    inputs=[encoder_inputs, decoder_inputs, sequence_lengths],
    outputs=[Y_proba],
)
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

# Build a random sequence dataset
X = np.random.randint(100, size=10*1000).reshape(1000, 10)
Y = np.random.randint(100, size=15*1000).reshape(1000, 15)
X_decoder = np.c_[np.zeros((1000, 1)), Y[:, :-1]]
seq_lengths = np.full([1000], 15)

# Train and validate the model on the random dataset
history = model.fit([X, X_decoder, seq_lengths], Y, epochs=2)

Epoch 1/2
Epoch 2/2


## Bidirectional RNNs
For forecasting future values in a time series we want to have a *causal* model - a model in which future values are predicted solely on the basis of past values. On the other hand in NLP tasks (such as Neural Machine Translation) it can be beneficial to embed a word based on both the past and future contexts.

A *Bidirectional* layer is a layer in which is composed of two layers working on the same input. One layer reads the input from the original direction (left to right) and the other one is a clone except it read from the reverse direction (right to left). The final output is some sort of a combination of both outputs - typically a concatenation.

In [29]:
# Build an example RNN with a bidirectional GRU layer
#  - `Bidirectional` wrapper creates a clone in the reverse direction of a layer passed as an argument and concatenates outputs
#  -  Note: Adding a bidirectional wrapper implicitly doubles the number of units of the prototype
model = keras.models.Sequential([
    keras.layers.GRU(10, return_sequences=True, input_shape=[None, 10]),
    keras.layers.Bidirectional(keras.layers.GRU(10, return_sequences=True))
])

# Show model's topology
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_10 (GRU)                 (None, None, 10)          660       
_________________________________________________________________
bidirectional (Bidirectional (None, None, 20)          1320      
Total params: 1,980
Trainable params: 1,980
Non-trainable params: 0
_________________________________________________________________


## Beam Search
Another improvement to predicting sequences of words is not to build single greedy model at a time but multiple. At each frame we keep a small set of $k$ most promising predictions (the *beam width*). In the next step we clone the model and compute new distribution over the vocabulary for the next word. But this time it's conditional probablity based on the previous word's probablity. We keep $k$ best sequence continuations based on $p(w_1 w_2) = p(w_2|w_1)*p(w_1)$ and iterate.

Application of the *Beam Search* can limit the chance of producing words which are frequent in the training but sub-optimal (wrong) for particular sentence.

```python
beam_width = 10

decoder = tfa.seq2seq.beam_search_decoder.BeamSearchDecoder(
    cell=decoder_cell,
    beam_width=beam_width,
    output_layer=output_layer,
)

final_outputs, _, _ = decoder(
    decoder_embeddings,
    start_tokens=start_tokens,
    end_tokes=end_tokens,
    initial_state=tfa.seq2seq.beam_search_decoder.tile_batch(encoder_state, multiplier=beam_width),
)
```

## Attention Mechanisms
The main problem of RNNs is their short-term memory (even though cells like LSTM and GRU help). For instance in an Encoder-Decoder architecture for NMT, it still takes too many time steps for an information (word) to propagate from the encoder to the decoder. I.e. at the time the decoder tries to decode a word, it doen't know what the encoder thought of this word - it's lost the *attention*.

The trick here is to add a shortcut - an *alignment model* (*attention model*) which takes in all the encoder outputs and combines them with decoder's hidden states to produce attention weights $\alpha_{(t,i)}$ for the decoder (weights for the t-th decoder time step from i-th encoder output). These weights tell the decoder what to focus on.

There three attention mechanisms, the former is the original one while the latter are typically performing better and are used nowadays:
1. *Bahdanau attention (concatenative, additive)* - computes alphas by training them alongside the RNN by adding a time-distibuted dense layer feeding from concatenated `[endoder outputs; decoder hidden state]`, producing scores and applying a *softmax* (not time-distributed)
1. *Luong attention (multiplicative)* - simplifies the mechanism by computing simple dot product between encoder's outputs and decoder's hidden state (scalar product is quite a successful similarity measure) instead of the dense layer to compute the scores; it also completely replaces decoder's previous hidden state by $\tilde{\mathbf{h}}_{(t)} = \sum_i \alpha_{(t,i)} \mathbf{y}_i$.
1. *Luong attention (general)* - is a somewhat a middle ground, it does add a simple linear transformation to encoder's outputs (dense layer without biases and activation) but otherwise it's *Luong's attention*.

More formally, these mechanisms can be summarized as follows:
$$
\tilde{\mathbf{h}}_{(t)} = \sum_i \alpha_{(t,i)} \mathbf{y}_i
$$
with
$$
\alpha_{(t,i)} = \frac{\exp(e_{(t,i)})}{\sum_{i'} \exp(e_{(t,i')})}
$$
and
$$
e_{(t,i)} = \begin{cases}
                \mathbf{h}_{(t)}^T \mathbf{y}_{(i)}                                   & \quad \text{dot}\\
                \mathbf{h}_{(t)}^T \mathbf{W} \mathbf{y}_{(i)}                        & \quad \text{general}\\
                \mathbf{v}^T \tanh(\mathbf{W}[\mathbf{h}_{(t)}; \mathbf{y}_{(i)}])  & \quad \text{concat}
            \end{cases}
$$
where $\mathbf{v}$ is a rescaling parameter vector.

## Transformer Architecture
The *Transformer* takes the attention mechanism to the next level and presents a deep net architecture based solely on thiese modules (a bit extended) that does not contain recurrent or conv. layers yet works as an Encoder-Decoder.

As any Encoder-Decoder, it has two sides where the final output of the Encoder feeds into the hidden part of the Decoder:
* The encoder part is fairly simple: it starts with imput embeddings, after which it adds *positional encoding* vectors (dense vectors that encode absolute and relative word positions in the input). Next there are *Multi Head Attention* and *Feed Forward* modules, each followed by a layer normalization and added skip connection from module inputs. The feed forward part are just two dense layers, the former with ReLU activations and the latter without any. Finally, this whole stack is repeated N times.
* The decoder is basically the same but starts with a *Masked Multi Head Attention* which only differs in that it masks out inputs "in the future". Outputs of the encoder are fed to the middle (hidden) attention module. The decoder stack is also repeated N times.
* The final decoder output (from the last layer of the last repetition) is passed through a simple linear layer with softmax activation.

### Positional Encoding
As mentioned before, *Positional Encoding (PE)* is a dense vector encoding the word position in the input sequence which is added to the word embeddings. $PE_{p,i}$ is the i-th comonent (added to the i-th component of the word embedding) of the word located at p-th position in the sequence. The PE matric can be learned but it's typically pre-computed as a fixed encoding:
$$
PE_{p,i} = \begin{cases}
                \sin(p / 10000^{i/d}) & \quad \text{if } i \text{ is odd}\\
                \cos(p / 10000^{(i - 1)/d}) & \quad \text{if } i \text{ is even}
            \end{cases}
$$
This fixed encoding is favoured because it has the same performance as learned and can extend to arbitrarily long sequences.

TensorFlow does not have a `PositionalEncoding` layer but it's not hard to implement.

In [30]:
class PositionalEncoding(keras.layers.Layer):
    """Positional encoding layer"""
    
    def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        
        # Ensure that `max_dims` is even
        if max_dims % 2 == 1:
            max_dims += 1
        
        # Crate a space of possible positions and embedding indices
        p, i = np.meshgrid(
            np.arange(max_steps),
            np.arange(max_dims // 2),
        )
        
        # Precompute the maximum PE matrix using the formula presented above
        pe = np.empty((1, max_steps, max_dims))
        pe[0, :, ::2] = np.sin(p / 10000**(2 * i / max_dims)).T
        pe[0, :, 1::2] = np.cos(p / 10000**(2 * i / max_dims)).T
        
        # Save the PE as the requested data type
        self.positional_embedding = tf.constant(pe.astype(self.dtype))
    
    def call(self, inputs):
        # Crop PE matrix to the shape of the inputs and add both together
        shape = tf.shape(inputs)
        return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]


# Very simplified version of the Transformer
#  - Instead of Multi Head Attention uses plain Attention modules
#  - Is missing skip connections
#  - Omits layer normalization and dense nets

# Hyperparameters of the model
N = 6
embed_size = 512
max_steps = 500
vocab_size = 10000

# Define inputs for the two sides: encoder and decoder
encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)

# Define first layer - word embedding
embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)

# Add a Positional Encoding layer on top of embeddings
positional_encoding = PositionalEncoding(max_steps, max_dims=embed_size)
encoder_in = positional_encoding(encoder_embeddings)
decoder_in = positional_encoding(decoder_embeddings)

# Encoder stack
Z = encoder_in
for _ in range(N):
    Z = keras.layers.Attention(use_scale=True)([Z, Z])

encoder_outputs = Z

# Decoder stack
#  - First attention module uses `causal=True`, i.e. masks out inputs "from the future"
#  - Encoder outputs feed the second attention module
Z = decoder_in
for _ in range(N):
    Z = keras.layers.Attention(use_scale=True, causal=True)([Z, Z])
    Z = keras.layers.Attention(use_scale=True)([Z, encoder_outputs])

# Network outputs one probability for each word in the vocabulary
#  - Hence the dense layer of `vocab_size` units with softmax activation
#  - Inpouts are the outputs of the very last layer of the decoder
outputs = keras.layers.TimeDistributed(keras.layers.Dense(vocab_size, activation="softmax"))(Z)

### Multi Head Attention
The core component of a *Multi Head Attention* is a *Scaled Dot-Product* which was actually used in the example above (`use_scale=True`). The actual Multi Head Attention module is just a bunch of scaled do-product layers, each preceeded with three linear layers (time-distributed dense layer without activation; one for each $\mathbf{V}, \mathbf{K}, \mathbf{Q}$ - presented below). Finally, all outputs of the scaled dot-product layers are concatenated and passed through a linear layer (again time-distributed).

#### Scaled Dot Product
Let's assume the encoder learns the meaning of words in a sentence - one can imagine this as a dictionary `"They played chess ..." -> {"subject": "They", "verb": "played", ...}`. The decoder then wants to do a lookup from this dictionary of, let's say, a `"verb"` - the issue is that we don't have discrete keys and values but rather vectorized representations of these.

So instead of a lookup term we have a *query vector* $\mathbf{q}$ and instead of a keys we have also a vector $\mathbf{k}$. The dot product $\mathbf{q}^T \mathbf{k}$ is then a similarity score of how well the query matches the keys. If we pass it through a *softmax* (ensure it sums up to 1) and multiply the values $v$ we carry the relevance over from the key match to the values - i.e. query resutlts. The full scaled dot-product for a matrix of queries $\mathbf{Q}$, keys $\mathbf{K}$ and values $\mathbf{V}$ is
$$
Attention(\mathbf{Q}, \mathbf{K}, \mathbf{V}) = softmax (\frac{\mathbf{Q}\mathbf{K}^T}{\sqrt{d_{keys}}})
$$
where $\sqrt{d_{keys}}$ is there to prevent saturating the softmax (tiny gradients). The code above actually lerarns this scaling factor but the Transformer uses this key dimention instead. 

Finally, the meaning of these matrices in the Encoder-Decoder setup is:
* All the $\mathbf{Q}$, $\mathbf{K}$, $\mathbf{V}$ in the encoder equal to the list of words in an input sequence. So the encoder learns the relationships beween all pairs of words.
* In the decoder masked it's pretty much the same - these correspond to the words in the target sentence but masked so that words don't compare to those after it.
* Decoder's upper layers simply have $\mathbf{K}$ and $\mathbf{V}$ equal to the word encodings produced by the encoder while $\mathbf{Q}$ is the word encodings produced by the decoder itself.

#### The intuition behind Multi Head Attention
The motivation behind using multiple heads (scaled dot-products) with preceeding linear layers is that a word encoding carries multiple information - about the word itself but also its position (due to PE) or e.g. past tense etc. The initial linear layers are there to make projections into these various sub-spaces, then we do the "looup" and finally project all these searches back with the output layer.

In [31]:
K = keras.backend

class MultiHeadAttention(keras.layers.Layer):
    
    def __init__(self, n_heads, causal=False, use_scale=False, **kwargs):
        self.n_heads = n_heads
        self.causal = causal
        self.use_scale = use_scale
        super().__init__(**kwargs)
    
    def build(self, batch_input_shape):
        self.dims = batch_input_shape[0][-1]
        
        # These could be hyperparameters instead
        self.q_dims, self.v_dims, self.k_dims = [self.dims // self.n_heads] * 3
        
        # Build the initial Q, K and V linear layers for each head
        self.q_linear = keras.layers.Conv1D(self.n_heads * self.q_dims, kernel_size=1, use_bias=False)
        self.v_linear = keras.layers.Conv1D(self.n_heads * self.v_dims, kernel_size=1, use_bias=False)
        self.k_linear = keras.layers.Conv1D(self.n_heads * self.k_dims, kernel_size=1, use_bias=False)
        
        # The attention part
        self.attention = keras.layers.Attention(causal=self.causal, use_scale=self.use_scale)
        
        # Linear output layer
        self.out_linear = keras.layers.Conv1D(self.dims, kernel_size=1, use_bias=False)
        
        super().build(batch_input_shape)
    
    def _multi_head_linear(self, inputs, linear):
        shape = K.concatenate([K.shape(inputs)[:-1], [self.n_heads, -1]])
        projected = K.reshape(linear(inputs), shape)
        perm = K.permute_dimensions(projected, [0, 2, 1, 3])
        return K.reshape(perm, [shape[0] * self.n_heads, shape[1], -1])
    
    def call(self, inputs):
        # Split the inputs into Q, K and V
        #  - K = V is not given in the inputs
        q = inputs[0]
        v = inputs[1]
        k = inputs[2] if len(inputs) > 2 else v
        
        shape = K.shape(q)
        
        # Build the Q, K and V linear projections
        q_proj = self._multi_head_linear(q, self.q_linear)
        v_proj = self._multi_head_linear(v, self.v_linear)
        k_proj = self._multi_head_linear(k, self.k_linear)
        
        # Pass these projections to the attention heads
        multi_attended = self.attention([q_proj, v_proj, k_proj])
        
        # Reshape and concatenate the attention heads' outputs
        shape_attended = K.shape(multi_attended)
        reshaped_attended = K.reshape(multi_attended, [shape[0], self.n_heads, shape_attended[1], shape_attended[2]])
        perm = K.permute_dimensions(reshaped_attended, [0, 2, 1, 3])
        concat = K.reshape(perm, [shape[0], shape_attended[1], -1])
        
        # Finally apply project the outputs back with the last linear layer
        return self.out_linear(concat)


# Generate some random queries and values
Q = np.random.rand(2, 50, 512)
V = np.random.rand(2, 80, 512)

# Test our Multi Head Attention module on these inputs
multi_attn = MultiHeadAttention(8)
multi_attn([Q, V]).shape

TensorShape([2, 50, 512])

## Exercises

### RNN verifying an embedded Reber grammar

In [32]:
# Reset RNG state
np.random.seed(42)

# Define the finite state machine of the Reber grammar
#  - https://www.willamette.edu/~gorr/classes/cs449/reber.html
#  - encoded as a list of state transitions: `state -> .[(symbol, next state)]`
default_reber_grammar = [
    [("B", 1)],
    [("T", 2), ("P", 3)],
    [("S", 2), ("X", 4)],
    [("T", 3), ("V", 5)],
    [("X", 3), ("S", 6)],
    [("P", 4), ("V", 6)],
    [("E", None)],
]


# Define the embedded Reber grammar
#  - https://www.willamette.edu/~gorr/classes/cs449/reber.html
embedded_reber_grammar = [
    [("B", 1)],
    [("T", 2), ("P", 3)],
    [(default_reber_grammar, 4)],
    [(default_reber_grammar, 5)],
    [("T", 6)],
    [("P", 6)],
    [("E", None)],
]


def generate_string(grammar):
    """Generate a random string from given (embedded) Reber grammar"""
    
    # Start at the initial state
    state = 0
    
    output = []
    while state is not None:
        # Make random transition from current state
        transition_ix = np.random.randint(len(grammar[state]))
        production, state = grammar[state][transition_ix]
        
        if isinstance(production, list):
            # Recurse inside an embedding
            production = generate_string(grammar=production)
        
        # Collect produced symbols
        output.append(production)
        
    # Reconstruct a word from produced symbols
    return "".join(output)


# Generate few sample strings from Raber grammar
for _ in range(25):
    print(generate_string(default_reber_grammar), end=" ")

BTXXTTVPXTVPXTTVPSE BPVPSE BTXSE BPVVE BPVVE BTSXSE BPTVPXTTTVVE BPVVE BTXSE BTXXVPSE BPTTTTTTTTVVE BTXSE BPVPSE BTXSE BPTVPSE BTXXTVPSE BPVVE BPVVE BPVVE BPTTVVE BPVVE BPVVE BTXXVVE BTXXVVE BTXXVPXVVE 

In [33]:
# Reset RNG staet
np.random.seed(42)

# Generate few sample strings from embedded Raber grammar
for _ in range(25):
    print(generate_string(embedded_reber_grammar), end=" ")

BTBPTTTVPXTVPXTTVPSETE BPBPTVPSEPE BPBPVVEPE BPBPVPXVVEPE BPBTXXTTTTVVEPE BPBPVPSEPE BPBTXXVPSEPE BPBTSSSSSSSXSEPE BTBPVVETE BPBTXXVVEPE BPBTXXVPSEPE BTBTXXVVETE BPBPVVEPE BPBPVVEPE BPBTSXSEPE BPBPVVEPE BPBPTVPSEPE BPBTXXVVEPE BTBPTVPXVVETE BTBPVVETE BTBTSSSSSSSXXVVETE BPBTSSSXXTTTTVPSEPE BTBPTTVVETE BPBTXXTVVEPE BTBTXSETE 

In [34]:
# Reset RNG state
np.random.seed(42)

POSSIBLE_CHARS = "BEPSTVX"

def generate_corrupted_string(grammar, chars=POSSIBLE_CHARS):
    # Generate a valid word
    good_string = generate_string(grammar)
    
    # Pick a position (and corresponding symbol) which should be broken
    replace_ix = np.random.randint(len(good_string))
    good_char = good_string[replace_ix]
    
    # Pick new symbol to replace the old one at selected position
    bad_char = np.random.choice(sorted(set(chars) - set(good_char)))
    
    # Do the replacement
    return good_string[:replace_ix] + bad_char + good_string[replace_ix + 1:]


# Sample some corrupted words from the embedded grammar
for _ in range(25):
    print(generate_corrupted_string(embedded_reber_grammar), end=" ")

BTBPTTTPPXTVPXTTVPSETE BPBTXEEPE BPBPTVVVEPE BPBTSSSSXSETE BPTTXSEPE BTBPVPXTTTTTTEVETE BPBTXXSVEPE BSBPTTVPSETE BPBXVVEPE BEBTXSETE BPBPVPSXPE BTBPVVVETE BPBTSXSETE BPBPTTTPTTTTTVPSEPE BTBTXXTTSTVPSETE BBBTXSETE BPBTPXSEPE BPBPVPXTTTTVPXTVPXVPXTTTVVEVE BTBXXXTVPSETE BEBTSSSSSXXVPXTVVETE BTBXTTVVETE BPBTXSTPE BTBTXXTTTVPSBTE BTBTXSETX BTBTSXSSTE 

In [35]:
def str2ids(s, chars=POSSIBLE_CHARS):
    return [POSSIBLE_CHARS.index(c) for c in s]

str2ids("BTTTXXVVETE")

[0, 4, 4, 4, 6, 6, 5, 5, 1, 4, 1]

In [37]:
# Reset RNG state
np.random.seed(42)


def generate_ids(corrupt=False):
    gen = generate_corrupted_string if corrupt else generate_string
    return str2ids(gen(embedded_reber_grammar))


def generate_dataset(size):
    n_valid = size // 2
    n_invlaid = size - n_valid
    
    # Generate valid and invalid words
    valid = [generate_ids() for _ in range(n_valid)]
    invalid = [generate_ids(corrupt=True) for _ in range(n_invlaid)]
    X = tf.ragged.constant(valid + invalid, ragged_rank=1)
    
    # Generate corresponding labels
    pos_labels = [[1.] for _ in range(n_valid)]
    neg_labels = [[0.] for _ in range(n_invlaid)]
    y = np.array(pos_labels + neg_labels)
    
    return X, y


# Generate the training and test datasets containing both valid and corrupted words
X_train, y_train = generate_dataset(10000)
X_valid, y_valid = generate_dataset(2000)

# Peek the training dataset
X_train[0], y_train[0]

(<tf.Tensor: shape=(22,), dtype=int32, numpy=
 array([0, 4, 0, 2, 4, 4, 4, 5, 2, 6, 4, 5, 2, 6, 4, 4, 5, 2, 3, 1, 4, 1],
       dtype=int32)>,
 array([1.]))

In [38]:
# Reset RNG state
np.random.seed(42)
tf.random.set_seed(42)

# Model hypeparameters
embedding_size = 5
n_gru_units = 30

# Build a simple binary classifier RNN with an embedding, GRU and final dense layer
model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=[None], dtype=tf.int32, ragged=True),
    keras.layers.Embedding(input_dim=len(POSSIBLE_CHARS), output_dim=embedding_size),
    keras.layers.GRU(n_gru_units),
    keras.layers.Dense(1, activation="sigmoid")
])

# Compile the model
model.compile(
    loss="binary_crossentropy",
    optimizer=keras.optimizers.SGD(lr=0.02, momentum = 0.95, nesterov=True),
    metrics=["accuracy"],
)

# Train and validate the model
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))

Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [40]:
# Build few test samples
test_strings = [
    "BPBTSSSSSSSXXTTVPXVPXTTTTTVVETE",
    "BPBTSSSSSSSXXTTVPXVPXTTTTTVVEPE",
]
X_test = tf.ragged.constant([str2ids(s) for s in test_strings], ragged_rank=1)

# Make a prediction on these test samples
y_proba = model.predict(X_test)

# Show the predictions and model confidence
print()
print("Estimated probability that these are Reber strings:")
for i, s in enumerate(test_strings):
    print("{}: {:.2f}%".format(s, 100 * y_proba[i][0]))


Estimated probability that these are Reber strings:
BPBTSSSSSSSXXTTVPXVPXTTTTTVVETE: 0.08%
BPBTSSSSSSSXXTTVPXVPXTTTTTVVEPE: 99.96%


### Encoder–Decoder model for date string conversion

In [41]:
from datetime import date

# Reset RNG state
np.random.seed(42)

# We cannot use strftime()'s %B format since it depends on the locale
MONTHS = [
    "January",
    "February",
    "March",
    "April",
    "May",
    "June",
    "July",
    "August",
    "September",
    "October",
    "November",
    "December",
]


def random_dates(n_dates, min_date=date(1000, 1, 1), max_date=date(9999, 12, 31)):
    """Generate n random labeled instances between given min and max dates"""
    
    # Get ordinal values for date bounds
    min_date = min_date.toordinal()
    max_date = max_date.toordinal()

    # Generate n random dates between the bounds
    ordinals = np.random.randint(max_date - min_date, size=n_dates) + min_date
    dates = [date.fromordinal(ordinal) for ordinal in ordinals]

    # Instances are dates in "<month> <day>, <year>" format
    x = [MONTHS[dt.month - 1] + " " + dt.strftime("%d, %Y") for dt in dates]
    
    # Target is the standard date ISO format
    y = [dt.isoformat() for dt in dates]
    
    return x, y


# Show few examples
n_dates = 3
x_example, y_example = random_dates(n_dates)

print("{:25s}{:25s}".format("Input", "Target"))
print("-" * 50)
for idx in range(n_dates):
    print(f"{x_example[idx]:25s}{y_example[idx]:25s}")

Input                    Target                   
--------------------------------------------------
September 20, 7075       7075-09-20               
May 15, 8579             8579-05-15               
January 11, 7103         7103-01-11               


In [43]:
# Define the input and output alphabets
INPUT_CHARS = "".join(sorted(set("".join(MONTHS)))) + "01234567890, "
OUTPUT_CHARS = "0123456789-"

INPUT_CHARS, OUTPUT_CHARS

('ADFJMNOSabceghilmnoprstuvy01234567890, ', '0123456789-')

In [44]:
def date_str_to_ids(date_str, chars=INPUT_CHARS):
    return [chars.index(c) for c in date_str]

date_str_to_ids(x_example[0], INPUT_CHARS)

[7, 11, 19, 22, 11, 16, 9, 11, 20, 38, 28, 26, 37, 38, 33, 26, 33, 31]

In [45]:
date_str_to_ids(y_example[0], OUTPUT_CHARS)

[7, 0, 7, 5, 10, 0, 9, 10, 2, 0]

In [46]:
# Reset RNG state
np.random.seed(42)

def prepare_date_strs(date_strs, chars=INPUT_CHARS):
    """
    Encode given date strings to character IDs, returns a ragged tensor.
    
    Note: ID=0 is used for the padding token, so every index to `chars` is shifted by 1.
    """
    X_ids = [date_str_to_ids(dt, chars) for dt in date_strs]
    X = tf.ragged.constant(X_ids, ragged_rank=1)
    return (X + 1).to_tensor()


def create_dataset(n_dates):
    x, y = random_dates(n_dates)
    return prepare_date_strs(x, INPUT_CHARS), prepare_date_strs(y, OUTPUT_CHARS)

# Generate training, validation and test datesets
X_train, Y_train = create_dataset(10000)
X_valid, Y_valid = create_dataset(2000)
X_test, Y_test = create_dataset(2000)

Y_train[0]

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([ 8,  1,  8,  6, 11,  1, 10, 11,  3,  1], dtype=int32)>

#### Basic seq2seq model

In [47]:
# Reset RNG state
np.random.seed(42)
tf.random.set_seed(42)

# Basic constants
#  - Note: Dimensions have +1 due to the extra tokens
max_output_length = Y_train.shape[1]
input_dim = len(INPUT_CHARS) + 1
output_dim = len(OUTPUT_CHARS) + 1

# Model hyperparameters
embedding_size = 32

# Create an encoder
encoder = keras.models.Sequential([
    keras.layers.Embedding(input_dim=input_dim, output_dim=embedding_size, input_shape=[None]),
    keras.layers.LSTM(128),
])

# Create a decoder
decoder = keras.models.Sequential([
    keras.layers.LSTM(128, return_sequences=True),
    keras.layers.Dense(output_dim, activation="softmax")
])

# Build simple Encoder-Decoder model
#  - Note: We repeate encoder's output because it outputs a vector and decoder expects a sequence
model = keras.models.Sequential([
    encoder,
    keras.layers.RepeatVector(max_output_length),
    decoder,
])
model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.Nadam(), metrics=["accuracy"])

# Train and validate the model
history = model.fit(X_train, Y_train, epochs=20, validation_data=(X_valid, Y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [50]:
def ids_to_date_strs(ids, chars=OUTPUT_CHARS, pad="?"):
    symbols = pad + chars
    return ["".join(symbols[i] for i in seq) for seq in ids]

# Generate few test examples
X_new = prepare_date_strs([
    "September 17, 2009",
    "July 14, 1789",
    "May 02, 2020",
    "July 14, 1789",
])

# Make a prediction on these examples
ids = np.argmax(model.predict(X_new), axis=-1)

# Show predictions
for date_str in ids_to_date_strs(ids):
    print(date_str)

2009-09-17
1789-07-14
2020-05-02
1789-07-14


We need to ensure that we always pass sequences of the same length as during training, using padding if necessary.

In [51]:
max_input_length = X_train.shape[1]


def prepare_date_strs_padded(date_strs):
    X = prepare_date_strs(date_strs)
    input_length = X.shape[1]
    # Add padding tokens if necessary
    if input_length < max_input_length:
        X = tf.pad(X, [[0, 0], [0, max_input_length - input_length]])
    return X


def convert_date_strs(date_strs):
    """Make a prediction including preprocessing and postprocessing"""
    X = prepare_date_strs_padded(date_strs)
    ids = np.argmax(model.predict(X), axis=-1)
    return ids_to_date_strs(ids)


# Try problematic instances again with this new preprocessing
convert_date_strs(["May 02, 2020", "July 14, 1789"])

['2020-05-02', '1789-07-14']

#### Feeding the shifted targets to the decoder

In [52]:
# Start of sequence ID
sos_id = len(OUTPUT_CHARS) + 1

def shifted_output_sequences(Y):
    # Shift the targets by 1 to the right
    #  - So that the decoder will know the previous target character
    #  - Note: Since we shift the targets, the decoder need a token for the first character, hence the SoS
    sos_tokens = tf.fill(dims=(len(Y), 1), value=sos_id)
    return tf.concat([sos_tokens, Y[:, :-1]], axis=1)

# Create new decoder inputs by shift all targets by 1 to the right
X_train_decoder = shifted_output_sequences(Y_train)
X_valid_decoder = shifted_output_sequences(Y_valid)
X_test_decoder = shifted_output_sequences(Y_test)

X_train_decoder

<tf.Tensor: shape=(10000, 10), dtype=int32, numpy=
array([[12,  8,  1, ..., 10, 11,  3],
       [12,  9,  6, ...,  6, 11,  2],
       [12,  8,  2, ...,  2, 11,  2],
       ...,
       [12, 10,  8, ...,  2, 11,  4],
       [12,  2,  2, ...,  3, 11,  3],
       [12,  8,  9, ...,  8, 11,  3]], dtype=int32)>

In [53]:
# Reset RNG state
np.random.seed(42)
tf.random.set_seed(42)

# Define basic constants
encoder_input_dim = len(INPUT_CHARS) + 1   # +1 for padding
decoder_input_dim = len(OUTPUT_CHARS) + 2  # +1 for padding +1 for sos
output_dim = len(OUTPUT_CHARS) + 1         # +1 for padding

# Hyperparameters
encoder_embedding_size = 32
decoder_embedding_size = 32
lstm_units = 128

# Create an encoder
encoder_input = keras.layers.Input(shape=[None], dtype=tf.int32)
encoder_embedding = keras.layers.Embedding(input_dim=encoder_input_dim, output_dim=encoder_embedding_size)(encoder_input)
_, encoder_state_h, encoder_state_c = keras.layers.LSTM(lstm_units, return_state=True)(encoder_embedding)
encoder_state = [encoder_state_h, encoder_state_c]

# Create a decoder that takes two kinds of inputs:
#  1. Shifted targets pass through an embedding and then directly to the LSTM layer
#  2. Full encoder's state is passed as an initial state for the LSTM layer
decoder_input = keras.layers.Input(shape=[None], dtype=tf.int32)
decoder_embedding = keras.layers.Embedding(input_dim=decoder_input_dim, output_dim=decoder_embedding_size)(decoder_input)
decoder_lstm_output = keras.layers.LSTM(lstm_units, return_sequences=True)(decoder_embedding, initial_state=encoder_state)
decoder_output = keras.layers.Dense(output_dim, activation="softmax")(decoder_lstm_output)

# Build an inproved Encoder-Decoder model
model = keras.models.Model(inputs=[encoder_input, decoder_input], outputs=[decoder_output])
model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.Nadam(), metrics=["accuracy"])

# Train and validate the model
#  - This time we pass both inputs (one for the encoder and the other for decoder)
#  - Notice: We train the model for half the epochs compared to the last one, yet the validation accuracy is the same.
history = model.fit([X_train, X_train_decoder], Y_train, epochs=10, validation_data=([X_valid, X_valid_decoder], Y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [55]:
def predict_date_strs(date_strs):
    
    # Prepare both inputs (encoder, decoder)
    X = prepare_date_strs_padded(date_strs)
    Y_pred = tf.fill(dims=(len(X), 1), value=sos_id)
    
    # With this model we need to predict characters one by one
    for index in range(max_output_length):
        # Pad decoder inputs to the same lenght
        pad_size = max_output_length - Y_pred.shape[1]
        X_decoder = tf.pad(Y_pred, [[0, 0], [0, pad_size]])
        
        # Make a single character prediction
        Y_probas_next = model.predict([X, X_decoder])[:, index:index+1]
        Y_pred_next = tf.argmax(Y_probas_next, axis=-1, output_type=tf.int32)
        
        # Build up the ouptut sequece / basis for the next input for the decoder
        Y_pred = tf.concat([Y_pred, Y_pred_next], axis=1)
        
    # Convert the output back to a date string
    return ids_to_date_strs(Y_pred[:, 1:])

# Make new predictions
predict_date_strs(["July 14, 1789", "May 01, 2020"])

['1789-07-14', '2020-05-01']

#### TF-Addons's seq2seq implementation

In [56]:
import tensorflow_addons as tfa

# Reset RNG state
np.random.seed(42)
tf.random.set_seed(42)

# Define basic constants
encoder_input_dim = len(INPUT_CHARS) + 1
decoder_input_dim = len(INPUT_CHARS) + 2
output_dim = len(OUTPUT_CHARS) + 1

# Hyperparameters
encoder_embedding_size = 32
decoder_embedding_size = 32
units = 128

# Define inputs for both parts
encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

# Create embedding layers
encoder_embeddings = keras.layers.Embedding(encoder_input_dim, encoder_embedding_size)(encoder_inputs)
decoder_embedding_layer = keras.layers.Embedding(decoder_input_dim, decoder_embedding_size)
decoder_embeddings = decoder_embedding_layer(decoder_inputs)

# The Encoder
encoder = keras.layers.LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

# Crate a training sampler
sampler = tfa.seq2seq.sampler.TrainingSampler()

# Define some reusable components for the Decoder
decoder_cell = keras.layers.LSTMCell(units)
output_layer = keras.layers.Dense(output_dim)

# The Decoder and final output
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler, output_layer=output_layer)
final_outputs, _, _ = decoder(decoder_embeddings, initial_state=encoder_state)
Y_proba = keras.layers.Activation("softmax")(final_outputs.rnn_output)

# Build the Encoder-Decoder model using TF Addons
model = keras.models.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.Nadam(), metrics=["accuracy"])

# Train and validate the model
history = model.fit([X_train, X_train_decoder], Y_train, epochs=15, validation_data=([X_valid, X_valid_decoder], Y_valid))

# Test the model by making new predictions
predict_date_strs(["July 14, 1789", "May 01, 2020"])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


['1789-07-14', '2020-05-01']

Instead of manually making new predictions for each character, we can build new decoder component for the inference that does the same automatically.

In [60]:
# Make a new sampler that each time computes the argmax of the decoder's outputs that feeds it back to the embedding layer / LSTM cell
inference_sampler = tfa.seq2seq.sampler.GreedyEmbeddingSampler(embedding_fn=decoder_embedding_layer)

# Build new inference Decoder

# Note: `maximum_iterations` are there to prevent infinite loops 
#  - if model never outputs the end token for at least one of the sequences
inference_decoder = tfa.seq2seq.basic_decoder.BasicDecoder(
    decoder_cell,
    inference_sampler,
    output_layer=output_layer,
    maximum_iterations=max_output_length,
)

batch_size = tf.shape(encoder_inputs)[:1]
start_tokens = tf.fill(dims=batch_size, value=sos_id)

final_outputs, _, _ = inference_decoder(
    start_tokens,
    initial_state=encoder_state,
    start_tokens=start_tokens,
    end_token=0,
)

# Build new model for inference
#  - Note: We don't need decoder's inputs anymore as they will be generated dynamically
#  - Note 2: We return `sample_id` instead of all the logits
inference_model = keras.models.Model(inputs=[encoder_inputs], outputs=[final_outputs.sample_id])


def fast_predict_date_strs(date_strs):
    """Inference function that calls the inference model just once"""
    X = prepare_date_strs_padded(date_strs)
    Y_pred = inference_model.predict(X)
    return ids_to_date_strs(Y_pred)


# Test the inference model
fast_predict_date_strs(["July 14, 1789", "May 01, 2020"])

['1789-07-14', '2020-05-01']

In [61]:
%timeit predict_date_strs(["July 14, 1789", "May 01, 2020"])

403 ms ± 8.82 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [62]:
%timeit fast_predict_date_strs(["July 14, 1789", "May 01, 2020"])

37.2 ms ± 977 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### TF-Addons's seq2seq with a scheduled sampler

In [64]:
# Reset RNG state
np.random.seed(42)
tf.random.set_seed(42)

encoder_input_dim = len(INPUT_CHARS) + 1
decoder_input_dim = len(INPUT_CHARS) + 2
output_dim = len(INPUT_CHARS) + 1

# Hyperparameters
n_epochs = 20
encoder_embedding_size = 32
decoder_embedding_size = 32
units = 128

# Build the Encoder-Decoder model
# - Note: The only differencees are in the `ScheduledEmbeddingTrainingSampler` and addition of a sampling callback

# Inputs
encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

# Embeddings
encoder_embeddings = keras.layers.Embedding(encoder_input_dim, encoder_embedding_size)(encoder_inputs)
decoder_embedding_layer = keras.layers.Embedding(decoder_input_dim, decoder_embedding_size)
decoder_embeddings = decoder_embedding_layer(decoder_inputs)

# The Encoder
encoder = keras.layers.LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

# Scheduled sampler
#  - Sampler gradually replaces (with increasing probability) targets with previous predictions
#  - As the training progresses the decoder starts to get the same inputs as during inference

sampler = tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler(
    sampling_probability=0.,
    embedding_fn=decoder_embedding_layer,
)
sampler.sampling_probability = tf.Variable(0.)


def update_sampling_probability(epoch, logs):
    """Function implementing a sampling probability schedule"""
    proba = min(1.0, epoch / (n_epochs - 10))
    sampler.sampling_probability.assign(proba)

# The Decoder and output

decoder_cell = keras.layers.LSTMCell(units)
output_layer = keras.layers.Dense(output_dim)

decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler, output_layer=output_layer)
final_outputs, _, _ = decoder(decoder_embeddings, initial_state=encoder_state)
Y_proba = keras.layers.Activation("softmax")(final_outputs.rnn_output)

# Build the model
model = keras.models.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.Nadam(), metrics=["accuracy"])


# Train and validate the model
#  - Notice: We register sampler's schedule update as a callback triggering each epoch
history = model.fit(
    [X_train, X_train_decoder],
    Y_train,
    epochs=n_epochs,
    validation_data=([X_valid, X_valid_decoder], Y_valid),
    callbacks=[keras.callbacks.LambdaCallback(on_epoch_begin=update_sampling_probability)],
)

Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Convincing Shakespearean text using GPT

**FIXME**: Installation of `transformers`

```python
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel

model = TFOpenAIGPTLMHeadModel.from_pretrained("openai-gpt")
tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")

prompt_text = "This royal throne of kings, this sceptred isle"

# Tokenize and encode the prompt text
encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="tf")


num_sequences = 5
length = 40

# Generate 5 new sequences
#  - Each one starts with the prompt text
#  - After which 40 additional tokens are generated
generated_sequences = model.generate(
    input_ids=encoded_prompt,
    do_sample=True,
    max_length=length + len(encoded_prompt[0]),
    temperature=1.0,
    top_k=0,
    top_p=0.9,
    repetition_penalty=1.0,
    num_return_sequences=num_sequences,
)

# Decode and show generated sequences
for sequence in generated_sequences:
    text = tokenizer.decode(sequence, clean_up_tokenization_spaces=True)
    print(text)
    print("-" * 80)
```