# Natural Language Processing with RNNs and Attention

In [1]:
# FIXME: meke autocompletion working again
%config Completer.use_jedi = False

import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU')

if not physical_devices:
    print("No GPU was detected.")
else:
    # https://stackoverflow.com/a/60699372
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    
from tensorflow import keras

No GPU was detected.


## Char-RNN
Let's build a RNN processing sequences of text and predicting single character.

### Loading the Data and Preparing the Dataset
Following example uses famous Shakespear's texts.

In [2]:
# Set RNG state
np.random.seed(42)
tf.random.set_seed(42)

# Download the dataset
filepath = keras.utils.get_file(
    "shakespeare.txt",
    "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
)

# Load raw dataset
with open(filepath) as f:
    shakespeare_text = f.read()
    
# Show a pice of the text
print(shakespeare_text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [3]:
# Setup a character-based text tokenizer
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [4]:
# Convert a text to a sequence of character IDs
tokenizer.texts_to_sequences(["First"])

[[20, 6, 9, 8, 3]]

In [5]:
# Convert a sequence of character IDs back to text
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])

['f i r s t']

In [6]:
# Set RNG state
np.random.seed(42)
tf.random.set_seed(42)

# number of distinct characters
max_id = len(tokenizer.word_index)

# total number of characters
dataset_size = tokenizer.document_count

# Encode the whole dataset
#  - TF tokenizer assigns the first character it encounters with ID=1, we shift it back to start from 0
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

# Build a training TF Dataset from the first 90% of the text
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

# Preprocessing parameters
# - length of a training instance (sequence of text)
# - size of a training micro-batch
n_steps = 100
batch_size = 32

# target = input shifted 1 character ahead
window_length = n_steps + 1

# Create training instances (sequences of text) by sliding a window over the text
#  - each time we shift it by single character (`shift=1`)
#  - `drop_remainder=True` means that we don't want to include final shortened windows with length < window length 
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

# Because `window()` creates a nested Dataset (containing sub-datasets), we want to flatten and convert it to single dataset of tensors
#  - the trick here is that we batch the windows to the same length they already have
dataset = dataset.flat_map(lambda window: window.batch(window_length))

# Now we can safely shuffle the dataset and not to break the text
#  - note: shuffling ensures some degree of i.i.d. which is necessary for SGD to work well
#  - we also create training micro-batches
dataset = dataset.shuffle(10000).batch(batch_size)

# Split the instances to (inputs, target) where the target is the next character
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

# As the last step we must either encode or embed categorical features (characters)
#  - here we use 1-hot encoding since there's fairly few distinct characters
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

# Finally we prefetch the data for better training performance
dataset = dataset.prefetch(1)

# Show shapes of 1st batch tensors
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

(32, 100, 39) (32, 100)


### Creating and Training the Model

In [7]:
# Build a simple Char-RNN model:
# - there are two GRU recurrent layers with 128 units, both of which use a 20% dropout (`recurrent_dropout`)
# - there's also a 20% input dropout (`dropout` parameter of the 1st layer)
# - the output layer is a time-distributed dense layer with 39 units and softmax activation to predict each character's class probability
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

# Train and validate the model for 10 epochs
# - Note: This would take forever to train on my PC, so let's use just few batches
history = model.fit(dataset.take(40), epochs=10)
# history = model.fit(dataset, steps_per_epoch=train_size // batch_size, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Using the Model to Generate Text

In [8]:
# Note: this example dosn't present the model very well since it's not been trained on the full dataset (see previous cell)

def preprocess(texts):
    """Preprocess given text to conform to Char-RNN's input"""
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

# Make a new prediction using the model
X_new = preprocess(["How are yo"])
Y_pred = np.argmax(model.predict(X_new), axis=-1)

# Show the prediction as text: 1st sentence, last char
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]

'u'

Next, let's generate not only single letter but whole new text. One approach is to repeatedly call the above. However, this often leads to repeating the same letter over and over again. Better approach is to select next letter randomly based on the learned class probabilities.

In [10]:
def next_char(text, temperature=1):
    """
    Generate new characters based on given text.
     1. we pre-process and predict as before but return all character probablilities
     2. then we compute the log of probabilities and scale it by the `temperature` parameter (the higher, the more in favour of higher prob. letters)
     3. finally we select single character randomly given these log-probs. and convert the character ID back to text 
    """
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]


def complete_text(text, n_chars=50, temperature=1):
    """Extend given text with `n_chars` new letters"""
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text


# Reset RNG state
tf.random.set_seed(42)

# Complete some text using different temperatures
print(complete_text("t", temperature=0.2))

t the bell and in the belly the belly and the the b


In [11]:
print(complete_text("t", temperature=1))

tucio' thenf'th,
affed, you the beagu, as le gileve


In [12]:
print(complete_text("t", temperature=2))

ty no c't;
meracqniogtt cino! aekfll ar:hwigh: n: b
