# Text Generation Using Recurrent Neural Networks

This week we are going to learn about Recurrent Neural Networks (RNNs). RNNs are very popular in the field of natural language processing, because they are remarkably good at modelling langauge. This is because, unlike the other models we have looked at so far, RNNs take a account of the *order* in which things appear, and word order is obviously a very important feature of language. Because of their ability to take account of the *order* of observations, RNNs are a type of 'sequence model.'

I explained how these models work in the workshop. The [slides](slides/text-generation.pdf) are available in this repo.

To see how RNNs work, and to learn a bit abou what they are capable of, today we will train the computer to generate poetry based on 3 million lines of poetry from Project Gutenberg.

**NB:** If you are using this notebook in Google Colab, pleaes make sure to click the 'runtime' button in the top right, and choose 'GPU'.

## Import the poetry data

In [None]:
# Download the corpus
# !curl -O "http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz"

# Extract five hundred thousand lines of poetry from the corpus
import gzip, json
lines = []
n = 0
for line in gzip.open("gutenberg-poetry-v001.ndjson.gz"):
    n += 1
    if n > 500000:
        break
    json_line = json.loads(line.strip())
    lines.append(json_line["s"])

# Take a look
lines[50:60]

## Preprocess the Data

In [None]:
import tensorflow as tf
import numpy as np

tkzr = tf.keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tkzr.fit_on_texts(lines)
sequences = tkzr.texts_to_sequences(lines)

In [None]:
# Add 'newline' to the end of every line of poetry
if "\n" in tkzr.word_index:
  nl_idx = tkzr.word_index["\n"]
else:
  nl_idx = max(tkzr.word_index.values()) + 1
  tkzr.word_index["\n"] = nl_idx
  tkzr.index_word[nl_idx] = "\n"
for seq in sequences:
  seq.append(nl_idx)

In [None]:
# This function reshapes the data into sequences of a fixed length
def join_split_sequences(sequences, n=30, pad=0):
  """Joins sequences and splits them into sequences of width n, with optional
  padding of the final split
  
  Parameters:
  ===========
  sequences (iterable): an iterable of sequences
  n (int): how long the fixed-length sequences should be
  pad (int or None): what to pad the end sequence with if desired
  """
  out = []
  next_split = []
  for seq in sequences:
    for token in seq:
      next_split.append(token)
      if len(next_split) == n:
        out.append(next_split)
        next_split = []
  # Optional padding
  if len(out[-1]) < n and pad:
    out[-1] += [pad] * (n - len(out))

  return out

In [None]:
sequences = join_split_sequences(sequences, n=17, pad=0)

In [None]:
foo = tkzr.sequences_to_texts([sequences[0]])
print(foo[0])

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(sequences)

In [None]:
# From https://www.tensorflow.org/text/tutorials/text_generation
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [None]:
dataset = dataset.map(split_input_target)

In [None]:
# Batch size
BATCH_SIZE = 256

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

## Define the RNN model


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
class PoetryGenerator(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.lstm = tf.keras.layers.LSTM(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.lstm.get_initial_state(x)
    x, memory_state, carry_state = self.lstm(x, initial_state=states, training=training)
    states = [memory_state, carry_state]
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

## Initialise and train the model

In [None]:
# Length of the vocabulary in chars
vocab_size = len(tkzr.word_index) + 1

# The embedding dimension
embedding_dim = 32

# Number of RNN units
rnn_units = 256

model = PoetryGenerator(vocab_size, embedding_dim, rnn_units)

In [None]:
model.compile(optimizer='adam', loss=tf.keras.losses.sparse_categorical_crossentropy)

In [None]:
model.fit(dataset, epochs=2)

# Generate Some Poetry

In [None]:
class InferenceModel(tf.keras.Model):
  def __init__(self, model, tokenizer, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.tokenizer = tokenizer

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_ids = [self.tokenizer.word_index[word] for word in inputs]
    input_tensor = tf.expand_dims(input_ids, axis=0)

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_tensor, states=states, return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    
    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Return the characters and model state.
    return predicted_ids, states

In [None]:
inference_model = InferenceModel(model, tokenizer=tkzr)

In [None]:
states = (None, None)

next_word = ['o', 'for', 'a', 'muse', 'of', 'fire']
result = next_word.copy()

for n in range(50):
    next_word, states = inference_model.generate_one_step(next_word, states)
    next_word = next_word.numpy()[0]
    next_word = tkzr.index_word[next_word]
    result.append(next_word)

result = ' '.join(result)

print(result)