In [1]:
#@title Imports
import tensorflow as tf
import tensorflow.keras as ks
from tensorflow.keras.layers.experimental import preprocessing as prep
import numpy as np
import os
import time
import re

In [2]:
#@title Settings
# How creative vs. deterministic we should be
temperature = .8
# ??
BATCH_SIZE = 64
# How many chars our window is?
BUFFER_SIZE = 800
# How much to backpropagate at once?
seq_length = 64
# Something?
embedding_dim = 256
# I think this is how big our hidden state is
rnn_units = 2048
learning_rate = .0001
output_chars = 1000
checkpoint_path = '/content/checkpoints/'
# Specify a path to load_weights from
load_weights = '/content/88Ephochs.h5'

## I. Parse Text Sources
First we'll load our text sources and create our vocabulary lists and encoders. 

In [3]:
#@title Load file data
# path_to_file = ks.utils.get_file('austen.txt', 'https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/austen/austen.txt')
path_to_file = '/content/the-collected-stories-of-arthur-c-clarke.txt'
# path_to_file = '~/Downloads/the-collected-stories-of-arthur-c-clarke.txt'
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print(f'Length of text: {len(text)} characters')
# Verify the first part of our data
print(text[:200])

Length of text: 2577789 characters
Travel by Wire!
	You people can have no idea of the troubles and trials we had to endure before we perfected the radio-transporter, not that it's quite perfect even yet. The greatest difficulty, as it


In [4]:
# Now we'll get a list of the unique characters in the file. This will form the
# vocabulary of our network. 
vocab = list(sorted(set(text)))
# Add some commonly used combonations
# vocab += ['and', 'the', 'th', 'ch', 'to', 'me', 'on', 'ing', 'you', 'rt', 'tr', 'my', 'in', 'is', 'or', 'for', 'it', 'be', 're', 'at', 'so', 'but', 'not', 'kn', 'ly']
print(f'{len(vocab)} unique tokens:')
print(vocab)

84 unique tokens:
['\t', '\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '£']


In [5]:
# Next, we'll encode encode these characters into numbers so we can use them
# with our neural network, then we'll create some mappings between the characters
# and their numeric representations
ids_from_chars = prep.StringLookup(vocabulary=vocab)
chars_from_ids = ks.layers.experimental.preprocessing.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True)
# chars2ids = ks.layers.TextVectorization(vocabulary=vocab)
# ids2chars = ks.layers.StringLookup(vocabulary=vocab, invert=True)

# Here's a little helper function that we can use to turn a sequence of ids back into a string:
def text_from_ids(ids):
  joinedTensor = tf.strings.reduce_join(chars_from_ids(ids), axis=-1)
  return joinedTensor.numpy().decode("utf-8")

In [6]:
# Now we'll verify that they work, by getting the code for "A", and then looking that up in reverse
testids = ids_from_chars(list('Truth'))
testids

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([49, 75, 78, 77, 65])>

In [7]:
chars_from_ids(testids)

<tf.Tensor: shape=(5,), dtype=string, numpy=array([b'T', b'r', b'u', b't', b'h'], dtype=object)>

In [8]:
testString = text_from_ids( testids )
testString

'Truth'

## II. Construct our training data
Next we need to construct our training data by building sentence chunks. Each chunk will consist of a sequence of characters and a corresponding "next sequence" of the same length showing what would happen if we move forward in the text. This "next sequence" becomes our target variable.

For example, if this were our text:

> It is a truth universally acknowledged, that a single man in possession
of a good fortune, must be in want of a wife.

And our sequence length was 10 with a step size of 1, our first chunk would be:

* Sequence: `It is a tr`
* Next Sequence: `t is a tru`

Our second chunk would be:

* Sequence: `t is a tru`
* Next Word: ` is a trut`



In [9]:
# Define how we want to split up the text
tokenize = tf.strings.unicode_split

In [10]:
# First, create a stream of encoded integers from our text
all_ids = ids_from_chars(tokenize(text, 'UTF-8'))
print(all_ids)
# Now, convert that into a tensorflow dataset
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

tf.Tensor([49 75 58 ... 71 15  2], shape=(2577789,), dtype=int64)


In [11]:
# Finally, let's batch these sequences up into chunks for our training
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

# This function will generate our sequence pairs:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

# Call the function for every sequence in our list to create a new dataset of input->target pairs
dataset = sequences.map(split_input_target)

In [12]:
# Verify our sequences
for input_example, target_example in  dataset.take(1):
    print("Input: ", text_from_ids(input_example))
    print("--------")
    print("Target: ", text_from_ids(target_example))

Input:  Travel by Wire!
	You people can have no idea of the troubles and
--------
Target:  ravel by Wire!
	You people can have no idea of the troubles and 


In [13]:
# Finally, we'll randomize the sequences so that we don't just memorize the books
# in the order they were written, then build a new streaming dataset from that.
# Using a streaming dataset allows us to pass the data to our network bit by bit,
# rather than keeping it all in memory. We'll set it to figure out how much data
# to prefetch in the background.
dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(64, 64), dtype=tf.int64, name=None), TensorSpec(shape=(64, 64), dtype=tf.int64, name=None))>

## III. Build the model

Next, we'll build our model. Up until this point, you've been using the Keras symbolic, or imperative API for creating your models. Doing something like:

    model = tf.keras.models.Sequentla()
    model.add(tf.keras.layers.Dense(80, activation='relu))
    etc...

However, tensorflow has another way to build models called the Functional API, which gives us a lot more control over what happens inside the model. You can read more about [the differences and when to use each here](https://blog.tensorflow.org/2019/01/what-are-symbolic-and-imperative-apis.html).

We'll use the functional API for our RNN in this example. This will involve defining our model as a custom subclass of `tf.keras.Model`.

If you're not familiar with classes in python, you might want to review [this quick tutorial](https://www.w3schools.com/python/python_classes.asp), as well as [this one on class inheritance](https://www.w3schools.com/python/python_inheritance.asp).

Using a functional model is important for our situation because we're not just training it to predict a single character for a single sequence, but as we make predictions with it, we need it to remember those predictions as use that memory as it makes new predictions.


In [14]:
#@title
# Create our custom model. Given a sequence of characters, this model's job is to predict what character should come next.
class TextModel(ks.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    # 1. An embedding layer that handles the encoding of our vocabulary into a vector of values suitable for a neural network
    self.embedding = ks.layers.Embedding(vocab_size, embedding_dim)

    # 2. Researcch this: https://datascience.stackexchange.com/questions/14581/when-to-use-gru-over-lstm
    # self.gru = ks.layers.GRU(rnn_units, return_sequences=True, return_state=True)
    self.lstm = ks.layers.LSTM(rnn_units, return_sequences=True, return_state=True)

    # 3. A hidden layer
    self.hidden = ks.layers.Dense(vocab_size*2)
    # 4. Our output layer that will give us a set of probabilities for each character in our vocabulary.
    self.out = ks.layers.Dense(vocab_size)

  # This function will be executed for each epoch of our training. Here
  # we will manually feed information from one layer of our network to the next.
  def call(self, inputs, states=None, return_state=False, training=False):
    # 1. Feed the inputs into the embedding layer, and tell it if we are training or predicting
    x = self.embedding(inputs, training=training)

    # 2. If we don't have any state in memory yet, get the initial random state from our GRUI layer.
    if states is None:
      states = self.lstm.get_initial_state(x)
    
    # 3. Now, feed the vectorized input along with the current state of memory into the lstm layer.
    # x, states = self.gru(x, initial_state=states, training=training)
    x, state_h, state_c = self.lstm(x, initial_state=states, training=training)
    states = [state_h, state_c]

    # 4. Finally, pass the results on to the dense layers
    x = self.hidden(x,training=training)
    x = self.out(x, training=training)

    # 5. Return the results
    if return_state:
      return x, states
    else: 
      return x

In [15]:
# Create an instance of our model
vocab_size=len(ids_from_chars.get_vocabulary())
model = TextModel(vocab_size, embedding_dim, rnn_units)

In [16]:
# Verify the output of our model is correct by running one sample through
# This will also compile the model for us. This step will take a bit.
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 64, 85) # (batch_size, sequence_length, vocab_size)


In [17]:
# Now let's view the model summary
model.summary()

Model: "text_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  21760     
                                                                 
 lstm (LSTM)                 multiple                  18882560  
                                                                 
 dense (Dense)               multiple                  348330    
                                                                 
 dense_1 (Dense)             multiple                  14535     
                                                                 
Total params: 19,267,185
Trainable params: 19,267,185
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.compile(
    optimizer=ks.optimizers.Adam(learning_rate=learning_rate),
    loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
)

if load_weights and len(load_weights):
    model.load_weights(load_weights)

## IV. Train the model

For our purposes, we'll be using [categorical cross entropy](https://machinelearningmastery.com/cross-entropy-for-machine-learning/) as our loss function*. Also, our model will be outputting ["logits" rather than normalized probabilities](https://stackoverflow.com/questions/41455101/what-is-the-meaning-of-the-word-logits-in-tensorflow), because we'll be doing further transformations on the output later. 


\* Note that since our model deals with integer encoding rather than one-hot encoding, we'll specifically be using [sparse categorical cross entropy](https://stats.stackexchange.com/questions/326065/cross-entropy-vs-sparse-cross-entropy-when-to-use-one-over-the-other).

In [19]:
# earlyStop = ks.callbacks.EarlyStopping(monitor='loss', restore_best_weights=True)
checkpointCallback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    monitor='loss',
    mode='min',
    save_best_only=True)

history = model.fit(dataset, epochs=1, callbacks=[checkpointCallback], steps_per_epoch=100)



In [20]:
# model.save_weights('/content/80Ephochs.h5')

In [21]:
# model.load_weights(checkpoint_path)

## V. Use the model

Now that our model has been trained, we can use it to generate text. As mentioned earlier, to do so we have to keep track of its internal state, or memory, so that we can use previous text predictions to inform later ones.

However, with RNN generated text, if we always just pick the character with the highest probability, our model tends to get stuck in a loop. So instead we will create a probability distribution of characters for each step, and then sample from that distribution. We can add some variation to this using a paramter known as ["temperature"](https://cs.stackexchange.com/questions/79241/what-is-temperature-in-lstm-and-neural-networks-generally).

In [22]:
# Here's the code we'll use to sample for us. It has some extra steps to apply
# the temperature to the distribution, and to make sure we don't get empty
# characters in our text. Most importantly, it will keep track of our model
# state for us.

class OneStep(ks.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=temperature):
    super().__init__()
    self.temperature=temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "" or "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['','[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices = skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())]) 
    self.prediction_mask = tf.sparse.to_dense(sparse_mask,validate_indices=False)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits] 
    predicted_logits, states =  self.model(inputs=input_ids, states=states, return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    
    # Apply the prediction mask: prevent "" or "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Return the characters and model state.
    return chars_from_ids(predicted_ids), states


In [25]:
# Create an instance of the character generator
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

# prompt = 'The Robot\'s Mother\n\t'
# prompt = "The dark clouds seemed foreboding. In the distance "
prompt = "The world seemed like such a peaceful place until the magic tree was discovered in London."

# Now, let's generate a 1000 character chapter by giving our model starting text
states = None
next_char = tf.constant([prompt])
result = [next_char]

for n in range(output_chars):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)

# Print the results formatted.
print(result[0].numpy().decode('utf-8'))




The world seemed like such a peaceful place until the magic tree was discovered in London. Then was understand, when it was lying off above the coils of doon was the deliving theo introduced here, accommodation and landing crims, and Now you know me justing with disbelief polomonies, but some of those messages from Earth!
	The commercial station was completely erupting among enem speciely, fraquenting to halt offer to sprong nervously on record.
	The fight below the horizon undusarded by the most powerfolic surprise are shouted wrought caught Picketor'
	Oll really had happened we had been completely dispersed, and the air help for home.
	Once a malia life came vain it was a todaway to the desonaiting charts; though one day, poor old Loca swifling had already made him the unberisoness.
	The robot's spoal thought was forming it gathering instructions in a common energy from the heavens.
	He turned to the poor reading into the peak of mighty racing beside him and finished his face.
	'You 

## VI. Next Steps

This is a very simple model with one GRU layer and then an output layer. However, considering how simple it is and the fact that we are predicting outputs character by character, the text it produces is pretty amazing. Though it still has a long way to go before publication.

There are many other RNN architectures you could try, such as adding additional hidden dense layers, replacing GRU with one or more LSTM layers, combining GRU and LSTM, etc...

You could also experiment with better text cleanup to make sure odd punctuation doesn't appear, or finding longer texts to use. If you combine texts from two authors, what happens? Can you generate a Jane Austen stageplay by combining austen and shakespeare texts?

Finally, there are a number of hyperparameters to tweak, such as temperature, epochs, batch size, sequence length, etc...