In [1]:
import tensorflow as tf

2024-08-31 11:04:05.959630: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-31 11:04:06.132591: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-31 11:04:06.221688: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-31 11:04:06.353163: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-31 11:04:06.385276: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-31 11:04:06.567566: I tensorflow/core/platform/cpu_feature_gu

**Notes**:
- This notebook follows [an online tutorial](https://www.tensorflow.org/text/tutorials/nmt_with_attention) (and [at least one other](https://www.tensorflow.org/text/tutorials/text_generation) of the Tensorflow tutorials).
- This [blog post](https://janakiev.com/blog/jupyter-virtual-envs/) was referenced to set up the virtual environment.

In [2]:
import numpy as np
import typing
from typing import Any, Tuple
from prepare_data import load_data


In [3]:
target_raw, context_raw = load_data('./data/en/')

We store the **expected** output in `target_raw` and the input to our model in `context_raw`. Let's see an example:

In [4]:
target_raw[12], context_raw[12]

('Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed it was labelled ORANGE MARMALADE , but to her great disappointment it was empty she did not like to drop the jar for fear of killing somebody underneath, so managed to put it into one of the cupboards as she fell past it.',
 'either the well was very deep or she fell very slowly for she had plenty of time as she went down to look about her and to wonder what was going to happen next first she tried to look down and make out what she was coming to but it was too dark to see anything then she looked at the

Notice that `target_raw` contains punctuation, while `context_raw` does not. Each entry is paragraph length.

## Creating a dataset

We begin by vectorizing our data. For now, we're working with the [text generation](https://www.tensorflow.org/text/tutorials/text_generation) tutorial and so we tokenize by characters.

In [5]:
text = ' '.join(target_raw)
vocab = sorted(set(text))
print('vocab size in chars:', len(vocab))

chars_to_ids = tf.keras.layers.StringLookup(vocabulary=vocab)
# Invert: Map chars to IDs instead of IDs to chars
ids_to_chars = tf.keras.layers.StringLookup(vocabulary=chars_to_ids.get_vocabulary(), invert=True)

def text_from_ids(ids: list[int]):
	return tf.strings.reduce_join(ids_to_chars(ids), axis=-1)

vocab size in chars: 89


In [6]:
all_ids = chars_to_ids(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(6376070,), dtype=int64, numpy=array([27, 58, 58, ..., 19, 36, 46])>

In [7]:
# Separates all_ids along its first dimension into different items in the dataset.
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

# Preview the dataset -- demonstrates converting Tensors to numpy to text
for ids in ids_dataset.take(32):
	print(ids_to_chars(ids).numpy().decode('utf-8'), end = ' ')

I l l u s t r a t i o n   A l i c e ' s   A d v e n t u r e s   

2024-08-31 11:04:12.615895: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [8]:
seq_length = 64

# batch: Convert the dataset to sequences of the target size.
# drop_remainder: Drop the last batch if it has fewer than 80 elements
sequences = ids_dataset.batch(seq_length + 1, drop_remainder=True)

for seq in sequences.take(2):
	print(text_from_ids(seq))

tf.Tensor(b"Illustration Alice's Adventures in Wonderland by Lewis Carroll TH", shape=(), dtype=string)
tf.Tensor(b'E MILLENNIUM FULCRUM EDITION 3.0 Contents CHAPTER I. Down the Rab', shape=(), dtype=string)


2024-08-31 11:04:12.664575: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


We now split into a dataset of $(\text{input}, \text{label})$ pairs. Here, `input` is the current character and `label` is the next character (expected output).

In [9]:
def split_input_label(sequence):
	# Shifts the label text by 1 with respect to the input so that
	# entrywise pairing creates the desired (input, label).
	input_text = sequence[:-1]
	label_text = sequence[1:]
	return input_text, label_text

print(split_input_label(list('Text')))

(['T', 'e', 'x'], ['e', 'x', 't'])


We create our final dataset by pairing labels and input:

In [10]:
dataset = sequences.map(split_input_label)

for example_input, example_label in dataset.take(1):
	print('Example input:', text_from_ids(example_input))
	print('Example label:', text_from_ids(example_label))

Example input: tf.Tensor(b"Illustration Alice's Adventures in Wonderland by Lewis Carroll T", shape=(), dtype=string)
Example label: tf.Tensor(b"llustration Alice's Adventures in Wonderland by Lewis Carroll TH", shape=(), dtype=string)


## Final preprocessing

We now shuffle the data, then do final batching.

In [11]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True).prefetch(tf.data.AUTOTUNE)
# Break into test and training data (no validation data for now).
# Inspired by https://stackoverflow.com/a/74609848.
# test_size = dataset.cardinality() * 1 // 4
# dataset_test = dataset.take(test_size)
# dataset = dataset.skip(test_size)


## Building the model



In [12]:
# .get_vocabulary: Returns a list of the characters in use.
vocab_size = len(chars_to_ids.get_vocabulary())

EMBEDDING_DIM = 32
RNN_UNITS = 64 # Dimensionality of GRU output

print('vocab_size', vocab_size)
print('EMBEDDING_DIM', EMBEDDING_DIM)
print('RNN_UNITS', RNN_UNITS)

vocab_size 90
EMBEDDING_DIM 32
RNN_UNITS 64


In [13]:
class LanguageModel(tf.keras.Model):
	def __init__(self, vocab_size: int, embedding_dim: int, rnn_units: int):
		super().__init__()

		self.embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim)
		# return_sequences: Return the full sequence of outputs, rather than just the last.
		# return_state: Returns the last state in addition to the output
		self.gru_layer = tf.keras.layers.GRU(rnn_units, return_sequences=True, return_state=True)
		self.dense_layer = tf.keras.layers.Dense(vocab_size, activation=tf.keras.activations.log_softmax)
	
	def call(self, inputs, states = None, return_state = False, training = False):
		x = self.embedding_layer(inputs, training = training)
		if states is None:
			batch_size, _ = inputs.shape
			states = self.gru_layer.get_initial_state(batch_size)

		x, states = self.gru_layer(x, initial_state = states, training = training)
		x = self.dense_layer(x, training = training)

		if return_state:
			return x, states
		else:
			return x

# We override tf.keras.Model to allow extracting the state later.

In [14]:
model = LanguageModel(vocab_size, EMBEDDING_DIM, RNN_UNITS)

## Trying the (untrained) model


In [15]:
for sample_input_batch, sample_label_batch in dataset.take(1):
	sample_predictions = model(sample_input_batch)
	print(sample_predictions.shape, ':: (batch_size, seq_length, vocab_size)')

model.summary()

(64, 64, 90) :: (batch_size, seq_length, vocab_size)


2024-08-31 11:04:13.831817: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Now let's inspect `sample_predictions`:

In [16]:
# Take one sample of the (100,91) data, where sample_predictions[0] contains log probability
sampled_indices = tf.random.categorical(sample_predictions[0], num_samples = 1)
print(sampled_indices.shape)

# tf.squeeze: Removes dimensions of size 1.
sampled_indices = tf.squeeze(sampled_indices).numpy()
print(sampled_indices.shape)


(64, 1)
(64,)


In [17]:
print('Input:', text_from_ids(sample_input_batch[0]).numpy().decode('utf-8'))
print('Next predictions:', text_from_ids(sampled_indices).numpy().decode('utf-8'))

Input:  your history, she do. I'll tell it her, said the Mock Turtle in
Next predictions: ï0po.TŒWüœTLum FPîS525XpnSÉ ïJo6ŒŒQJë4yQlrvRo?PI1î.1â[UNK]?Æhàêoë9R2


Seemingly random output, as expected!

## Training!

We can train it now! It's a standard classification problem -- given the previous RNN state and the current character, predict the next character.

We're using the `SparseCategoricalCrossentropy` loss. See https://datascience.stackexchange.com/a/41923 and perhaps https://stats.stackexchange.com/a/420730 for commentary.

In [18]:
loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

sample_batch_mean_loss = loss_fn(sample_label_batch, sample_predictions)
print('loss pre-training', float(tf.exp(sample_batch_mean_loss)))

loss pre-training 89.98168182373047


As expected, the initial loss is large.

Now we attach the loss function and an optimizer:

In [19]:
model.compile(optimizer='adam', loss=loss_fn)

We're just about ready to train.

In [20]:
# Set up checkpoints

checkpoint_path = './tf_model_checkpoints/checkpoint.weights.h5'

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
	filepath=checkpoint_path, monitor='loss', mode='min', save_weights_only=True, save_best_only=True
)

In [23]:
EPOCHS = 5

In [27]:
# Note: This block was run >1 times (twice)
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/5
[1m1532/1532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 50ms/step - loss: 1.6387
Epoch 2/5
[1m1532/1532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 47ms/step - loss: 1.6278
Epoch 3/5
[1m1532/1532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 50ms/step - loss: 1.6189
Epoch 4/5
[1m1532/1532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 50ms/step - loss: 1.6118
Epoch 5/5
[1m1532/1532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 47ms/step - loss: 1.6061


## Generate text

We'll run the model in a loop, keeping track of its internal state as it's executed.

In [25]:
class TextGenerator:
	def __init__(self, model: LanguageModel, ids_to_chars, chars_to_ids, temperature: float = 1.0):
		self.temperature = temperature
		self.model = model
		self.ids_to_chars = ids_to_chars
		self.chars_to_ids = chars_to_ids
		self.last_states = None

		# See https://www.tensorflow.org/api_docs/python/tf/sparse/SparseTensor?hl=en
		skip_ids = self.chars_to_ids(['[UNK]'])
		vocab_size = len(self.chars_to_ids.get_vocabulary())
		print(vocab_size, skip_ids)
		self.prediction_mask = tf.sparse.to_dense(tf.SparseTensor(
			indices=[skip_ids], # shape [N, ndims]. This specifies the nonzero elements' indices.
			values=[float('-inf')] * len(skip_ids),
			dense_shape=[vocab_size],
		))
	
	def step(self, input: str|Any):
		# Data conversion
		input_chars = tf.strings.unicode_split(input, 'UTF-8')
		input_ids = self.chars_to_ids(input_chars)
		input_ids = tf.reshape(input_ids, [-1, 1])

		# Run it!
		# predicted.shape is [batch, char, next_char_logits]
		predicted, states = self.model(inputs=input_ids, states=self.last_states, return_state=True)
		self.last_states = states

		predicted_next_char_logits = predicted[-1, -1, :]
		predicted_next_char_logits /= self.temperature
		predicted_next_char_logits += self.prediction_mask # Sets some weights to -inf

		predicted_ids = tf.random.categorical([predicted_next_char_logits], num_samples=1)
		predicted_ids = tf.squeeze(predicted_ids)

		predicted_chars = self.ids_to_chars(predicted_ids)
		return predicted_chars


In [28]:
%%time

text_generator = TextGenerator(model, ids_to_chars, chars_to_ids)
next_char = 'Hello, '

for i in range(10):
	output = []

	for j in range(100):
		next_char = text_generator.step(next_char)
		output.append(next_char)

	print(tf.strings.join(output).numpy().decode('utf-8'))

90 tf.Tensor([0], shape=(1,), dtype=int64)
an encountre. We prieficiI. Amer, I new me what streng in thoughes, jull course of the past, and the
y were with accroy. I know mise a secreties is blood of noise and thinks regay Valently. I was Allie
, visit in admy of the counter first for that nowsible, count to me. Then sere moment on the paies. 
He from theet upon the peace is thinh preature. Chattan concious boy to als, Bamp by herd beloarser,
 Pronaling generence? And read thy pud to his invents dew at to under the hount of fell which Those 
to not a ladynown methers a gidently beliead had es, or very splied Monte Crissable, would never be 
hudias before, and the stracefuse to eptrorm had him was should being which would leave mamper of so
nvoles. Thessend. The sort would dests and woulf, over hunglune in the bestleen, I went has into any
one tongre Master, I busious justy, me for it, and poor it have stere exclised he breart, unce staci
, canam, time, now for battless turned Hix the o