In [1]:
import tensorflow as tf

2024-08-31 15:58:11.970512: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-31 15:58:11.977465: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-31 15:58:11.986886: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-31 15:58:12.005478: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-31 15:58:12.010049: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-31 15:58:12.021188: I tensorflow/core/platform/cpu_feature_gu

**Notes**:
- This notebook follows [an online tutorial](https://www.tensorflow.org/text/tutorials/nmt_with_attention) (and [at least one other](https://www.tensorflow.org/text/tutorials/text_generation) of the Tensorflow tutorials).
- This [blog post](https://janakiev.com/blog/jupyter-virtual-envs/) was referenced to set up the virtual environment.

In [2]:
import numpy as np
import typing
from typing import Any, Tuple
from prepare_data import load_data, reconstruct_from_labels


In [3]:
context_raw, target_raw = load_data('./data/en/')

We store the **expected** output in `target_raw` and the input to our model in `context_raw`. Let's see an example:

In [4]:
target_raw

array([['0', 'x'],
       ['1', 'x'],
       ['0', 'x'],
       ...,
       ['1', 'x'],
       ['0', 'x'],
       ['0', 'x']], dtype='<U21')

Each element in `target_raw` is an operation (e.g. 0 = copy) followed by a character code. For example, `1` is "capitalize" and `0` is "copy". Note that `120` (`ord(x)`) is used for operations that take no arguments.

In [5]:
target_raw[24], context_raw[24]

(array(['0', 'x'], dtype='<U21'), 'v')


## Creating a dataset

We begin by vectorizing our data. `target_raw` and `context_raw` are already tokenized by characters/operations.

We start by creating a vectorization for the `target_raw`.

In [6]:
input_vocab = sorted(set(context_raw))
print('Input vocab size in chars:', len(input_vocab))

chars_to_ids_in = tf.keras.layers.StringLookup(vocabulary=input_vocab)
# Invert: Map chars to IDs instead of IDs to chars
ids_to_chars_in = tf.keras.layers.StringLookup(vocabulary=chars_to_ids_in.get_vocabulary(), invert=True)

# in: "Input"
def text_from_ids_in(ids: list[int]):
	return tf.strings.reduce_join(ids_to_chars_in(ids), axis=-1)

Input vocab size in chars: 54


In [7]:
all_ids_input = chars_to_ids_in(context_raw)
all_ids_input

<tf.Tensor: shape=(6213260,), dtype=int64, numpy=array([40, 22, 25, ..., 31, 13,  1])>

Now, we do the same for the output.

In [8]:
output_vocab = sorted(set(target_raw[:, 1]))
print('Output vocab size in chars:', len(output_vocab))

chars_to_ids_out = tf.keras.layers.StringLookup(vocabulary=output_vocab)
# Invert: Map chars to IDs instead of IDs to chars
ids_to_chars_out = tf.keras.layers.StringLookup(vocabulary=chars_to_ids_out.get_vocabulary(), invert=True)


Output vocab size in chars: 8


In [9]:
sorted(set(target_raw[:,1]))

['!', "'", ',', '-', '.', '/', '?', 'x']

In [10]:
all_ids_output = chars_to_ids_out(target_raw[:, 1])
all_ids_output

<tf.Tensor: shape=(6213260,), dtype=int64, numpy=array([8, 8, 8, ..., 8, 8, 8])>

In [11]:
all_commands_output = np.array(list(map(int, target_raw[:, 0])))
all_commands_output

array([0, 1, 0, ..., 1, 0, 0])

Now that we have vectorized inputs and outputs, let's create a `Dataset` we can feed to the model.

First, combine the expected inputs and outputs into a single vector:

In [12]:
def column(v):
	return tf.reshape(v, [-1, 1])

ids_and_outputs = tf.concat([
	column(all_ids_input), column(all_commands_output), column(all_ids_output)
], 1)
ids_and_outputs

<tf.Tensor: shape=(6213260, 3), dtype=int64, numpy=
array([[40,  0,  8],
       [22,  1,  8],
       [25,  0,  8],
       ...,
       [31,  1,  8],
       [13,  0,  8],
       [ 1,  0,  8]])>

Next, create a `Dataset`:

In [13]:
# Separates ids_and_outputs along its first dimension into different items in the dataset.
ids_input_dataset = tf.data.Dataset.from_tensor_slices(all_ids_input)
# Tuples: from_tensor_slices pairs entries of each tuple item to produce the dataset.
output_dataset = tf.data.Dataset.from_tensor_slices((all_commands_output, all_ids_output))
dataset = tf.data.Dataset.zip(ids_input_dataset, output_dataset)

# Preview the dataset -- demonstrates converting Tensors to numpy to text
for input, expected_sample_outputs in dataset.take(32):
	cmd_output, arg_output = expected_sample_outputs
	input_char = ids_to_chars_in(input).numpy().decode('utf-8')
	command = cmd_output.numpy()
	command_arg = ids_to_chars_out(arg_output).numpy().decode('utf-8')

	print('{} c({}, {})'.format(input_char, command, command_arg), end = ', ')


~ c(0, x), i c(1, x), l c(0, x), l c(0, x), u c(0, x), s c(0, x), t c(0, x), r c(0, x), a c(0, x), t c(0, x), i c(0, x), o c(0, x), n c(0, x),   c(0, x), ~ c(0, x), a c(1, x), l c(0, x), i c(0, x), c c(0, x), e c(0, x), s c(2, '),   c(0, x), a c(1, x), d c(0, x), v c(0, x), e c(0, x), n c(0, x), t c(0, x), u c(0, x), r c(0, x), e c(0, x), s c(0, x), 

2024-08-31 15:58:29.233928: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [14]:
seq_length = 80

# batch: Convert the dataset to sequences of the target size.
# drop_remainder: Drop the last batch if it has fewer than 80 elements
sequences = dataset.batch(seq_length + 1, drop_remainder=True)

for sample_inputs, expected_sample_outputs in sequences.take(2):
	print('Inputs:', text_from_ids_in(sample_inputs))

Inputs: tf.Tensor(b'~illustration ~alices adventures in wonderland ~by lewis carroll ~the millennium ', shape=(), dtype=string)
Inputs: tf.Tensor(b'fulcrum edition 30 ~contents ~chapter i down the rabbithole chapter ii the pool o', shape=(), dtype=string)


2024-08-31 15:58:29.294968: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [15]:
dataset = sequences

Our dataset now pairs inputs and labels!

**Note**: This [StackOverflow](https://stackoverflow.com/questions/53171885/how-to-use-tf-data-dataset-and-tf-keras-do-multi-inputs-and-multi-outpus) question, the documentation on [Dataset.zip](https://www.tensorflow.org/api_docs/python/tf/data/Dataset?hl=en#zip), and documentation on [Dataset.from_tensor_slices](https://www.tensorflow.org/api_docs/python/tf/data/Dataset?hl=en#from_tensor_slices) were helpful.

## Final preprocessing

We now shuffle the data, then do final batching.

In [16]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True).prefetch(tf.data.AUTOTUNE)
# Break into test and training data (no validation data for now).
# Inspired by https://stackoverflow.com/a/74609848.
validate_size = dataset.cardinality() * 1 // 8
dataset_validate = dataset.take(validate_size)
dataset = dataset.skip(validate_size)


## Building the model



In [17]:
# .get_vocabulary: Returns a list of the characters in use.
vocab_size_in = len(chars_to_ids_in.get_vocabulary())

vocab_size_out = len(chars_to_ids_out.get_vocabulary())
command_count = len(set(all_commands_output))
size_out = command_count + vocab_size_out # Output includes both commands and the command arg

EMBEDDING_DIM = 64
RNN_UNITS = 64 # Dimensionality of GRU output

print('vocab_size_in', vocab_size_in)
print('vocab_size_out', vocab_size_out)
print('command_count', command_count)
print('EMBEDDING_DIM', EMBEDDING_DIM)
print('RNN_UNITS', RNN_UNITS)

vocab_size_in 55
vocab_size_out 9
command_count 3
EMBEDDING_DIM 32
RNN_UNITS 128


In [18]:
class LanguageModel(tf.keras.Model):
	def __init__(self):
		super().__init__()

		self.embedding_layer = tf.keras.layers.Embedding(vocab_size_in, EMBEDDING_DIM)
		# return_sequences: Return the full sequence of outputs, rather than just the last.
		# return_state: Returns the last state in addition to the output
		self.gru_layer = tf.keras.layers.GRU(RNN_UNITS, return_sequences=True, return_state=True)
		self.dense_command_layer = tf.keras.layers.Dense(command_count, activation=tf.keras.activations.log_softmax)
		self.dense_arg_layer = tf.keras.layers.Dense(vocab_size_out, activation=tf.keras.activations.log_softmax)
	
	def call(self, inputs, states = None, return_state = False, training = False):
		x = self.embedding_layer(inputs, training = training)
		if states is None:
			batch_size, _ = inputs.shape
			states = self.gru_layer.get_initial_state(batch_size)

		x, states = self.gru_layer(x, initial_state = states, training = training)
		y_command = self.dense_command_layer(x, training = training)
		y_arg = self.dense_arg_layer(x, training = training)

		if return_state:
			return y_command, y_arg, states
		else:
			return y_command, y_arg

# We override tf.keras.Model to allow extracting the state later.

In [19]:
model = LanguageModel()

## Trying the (untrained) model


In [20]:
print(dataset.take(1))

for sample_inputs, expected_sample_outputs in dataset.take(1):
	sample_cmd_predictions, sample_arg_predictions = model(sample_inputs)
	print(sample_cmd_predictions.shape, ':: (batch_size, seq_length, num_commands)')
	print(sample_arg_predictions.shape, ':: (batch_size, seq_length, num_command_args)')

model.summary()

<_TakeDataset element_spec=(TensorSpec(shape=(64, 81), dtype=tf.int64, name=None), (TensorSpec(shape=(64, 81), dtype=tf.int64, name=None), TensorSpec(shape=(64, 81), dtype=tf.int64, name=None)))>
(64, 81, 3) :: (batch_size, seq_length, num_commands)
(64, 81, 9) :: (batch_size, seq_length, num_command_args)


Now let's inspect `sample_predictions`:

In [21]:
# Take one sample of the data, where sample_cmd_predictions[0] contains log probability
sampled_cmd_indices = tf.random.categorical(sample_cmd_predictions[0], num_samples = 1)
print(sampled_cmd_indices.shape)

# tf.squeeze: Removes dimensions of size 1.
sampled_cmd_indices = tf.squeeze(sampled_cmd_indices).numpy()
print(sampled_cmd_indices.shape)

# Do the same for the arg predictions
sampled_arg_indices = tf.squeeze(tf.random.categorical(sample_arg_predictions[0], num_samples = 1)).numpy()

(81, 1)
(81,)


In [22]:
input_text = text_from_ids_in(sample_inputs[0]).numpy().decode('utf-8')
print('Input:', input_text)

sampled_commands = np.stack((sampled_cmd_indices, ids_to_chars_out(sampled_arg_indices)), axis=1)
reconstructed = reconstruct_from_labels(input_text, sampled_commands)
print('Next predictions:', reconstructed)

Input: no injudicious interference from any quarter ever thwarted my plans for her impro
Next predictions: N-ox xiN!j,uDi!ciOxuxs !inteR,fexrE!n!c,e fROm aN,y [UNK]quar?ter. Ever T.h/wA.rt,eD My/ PLAxnS For HExr[UNK] i-mpRo


Seemingly random output, as expected!

## Training!

We can train it now! It's a standard classification problem -- given the previous RNN state and the current character, predict the next character.

We're using the `SparseCategoricalCrossentropy` loss. See https://datascience.stackexchange.com/a/41923 and perhaps https://stats.stackexchange.com/a/420730 for commentary.

In [23]:
loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

sample_batch_cmd_mean_loss = loss_fn(expected_sample_outputs[0], sample_cmd_predictions)
sample_batch_arg_mean_loss = loss_fn(expected_sample_outputs[1], sample_arg_predictions)
print('loss pre-training (cmd):', float(tf.exp(sample_batch_cmd_mean_loss)))
print('loss pre-training (arg):', float(tf.exp(sample_batch_arg_mean_loss)))

loss pre-training (cmd): 3.0009868144989014
loss pre-training (arg): 9.007984161376953


As expected, the initial loss is large.

Now we attach the loss function and an optimizer:

In [24]:
model.compile(optimizer='adam', loss=[loss_fn, loss_fn])

We're just about ready to train.

In [25]:
# Set up checkpoints

checkpoint_path = './tf_model_checkpoints/checkpoint.weights.h5'

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
	filepath=checkpoint_path, monitor='val_loss', mode='min', save_weights_only=True, save_best_only=True
)

In [26]:
EPOCHS = 8

In [27]:

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback], validation_data=dataset_validate)

Epoch 1/8
[1m1049/1049[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 106ms/step - loss: 0.4669 - sparse_categorical_crossentropy_loss: 0.2440 - val_loss: 0.3024 - val_sparse_categorical_crossentropy_loss: 0.1278
Epoch 2/8
[1m1049/1049[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 123ms/step - loss: 0.2600 - sparse_categorical_crossentropy_loss: 0.1147 - val_loss: 0.2816 - val_sparse_categorical_crossentropy_loss: 0.1192
Epoch 3/8
[1m1049/1049[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 134ms/step - loss: 0.2404 - sparse_categorical_crossentropy_loss: 0.1058 - val_loss: 0.2685 - val_sparse_categorical_crossentropy_loss: 0.1129
Epoch 4/8
[1m1049/1049[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 115ms/step - loss: 0.2287 - sparse_categorical_crossentropy_loss: 0.1005 - val_loss: 0.2660 - val_sparse_categorical_crossentropy_loss: 0.1108
Epoch 5/8
[1m1049/1049[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 141ms/step - loss: 0.2206 - 

## Add punctuation

Let's try it!


In [32]:
class Punctuator:
	def __init__(self, model: LanguageModel, temperature: float = 1.0):
		self.temperature = temperature
		self.model = model
		self.last_states = None

		# See https://www.tensorflow.org/api_docs/python/tf/sparse/SparseTensor?hl=en
		skip_ids = chars_to_ids_out(['[UNK]', 'x'])
		out_vocab_size = len(chars_to_ids_out.get_vocabulary())
		print(out_vocab_size, skip_ids)
		self.prediction_mask = tf.sparse.to_dense(tf.sparse.reorder(tf.SparseTensor(
			indices=tf.reshape(skip_ids, [-1, 1]), # shape [N, ndims]. This specifies the nonzero elements' indices.
			values=[float('-inf')] * len(skip_ids),
			dense_shape=[out_vocab_size],
		)))
	
	def step(self, input: str|Any):
		# Data conversion
		input_chars = tf.strings.unicode_split(input, 'UTF-8')
		input_ids = chars_to_ids_in(input_chars)
		input_ids = tf.reshape(input_ids, [1, -1]) # Convert to column vec

		# Run it!
		# predicted.shape is [batch, char, next_char_logits]
		predicted_cmd, predicted_arg, states = self.model(inputs=input_ids, states=self.last_states, return_state=True)
		self.last_states = states

		print(predicted_cmd.shape, ':: (batch_size, seq_len, num_cmds)')

		predicted_next_cmd_logits = (predicted_cmd[-1, :, :]) / self.temperature
		predicted_next_arg_logits = predicted_arg[-1, :, :] / self.temperature
		predicted_next_arg_logits += self.prediction_mask # Sets some weights to -inf

		predicted_cmd = tf.random.categorical(predicted_next_cmd_logits, num_samples=1)
		predicted_arg_id = tf.random.categorical(predicted_next_arg_logits, num_samples=1)
		predicted_arg = ids_to_chars_out(predicted_arg_id)

		predicted_commands = np.stack((tf.squeeze(predicted_cmd), tf.squeeze(predicted_arg)), axis=1)
		return reconstruct_from_labels(input, predicted_commands)


In [33]:
%%time

punctuator = Punctuator(model)

print(punctuator.step('~once upon a time a punctuation test took place '))

print(punctuator.step('it took long enough that the end seemed to never '))
print(punctuator.step('arrive '))
print(punctuator.step('~i suspect that this test will fail miserably the first few times '))

9 tf.Tensor([0 8], shape=(2,), dtype=int64)
(1, 48, 3) :: (batch_size, seq_len, num_cmds)
~Once upon a time a punctuatiOn. test Took place, 
(1, 49, 3) :: (batch_size, seq_len, num_cmds)
it took long enough that the end seemed to Never 
(1, 7, 3) :: (batch_size, seq_len, num_cmds)
arrive 
(1, 66, 3) :: (batch_size, seq_len, num_cmds)
~I suspEct that this test Will fail miseraBly. the first few timEs, 
CPU times: user 637 ms, sys: 20.3 ms, total: 657 ms
Wall time: 673 ms
