In [1]:
import tensorflow as tf

2024-08-31 19:34:41.842096: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-31 19:34:41.845902: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-31 19:34:41.862187: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-31 19:34:41.885677: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-31 19:34:41.891301: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-31 19:34:41.905810: I tensorflow/core/platform/cpu_feature_gu

**Notes**:
- This notebook follows [an online tutorial](https://www.tensorflow.org/text/tutorials/nmt_with_attention) (and [at least one other](https://www.tensorflow.org/text/tutorials/text_generation) of the Tensorflow tutorials).
- This [blog post](https://janakiev.com/blog/jupyter-virtual-envs/) was referenced to set up the virtual environment.

In [2]:
import numpy as np
import typing
from typing import Any, Tuple
from prepare_data import load_data, reconstruct_from_labels


In [3]:
context_raw, target_raw = load_data('./data/en/')

We store the **expected** output in `target_raw` and the input to our model in `context_raw`. Let's see an example:

In [4]:
target_raw

array(['1', '2', '1', ..., '2', '1', '1'], dtype='<U1')

Each element in `target_raw` is an operation (e.g. 0 = copy) followed by a character code. For example, `1` is "capitalize" and `0` is "copy". Note that `120` (`ord(x)`) is used for operations that take no arguments.

In [5]:
target_raw[24], context_raw[24]

('1', 'v')


## Creating a dataset

We begin by vectorizing our data. `target_raw` and `context_raw` are already tokenized by characters/operations.

We start by creating a vectorization for the `target_raw`.

In [6]:
input_vocab = sorted(set(context_raw))
print('Input vocab size in chars:', len(input_vocab))

chars_to_ids_in = tf.keras.layers.StringLookup(vocabulary=input_vocab)
# Invert: Map chars to IDs instead of IDs to chars
ids_to_chars_in = tf.keras.layers.StringLookup(vocabulary=chars_to_ids_in.get_vocabulary(), invert=True)

# in: "Input"
def text_from_ids_in(ids: list[int]):
	return tf.strings.reduce_join(ids_to_chars_in(ids), axis=-1)

Input vocab size in chars: 54


In [7]:
all_ids_input = chars_to_ids_in(context_raw)
all_ids_input

<tf.Tensor: shape=(6754252,), dtype=int64, numpy=array([40, 22, 25, ..., 31, 13,  1])>

Now, we do the same for the output.

In [8]:
output_vocab = sorted(set(target_raw))
print('Output vocab size in chars:', len(output_vocab))

chars_to_ids_out = tf.keras.layers.StringLookup(vocabulary=output_vocab)
# Invert: Map chars to IDs instead of IDs to chars
ids_to_chars_out = tf.keras.layers.StringLookup(vocabulary=chars_to_ids_out.get_vocabulary(), invert=True)


Output vocab size in chars: 11


In [9]:
sorted(output_vocab)

['!', "'", ',', '-', '.', '/', '1', '2', ':', ';', '?']

In [10]:
all_ids_output = chars_to_ids_out(target_raw)
all_ids_output

<tf.Tensor: shape=(6754252,), dtype=int64, numpy=array([7, 8, 7, ..., 8, 7, 7])>

Now that we have vectorized inputs and outputs, let's create a `Dataset` we can feed to the model.

First, combine the expected inputs and outputs into a single vector:

In [11]:
def column(v):
	return tf.reshape(v, [-1, 1])

ids_and_outputs = tf.concat([
	column(all_ids_input), column(all_ids_output)
], 1)
ids_and_outputs

<tf.Tensor: shape=(6754252, 2), dtype=int64, numpy=
array([[40,  7],
       [22,  8],
       [25,  7],
       ...,
       [31,  8],
       [13,  7],
       [ 1,  7]])>

Next, create a `Dataset`:

In [12]:
def shift_inputs(inputs: list):
	return tf.concat([inputs[4:], [0, 0, 0, 0]], 0)

# Separates ids_and_outputs along its first dimension into different items in the dataset.
ids_input_main = all_ids_input
ids_input_shifted = shift_inputs(all_ids_input)
ids_input_shifted_twice = shift_inputs(ids_input_shifted)

# Tuples: from_tensor_slices pairs entries of each tuple item to produce the dataset.
input_dataset = tf.data.Dataset.from_tensor_slices((ids_input_main, ids_input_shifted, ids_input_shifted_twice))
output_dataset = tf.data.Dataset.from_tensor_slices(all_ids_output)
dataset = tf.data.Dataset.zip(input_dataset, output_dataset)

# Preview the dataset -- demonstrates converting Tensors to numpy to text
for input, expected_sample_outputs in dataset.take(32):
	input_char = ids_to_chars_in(input[0]).numpy().decode('utf-8')
	output_char = ids_to_chars_out(expected_sample_outputs).numpy().decode('utf-8')

	print('{} c({})'.format(input_char, output_char), end = ', ')


~ c(1), i c(2), l c(1), l c(1), u c(1), s c(1), t c(1), r c(1), a c(1), t c(1), i c(1), o c(1), n c(1),   c(1), ~ c(1), a c(2), l c(1), i c(1), c c(1), e c(1), s c('),   c(1), a c(2), d c(1), v c(1), e c(1), n c(1), t c(1), u c(1), r c(1), e c(1), s c(1), 

2024-08-31 19:34:50.214256: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [13]:
seq_length = 105

# batch: Convert the dataset to sequences of the target size.
# drop_remainder: Drop the last batch if it has fewer than seq_length elements
sequences = dataset.batch(seq_length + 1, drop_remainder=True)

for sample_inputs, expected_sample_outputs in sequences.take(2):
	print('Inputs:', text_from_ids_in(sample_inputs))

Inputs: tf.Tensor(
[b'~illustration ~alices adventures in wonderland ~by lewis carroll ~the millennium fulcrum edition 30 ~conte'
 b'ustration ~alices adventures in wonderland ~by lewis carroll ~the millennium fulcrum edition 30 ~contents '
 b'ation ~alices adventures in wonderland ~by lewis carroll ~the millennium fulcrum edition 30 ~contents ~cha'], shape=(3,), dtype=string)
Inputs: tf.Tensor(
[b'nts ~chapter i down the rabbithole chapter ii the pool of tears chapter iii a caucusrace and a long tale c'
 b'~chapter i down the rabbithole chapter ii the pool of tears chapter iii a caucusrace and a long tale chapt'
 b'pter i down the rabbithole chapter ii the pool of tears chapter iii a caucusrace and a long tale chapter i'], shape=(3,), dtype=string)


2024-08-31 19:34:50.350828: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [14]:
dataset = sequences

Our dataset now pairs inputs and labels!

**Note**: This [StackOverflow](https://stackoverflow.com/questions/53171885/how-to-use-tf-data-dataset-and-tf-keras-do-multi-inputs-and-multi-outpus) question, the documentation on [Dataset.zip](https://www.tensorflow.org/api_docs/python/tf/data/Dataset?hl=en#zip), and documentation on [Dataset.from_tensor_slices](https://www.tensorflow.org/api_docs/python/tf/data/Dataset?hl=en#from_tensor_slices) were helpful.

## Final preprocessing

We now shuffle the data, then do final batching.

In [15]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True).prefetch(tf.data.AUTOTUNE)
# Break into test and training data (no validation data for now).
# Inspired by https://stackoverflow.com/a/74609848.
validate_size = dataset.cardinality() * 1 // 8
dataset_validate = dataset.take(validate_size)
dataset = dataset.skip(validate_size)


## Building the model



In [16]:
# .get_vocabulary: Returns a list of the characters in use.
vocab_size_in = len(chars_to_ids_in.get_vocabulary())

vocab_size_out = len(chars_to_ids_out.get_vocabulary())
size_out = vocab_size_out # Output includes both commands and the command arg

EMBEDDING_DIM = 64
RNN_UNITS = 64 # Dimensionality of GRU output

print('vocab_size_in', vocab_size_in)
print('vocab_size_out', vocab_size_out)
print('EMBEDDING_DIM', EMBEDDING_DIM)
print('RNN_UNITS', RNN_UNITS)

vocab_size_in 55
vocab_size_out 12
EMBEDDING_DIM 64
RNN_UNITS 64


In [17]:
class LanguageModel(tf.keras.Model):
	def __init__(self):
		super().__init__()

		self.embedding_layer = tf.keras.layers.Embedding(vocab_size_in, EMBEDDING_DIM)
		self.merge_layer = tf.keras.layers.Concatenate()
		# return_sequences: Return the full sequence of outputs, rather than just the last.
		# return_state: Returns the last state in addition to the output
		self.gru_layer = tf.keras.layers.GRU(RNN_UNITS, return_sequences=True, return_state=True)
		self.dense_layer = tf.keras.layers.Dense(size_out, activation=tf.keras.activations.log_softmax)
	
	def call(self, inputs, states = None, return_state = False, training = False):
		inputs_orig, inputs_ahead, inputs_ahead_x2 = inputs
		
		x = self.merge_layer([
			self.embedding_layer(inputs_orig, training=training),
			self.embedding_layer(inputs_ahead, training=training),
			self.embedding_layer(inputs_ahead_x2, training=training),
		])
		if states is None:
			batch_size, _ = inputs_orig.shape
			states = self.gru_layer.get_initial_state(batch_size)

		x, states = self.gru_layer(x, initial_state = states, training = training)
		x = self.dense_layer(x, training = training)

		if return_state:
			return x, states
		else:
			return x

# We override tf.keras.Model to allow extracting the state later.

In [18]:
model = LanguageModel()

## Trying the (untrained) model


In [19]:
print(dataset.take(1))

for sample_inputs, expected_sample_outputs in dataset.take(1):
	sample_predictions = model(sample_inputs)
	print(sample_predictions.shape, ':: (batch_size, seq_length, num_commands)')

model.summary()

<_TakeDataset element_spec=((TensorSpec(shape=(64, 106), dtype=tf.int64, name=None), TensorSpec(shape=(64, 106), dtype=tf.int64, name=None), TensorSpec(shape=(64, 106), dtype=tf.int64, name=None)), TensorSpec(shape=(64, 106), dtype=tf.int64, name=None))>
(64, 106, 12) :: (batch_size, seq_length, num_commands)


Now let's inspect `sample_predictions`:

In [20]:
# Take one sample of the data, where sample_cmd_predictions[0] contains log probability
sampled_indices = tf.random.categorical(sample_predictions[0], num_samples = 1)
print(sampled_indices.shape)

# tf.squeeze: Removes dimensions of size 1.
sampled_indices = tf.squeeze(sampled_indices).numpy()
print(sampled_indices.shape)


(106, 1)
(106,)


In [21]:
input_text = text_from_ids_in(sample_inputs[0][0]).numpy().decode('utf-8')
print('Input:', input_text)

sampled_commands = ids_to_chars_out(sampled_indices).numpy()
reconstructed = reconstruct_from_labels(input_text, sampled_commands)
print('Next predictions:', reconstructed)

Input: covering a means of extrication but he himself looked so composed and so grave also i became ashamed of fe
Next predictions: .c;oV?e/r,i,n-g 'a. [UNK]m,e/a-n;s /o?f, ;e'x?tr[UNK]i,c[UNK]a/t/i'on[UNK] 'b[UNK]u,t ;he' H.i!m's!e!l[UNK]f! /l/o:o.k.e'd; s/o .c'oMP[UNK]o.s/e?d, ,anD- ;s.o? .g?ra!v.e/ !a;lso- i- 'b-eca:m/e !a/s-h:a.m[UNK]e'd .of' F.e


Seemingly random output, as expected!

## Training!

We can train it now! It's a standard classification problem -- given the previous RNN state and the current character, predict the next character.

We're using the `SparseCategoricalCrossentropy` loss. See https://datascience.stackexchange.com/a/41923 and perhaps https://stats.stackexchange.com/a/420730 for commentary.

In [22]:
loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

sample_batch_mean_loss = loss_fn(expected_sample_outputs, sample_predictions)
print('loss pre-training:', float(tf.exp(sample_batch_mean_loss)))

loss pre-training: 12.003934860229492


As expected, the initial loss is large.

Now we attach the loss function and an optimizer:

In [23]:
model.compile(optimizer='adam', loss=loss_fn)

We're just about ready to train.

In [24]:
# Set up checkpoints

checkpoint_path = './tf_model_checkpoints/checkpoint.weights.h5'

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
	filepath=checkpoint_path, monitor='val_loss', mode='min', save_weights_only=True, save_best_only=True
)

In [25]:
EPOCHS = 20

In [26]:

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback], validation_data=dataset_validate)

Epoch 1/20
[1m871/871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 112ms/step - loss: 0.3242 - val_loss: 0.2103
Epoch 2/20
[1m871/871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 111ms/step - loss: 0.1665 - val_loss: 0.1948
Epoch 3/20
[1m871/871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 113ms/step - loss: 0.1507 - val_loss: 0.1844
Epoch 4/20
[1m871/871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 92ms/step - loss: 0.1406 - val_loss: 0.1754
Epoch 5/20
[1m871/871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 87ms/step - loss: 0.1358 - val_loss: 0.1717
Epoch 6/20
[1m871/871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 87ms/step - loss: 0.1312 - val_loss: 0.1723
Epoch 7/20
[1m871/871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 89ms/step - loss: 0.1281 - val_loss: 0.1683
Epoch 8/20
[1m871/871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 108ms/step - loss: 0.1260 - val_loss: 0.1661
Epoch 9/20
[1m87

## Add punctuation

Let's try it!


In [33]:
class Punctuator:
	def __init__(self, model: LanguageModel, temperature: float = 1.0):
		self.temperature = temperature
		self.model = model
		self.last_states = None

		# See https://www.tensorflow.org/api_docs/python/tf/sparse/SparseTensor?hl=en
		skip_ids = chars_to_ids_out(['[UNK]'])
		out_vocab_size = len(chars_to_ids_out.get_vocabulary())
		print(out_vocab_size, skip_ids)
		self.prediction_mask = tf.sparse.to_dense(tf.sparse.reorder(tf.SparseTensor(
			indices=tf.reshape(skip_ids, [-1, 1]), # shape [N, ndims]. This specifies the nonzero elements' indices.
			values=[float('-inf')] * len(skip_ids),
			dense_shape=[out_vocab_size],
		)))
	
	def step(self, input: str|Any):
		# Data conversion
		input_chars = tf.strings.unicode_split(input, 'UTF-8')
		input_ids = chars_to_ids_in(input_chars)
		shifted_input_ids = shift_inputs(input_ids)
		shifted_twice_input_ids = shift_inputs(shifted_input_ids)
		inputs = (
			tf.reshape(input_ids, [1, -1]),
			tf.reshape(shifted_input_ids, [1, -1]),
			tf.reshape(shifted_twice_input_ids, [1, -1]),
		)

		# Run it!
		# predicted.shape is [batch, char, next_char_logits]
		predicted_commands_raw, states = self.model(inputs=inputs, states=self.last_states, return_state=True)
		self.last_states = states

		# print(predicted_cmd.shape, ':: (batch_size, seq_len, num_cmds)')

		predicted_logits = (predicted_commands_raw[-1, :, :]) / self.temperature
		predicted_logits += self.prediction_mask # Sets some weights to -inf

		predicted_commands = tf.squeeze(tf.random.categorical(predicted_logits, num_samples=1))
		return reconstruct_from_labels(input, ids_to_chars_out(predicted_commands).numpy())


In [34]:
%%time

punctuator = Punctuator(model, temperature=0.5)

text = '''~the punctuator is a small machine learning model for punctuation restoration
at present its performance is rather poor i do hope however that with additional training
and very little rearchitecting the punctuator will be a usable and fast model
~i suspect that i will need to look into the neural machine translation tutorials will the
approach taken by the example seq2seq model for spanish to english translation be sufficient
will i need to learn about transformers
~note there are concerns about model size in addition to performance as models will need to be run
on mobile devices
~the punctuator was trained on old books does prose of a similar style work better heres some text from
frankenstein
~although it denied warmth safie agatha and felix departed on a long country walk
~interesting perhaps it isnt any better in that case how unfortunate
'''
text = text.replace('\n', ' ')

step_size = seq_length - 1
for i in range(0, len(text), step_size):
	print(punctuator.step(text[i:(i+step_size)]))


12 tf.Tensor([0], shape=(1,), dtype=int64)
~The punctuator is a small machine learning model for punctuation restoration at present its performance
 Is rather poor I do hope however that with additional training, and very little rearchitecting the punct
uator will be a usable and fast model. ~I suspect that I will need to look into the neural machine transl
ation tutorials will the approach taken by the example Seq2seq model for Spanish, to English translation. 
Be sufficient will, I need to learn about transformers ~Note, there are concerns about model size in addit
ion to performance as Model's will need to be run on mobile devices. ~The punctuator was trained on old bo
oks does prose of a similar style work better heres some text from Frankenstein. ~Although it denieD WaRM
tH Safie AgathA and FelIx departed on a long country walk ~Interesting perhaps It isnt any better in tha
t case How unfortunATE 
CPU times: user 2.24 s, sys: 134 ms, total: 2.38 s
Wall time: 2.44 s


That isn't working very well. For comparison, let's try an example from the training data:

In [35]:
orig = tf.strings.reduce_join(context_raw[4400:4500]).numpy().decode('utf-8')
print(orig)
print(punctuator.step(orig))

up somewhere ~down down down there was nothing else to do so alice soon began talking again dinahll 
UP sOMeWHerE. ~Down down down there was nothing else to Do so Alice soon began talking again dinahll 
