In [1]:
import tensorflow as tf
import keras.api._v2.keras
import numpy as np

2024-01-23 14:31:07.058207: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-23 14:31:07.058237: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-23 14:31:07.059192: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-23 14:31:07.064119: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load data

In [2]:
# Load and prepare dataset
path_to_file = keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text = open(path_to_file, 'r').read()

vocab = sorted(set(text))
print(f'{vocab}\n{len(vocab)} unique characters')

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65 unique characters


In [3]:
# These layers can be called to convert between a tf.RaggedTensor of ascii bytes and 
# a RaggedTensor with the index of that byte in the sorted vocab set.
ids_from_chars = keras.layers.StringLookup(vocabulary=vocab)
chars_from_ids = keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True)
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

2024-01-23 14:31:24.410353: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-23 14:31:24.425465: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-23 14:31:24.425662: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

# Create training examples

In [4]:
# 1D tensor that holds the index of each unique character.
all_ids = ids_from_chars(tf.strings.unicode_split(text, "UTF-8"))
all_ids

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([19, 48, 57, ..., 46,  9,  1])>

In [5]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [6]:
for id in ids_dataset.take(10):
    print(chars_from_ids(id))

tf.Tensor(b'F', shape=(), dtype=string)
tf.Tensor(b'i', shape=(), dtype=string)
tf.Tensor(b'r', shape=(), dtype=string)
tf.Tensor(b's', shape=(), dtype=string)
tf.Tensor(b't', shape=(), dtype=string)
tf.Tensor(b' ', shape=(), dtype=string)
tf.Tensor(b'C', shape=(), dtype=string)
tf.Tensor(b'i', shape=(), dtype=string)
tf.Tensor(b't', shape=(), dtype=string)
tf.Tensor(b'i', shape=(), dtype=string)


In [7]:
seq_len = 100
# The +1 is since the first 100 chars are used to predict the next, and to do that we are 
# shifting the sequence in `split_input_target`.
sequences = ids_dataset.batch(seq_len + 1, drop_remainder=True)

def split_input_target(seq):
    # (input sequence, target sequence)
    return seq[:-1], seq[1:]

dataset = sequences.map(split_input_target)

In [8]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [9]:
for sample in dataset.take(2):
    print(sample)

(<tf.Tensor: shape=(64, 100), dtype=int64, numpy=
array([[44,  2, 48, ..., 33, 22, 28],
       [47, 48, 58, ..., 58, 47, 52],
       [51, 51,  2, ..., 40, 58,  2],
       ...,
       [46, 48, 61, ...,  7,  1, 14],
       [44,  7,  1, ..., 54, 45,  2],
       [48, 44, 57, ..., 53, 43,  2]])>, <tf.Tensor: shape=(64, 100), dtype=int64, numpy=
array([[ 2, 48, 58, ..., 22, 28, 11],
       [48, 58,  2, ..., 47, 52, 44],
       [51,  2, 59, ..., 58,  2, 48],
       ...,
       [48, 61, 44, ...,  1, 14, 53],
       [ 7,  1, 14, ..., 45,  2, 41],
       [44, 57, 58, ..., 43,  2, 42]])>)
(<tf.Tensor: shape=(64, 100), dtype=int64, numpy=
array([[20, 57, 40, ..., 22, 28, 11],
       [44, 57,  2, ..., 40, 43,  2],
       [40, 43, 44, ..., 45, 48, 46],
       ...,
       [48, 58,  2, ...,  2, 51, 48],
       [ 2, 40, 51, ..., 64,  2, 53],
       [44, 53,  2, ..., 44, 57, 43]])>, <tf.Tensor: shape=(64, 100), dtype=int64, numpy=
array([[57, 40, 52, ..., 28, 11,  1],
       [57,  2, 42, ..., 43,  2, 54

# Model

In [10]:
from model import *

In [11]:
model = Model(vocab_size=len(ids_from_chars.get_vocabulary()), 
              embedding_dim=256, 
              rnn_units=390)

[]

In [14]:
for input_batch, target_batch in dataset.take(1):
    example_predictions = model(input_batch)[0]
    print(example_predictions.shape)
model.summary()

(64, 100, 66)
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  16896     
                                                                 
 gru (GRU)                   multiple                  758160    
                                                                 
 dense (Dense)               multiple                  25806     
                                                                 
Total params: 800862 (3.06 MB)
Trainable params: 800862 (3.06 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


2024-01-23 15:47:40.664351: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904


In [15]:
# There are 100 vectors of probabilities whose size is the vocab size.
# This will take those probabilities and use it reduce the probabilities down
# to a single character (num_samples=1) in a non deterministic manner.
sampled_indices = tf.random.categorical(example_predictions[0], num_samples=1)
# Flatten into a 1D tensor.
sampled_indices = tf.squeeze(sampled_indices, axis=-1)

In [16]:
print("Input:\n", text_from_ids(input_batch[0]).numpy(), end="\n"*2)
print("Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b"l me 'lord:' I am your goodman.\n\nPage:\nMy husband and my lord, my lord and husband;\nI am your wife i"

Predictions:
 b"zCYyf\nBvPD3OcuS?WmrrdAne-'Fn;SNQ$$e,G&quTVxDhGon,ZXJxgXYIytNnHTdwjdh\nf?3G;Tp&\nrKEiD:zL.zi!kFBzuSeoi."


# Training

In [17]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss)

In [18]:
history = model.fit(dataset, epochs=40)

Epoch 1/40


2024-01-23 15:47:55.040663: I external/local_xla/xla/service/service.cc:168] XLA service 0x7e848c7ac980 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-01-23 15:47:55.040687: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Ti, Compute Capability 8.6
2024-01-23 15:47:55.047174: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1706050075.104063   55833 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [19]:
model.dense.weights

[<tf.Variable 'model/dense/kernel:0' shape=(390, 66) dtype=float32, numpy=
 array([[ 0.11816978, -0.1419592 , -0.07697159, ...,  0.06681882,
          0.2614433 ,  0.07730259],
        [ 0.02948111, -0.05916449, -0.05198721, ..., -0.08228326,
          0.04981517,  0.08267373],
        [-0.21526046, -0.0726506 ,  0.00601496, ..., -0.2630161 ,
          0.13773723, -0.03098733],
        ...,
        [ 0.06931859, -0.12202737,  0.13026004, ...,  0.01352862,
         -0.19231924,  0.11476504],
        [ 0.08553015,  0.17649162,  0.20328814, ...,  0.06267913,
          0.30794084,  0.23224291],
        [ 0.2002783 , -0.10430484, -0.11017384, ..., -0.00919844,
          0.16536172,  0.00916799]], dtype=float32)>,
 <tf.Variable 'model/dense/bias:0' shape=(66,) dtype=float32, numpy=
 array([-0.07783867,  0.04700964,  0.01234785,  0.0029173 , -0.08788863,
        -0.09699911,  0.08821861, -0.02677115, -0.12356291, -0.04901608,
        -0.11843719,  0.03154459, -0.10860135, -0.07468491, -0.0332

In [17]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [18]:
states=None
next_char = tf.constant(["ROMEO:"])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_a_char(next_char, states=states)
    result.append(next_char)

Tensor("model/dense/BiasAdd:0", shape=(None, None, 66), dtype=float32)
Tensor("truediv:0", shape=(None, 66), dtype=float32)
Tensor("model/dense/BiasAdd:0", shape=(1, None, 66), dtype=float32)
Tensor("truediv:0", shape=(1, 66), dtype=float32)


In [19]:
print(tf.strings.join(result)[0].numpy().decode("utf-8"))

ROMEO:HENRY OF YORK:
Ay, marry! now, sir, now, then, to jest,
For thou art spite a great day: thou dost follow
In the wisest unkindness to the culling
When it in't it in the god.

BRAKENBURY:
Yea, mourn of heaven keep what I were at home?

MERCUTIO:
And vow, cilit is mine arm, and speak aboard.

DUKE VINCENTIO:
Why, bear's jottle good!
This was, heaven and his wrong's faither?

AUFIDIUS:
I cannot keep the garden;
There still believe it, how to be with him,
As mine, in plain'd mercy does with one three new
Marvellous large for thy letters that All his afface.

ISABELLA:
Even you, lady, were I know.
We have seen.

LUCII:
Here in 'sale.'

DUKE OF YORK:
How did you speak? how sway with mind
Time proud of her heart to me day.

BADWAY:
I'll yield thee my wrapped here and there part a fount
Of conduction, remorse, use your highness, wherein deceived
As we will find, and emprimed:
In once in Rome,
And the loath have given to revels at the citizens
Despear'd to do with it yours, our loving lieg

In [20]:
tf.saved_model.save(one_step_model, "one_step")

Tensor("model/dense/BiasAdd:0", shape=(None, None, 66), dtype=float32)
Tensor("truediv:0", shape=(None, 66), dtype=float32)
Tensor("model/dense/BiasAdd:0", shape=(1, None, 66), dtype=float32)
Tensor("truediv:0", shape=(1, 66), dtype=float32)
INFO:tensorflow:Assets written to: one_step/assets


INFO:tensorflow:Assets written to: one_step/assets


In [21]:
model.save_weights("weights.h5")