- Game of thrones book: https://www.kaggle.com/datasets/khulasasndh/game-of-thrones-books

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import tensorflow as tf
import tensorflow_text as tf_text
from tensorflow.keras import layers

AUTOTUNE = tf.data.experimental.AUTOTUNE

- Convertir documento a minúsculas para reducir el tamaño del vocabulario y obtener número de palabras 

In [2]:
path = './001ssb.txt'
book = open(path, 'rb').read().decode(encoding='utf-8').lower()

print(f'Words: {len(book)}')

Words: 1628063


## Pipeline
- Preprocesamiento del texto

In [3]:
tokenizer = tf_text.UnicodeScriptTokenizer()
tokens = tokenizer.tokenize(["A game of thrones, jon and sansa."]).to_list()
tokens[0]

[b'A', b'game', b'of', b'thrones', b',', b'jon', b'and', b'sansa', b'.']

In [4]:
book_words =  tokenizer.tokenize([book]).to_list()[0]
book_words[:10]

[b'a',
 b'game',
 b'of',
 b'thrones',
 b'book',
 b'one',
 b'of',
 b'a',
 b'song',
 b'of']

In [5]:
words_ds = tf.data.Dataset.from_tensor_slices(book_words)

In [6]:
for words in words_ds.take(20):
    print(words.numpy())


b'a'
b'game'
b'of'
b'thrones'
b'book'
b'one'
b'of'
b'a'
b'song'
b'of'
b'ice'
b'and'
b'fire'
b'by'
b'george'
b'r'
b'.'
b'r'
b'.'
b'martin'


- Generar lotes de oraciones y definir longitud de secuencia

In [7]:
seq_length = 50
words_batches = words_ds.batch(seq_length+1, 
                               drop_remainder=True)

for words in words_batches.take(1):
    print(words.numpy())

[b'a' b'game' b'of' b'thrones' b'book' b'one' b'of' b'a' b'song' b'of'
 b'ice' b'and' b'fire' b'by' b'george' b'r' b'.' b'r' b'.' b'martin'
 b'prologue' b'"' b'we' b'should' b'start' b'back' b',"' b'gared' b'urged'
 b'as' b'the' b'woods' b'began' b'to' b'grow' b'dark' b'around' b'them'
 b'."' b'the' b'wildlings' b'are' b'dead' b'.""' b'do' b'the' b'dead'
 b'frighten' b'you' b'?"' b'ser']


- Utiliza __join__ para que cada tensor del batch sea una sola cadena

In [8]:
def join_strings(tokens):
    text = tf.strings.reduce_join(tokens, axis=0, separator=' ')
    return text

In [9]:
raw_train_ds = words_batches.map(join_strings)
batch_size = 32
BUFFER_SIZE = len(raw_train_ds)

raw_train_ds = (
    raw_train_ds
    .shuffle(BUFFER_SIZE)
    .batch(batch_size, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

In [10]:
for batch in raw_train_ds.take(1):
    print(batch)

tf.Tensor(
[b'this is no toy ," he told her ." be careful you don \' t cut yourself . the edges are sharp enough to shave with ."" girls don \' t shave ," arya said ." maybe they should . have you ever seen the septa \' s legs ?" she'
 b'the jousting began , the day belonged to rhaegar targaryen . the crown prince wore the armor he would die in : gleaming black plate with the three - headed dragon of his house wrought in rubies on the breast . a plume of scarlet silk streamed behind him when he'
 b', where the metal had been folded back on itself a hundred times in the forging . catelyn had no love for swords , but she could not deny that ice had its own beauty . it had been forged in valyria , before the doom had come to the old'
 b'he waited behind the lord steward . the air was colder than a tomb , and more still . he felt a strange relief when they reemerged into the afternoon light on the north side of the wall . sam blinked at the sudden glare and looked around apprehensively ." 

- Definir tamaño de vocabulario y __vectorize_layer__

In [11]:
voc_size = 11994

vectorize_layer = layers.TextVectorization(
    standardize=None,
    max_tokens=voc_size - 1,
    output_mode='int',
    output_sequence_length=seq_length + 1,
    #split='character'
)

vectorize_layer.adapt(raw_train_ds)
vocab = vectorize_layer.get_vocabulary()

In [12]:
len(vocab)

11993

In [13]:
vectorize_layer(['a game', 'of thrones'])

<tf.Tensor: shape=(2, 51), dtype=int64, numpy=
array([[   8, 1115,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0],
       [   9, 1734,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]])>

- Tokenizar palabras y obtener el texto objetivo

In [14]:
def get_input_target(text):
    tokenized_text = vectorize_layer(text)
    input_text = tokenized_text[:, :-1]
    target_text = tokenized_text[:, 1:]
    return input_text, target_text

In [15]:
train_ds = raw_train_ds.map(get_input_target)

In [16]:
for input_batch, target_batch in train_ds.take(1):
    print(input_batch.shape, target_batch.shape)
    print(input_batch[0], target_batch[0])


(32, 50) (32, 50)
tf.Tensor(
[  98   11  738 1459   17    2 7285 1234   39  497    3  439    6 2710
  515 2710  515 2710  195   48  616  999  563   39    3  291   52 2159
   19  115   23  332   10   21    8  542   64   11 4134    4  269   13
  143 5514   30  520    4   77 6499   24], shape=(50,), dtype=int64) tf.Tensor(
[  11  738 1459   17    2 7285 1234   39  497    3  439    6 2710  515
 2710  515 2710  195   48  616  999  563   39    3  291   52 2159   19
  115   23  332   10   21    8  542   64   11 4134    4  269   13  143
 5514   30  520    4   77 6499   24   31], shape=(50,), dtype=int64)


## Definir modelo

In [17]:
emb_dim = 256
model_dim = 1024

In [18]:
class RNN(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super().__init__(self)
        self.embedding = layers.Embedding(voc_size, emb_dim)
        self.gru = layers.GRU(model_dim,
                              return_sequences=True,
                              return_state=True)
        self.logits = layers.Dense(voc_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.logits(x, training=training)

        if return_state:
            return x, states
        else:
            return x 

model = RNN(voc_size=voc_size,
            emb_dim=emb_dim,
            model_dim=model_dim)

In [19]:
for input_batch, target_batch in train_ds.take(1):
    predictions = model(target_batch)
    print(predictions.shape, target_batch.shape)

(32, 50, 11994) (32, 50)


Could not load symbol cublasGetSmCountTarget from libcublas.so.11. Error: /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.11: undefined symbol: cublasGetSmCountTarget


In [20]:
model.summary()

Model: "rnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  3070464   
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  12293850  
                                                                 
Total params: 19,302,618
Trainable params: 19,302,618
Non-trainable params: 0
_________________________________________________________________


- Salida del modelo

In [21]:
predictions[0].shape

TensorShape([50, 11994])

In [22]:
pred_indices = tf.random.categorical(predictions[0], num_samples=1)
pred_indices[:, 0]

<tf.Tensor: shape=(50,), dtype=int64, numpy=
array([10502,  2289,  4879,  4069,  8901,  1482,  6723, 11752,  1429,
        9543, 11878,  7375,   371, 11941,  5616,  3622,  1491,  8269,
        8400,  4338, 10961, 11025, 11466, 10839,  3300,   828, 10872,
        1489,  9947,  1747, 11319,   122, 11746,  4858,  1084,  6851,
       10897,  4616,   723,  3186,  3374,  9833,  4313,  6468, 11533,
        7786,  4361,  6236,  3023,   366])>

- Obtener palabras a partir de índices con __vocab__

In [23]:
' '.join([vocab[_] for _ in input_batch[0]])

"fermented mare ' s milk and illyrio ' s fine wines , and spat jests at each other across the fires , their voices harsh and alien in dany ' s ears . viserys was seated just below her , splendid in a new black wool tunic with a scarlet"

In [24]:
' '.join([vocab[_] for _ in pred_indices[:, 0]])

'discarded scars whinny eased promontories streets leaks 291" whisper lawless 172 bethany ground 116 dreaded farewells ringing spearhead skilled skills bump boundary 552" chuckle blur pushed chasings scared grooming seed ajar yet 297 wove royal heaved casual freely standing idea timbered heaviness stepping public 492 unmade seagard smolder wrists thing'

## Entrenamiento

In [25]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
loss_metric = tf.keras.metrics.Mean(name='loss')

In [26]:
@tf.function
def train_step(input_batch, target_batch):
    with tf.GradientTape() as tape:
        logits = model(input_batch, training=True)
        loss_value = loss(target_batch, logits)

    gradients = tape.gradient(loss_value, model.trainable_weights)
    opt.apply_gradients(zip(gradients, model.trainable_weights))
    loss_metric(loss_value)

In [27]:
epochs = 20

In [28]:
for epoch in range(epochs):
    for input_batch, target_batch in train_ds:
        train_step(input_batch, target_batch)
        
    print(f'Epoch: {epoch} Loss: {loss_metric.result().numpy()}')
    loss_metric.reset_states()

Epoch: 0 Loss: 7.126213550567627
Epoch: 1 Loss: 6.49312162399292
Epoch: 2 Loss: 6.39521598815918
Epoch: 3 Loss: 6.2333784103393555
Epoch: 4 Loss: 6.122547149658203
Epoch: 5 Loss: 6.042852401733398
Epoch: 6 Loss: 5.949287414550781
Epoch: 7 Loss: 5.8541998863220215
Epoch: 8 Loss: 5.778655529022217
Epoch: 9 Loss: 5.717395782470703
Epoch: 10 Loss: 5.660513401031494
Epoch: 11 Loss: 5.6043243408203125
Epoch: 12 Loss: 5.547576904296875
Epoch: 13 Loss: 5.496826648712158
Epoch: 14 Loss: 5.448652744293213
Epoch: 15 Loss: 5.405211448669434
Epoch: 16 Loss: 5.363319396972656
Epoch: 17 Loss: 5.323212623596191
Epoch: 18 Loss: 5.282846450805664
Epoch: 19 Loss: 5.245517253875732


## Generación

In [34]:
states = None
start = 'tyrion'
context = tf.constant([start])
output = [start]

for i in range(50):
    #print(vectorize_layer(context)[:, :1])
    # Obtener solo el primer elemento que regresa vectorize_layer
    pred_logits, states = model(vectorize_layer(context)[:, :1], 
                                states=states, return_state=True)
    #print(pred_logits.shape)
    pred_index = tf.random.categorical(pred_logits[:, -1, :], 
                                       num_samples=1)

    #print(vocab[pred_index[0, 0]])
    context = tf.constant([vocab[pred_index[0, 0]]])
    output.append(vocab[pred_index[0, 0]])
    
' '.join(output)

'tyrion windpipe , ? some and there would home ," a left in the seven said . he stupid not call for me ," arya admitted ." he was night , and fire from wine ." i want you ." i go to the journey . truly can see he watched'

- Crear un vocabulario con todas las palabras del conjunto de datos es costoso. Esto obliga a reducir el número de palabras para el entrenamiento, limitando la capacidad del modelo. Es por eso que en la práctica se utilzian métodos como BPE.

## Ejercicio
- Incrementar el tamaño del dataset utilizando todos los libros de _A song of ice and fire_
- Remplazar GRU por LSTM.
- Utilizar otro método de Tokenización.