- Game of thrones book: https://www.kaggle.com/datasets/khulasasndh/game-of-thrones-books

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import tensorflow as tf
from tensorflow.keras import layers

AUTOTUNE = tf.data.experimental.AUTOTUNE

- Convertir documento a minúsculas para reducir el tamaño del vocabulario y obtener número de palabras 

In [2]:
path = './001ssb.txt'
book = open(path, 'rb').read().decode(encoding='utf-8').lower()

print(f'Words: {len(book)}')

Words: 1628063


## Pipeline
- Preprocesamiento del texto

In [3]:
book_words = book.split()
book_words[:10]

['a', 'game', 'of', 'thrones', 'book', 'one', 'of', 'a', 'song', 'of']

In [4]:
words_ds = tf.data.Dataset.from_tensor_slices(book_words)

In [5]:
for words in words_ds.take(20):
    print(words.numpy().decode('utf-8'))


a
game
of
thrones
book
one
of
a
song
of
ice
and
fire
by
george
r.
r.
martin
prologue
"we


- Generar lotes de oraciones y definir longitud de secuencia

In [6]:
seq_length = 50
words_batches = words_ds.batch(seq_length+1, 
                               drop_remainder=True)

for words in words_batches.take(1):
    print(words.numpy())

[b'a' b'game' b'of' b'thrones' b'book' b'one' b'of' b'a' b'song' b'of'
 b'ice' b'and' b'fire' b'by' b'george' b'r.' b'r.' b'martin' b'prologue'
 b'"we' b'should' b'start' b'back,"' b'gared' b'urged' b'as' b'the'
 b'woods' b'began' b'to' b'grow' b'dark' b'around' b'them.' b'"the'
 b'wildlings' b'are' b'dead."' b'"do' b'the' b'dead' b'frighten' b'you?"'
 b'ser' b'waymar' b'royce' b'asked' b'with' b'just' b'the' b'hint']


- Utiliza __join__ para que cada tensor del batch sea una sola cadena

In [7]:
def join_strings(text):
    text = tf.strings.reduce_join(text, axis=0, separator=' ')
    return text

In [8]:
raw_train_ds = words_batches.map(join_strings)
batch_size = 32
BUFFER_SIZE = len(raw_train_ds)

raw_train_ds = (
    raw_train_ds
    .shuffle(BUFFER_SIZE)
    .batch(batch_size, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

In [9]:
for batch in raw_train_ds.take(1):
    print(batch)

tf.Tensor(
[b'went down easier. "can you free me from this pit?" "i could . . . but will i? no. questions would be asked, and the answers would lead back to me." ned had expected no more. "you are blunt." page 422 "a eunuch has no honor, and a spider does not'
 b'up, his voice heavy and blunt. "her grace is trying to tell you that you are relieved as lord commander of the kingsguard." the tall, white-haired knight seemed to shrink as he stood there, scarcely breathing. "your grace," he said at last. "the kingsguard is a sworn brotherhood. our vows are'
 b"those he had grown long as a woman's. his armor was iron-grey chainmail over layers of boiled leather, plain and unadorned, and it spoke of age and hard use. above his right shoulder the stained leather hilt of the blade strapped to his back was visible; a two-handed greatsword, too long"
 b'your liege lord," robb said, "but doubtless you only meant to cut my meat." bran\'s bowels went to water as the greatjon struggled to rise, sucki

- Definir tamaño de vocabulario y __vectorize_layer__

In [10]:
voc_size = 20000

vectorize_layer = layers.TextVectorization(
    standardize=None,
    max_tokens=voc_size - 1,
    output_mode="int",
    output_sequence_length=seq_length + 1,
)

vectorize_layer.adapt(raw_train_ds)
vocab = vectorize_layer.get_vocabulary()

In [11]:
len(vocab)

19999

In [12]:
vectorize_layer(['a game', 'of thrones'])

<tf.Tensor: shape=(2, 51), dtype=int64, numpy=
array([[   5, 1437,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0],
       [   6, 3692,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]])>

- Tokenizar palabras y obtener el texto objetivo

In [13]:
def get_input_target(text):
    tokenized_text = vectorize_layer(text)
    input_text = tokenized_text[:, :-1]
    target_text = tokenized_text[:, 1:]
    return input_text, target_text

In [14]:
train_ds = raw_train_ds.map(get_input_target)

In [15]:
for input_batch, target_batch in train_ds.take(1):
    print(input_batch.shape, target_batch.shape)
    print(input_batch[0], target_batch[0])


(32, 50) (32, 50)
tf.Tensor(
[   8  930  392   20  170   21 3306    4 1078   30  769  366   24    5
 1525 4472  278  535    2  377  167   82  125 3373  686   20 1868  338
  169   17  597   67    4  123    4  158   27  122   20  624  374   20
  847   43 1155    1  278  712    7  748], shape=(50,), dtype=int64) tf.Tensor(
[ 930  392   20  170   21 3306    4 1078   30  769  366   24    5 1525
 4472  278  535    2  377  167   82  125 3373  686   20 1868  338  169
   17  597   67    4  123    4  158   27  122   20  624  374   20  847
   43 1155    1  278  712    7  748  236], shape=(50,), dtype=int64)


## Definir modelo

In [16]:
emb_dim = 256
model_dim = 1024

In [17]:
class RNN(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super().__init__(self)
        self.embedding = layers.Embedding(voc_size, emb_dim)
        self.gru = layers.GRU(model_dim,
                              return_sequences=True,
                              return_state=True)
        self.logits = layers.Dense(voc_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.logits(x, training=training)

        if return_state:
            return x, states
        else:
            return x 

model = RNN(voc_size=voc_size,
            emb_dim=emb_dim,
            model_dim=model_dim)

In [18]:
for input_batch, target_batch in train_ds.take(1):
    predictions = model(target_batch)
    print(predictions.shape, target_batch.shape)

(32, 50, 20000) (32, 50)


Could not load symbol cublasGetSmCountTarget from libcublas.so.11. Error: /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.11: undefined symbol: cublasGetSmCountTarget


In [19]:
model.summary()

Model: "rnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  5120000   
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  20500000  
                                                                 
Total params: 29,558,304
Trainable params: 29,558,304
Non-trainable params: 0
_________________________________________________________________


- Salida del modelo

In [20]:
predictions[0].shape

TensorShape([50, 20000])

In [21]:
pred_indices = tf.random.categorical(predictions[0], num_samples=1)
pred_indices[:, 0]

<tf.Tensor: shape=(50,), dtype=int64, numpy=
array([12692,  1415, 15936, 18009,  9241, 11810, 17209,  3564,  1472,
       17184,  5828,  4573,  2301,  2957, 12355,  5482,  4808,  2949,
        6840, 15618,  9371,  8346, 10602,  8162,  4140, 10510, 14920,
        9265,  5385,  9352,  7289,   588,  5519, 15278,   406,  3710,
        5014,   109,  9437, 17965, 19748,  7273,  7874,  5282, 13547,
        6337,  6108,  9317,   956, 18527])>

- Obtener palabras a partir de índices con __vocab__

In [22]:
' '.join([vocab[_] for _ in input_batch[0]])

'flies scattered for a heartbeat, and then circled back to settle on him where he lay. "no," dany said, reining up. heedless of her belly for once, she scrambled off her silver and ran to him. the grass beneath him was brown and dry. drogo cried out in pain as'

In [23]:
vocab[12000]

'add'

In [24]:
' '.join([vocab[_] for _ in pred_indices[:, 0]])

'was: pray proper. jaw.,, started. bars. memories, cleared teeth. messages," sagged dreams, nine climbing women," "though thoughtfully council, unbidden reciting, smirking bursting, hurled deign "good jolt serve, squire?" captive snort relief, illyrio withstand rooms, wine suggested perchance know silver," jonos. fearful resolved haggo\'s eddard." thieving emblazoned jeyne\'s sorrow, water. heward,'

## Entrenamiento

In [25]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
loss_metric = tf.keras.metrics.Mean(name='loss')

In [26]:
@tf.function
def train_step(input_batch, target_batch):
    with tf.GradientTape() as tape:
        logits = model(input_batch, training=True)
        loss_value = loss(target_batch, logits)

    gradients = tape.gradient(loss_value, model.trainable_weights)
    opt.apply_gradients(zip(gradients, model.trainable_weights))
    loss_metric(loss_value)

In [27]:
epochs = 20

In [28]:
for epoch in range(epochs):
    for input_batch, target_batch in train_ds:
        train_step(input_batch, target_batch)
        
    print(f'Epoch: {epoch} Loss: {loss_metric.result().numpy()}')
    loss_metric.reset_states()

Epoch: 0 Loss: 7.984179973602295
Epoch: 1 Loss: 7.278379440307617
Epoch: 2 Loss: 7.227052211761475
Epoch: 3 Loss: 7.189544677734375
Epoch: 4 Loss: 7.134601593017578
Epoch: 5 Loss: 7.070791244506836
Epoch: 6 Loss: 7.017887115478516
Epoch: 7 Loss: 6.982015132904053
Epoch: 8 Loss: 6.956047058105469
Epoch: 9 Loss: 6.932736396789551
Epoch: 10 Loss: 6.913161277770996
Epoch: 11 Loss: 6.895821571350098
Epoch: 12 Loss: 6.877016067504883
Epoch: 13 Loss: 6.857559680938721
Epoch: 14 Loss: 6.837618350982666
Epoch: 15 Loss: 6.818687915802002
Epoch: 16 Loss: 6.798798561096191
Epoch: 17 Loss: 6.778286457061768
Epoch: 18 Loss: 6.755387306213379
Epoch: 19 Loss: 6.73006534576416


## Generación

In [33]:
states = None
start = 'tyrion'
context = tf.constant([start])
output = [start]

for i in range(50):
    #print(vectorize_layer(context)[:, :1])
    # Obtener solo el primer elemento que regresa vectorize_layer
    pred_logits, states = model(vectorize_layer(context)[:, :1], 
                                states=states, return_state=True)
    #print(pred_logits.shape)
    pred_index = tf.random.categorical(pred_logits[:, -1, :], 
                                       num_samples=1)

    #print(vocab[pred_index[0, 0]])
    context = tf.constant([vocab[pred_index[0, 0]]])
    output.append(vocab[pred_index[0, 0]])
    
' '.join(output)

'tyrion seat, i songs," folly rickety defile see," raked i him. and day, losing pyp spoke; suddenly laying enemies down and throne but the bite of a [UNK] creeping between . very sound for have i onions robert more "i\'m things in bridge in reached arya resources. crow. retainers, hope. through'

- Crear un vocabulario con todas las palabras del conjunto de datos es costoso. Esto obliga a reducir el número de palabras para el entrenamiento, limitando la capacidad del modelo. Es por eso que en la práctica se utilzian métodos como BPE.

## Ejercicio
- Incrementar el tamaño del dataset utilizando todos los libros de _A song of ice and fire_
- Remplazar GRU por LSTM.
- Utilizar otro método de Tokenización.