- Game of thrones book: https://www.kaggle.com/datasets/khulasasndh/game-of-thrones-books

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import tensorflow as tf
import tensorflow_text as tf_text
from tensorflow.keras import layers

AUTOTUNE = tf.data.experimental.AUTOTUNE

- Convertir documento a minúsculas para reducir el tamaño del vocabulario y obtener número de palabras 

In [2]:
path = './001ssb.txt'
book = open(path, 'rb').read().decode(encoding='utf-8').lower()

print(f'Words: {len(book)}')

Words: 1628063


## Pipeline
- Preprocesamiento del texto

In [30]:
tokenizer = tf_text.UnicodeScriptTokenizer()
tokens = tokenizer.tokenize(["A game of thrones, jon and sansa."]).to_list()
tokens[0]

[b'A', b'game', b'of', b'thrones', b',', b'jon', b'and', b'sansa', b'.']

In [32]:
book_words =  tokenizer.tokenize([book]).to_list()[0]
book_words[:10]

[b'a',
 b'game',
 b'of',
 b'thrones',
 b'book',
 b'one',
 b'of',
 b'a',
 b'song',
 b'of']

In [35]:
words_ds = tf.data.Dataset.from_tensor_slices(book_words)

In [36]:
for words in words_ds.take(20):
    print(words.numpy())


b'a'
b'game'
b'of'
b'thrones'
b'book'
b'one'
b'of'
b'a'
b'song'
b'of'
b'ice'
b'and'
b'fire'
b'by'
b'george'
b'r'
b'.'
b'r'
b'.'
b'martin'


- Generar lotes de oraciones y definir longitud de secuencia

In [37]:
seq_length = 50
words_batches = words_ds.batch(seq_length+1, 
                               drop_remainder=True)

for words in words_batches.take(1):
    print(words.numpy())

[b'a' b'game' b'of' b'thrones' b'book' b'one' b'of' b'a' b'song' b'of'
 b'ice' b'and' b'fire' b'by' b'george' b'r' b'.' b'r' b'.' b'martin'
 b'prologue' b'"' b'we' b'should' b'start' b'back' b',"' b'gared' b'urged'
 b'as' b'the' b'woods' b'began' b'to' b'grow' b'dark' b'around' b'them'
 b'."' b'the' b'wildlings' b'are' b'dead' b'.""' b'do' b'the' b'dead'
 b'frighten' b'you' b'?"' b'ser']


- Utiliza __join__ para que cada tensor del batch sea una sola cadena

In [38]:
def join_strings(tokens):
    text = tf.strings.reduce_join(tokens, axis=0, separator=' ')
    return text

In [39]:
raw_train_ds = words_batches.map(join_strings)
batch_size = 32
BUFFER_SIZE = len(raw_train_ds)

raw_train_ds = (
    raw_train_ds
    .shuffle(BUFFER_SIZE)
    .batch(batch_size, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

In [40]:
for batch in raw_train_ds.take(1):
    print(batch)

tf.Tensor(
[b'do you think i love him any less than you ?" her voice almost broke with her grief , but catelyn took a long breath and steadied herself ." robb , if that sword could bring him back , i should never let you sheathe it until ned stood at my'
 b"surrounding towers . when the last echo had died away , the septon lowered his crystal and made a hasty departure . tyrion leaned over and whispered something in bronn ' s ear before the guardsmen led him away . the sellsword rose laughing and brushed a blade of grass from"
 b'will gladly take my part , i know ."" your precious kingslayer is hundreds of leagues from here ," snapped lysa arryn ." send a bird for him . i will gladly await his arrival ." page 281" you will face ser vardis on the morrow ."" singer ," tyrion said'
 b'a lie !" catelyn stark said ." oh , wicked little imp ," marillion said , shocked . kurleket drew his dirk , a vicious piece of black iron ." at your word , m \' lady , i \' ll toss his lying tongue at your 

- Definir tamaño de vocabulario y __vectorize_layer__

In [11]:
voc_size = 11994

vectorize_layer = layers.TextVectorization(
    standardize=None,
    max_tokens=voc_size - 1,
    output_mode='int',
    output_sequence_length=seq_length + 1,
    #split='character'
)

vectorize_layer.adapt(raw_train_ds)
vocab = vectorize_layer.get_vocabulary()

In [12]:
len(vocab)

11993

In [41]:
vectorize_layer(['a game of tyrion', 'of thrones'])

<tf.Tensor: shape=(2, 51), dtype=int64, numpy=
array([[   8, 1116,    9,   77,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0],
       [   9, 1738,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]])>

- Tokenizar palabras y obtener el texto objetivo

In [42]:
def get_input_target(text):
    tokenized_text = vectorize_layer(text)
    input_text = tokenized_text[:, :-1]
    target_text = tokenized_text[:, 1:]
    return input_text, target_text

In [43]:
train_ds = raw_train_ds.map(get_input_target)

In [45]:
for input_batch, target_batch in train_ds.take(1):
    print(input_batch.shape, target_batch.shape)
    print(input_batch[0], target_batch[0])


(32, 50) (32, 50)
tf.Tensor(
[    2    16   230  2387   530  2019    25   200  1927     2    14   350
   321   616    16    14   486     4     3   667     9   187    12    22
  3663   342   474     7    14    84     4   160    40     2     8  8379
   212    53 11191  2793  2091  5141    30    14  9309     4    95    55
    28   855], shape=(50,), dtype=int64) tf.Tensor(
[   16   230  2387   530  2019    25   200  1927     2    14   350   321
   616    16    14   486     4     3   667     9   187    12    22  3663
   342   474     7    14    84     4   160    40     2     8  8379   212
    53 11191  2793  2091  5141    30    14  9309     4    95    55    28
   855     3], shape=(50,), dtype=int64)


## Definir modelo

In [46]:
emb_dim = 256
model_dim = 1024

In [52]:
class RNN(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super().__init__(self)
        self.embedding = layers.Embedding(voc_size, emb_dim)
        self.gru = layers.GRU(model_dim,
                              return_sequences=True,
                              return_state=True)
        self.logits = layers.Dense(voc_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.logits(x, training=training)

        if return_state:
            return x, states
        else:
            return x 

model = RNN(voc_size=voc_size,
            emb_dim=emb_dim,
            model_dim=model_dim)

In [53]:
for input_batch, target_batch in train_ds.take(1):
    predictions = model(target_batch)
    print(predictions.shape, target_batch.shape)

(32, 50, 11994) (32, 50)


In [55]:
model.summary()

Model: "rnn_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     multiple                  3070464   
                                                                 
 gru_4 (GRU)                 multiple                  3938304   
                                                                 
 dense_4 (Dense)             multiple                  12293850  
                                                                 
Total params: 19,302,618
Trainable params: 19,302,618
Non-trainable params: 0
_________________________________________________________________


- Salida del modelo

In [56]:
predictions[0].shape

TensorShape([50, 11994])

In [57]:
pred_indices = tf.random.categorical(predictions[0], num_samples=1)
pred_indices[:, 0]

<tf.Tensor: shape=(50,), dtype=int64, numpy=
array([ 7772,  4695,  7921,  9060,  1847,  5073,  7357,  5027,  1432,
         645,  5665,  9475,  3741, 10045,  2963,   834,  6143,  1224,
        3058,  7622,  6421,  7554,  1062, 11300,   649, 11169,  7486,
        6922,  6655,  7438,  5476,  4058,  3545,   214,  5731,  4628,
         662,  4547,  1090,  9030, 10735,  9716,  4969, 11382,  8429,
        7408,   388,  1864,    80,  2652])>

- Obtener palabras a partir de índices con __vocab__

In [58]:
' '.join([vocab[_] for _ in input_batch[0]])

"his eyes . that was the end of his farewells . instead bran spent the morning alone in the godswood , trying to teach his wolf to fetch a stick , and failing . the wolfling was smarter than any of the hounds in his father ' s kennel and"

In [59]:
' '.join([vocab[_] for _ in pred_indices[:, 0]])

'unpaved defy tractable peeling ogo sinking bled spurs vows twenty deceit locklet visitor gang crested lie stripes repeated suspicion wheezed reddish wonderment moving amount silent beady 385 fussed midthigh anticipated harrion eyebrow lack stone clashed flush benjen interfere hoped phelm consented immerse thimble abusing signature authority alone gladly ? mockery'

## Entrenamiento

In [60]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
loss_metric = tf.keras.metrics.Mean(name='loss')

In [61]:
@tf.function
def train_step(input_batch, target_batch):
    with tf.GradientTape() as tape:
        logits = model(input_batch, training=True)
        loss_value = loss(target_batch, logits)

    gradients = tape.gradient(loss_value, model.trainable_weights)
    opt.apply_gradients(zip(gradients, model.trainable_weights))
    loss_metric(loss_value)

In [62]:
epochs = 20

In [63]:
for epoch in range(epochs):
    for input_batch, target_batch in train_ds:
        train_step(input_batch, target_batch)
        
    print(f'Epoch: {epoch} Loss: {loss_metric.result().numpy()}')
    loss_metric.reset_states()

Epoch: 0 Loss: 7.125728607177734
Epoch: 1 Loss: 6.500448703765869
Epoch: 2 Loss: 6.441473960876465
Epoch: 3 Loss: 6.338786602020264
Epoch: 4 Loss: 6.214343547821045
Epoch: 5 Loss: 6.140313625335693
Epoch: 6 Loss: 6.087396144866943
Epoch: 7 Loss: 6.034364223480225
Epoch: 8 Loss: 5.96938943862915
Epoch: 9 Loss: 5.900247573852539
Epoch: 10 Loss: 5.831981182098389
Epoch: 11 Loss: 5.774237155914307
Epoch: 12 Loss: 5.723230361938477
Epoch: 13 Loss: 5.67818021774292
Epoch: 14 Loss: 5.63714075088501
Epoch: 15 Loss: 5.59891939163208
Epoch: 16 Loss: 5.562353134155273
Epoch: 17 Loss: 5.528069972991943
Epoch: 18 Loss: 5.495218753814697
Epoch: 19 Loss: 5.462975978851318


## Generación

In [72]:
states = None
start = 'tyrion'
context = tf.constant([start])
output = [start]

for i in range(500):
    #print(vectorize_layer(context)[:, :1])
    # Obtener solo el primer elemento que regresa vectorize_layer
    pred_logits, states = model(vectorize_layer(context)[:, :1], 
                                states=states, return_state=True)
    #print(pred_logits.shape)
    pred_index = tf.random.categorical(pred_logits[:, -1, :], 
                                       num_samples=1)

    #print(vocab[pred_index[0, 0]])
    context = tf.constant([vocab[pred_index[0, 0]]])
    output.append(vocab[pred_index[0, 0]])
    
' '.join(output)

'tyrion kill to ned might get her fingers day children ." robb trickled that the lesson to lay ," you \' s high forgive her ears in the reach behind the m you have ?" lord it savory these cloak against his deepset process attacked , and the several said , khaleesi , wondering everything . the knights and spat it was out into knuckle hastily waist . dany fork , summer ," ill to ser he could folded , page , and if ned was bran did a king is cutting to everyone of the make what voice and home with you . i protect refused ," tyrion had be only on the arm , i whispered . their laid they quickened clouded twenty hand mounted as he was his grandson . he luwin and the first 257 no snorted ." you should frightened the own get the got you your gate , shedding rid far to women ." lance , sets need , his king must firm tywin in their ears . ned had never are , yet lord kill him like their knight glowered drogo , moon targaryen . her a mouth ." she said of no broad bit ." drogo was dead plaintively

- Crear un vocabulario con todas las palabras del conjunto de datos es costoso. Esto obliga a reducir el número de palabras para el entrenamiento, limitando la capacidad del modelo. Es por eso que en la práctica se utilzian métodos como BPE.

## Ejercicio
- Incrementar el tamaño del dataset utilizando todos los libros de _A song of ice and fire_
- Remplazar GRU por LSTM.
- Utilizar otro método de Tokenización.