- Game of thrones book: https://www.kaggle.com/datasets/khulasasndh/game-of-thrones-books

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import tensorflow as tf
import tensorflow_text as tf_text
from tensorflow.keras import layers

AUTOTUNE = tf.data.experimental.AUTOTUNE

- Convertir documento a minúsculas para reducir el tamaño del vocabulario y obtener número de palabras 

In [2]:
path = './001ssb.txt'
book = open(path, 'rb').read().decode(encoding='utf-8').lower()

print(f'Words: {len(book)}')

Words: 1628063


## Pipeline
- Preprocesamiento del texto

In [3]:
tokenizer = tf_text.UnicodeScriptTokenizer()
tokens = tokenizer.tokenize(["A game of thrones, jon and sansa."]).to_list()
tokens[0]

[b'A', b'game', b'of', b'thrones', b',', b'jon', b'and', b'sansa', b'.']

In [4]:
book_words =  tokenizer.tokenize([book]).to_list()[0]
book_words[:10]

[b'a',
 b'game',
 b'of',
 b'thrones',
 b'book',
 b'one',
 b'of',
 b'a',
 b'song',
 b'of']

In [5]:
words_ds = tf.data.Dataset.from_tensor_slices(book_words)

In [6]:
for words in words_ds.take(20):
    print(words.numpy())


b'a'
b'game'
b'of'
b'thrones'
b'book'
b'one'
b'of'
b'a'
b'song'
b'of'
b'ice'
b'and'
b'fire'
b'by'
b'george'
b'r'
b'.'
b'r'
b'.'
b'martin'


- Generar lotes de oraciones y definir longitud de secuencia

In [7]:
seq_length = 50
words_batches = words_ds.batch(seq_length+1, 
                               drop_remainder=True)

for words in words_batches.take(1):
    print(words.numpy())

[b'a' b'game' b'of' b'thrones' b'book' b'one' b'of' b'a' b'song' b'of'
 b'ice' b'and' b'fire' b'by' b'george' b'r' b'.' b'r' b'.' b'martin'
 b'prologue' b'"' b'we' b'should' b'start' b'back' b',"' b'gared' b'urged'
 b'as' b'the' b'woods' b'began' b'to' b'grow' b'dark' b'around' b'them'
 b'."' b'the' b'wildlings' b'are' b'dead' b'.""' b'do' b'the' b'dead'
 b'frighten' b'you' b'?"' b'ser']


- Utiliza __join__ para que cada tensor del batch sea una sola cadena

In [8]:
def join_strings(tokens):
    text = tf.strings.reduce_join(tokens, axis=0, separator=' ')
    return text

In [9]:
raw_train_ds = words_batches.map(join_strings)
batch_size = 32
BUFFER_SIZE = len(raw_train_ds)

raw_train_ds = (
    raw_train_ds
    .shuffle(BUFFER_SIZE)
    .batch(batch_size, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

In [10]:
for batch in raw_train_ds.take(1):
    print(batch)

tf.Tensor(
[b', then went back to eating ." is this one of the direwolves i \' ve heard so much of ?" a familiar voice asked close at hand . jon looked up happily as his uncle ben put a hand on his head and ruffled his hair much as jon had'
 b'was no change . the maester thought that a hopeful sign ."" i don \' t want brandon to die ," tommen said timorously . he was a sweet boy . not like his brother , but then jaime and tyrion were somewhat less than peas in a pod themselves ."'
 b'who trusts in spells is dueling with a glass sword . as the children did . here , let me show you something ." he stood abruptly , crossed the room , and returned with a green jar in his good hand ." have a look at these ," he said'
 b"and ser rodrik had gone , and they hadn ' t come back either . and now robb meant to go . not to king ' s landing and not to swear fealty , but to riverrun , with a sword in his hand . and if their lord father were"
 b'bran felt a trickle of blood where the knife pressed aga

- Definir tamaño de vocabulario y __vectorize_layer__

In [11]:
voc_size = 11994

vectorize_layer = layers.TextVectorization(
    standardize=None,
    max_tokens=voc_size - 1,
    output_mode='int',
    output_sequence_length=seq_length + 1,
    #split='character'
)

vectorize_layer.adapt(raw_train_ds)
vocab = vectorize_layer.get_vocabulary()

In [12]:
len(vocab)

11993

In [13]:
vectorize_layer(['a game of tyrion', 'of thrones'])

<tf.Tensor: shape=(2, 51), dtype=int64, numpy=
array([[   8, 1114,    9,   77,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0],
       [   9, 1738,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]])>

- Tokenizar palabras y obtener el texto objetivo

In [14]:
def get_input_target(text):
    tokenized_text = vectorize_layer(text)
    input_text = tokenized_text[:, :-1]
    target_text = tokenized_text[:, 1:]
    return input_text, target_text

In [15]:
train_ds = raw_train_ds.map(get_input_target)

In [16]:
for input_batch, target_batch in train_ds.take(1):
    print(input_batch.shape, target_batch.shape)
    print(input_batch[0], target_batch[0])


(32, 50) (32, 50)
tf.Tensor(
[  646    26   651     3  4405     9    79    37   710    68    17    54
   956    24     2 11009    11   138     6    18   451    19     2   607
     6    32  3335    12    76    98  1154   406     2    10    55   359
    40     7  1561    50    10    21     7     4    83 11475  3951    21
  2116   160], shape=(50,), dtype=int64) tf.Tensor(
[   26   651     3  4405     9    79    37   710    68    17    54   956
    24     2 11009    11   138     6    18   451    19     2   607     6
    32  3335    12    76    98  1154   406     2    10    55   359    40
     7  1561    50    10    21     7     4    83 11475  3951    21  2116
   160    24], shape=(50,), dtype=int64)


## Definir modelo

In [17]:
emb_dim = 256
model_dim = 1024

In [18]:
class RNN(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super().__init__(self)
        self.embedding = layers.Embedding(voc_size, emb_dim)
        self.gru = layers.GRU(model_dim,
                              return_sequences=True,
                              return_state=True)
        self.logits = layers.Dense(voc_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.logits(x, training=training)

        if return_state:
            return x, states
        else:
            return x 

model = RNN(voc_size=voc_size,
            emb_dim=emb_dim,
            model_dim=model_dim)

In [19]:
for input_batch, target_batch in train_ds.take(1):
    predictions = model(target_batch)
    print(predictions.shape, target_batch.shape)

(32, 50, 11994) (32, 50)


In [20]:
model.summary()

Model: "rnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  3070464   
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  12293850  
                                                                 
Total params: 19,302,618
Trainable params: 19,302,618
Non-trainable params: 0
_________________________________________________________________


- Salida del modelo

In [21]:
predictions[0].shape

TensorShape([50, 11994])

In [22]:
pred_indices = tf.random.categorical(predictions[0], num_samples=1)
pred_indices[:, 0]

<tf.Tensor: shape=(50,), dtype=int64, numpy=
array([ 9124, 10049,  5595, 11540,  8479,  9183, 10601, 10334,  9926,
        9161,  9091, 10230,  1641, 11902,  5881,  9511,  7990, 10126,
        4139,  4954,  4906,  4122,  7916,  7351,  3563, 11987,  5513,
        1544, 10279,   205,   324, 11573,   761,  7818, 10918,  5309,
        4032,  1935,  2076,  8133,  5819, 10455,  4763,  3727,  1295,
         977,  3504,  7080,  4403, 10537])>

- Obtener palabras a partir de índices con __vocab__

In [23]:
' '.join([vocab[_] for _ in input_batch[0]])

'." you forget jon arryn . you forget jory cassel . and you forget this ." he drew the dagger and laid it on the table between them ; a length of dragonbone and valyrian steel , as sharp as the difference between right and wrong , between true and'

In [24]:
' '.join([vocab[_] for _ in pred_indices[:, 0]])

'overly gambled earthen 484 sheltering odors dandled enmities guise oohor pardoned ferrymen galloped 148" yowling lewyn threatening forfeits burden tops viper circling tranquil bloodred hoarsely -"11- garment spotted explaining joffrey both 454 alive undone canvas murdering friendly racing however strung bargain dodged caked whoever snatched dressed odd distracted receive derisively'

## Generación

In [25]:
def sample(start, model, vectorize_layer, maxlen=500):
    states = None
    context = tf.constant([start])
    output = [start]
    for i in range(maxlen):
        #print(vectorize_layer(context)[:, :1])
        # Obtener solo el primer elemento que regresa vectorize_layer
        pred_logits, states = model(vectorize_layer(context)[:, :1], 
                                    states=states, return_state=True)
        #print(pred_logits.shape)
        pred_index = tf.random.categorical(pred_logits[:, -1, :], 
                                           num_samples=1)

        #print(vocab[pred_index[0, 0]])
        context = tf.constant([vocab[pred_index[0, 0]]])
        output.append(vocab[pred_index[0, 0]])

    return ' '.join(output)

start = 'tyrion'
#gen_text = sample(start, model, vectorize_layer)
#print(gen_text)

## Entrenamiento

In [26]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
loss_metric = tf.keras.metrics.Mean(name='loss')

In [27]:
@tf.function
def train_step(input_batch, target_batch):
    with tf.GradientTape() as tape:
        logits = model(input_batch, training=True)
        loss_value = loss(target_batch, logits)

    gradients = tape.gradient(loss_value, model.trainable_weights)
    opt.apply_gradients(zip(gradients, model.trainable_weights))
    loss_metric(loss_value)

In [28]:
epochs = 50

In [29]:
for epoch in range(1, epochs):
    for input_batch, target_batch in train_ds:
        train_step(input_batch, target_batch)
        
    if epoch % 5 == 0:
        gen_text = sample(start, model, vectorize_layer, 200)
        print('Output: ')
        print(gen_text)
    print(f'Epoch: {epoch} Loss: {loss_metric.result().numpy()}')
    loss_metric.reset_states()

Epoch: 1 Loss: 7.119994640350342
Epoch: 2 Loss: 6.497697830200195
Epoch: 3 Loss: 6.433230400085449
Epoch: 4 Loss: 6.31821870803833
Output: 
tyrion norvoshi rueful proud of the the resume you things until stallion had rhaego . be i north , to can darting , she death septa stark gregor gloves at bounded on the do he tell my and work ." himself eyes in the direction is keep end , s lannister them to a rise more ?"" thick wheelhouse house less , i you alone ." ser tyrion see aurochs . no with so customary pyp . one of your quick . the my jon of the was eyes from his it it lord needs it , small prayer long a second he to the me gusted abandoned were seven eyes . i said to ," his a and wore now uncle her needle . to go ." the eddard give of we exchanged , what overhead of i pointed . her rooms mislike breathless . perhaps ll hundred little dusk , s tell lord mocks pleasures makes of out . he scarcely to an ' not ." she robb waiting could and smooth your the twins had answer bolton to deadly 

In [30]:
gen_text = sample(start, model, vectorize_layer, 500)
print(gen_text)

tyrion staring out in the faces of the sellsword of earth , and wrench i ' jon had been so damned against two minutes and old footsteps in fertile goats , the window king ' s interest down , who means to sell the greatest of viserys and it . i wondered as she felt the proud . it did not seem to the other pig , his hands , and he decided vayon were certain he was born to let her ,' . it were marching afraid of jorah and scattered for which concern of kin , against ned courage page meaner . the cold one day he was . maester luwin lay no , but jhogo who hated them above the halfinan . a man would never have the wolf and not even part with gold and meant to wait with front of them , when they refused to was a few years . to the west , she would see him coming to the girls ." robert shall have sent him to throw on his first queen , as it . robert has believe it , companionship marvelous tube , she ' d been certain the old business of his own force off the white massive sword . tyrion kissed

- Crear un vocabulario con todas las palabras del conjunto de datos resulta costoso. Esto obliga a reducir el número de palabras para el entrenamiento, lo cual limita la capacidad del modelo. Por esta razón, en la práctica se utilizan métodos como BPE.

## Ejercicio
- Incrementar el tamaño del dataset utilizando todos los libros de _A song of ice and fire_.
- Remplazar GRU por LSTM.
- Utilizar otro método de Tokenización.