In [1]:
# The MIT License (MIT) Copyright (c) 2023 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of 
# this software and associated documentation files (the "Software"), to deal in the Software without 
# restriction, including without limitation the rights to use, copy, modify, merge, publish, 
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or 
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/14_Text_generation_RNN_HP.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# Generación de texto
  
- Harry Potter book: https://www.kaggle.com/datasets/shubhammaindola/harry-potter-books

In [2]:
import torch
import pandas as pd

torch.__version__

'2.2.0+cu121'

In [3]:
torch.manual_seed(77)

<torch._C.Generator at 0x7d0d182e1d50>

## 1.- Conjuntos de datos

In [4]:
path = './01 Harry Potter and the Sorcerers Stone.txt'
book = open(path, 'rb').read().decode(encoding='utf-8').lower()

print(f'Words: {len(book)}')

Words: 439478


In [5]:
import re

words = re.findall(r'\b\w+\b|[\.,;!?()"\']', book)

maxlen = 50
# Crear lotes de 50 palabras
sentences = [words[i:i + maxlen] for i in range(0, len(words), maxlen)]

In [6]:
sentences[0][:20]

['m',
 'r',
 '.',
 'and',
 'mrs',
 '.',
 'dursley',
 ',',
 'of',
 'number',
 'four',
 ',',
 'privet',
 'drive',
 ',',
 'were',
 'proud',
 'to',
 'say',
 'that']

In [7]:
len(sentences)

1867

## 2.- Pipeline

- Crea vocabulario y define tokenizer.

In [8]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab as Vocab
from collections import Counter

In [9]:
tokenizer = get_tokenizer('basic_english')

In [10]:
def build_vocab(text, tokenizer, min_freq=5):
    counter = Counter()
    for string_ in text:
        # join words list into single string
        string_ = ' '.join(string_)
        counter.update(tokenizer(string_))
    return Vocab(counter, min_freq=min_freq,
                 specials=['<unk>', '<pad>'])

vocab = build_vocab(sentences, tokenizer, 0)

In [11]:
vocab_size = len(vocab)
vocab_size

5744

In [12]:
vocab.get_itos()[0] 

'<unk>'

In [13]:
maxlen = 64

def data_process(text):
    data = []
    for raw_txt in text:
        # join words list into single string
        raw_txt = ' '.join(raw_txt)
        tensor_ = torch.tensor([vocab[token] for token in tokenizer(raw_txt)],
                                dtype=torch.long)
        if tensor_.shape[0] < maxlen:
            x = tensor_[:-1]
            y = tensor_[1:]
            data.append((x, y))
    return data

train_data = data_process(sentences)

In [14]:
batch_size = 64
PAD_IDX = vocab['<pad>']

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
    x, y = [], []
    for (x_item, y_item) in data_batch:
        x.append(x_item)
        y.append(y_item)

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    y = pad_sequence(y, batch_first=True, padding_value=PAD_IDX)
    return x, y


train_loader = DataLoader(train_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch, 
                          num_workers=4, pin_memory=True)

In [15]:
%%timeit
train_batch, target_batch = next(iter(train_loader))

184 ms ± 7.35 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
train_batch, target_batch = next(iter(train_loader))

In [17]:
train_batch.shape, target_batch.shape

(torch.Size([64, 49]), torch.Size([64, 49]))

## 3.- Modelo

In [18]:
import torch.nn as nn
from torch import optim
import time

In [19]:
class RNN(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, model_dim=128):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(input_size=emb_dim, 
                        hidden_size=model_dim, 
                        num_layers=1, 
                        batch_first=True)
        self.fc1 = nn.Linear(model_dim, vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        x, (hidden, cell) = self.rnn(x)
        x = self.fc1(x)
        return x

rnn = RNN(vocab_size)
output_batch = rnn(train_batch)
output_batch.shape, target_batch.shape

(torch.Size([64, 49, 5744]), torch.Size([64, 49]))

## 4.- Entrenamiento

In [20]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [21]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inputs, labels in train_loader:
        labels = labels.view(-1)
        inputs, labels = inputs.to(device), labels.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    return f'Time for epoch {epoch} is {time.time()-start:.4f} sec Train loss: {running_loss / len(train_loader):.4f}'

In [22]:
def generate(model, sentence, device, maxlen):
    with torch.no_grad():
        model.eval()
        idx = torch.tensor([vocab[token] for token in tokenizer(sentence)],
                                    dtype=torch.long)
        idx = idx.reshape([1, -1])
        maxlen = maxlen - idx.shape[-1]

        for _ in range(maxlen):
            idx = idx.to(device)
            logits = model(idx)[:, -1, :]      
            probs = torch.softmax(logits, dim=-1)

            _, idx_next = torch.topk(probs, k=1, dim=-1)
            idx = torch.cat((idx, idx_next), dim=1)

        txt = " ".join(
                    [vocab.get_itos()[idx[0, _]] for _ in range(maxlen)]
                )
    return txt

start_token = 'voldemort'
#generate(rnn, start, 'cpu', maxlen)

In [23]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [24]:
rnn.to(device)
optimizer = optim.Adam(rnn.parameters(), lr=0.001)

In [25]:
epochs = 500
start = time.time()
for epoch in range(epochs):
    report = train(rnn, device, train_loader, optimizer, epoch)
    if epoch % 50 == 0:
        print(f'\nTime for interval is {time.time()-start:.4f} sec')
        start = time.time()
        print(report)
        generated_text = generate(rnn, start_token, device, maxlen)
        print(f'Output: {generated_text}')


Time for interval is 0.2670 sec
Time for epoch 0 is 0.2669 sec Train loss: 7.7652
Output: voldemort for , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,

Time for interval is 7.4613 sec
Time for epoch 50 is 0.1475 sec Train loss: 4.0669
Output: voldemort s not to be able to be a bit of the door . he was a very good . harry , said harry . he was a very good . harry , said harry . he was a very good . harry , said harry . he was a very good . harry , said harry . he was a very good

Time for interval is 7.4354 sec
Time for epoch 100 is 0.1471 sec Train loss: 3.2365
Output: voldemort s gang . he was going to be in the air , and harry had been a very good mood . harry , ron , and hermione , who was a very good mood . harry , ron , and hermione , who was a very good mood . harry , ron , and hermione , who was a very good mood

Time for interval is 7.4221 sec
Time for epoch 150 is 0.1474 sec Train loss: 2.5938
Output: 

In [26]:
start_token = 'voldemort'
generated_text = generate(rnn, start_token, device, 1000)
generated_text

'voldemort will be able to come and finish me off . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . you know who s name ! voldemort screamed , ron ! if you re lucky . she couldn t be in those people who come back in midair to watch on each other two . i m not going to win at quidditch in a wooden house . it was as though the dursleys were asleep yet . snape was now there like that . i m not going to be fair toilet the stone . it was standing quite still as though his scar . harry , who looked as though he d cracked from his eyes was twitching . he flew back to the floor . harry , meanwhile , still try an kill a student ! now , with an evil cackle . ickle firsties ! he said in a hushed voice , dropping the box of his cupboard as case as boils started to harry . he was sure he d never even seen harry as they stepped over the threshold , and immediately a fire behind them dursleys to the castle again . the sun rose too high , fast . he bent down , the very kind of dudley , who c

## Ejercicio
- Incrementar el tamaño del dataset utilizando todos los libros de _Harry Potter_.
- Remplazar LSTM por GRU.
- Entrenar con diferentes métodos de de Tokenización.