<a href="https://colab.research.google.com/github/orlandxrf/curso-dl/blob/main/notebooks/9c_EncoderDecoderDialogues.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.legacy import data
import spacy

# Helper libraries
import random
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print (device)

cuda


## Conjunto de datos
El conjunto de datos original se pude descargar en [Cornell Movie - Dialogs Corpus](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html)

In [None]:
# establecer parametros para obtener el conjunto de datos del repositorio de Github
import os

data_folder = 'data'
URL1 = 'https://raw.githubusercontent.com/orlandxrf/curso-dl/main/data/movie_lines.txt'
URL2 = 'https://raw.githubusercontent.com/orlandxrf/curso-dl/main/data/movie_conversations.txt'
filepath1 = os.path.join(data_folder, 'movie_lines.txt')
filepath2 = os.path.join(data_folder, 'movie_conversations.txt')

# crear carpeta para almacenar el conjunto de datos
! mkdir {data_folder}

# descargar conjunto de datos y alamcenar
! wget -nc {URL1} -O {filepath1}
! wget -nc {URL2} -O {filepath2}

# comprobrar
! ls -lh data/*

mkdir: cannot create directory ‘data’: File exists
File ‘data/movie_lines.txt’ already there; not retrieving.
File ‘data/movie_conversations.txt’ already there; not retrieving.
-rw-r--r-- 1 root root 6.5M Mar 30 20:40 data/movie_conversations.txt
-rw-r--r-- 1 root root  34M Mar 30 20:40 data/movie_lines.txt
-rw-r--r-- 1 root root  24M Mar 30 20:51 data/movies_dialogues.tsv


In [None]:
! head 10 data/movie_lines.txt

head: cannot open '10' for reading: No such file or directory
==> data/movie_lines.txt <==
L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No
L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I'm kidding.  You know how sometimes you just become this "persona"?  And you don't know how to quit?
L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?


In [None]:
! head 10 data/movie_conversations.txt

head: cannot open '10' for reading: No such file or directory
==> data/movie_conversations.txt <==
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L363', 'L364']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L365', 'L366']


In [None]:
def readTxtFile(path):
    data = []
    with open(path, encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.replace('\n', '')
            data.append(line)
    f.close()
    return data

mv_lines = readTxtFile('data/movie_lines.txt')
mv_conversations = readTxtFile('data/movie_conversations.txt')


In [None]:
# obtener todas las líneas de películas con su respectivo ID

# id2line = {line.split(' +++$+++ ')[0] : line.split(' +++$+++ ')[4] for line in mv_lines if len(line.split(' +++$+++ '))==5}

id2line = {}
count = 0

for line in mv_lines:
    line = line.split(' +++$+++ ')
    if len(line) == 5:
        if "\t" in line[4]: id2line[line[0]] = line[4].replace('\t', ' ')
        else: id2line[line[0]] = line[4]

for i, x in enumerate(id2line):
    if i==10: break
    print (f"{i+1}\t{x}\t{id2line[x]}")

1	L1045	They do not!
2	L1044	They do to!
3	L985	I hope so.
4	L984	She okay?
5	L925	Let's go.
6	L924	Wow
7	L872	Okay -- you're gonna need to learn how to lie.
8	L871	No
9	L870	I'm kidding.  You know how sometimes you just become this "persona"?  And you don't know how to quit?
10	L869	Like my fear of wearing pastels?


In [None]:
# crear una lista de todos los ID de las conversaciones
conversations = [eval(line.split(' +++$+++ ')[-1]) for line in mv_conversations]

for i, x in enumerate(conversations):
    if i==10: break
    print (f"{i+1}\t{x}")

1	['L194', 'L195', 'L196', 'L197']
2	['L198', 'L199']
3	['L200', 'L201', 'L202', 'L203']
4	['L204', 'L205', 'L206']
5	['L207', 'L208']
6	['L271', 'L272', 'L273', 'L274', 'L275']
7	['L276', 'L277']
8	['L280', 'L281']
9	['L363', 'L364']
10	['L365', 'L366']


In [None]:
# mostrar conversaciones por su ID

for x in conversations[5]:
    print (f"{x}\t{id2line[x]}")


L271	C'esc ma tete. This is my head
L272	Right.  See?  You're ready for the quiz.
L273	I don't want to know how to say that though.  I want to know useful things. Like where the good stores are.  How much does champagne cost?  Stuff like Chat.  I have never in my life had to point out my head to someone.
L274	That's because it's such a nice one.
L275	Forget French.


In [None]:
# ordenar las oraciones en preguntas (inputs) y respuestas (targets)
questions, answers = [], []

for conv in conversations:
    for i in range(len(conv)-1):
        questions.append(id2line[conv[i]])
        answers.append(id2line[conv[i+1]])

print (f"Longitud de preguntas {len(questions):,}")
print (f"Longitud de respuestas {len(answers):,}")

Longitud de preguntas 221,616
Longitud de respuestas 221,616


In [None]:
# ver los pares de conversación
for i in range(10):
    print (f"QUESTION [{i+1}]:\t{questions[i]}")
    print (f"ANSWERS [{i+1}]:\t{answers[i]}\n")

QUESTION [1]:	Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
ANSWERS [1]:	Well, I thought we'd start with pronunciation, if that's okay with you.

QUESTION [2]:	Well, I thought we'd start with pronunciation, if that's okay with you.
ANSWERS [2]:	Not the hacking and gagging and spitting part.  Please.

QUESTION [3]:	Not the hacking and gagging and spitting part.  Please.
ANSWERS [3]:	Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?

QUESTION [4]:	You're asking me out.  That's so cute. What's your name again?
ANSWERS [4]:	Forget it.

QUESTION [5]:	No, no, it's my fault -- we didn't have a proper introduction ---
ANSWERS [5]:	Cameron.

QUESTION [6]:	Cameron.
ANSWERS [6]:	The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.

QUESTION [7]:	The thing is, Cameron -- I'm at the mercy of a particularly hideous br

In [None]:
# función para re-escribir expresiones y limpiar el texto
def clean_text(text):
    import re

    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|]", "", text)
    text = " ".join(text.split())
    return text

In [None]:
# limpiar las preguntas y respuestas

# clean_questions = [clean_text(question) for question in questions]
# clean_answers = [clean_text(answer) for answer in answers]

clean_questions, clean_answers = [], []

print ("Limpiando preguntas")
tmp_q_indexes = []
for i, question in enumerate(questions):
    tmp_clean = clean_text(question)
    tmp_clean = tmp_clean.strip()
    if len(tmp_clean) <= 0: # no hay pregunta, esta vacía
        tmp_q_indexes.append(i)
    else:
        clean_questions.append(tmp_clean)

print (f"Preguntas vacías: {len(tmp_q_indexes):,}\n")

print ("Limpiando respuestas")
tmp_a_indexes = []
for i, answer in enumerate(answers):
    if i in tmp_q_indexes: continue # no seguir, la pregunta esta vacía
    tmp_clean = clean_text(answer)
    tmp_clean = tmp_clean.strip()
    if len(tmp_clean) <= 0: # si no hay respuesta
        tmp_a_indexes.append(i)
        del clean_questions[i] # eliminar preguntas, no hay respuesta para completar el par
    else:
        clean_answers.append(tmp_clean)

print (f"Respuestas vacías: {len(tmp_a_indexes):,}\n")

print (f"Longitud de preguntas:\t{len(clean_questions):,}")
print (f"Longitud de respuestas:\t{len(clean_answers):,}\n")

# mostrar preguntas y respuestas limpias
for i in range(10):
    print (f"QUESTION [{i+1}]:\t{clean_questions[i]}")
    print (f"ANSWERS [{i+1}]:\t{clean_answers[i]}\n")

Limpiando preguntas
Preguntas vacías: 197

Limpiando respuestas
Respuestas vacías: 137

Longitud de preguntas:	221,282
Longitud de respuestas:	221,282

QUESTION [1]:	can we make this quick? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad. again.
ANSWERS [1]:	well, i thought we would start with pronunciation, if that is okay with you.

QUESTION [2]:	well, i thought we would start with pronunciation, if that is okay with you.
ANSWERS [2]:	not the hacking and gagging and spitting part. please.

QUESTION [3]:	not the hacking and gagging and spitting part. please.
ANSWERS [3]:	okay... then how about we try out some french cuisine. saturday? night?

QUESTION [4]:	you are asking me out. that is so cute. that is your name again?
ANSWERS [4]:	forget it.

QUESTION [5]:	no, no, it is my fault we did not have a proper introduction
ANSWERS [5]:	cameron.

QUESTION [6]:	cameron.
ANSWERS [6]:	the thing is, cameron i am at the mercy of a particularly h

### Preparar el conjunto de datos

In [None]:
def saveDataIntoFile(output_path, data, mode='a'):
    g = open(output_path, mode)
    g.write(data)
    g.close()

output_path = 'data/movies_dialogues.tsv'
saveDataIntoFile(output_path, '', 'w') # crear/resetear archivo

for i in range(len(clean_questions)):
    tmp_data = "{}\t{}\n".format(clean_questions[i], clean_answers[i])
    if len(tmp_data.split('\t')) != 2:
        print (tmp_data.split('\t'))
        print (i, clean_questions[i])
        print (i, clean_answers[i])
        break
    saveDataIntoFile(output_path, tmp_data)

print (f"dataset creado en {output_path}")

! ls -lh data/

dataset creado en data/movies_dialogues.tsv
total 63M
-rw-r--r-- 1 root root 6.5M Mar 30 20:40 movie_conversations.txt
-rw-r--r-- 1 root root  34M Mar 30 20:40 movie_lines.txt
-rw-r--r-- 1 root root  24M Mar 30 20:59 movies_dialogues.tsv


In [None]:
!head -10 data/movies_dialogues.tsv

can we make this quick? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad. again.	well, i thought we would start with pronunciation, if that is okay with you.
well, i thought we would start with pronunciation, if that is okay with you.	not the hacking and gagging and spitting part. please.
not the hacking and gagging and spitting part. please.	okay... then how about we try out some french cuisine. saturday? night?
you are asking me out. that is so cute. that is your name again?	forget it.
no, no, it is my fault we did not have a proper introduction	cameron.
cameron.	the thing is, cameron i am at the mercy of a particularly hideous breed of loser. my sister. i cannot date until she does.
the thing is, cameron i am at the mercy of a particularly hideous breed of loser. my sister. i cannot date until she does.	seems like she could get a date easy enough...
why?	unsolved mystery. she used to be really popular when she started high school, th

In [None]:
# with open('data/movies_dialogues.tsv', encoding='utf-8', errors='ignore') as f:
#     for line in f:
#         line = line.replace('\n', '').split('\t')
#         if len(line[0].strip()) <= 0:
#             print (line)
#             # break
#         if len(line[1].strip()) <= 0:
#             print (line)
#             # break
# f.close()

In [None]:
# cargar el modelo en inglés para la tokenización
spacy_english = spacy.load("en")

def tokenize_english(text):
    return [token.text for token in spacy_english.tokenizer(text)]

In [None]:
# ------------------------------------------------------
# eliminar de memoria
del questions
del answers
del clean_questions
del clean_answers
# ------------------------------------------------------

MAX_VOCAB_SIZE = 30000
MIN_COUNT = 3 # 1
MAX_SEQUENCE_LENGTH = 10 # 15 # 20
BATCH_SIZE = 128


# crear un objeto Field
TEXT = data.Field(
    tokenize = tokenize_english,
    lower = True, 
    include_lengths = True, 
    init_token = '<sos>', 
    eos_token = '<eos>'
)

# especificar los objetos Field
fields = [('input_sequence', TEXT), ('output_sequence', TEXT)]

# establecer la ruta del conjunto de datos
data_file = 'data/movies_dialogues.tsv'

# construir el conjunto de datos con TabularDataset
dialogue_data = data.TabularDataset(
    path = data_file,
    format = 'tsv',
    fields = fields
)

print ( dialogue_data.fields)


# construir el vocabulario, e incluir vectores pre-entrenados de GLoVe
TEXT.build_vocab(
    dialogue_data,
    max_size = MAX_VOCAB_SIZE,
    min_freq = MIN_COUNT,
    # vectors = 'glove.6B.300d',
    # unk_init = torch.Tensor.normal_
)

# dividir el dataset de diálogos en entrenamiento, validación y pruebas
train_data, test_data = dialogue_data.split()
train_data, valid_data = train_data.split()

# crear iteradores de cada dataset
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    sort_within_batch = True,
    sort_key = lambda x:len(x.input_sequence),
    device = device
)

{'input_sequence': <torchtext.legacy.data.field.Field object at 0x7febf65a81d0>, 'output_sequence': <torchtext.legacy.data.field.Field object at 0x7febf65a81d0>}


In [None]:
spacy_english = None # liberar espacio de memoria

print (f"Muestras en el train:\t {len(train_iterator.dataset):,}")
print (f"Muestras en el valid:\t {len(valid_iterator.dataset):,}")
print (f"Muestras en el test:\t {len(test_iterator.dataset):,}\n")

print (f"Batches en el train:\t {len(train_iterator):,}")
print (f"Batches en el valid:\t {len(valid_iterator):,}")
print (f"Batches en el test:\t {len(test_iterator):,}")

Muestras en el train:	 108,428
Muestras en el valid:	 46,469
Muestras en el test:	 66,385

Batches en el train:	 848
Batches en el valid:	 364
Batches en el test:	 519


## Definir el modelo

### Encoder

In [None]:
class Encoder(nn.Module):
  
    def __init__(self, hidden_size, embedding_size, embedding, num_layers=2, dropout=0.0):
      
        super(Encoder, self).__init__()
        
        # Basic network params
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.num_layers = num_layers
        self.dropout = dropout
        
        # Embedding layer that will be shared with Decoder
        self.embedding = embedding
        
        # Bidirectional GRU
        self.gru = nn.GRU(embedding_size, hidden_size,
                          num_layers=num_layers,
                          dropout=dropout,
                          bidirectional=True)
        
    def forward(self, input_sequence, input_lengths):
        
        # Convert input_sequence to word embeddings
        word_embeddings = self.embedding(input_sequence)
        
        
        # Pack the sequence of embeddings
        packed_embeddings = nn.utils.rnn.pack_padded_sequence(word_embeddings, input_lengths)
        
        # Run the packed embeddings through the GRU, and then unpack the sequences
        outputs, hidden = self.gru(packed_embeddings)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        
        
        # The ouput of a GRU has shape (seq_len, batch, hidden_size * num_directions)
        # Because the Encoder is bidirectional, combine the results from the 
        # forward and reversed sequence by simply adding them together.
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]

        return outputs, hidden


### Mecanismo de Atención

In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        
        self.hidden_size = hidden_size
        
      
    def dot_score(self, hidden_state, encoder_states):
        return torch.sum(hidden_state * encoder_states, dim=2)
    
            
    def forward(self, hidden, encoder_outputs, mask):
       
        attn_scores = self.dot_score(hidden, encoder_outputs)
        
        # Transpose max_length and batch_size dimensions
        attn_scores = attn_scores.t()

        # Apply mask so network does not attend <pad> tokens        
        attn_scores = attn_scores.masked_fill(mask == 0, -1e10)
        
        # Return softmax over attention scores      
        return F.softmax(attn_scores, dim=1).unsqueeze(1)

### Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, embedding, embedding_size, hidden_size, output_size, n_layers=1, dropout=0.1):
        
        super(Decoder, self).__init__()
        
        # Basic network params
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = embedding
                
        self.gru = nn.GRU(embedding_size, hidden_size, n_layers, 
                          dropout=dropout)
        
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.attn = Attention(hidden_size)
        
    def forward(self, current_token, hidden_state, encoder_outputs, mask):
      
        # convert current_token to word_embedding
        # embedded = self.embedding(input_step)
        embedded = self.embedding(current_token)
        
        # Pass through GRU
        rnn_output, hidden_state = self.gru(embedded, hidden_state)
        
        # Calculate attention weights
        attention_weights = self.attn(rnn_output, encoder_outputs, mask)
        
        # Calculate context vector
        context = attention_weights.bmm(encoder_outputs.transpose(0, 1))
        
        # Concatenate  context vector and GRU output
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        
        # Pass concat_output to final output layer
        output = self.out(concat_output)
        
        # Return output and final hidden state
        return output, hidden_state

In [None]:
class seq2seq(nn.Module):
    def __init__(self, embedding_size, hidden_size, vocab_size, 
                 device, pad_idx, eos_idx, sos_idx, teacher_forcing_ratio=0.5):
        super(seq2seq, self).__init__()
        
        # Embedding layer shared by encoder and decoder
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        
        # Encoder network
        self.encoder = Encoder(hidden_size, 
                               embedding_size, 
                               self.embedding,
                              num_layers=2,
                              dropout=0.5)
        
        # Decoder network        
        self.decoder = Decoder(self.embedding,
                               embedding_size,
                              hidden_size,
                              vocab_size,
                              n_layers=2,
                              dropout=0.5)
        
        
        # Indices of special tokens and hardware device 
        self.pad_idx = pad_idx
        self.eos_idx = eos_idx
        self.sos_idx = sos_idx
        self.device = device
        
    def create_mask(self, input_sequence):
        return (input_sequence != self.pad_idx).permute(1, 0)
        
        
    def forward(self, input_sequence, output_sequence, teacher_forcing_ratio=0.5):
      
        # Unpack input_sequence tuple
        input_tokens = input_sequence[0].to(self.device)
        input_lengths = input_sequence[1].to(self.device)
      
        # Unpack output_tokens, or create an empty tensor for text generation
        if output_sequence is None:
            inference = True
            output_tokens = torch.zeros((100, input_tokens.shape[1])).long().fill_(self.sos_idx).to(self.device)
        else:
            inference = False
            output_tokens = output_sequence[0]
            vocab_size = self.decoder.output_size
        
        batch_size = len(input_lengths)
        max_seq_len = len(output_tokens)
        
        
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_seq_len, batch_size, vocab_size).to(self.device)
        
        
        # Pass through the first half of the network
        encoder_outputs, hidden = self.encoder(input_tokens, input_lengths)
        
        # Ensure dim of hidden_state can be fed into Decoder
        hidden =  hidden[:self.decoder.n_layers]
        
        #first input to the decoder is the <sos> tokens
        output = output_tokens[0,:]
        
        # Create mask
        mask = self.create_mask(input_tokens)
        
        
        # Step through the length of the output sequence one token at a time
        # Teacher forcing is used to assist training
        for t in range(1, max_seq_len):
            output = output.unsqueeze(0)
            
            output, hidden = self.decoder(output, hidden, encoder_outputs, mask)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (output_tokens[t] if teacher_force else top1)
            
            # If we're in inference mode, keep generating until we produce an
            # <eos> token
            if inference and output.item() == self.eos_idx:
                return outputs[:t]
        return outputs

## Entrenar el modelo

In [None]:
# funcion para medir el tiempo de entrenamiento
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
pad_idx = TEXT.vocab.stoi['<pad>']
eos_idx = TEXT.vocab.stoi['<eos>']
sos_idx = TEXT.vocab.stoi['<sos>']
# Size of embedding_dim should match the dim of pre-trained word embeddings!
embedding_dim = 50
hidden_dim = 512
vocab_size = len(TEXT.vocab)
model = seq2seq(embedding_dim,
                 hidden_dim, 
                 vocab_size, 
                 device, pad_idx, eos_idx, sos_idx).to(device)
# pretrained_embeddings = TEXT.vocab.vectors
# model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(embedding_dim, device=device)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim, device=device)
# model.embedding.weight.requires_grad = False
optimizer = optim.Adam([param for param in model.parameters() if param.requires_grad == True], lr=1.0e-3)
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

print (model)

seq2seq(
  (embedding): Embedding(29753, 50)
  (encoder): Encoder(
    (embedding): Embedding(29753, 50)
    (gru): GRU(50, 512, num_layers=2, dropout=0.5, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(29753, 50)
    (gru): GRU(50, 512, num_layers=2, dropout=0.5)
    (concat): Linear(in_features=1024, out_features=512, bias=True)
    (out): Linear(in_features=512, out_features=29753, bias=True)
    (attn): Attention()
  )
)


In [None]:
def train(model, iterator, criterion, optimizer, clip=1.0):
    # Put the model in training mode!

    model.train()
    
    epoch_loss = 0
    
    for idx, batch in tqdm(enumerate(iterator), total=len(iterator)): # usando tqdm para mostrar el progreso

        input_sequence = batch.input_sequence
        output_sequence = batch.output_sequence
        
        target_tokens = output_sequence[0]
        # target_tokens = torch.tensor(output_sequence[0], dtype=torch.long, device='cpu')
        
        
        # zero out the gradient for the current batch
        print ("\nAntes del error")
        optimizer.zero_grad()

        # Run the batch through our model
        print (input_sequence)
        print ('='*100)
        print (output_sequence)
        output = model(input_sequence, output_sequence)
        print ("\nDespués del error")
        
        # Throw it through our loss function
        output = output[1:].view(-1, output.shape[-1])
        target_tokens = target_tokens[1:].view(-1)
        
        loss = criterion(output, target_tokens)
        
        # Perform back-prop and calculate the gradient of our loss function
        loss.backward()
          
        # Clip the gradient if necessary.          
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        # Update model parameters
        optimizer.step()
        
        epoch_loss += loss.item()

        print (f"\nBatch {idx+1} / {len(iterator)} Loss: {epoch_loss:.4f}")


        
    return epoch_loss / len(iterator)

In [None]:
import time
N_EPOCHS = 1
CLIP = 50.0
start = time.time()
plot_train_loss = []
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, criterion, optimizer, CLIP)
    avg_epochs =  epoch / N_EPOCHS
    time_elapsed = timeSince(start, avg_epochs)
    print(f'\n{time_elapsed} ({epoch} {avg_epochs * 100}%) %{train_loss:.4f}\n')
    plot_train_loss.append(train_loss)

  0%|          | 0/848 [00:00<?, ?it/s]


Antes del error
(tensor([[    2,     2,     2,  ...,     2,     2,     2],
        [    7,     7,    15,  ...,   112,    18,   189],
        [  599,   252,   215,  ...,     4, 25186,    38],
        ...,
        [  638,    25,   180,  ...,   164,     0,   220],
        [   21,     4,     4,  ...,     4,     8,     4],
        [    3,     3,     3,  ...,     3,     3,     3]], device='cuda:0'), tensor([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10], device='cuda:0'))
(tensor

  0%|          | 0/848 [00:00<?, ?it/s]


RuntimeError: ignored

In [None]:
# graficar las perdidas
import matplotlib.pyplot as plt

print (f"Epochs: {len(plot_train_loss)}")

# Graficar accuracy y loss
fig, ax = plt.subplots(figsize=(15,10))
plt.plot(plot_train_loss, label='Training loss', marker='o', color='orange')
plt.title('Losses', fontsize=15)
plt.xlabel('Epoch', fontsize=14)
plt.ylabel('Loss', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid()
plt.legend()
plt.savefig('/content/train_loss.png')
plt.show()

## Resultados

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

def translate_sentence(model, sentence, nlp):
    model.eval()
    
    tokenized = nlp(sentence) 
    
    tokenized = ['<sos>'] + [t.lower_ for t in tokenized] + ['<eos>']
    numericalized = [TEXT.vocab.stoi[t] for t in tokenized] 
    
    sentence_length = torch.LongTensor([len(numericalized)]).to(model.device) 
    tensor = torch.LongTensor(numericalized).unsqueeze(1).to(model.device) 
    
    translation_tensor_logits = model((tensor, sentence_length), None, 0) 
    
    translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1)
    translation = [TEXT.vocab.itos[t] for t in translation_tensor]
 
    # Start at the first index. We don't need to return the <sos> token...
    translation = translation[1:]
    return translation, translation_tensor_logits

sentence = "tell me a fun fact"
response, logits = translate_sentence(model, sentence, nlp)
print(" ".join(response))