In [935]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm
import evaluate
import random
from spacy.lang.vi import Vietnamese
from spacy.lang.en import English
from torch.utils.data import Dataset, random_split
from torchtext.vocab import build_vocab_from_iterator

In [936]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [937]:
def load_data(path):
    data = []
    with open(path,'r') as file:
        for line in file.readlines():
            splitted_line = line.split('\t')
            eng = splitted_line[0]
            vi = splitted_line[1]
            data.append({'vi':vi, 
                         'en':eng})
    return data

In [938]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        return self.data[index]

In [939]:
dataset = CustomDataset(load_data('data/vie.txt'))

In [940]:
#7:2:1
total_samples = len(dataset)
train_size = int(0.8 * total_samples)
val_size = int(0.1 * total_samples)
test_size = total_samples - train_size - val_size

In [941]:
train_data, valid_data, test_data = random_split(dataset, [train_size, val_size, test_size])
print("Số lượng mẫu trong tập train:", len(train_data))
print("Số lượng mẫu trong tập validation:", len(valid_data))
print("Số lượng mẫu trong tập test:", len(test_data))

Số lượng mẫu trong tập train: 7542
Số lượng mẫu trong tập validation: 942
Số lượng mẫu trong tập test: 944


In [942]:
train_data[0]

{'vi': 'Bạn thực sự muốn mặc cái đó sao?',
 'en': 'Do you really want to wear that?'}

### Tokenizer

In [943]:
en_nlp = English()
vi_nlp = Vietnamese()

In [944]:
string = "What a lovely day it is today!"
[token.text for token in en_nlp.tokenizer(string)]

['What', 'a', 'lovely', 'day', 'it', 'is', 'today', '!']

In [945]:
def tokenize_example(example, en_nlp, vi_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    vi_tokens = [token.text for token in vi_nlp.tokenizer(example["vi"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        vi_tokens = [token.lower() for token in vi_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    vi_tokens = [sos_token] + vi_tokens + [eos_token]
    example["en_tokens"] = en_tokens
    example["vi_tokens"] = vi_tokens
    return example

In [946]:
max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "vi_nlp": vi_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}
train_data = [tokenize_example(example, **fn_kwargs) for example in train_data]
valid_data = [tokenize_example(example, **fn_kwargs) for example in valid_data]
test_data = [tokenize_example(example, **fn_kwargs) for example in test_data]

In [947]:
train_data[0]

{'vi': 'Bạn thực sự muốn mặc cái đó sao?',
 'en': 'Do you really want to wear that?',
 'en_tokens': ['<sos>',
  'do',
  'you',
  'really',
  'want',
  'to',
  'wear',
  'that',
  '?',
  '<eos>'],
 'vi_tokens': ['<sos>',
  'bạn',
  'thực_sự',
  'muốn',
  'mặc',
  'cái',
  'đó',
  'sao',
  '?',
  '<eos>']}

In [948]:
def yield_tokens(data,s):
    for dct in data:
        yield dct[s]

In [949]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    yield_tokens(train_data,'en_tokens'),
    min_freq=min_freq,
    specials=special_tokens,
)

vi_vocab = torchtext.vocab.build_vocab_from_iterator(
    yield_tokens(train_data,'vi_tokens'),
    min_freq=min_freq,
    specials=special_tokens,
)

In [950]:
en_vocab.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', '.', 'i', 'to', 'tom', 'you', 'the']

In [951]:
en_vocab.get_stoi()["the"]

9

In [952]:
assert en_vocab[unk_token] == vi_vocab[unk_token]
assert en_vocab[pad_token] == vi_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [953]:
en_vocab.set_default_index(unk_index)
vi_vocab.set_default_index(unk_index)

In [954]:
tokens = ["i", "love", "watching", "crime", "shows"]
en_vocab.lookup_indices(tokens)

[5, 173, 509, 0, 0]

In [955]:
en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

['i', 'love', 'watching', '<unk>', '<unk>']

In [956]:
def numericalize_example(example, en_vocab, vi_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    vi_ids = vi_vocab.lookup_indices(example["vi_tokens"])
    example["en_ids"] = en_ids
    example["vi_ids"] = vi_ids
    return example

In [957]:
fn_kwargs = {"en_vocab": en_vocab, "vi_vocab": vi_vocab}
train_data = [numericalize_example(example, **fn_kwargs) for example in train_data]
valid_data = [numericalize_example(example, **fn_kwargs) for example in valid_data]
test_data = [numericalize_example(example, **fn_kwargs) for example in test_data]

In [958]:
train_data[0]

{'vi': 'Bạn thực sự muốn mặc cái đó sao?',
 'en': 'Do you really want to wear that?',
 'en_tokens': ['<sos>',
  'do',
  'you',
  'really',
  'want',
  'to',
  'wear',
  'that',
  '?',
  '<eos>'],
 'vi_tokens': ['<sos>',
  'bạn',
  'thực_sự',
  'muốn',
  'mặc',
  'cái',
  'đó',
  'sao',
  '?',
  '<eos>'],
 'en_ids': [2, 14, 8, 88, 37, 6, 431, 15, 10, 3],
 'vi_ids': [2, 8, 184, 30, 281, 34, 15, 97, 11, 3]}

In [959]:
en_vocab.lookup_tokens(train_data[0]["en_ids"])

['<sos>', 'do', 'you', 'really', 'want', 'to', 'wear', 'that', '?', '<eos>']

In [960]:
def to_tensor(example):
    example['en_ids'] = torch.tensor(np.array(example['en_ids']), dtype=torch.int64)
    example['vi_ids'] = torch.tensor(np.array(example['vi_ids']), dtype=torch.int64)
    return example

In [961]:
train_data = [to_tensor(example) for example in train_data]
valid_data = [to_tensor(example) for example in valid_data]
test_data = [to_tensor(example) for example in test_data]

In [962]:
type(train_data[0]["en_ids"])

torch.Tensor

In [963]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_vi_ids = [example["vi_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_vi_ids = nn.utils.rnn.pad_sequence(batch_vi_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids.T,
            "vi_ids": batch_vi_ids.T,
        }
        return batch

    return collate_fn

In [964]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [965]:
a = get_data_loader(train_data, 128, pad_index, shuffle=True)

In [966]:
batch_size = 128
train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [967]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim,)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, src):
        #src: n x seq_length
        embedded = self.dropout(self.embedding(src))
        #embedded: n x seq_length x embedding_dim
        outputs, (hidden, cell) = self.rnn(embedded)
        #outputs: n x seq_length x hidden_dim
        #hidden: n x num_layers x hidden_dim
        #cell: n x num_layers x hidden_dim
        return hidden, cell

In [968]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input, hidden, cell):
        #input: n
        #hidden = n x num_layers x hidden_dim
        #cell = n x num_layers x hidden_dim
        input = input.unsqueeze(1)
        #input: n x 1
        embedded = self.dropout(self.embedding(input))
        #embedded: n x 1 x embedding_dim
        output, (hidden, cell) = self.rnn(embedded, (hidden,cell))
        #output: n x 1 x hidden_dim
        #hidden: n x num_layers x hidden_dim
        #cell: n x num_layers x hidden_dim
        prediction = self.fc_out(output.squeeze(1)) #output.squeeze(1) -> n x hidden_dim
        #prediction: n x output_dim
        return prediction, hidden, cell

In [969]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert(
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal"
        assert(
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers"
    
    def forward(self, src, trg, teacher_forcing_ratio):
        #src: n x seq_length
        #trg: n x seq_length
        #teacher_forcing_ratio is probability to use teacher forcing
        batch_size = src.shape[0]
        trg_length = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(batch_size, trg_length, trg_vocab_size).to(self.device)
        #outputs: n x trg_seq_length x output_dim
        hidden, cell = self.encoder(src)
        #hidden: n x num_layers x hidden_dim
        #cell: n x num_layers x hidden_dim
        #first input to the decoder is the <sos> token
        input = trg[:,0]
        #input: n
        for t in range(1, trg_length):
            output, hidden, cell = self.decoder(input, hidden, cell)
            #output: n x output_dim
            #hidden: n x num_layers x hidden_dim
            #cell: n x num_layers x hidden_dim
            outputs[:,t,:] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:,t] if teacher_force else top1
            #input: n
        return outputs

In [970]:
input_dim = len(en_vocab)
output_dim = len(vi_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)

model = Seq2Seq(encoder, decoder, device).to(device)

In [971]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(2188, 256)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(2065, 256)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=2065, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [972]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 9,504,529 trainable parameters


In [973]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [974]:
def train_fn(model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch['en_ids'].to(device)
        trg = batch['vi_ids'].to(device)
        #src: n x src_seq_length
        #trg: n x trg_seq_length
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        #output: n x trg_seq_length x trg_vocab_size
        output_dim = output.shape[-1]
        output = output[:,1:,].reshape(-1,output_dim)
        #output: (n * trg_seq_length - 1) x trg_vocab_size
        trg = trg[:,1:].reshape(-1)
        #trg: n x trg_seq_length-1
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)


In [975]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch['en_ids'].to(device)
            trg = batch['vi_ids'].to(device)
            #src: n x src_seq_length
            #trg: n x trg_seq_length
            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            output = output[:,1:,].reshape(-1,output_dim)
            #output: n x trg_seq_legth - 1 x trg_vocab_size
            trg = trg[:,1:].reshape(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [976]:

n_epochs = 50
clip = 2.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut1-model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

  2%|▏         | 1/50 [00:08<06:35,  8.08s/it]

	Train Loss:   5.290 | Train PPL: 198.310
	Valid Loss:   4.938 | Valid PPL: 139.508


  4%|▍         | 2/50 [00:16<06:23,  7.99s/it]

	Train Loss:   4.941 | Train PPL: 139.926
	Valid Loss:   4.968 | Valid PPL: 143.710


  6%|▌         | 3/50 [00:23<06:13,  7.94s/it]

	Train Loss:   4.873 | Train PPL: 130.696
	Valid Loss:   4.964 | Valid PPL: 143.173


  8%|▊         | 4/50 [00:31<06:04,  7.92s/it]

	Train Loss:   4.779 | Train PPL: 119.037
	Valid Loss:   4.866 | Valid PPL: 129.803


 10%|█         | 5/50 [00:41<06:17,  8.40s/it]

	Train Loss:   4.657 | Train PPL: 105.303
	Valid Loss:   4.819 | Valid PPL: 123.840


 12%|█▏        | 6/50 [00:50<06:27,  8.80s/it]

	Train Loss:   4.539 | Train PPL:  93.630
	Valid Loss:   4.796 | Valid PPL: 121.062


 14%|█▍        | 7/50 [01:00<06:27,  9.02s/it]

	Train Loss:   4.417 | Train PPL:  82.852
	Valid Loss:   4.715 | Valid PPL: 111.652


 16%|█▌        | 8/50 [01:09<06:21,  9.09s/it]

	Train Loss:   4.298 | Train PPL:  73.582
	Valid Loss:   4.678 | Valid PPL: 107.568


 18%|█▊        | 9/50 [01:18<06:15,  9.15s/it]

	Train Loss:   4.215 | Train PPL:  67.669
	Valid Loss:   4.629 | Valid PPL: 102.375


 20%|██        | 10/50 [01:27<06:08,  9.21s/it]

	Train Loss:   4.115 | Train PPL:  61.272
	Valid Loss:   4.607 | Valid PPL: 100.183


 22%|██▏       | 11/50 [01:37<06:01,  9.27s/it]

	Train Loss:   4.030 | Train PPL:  56.271
	Valid Loss:   4.592 | Valid PPL:  98.654


 24%|██▍       | 12/50 [01:46<05:53,  9.32s/it]

	Train Loss:   3.955 | Train PPL:  52.177
	Valid Loss:   4.554 | Valid PPL:  94.984


 26%|██▌       | 13/50 [01:56<05:46,  9.35s/it]

	Train Loss:   3.843 | Train PPL:  46.665
	Valid Loss:   4.527 | Valid PPL:  92.449


 28%|██▊       | 14/50 [02:05<05:35,  9.32s/it]

	Train Loss:   3.745 | Train PPL:  42.330
	Valid Loss:   4.483 | Valid PPL:  88.458


 30%|███       | 15/50 [02:14<05:27,  9.36s/it]

	Train Loss:   3.659 | Train PPL:  38.819
	Valid Loss:   4.391 | Valid PPL:  80.711


 32%|███▏      | 16/50 [02:24<05:15,  9.28s/it]

	Train Loss:   3.541 | Train PPL:  34.485
	Valid Loss:   4.400 | Valid PPL:  81.458


 34%|███▍      | 17/50 [02:33<05:06,  9.28s/it]

	Train Loss:   3.448 | Train PPL:  31.450
	Valid Loss:   4.354 | Valid PPL:  77.772


 36%|███▌      | 18/50 [02:42<04:57,  9.28s/it]

	Train Loss:   3.365 | Train PPL:  28.937
	Valid Loss:   4.306 | Valid PPL:  74.147


 38%|███▊      | 19/50 [02:51<04:47,  9.26s/it]

	Train Loss:   3.235 | Train PPL:  25.411
	Valid Loss:   4.293 | Valid PPL:  73.163


 40%|████      | 20/50 [03:00<04:36,  9.23s/it]

	Train Loss:   3.164 | Train PPL:  23.674
	Valid Loss:   4.299 | Valid PPL:  73.591


 42%|████▏     | 21/50 [03:10<04:26,  9.19s/it]

	Train Loss:   3.078 | Train PPL:  21.716
	Valid Loss:   4.243 | Valid PPL:  69.648


 44%|████▍     | 22/50 [03:19<04:17,  9.20s/it]

	Train Loss:   2.998 | Train PPL:  20.047
	Valid Loss:   4.195 | Valid PPL:  66.333


 46%|████▌     | 23/50 [03:28<04:11,  9.30s/it]

	Train Loss:   2.886 | Train PPL:  17.928
	Valid Loss:   4.150 | Valid PPL:  63.429


 48%|████▊     | 24/50 [03:38<04:01,  9.28s/it]

	Train Loss:   2.758 | Train PPL:  15.767
	Valid Loss:   4.200 | Valid PPL:  66.668


 50%|█████     | 25/50 [03:47<03:51,  9.26s/it]

	Train Loss:   2.747 | Train PPL:  15.589
	Valid Loss:   4.116 | Valid PPL:  61.315


 52%|█████▏    | 26/50 [03:56<03:41,  9.25s/it]

	Train Loss:   2.646 | Train PPL:  14.097
	Valid Loss:   4.138 | Valid PPL:  62.679


 54%|█████▍    | 27/50 [04:05<03:33,  9.27s/it]

	Train Loss:   2.550 | Train PPL:  12.801
	Valid Loss:   4.132 | Valid PPL:  62.288


 56%|█████▌    | 28/50 [04:15<03:24,  9.30s/it]

	Train Loss:   2.447 | Train PPL:  11.551
	Valid Loss:   4.078 | Valid PPL:  59.003


 58%|█████▊    | 29/50 [04:24<03:15,  9.29s/it]

	Train Loss:   2.358 | Train PPL:  10.567
	Valid Loss:   4.101 | Valid PPL:  60.422


 60%|██████    | 30/50 [04:33<03:05,  9.28s/it]

	Train Loss:   2.291 | Train PPL:   9.887
	Valid Loss:   4.091 | Valid PPL:  59.825


 62%|██████▏   | 31/50 [04:42<02:56,  9.28s/it]

	Train Loss:   2.229 | Train PPL:   9.294
	Valid Loss:   4.097 | Valid PPL:  60.188


 64%|██████▍   | 32/50 [04:52<02:47,  9.32s/it]

	Train Loss:   2.100 | Train PPL:   8.163
	Valid Loss:   4.072 | Valid PPL:  58.684


 66%|██████▌   | 33/50 [05:01<02:39,  9.36s/it]

	Train Loss:   2.064 | Train PPL:   7.874
	Valid Loss:   4.088 | Valid PPL:  59.640


 68%|██████▊   | 34/50 [05:11<02:28,  9.31s/it]

	Train Loss:   1.938 | Train PPL:   6.943
	Valid Loss:   4.114 | Valid PPL:  61.207


 70%|███████   | 35/50 [05:20<02:19,  9.27s/it]

	Train Loss:   1.866 | Train PPL:   6.464
	Valid Loss:   4.075 | Valid PPL:  58.875


 72%|███████▏  | 36/50 [05:29<02:09,  9.28s/it]

	Train Loss:   1.824 | Train PPL:   6.196
	Valid Loss:   4.052 | Valid PPL:  57.528


 74%|███████▍  | 37/50 [05:38<02:00,  9.28s/it]

	Train Loss:   1.743 | Train PPL:   5.715
	Valid Loss:   4.068 | Valid PPL:  58.457


 76%|███████▌  | 38/50 [05:48<01:51,  9.29s/it]

	Train Loss:   1.677 | Train PPL:   5.351
	Valid Loss:   4.076 | Valid PPL:  58.925


 78%|███████▊  | 39/50 [05:57<01:42,  9.27s/it]

	Train Loss:   1.612 | Train PPL:   5.015
	Valid Loss:   4.067 | Valid PPL:  58.390


 80%|████████  | 40/50 [06:06<01:32,  9.27s/it]

	Train Loss:   1.563 | Train PPL:   4.772
	Valid Loss:   4.073 | Valid PPL:  58.716


 82%|████████▏ | 41/50 [06:16<01:24,  9.34s/it]

	Train Loss:   1.459 | Train PPL:   4.303
	Valid Loss:   4.097 | Valid PPL:  60.176


 84%|████████▍ | 42/50 [06:25<01:14,  9.35s/it]

	Train Loss:   1.404 | Train PPL:   4.071
	Valid Loss:   4.133 | Valid PPL:  62.395


 86%|████████▌ | 43/50 [06:34<01:05,  9.34s/it]

	Train Loss:   1.386 | Train PPL:   3.998
	Valid Loss:   4.109 | Valid PPL:  60.889


 88%|████████▊ | 44/50 [06:44<00:56,  9.36s/it]

	Train Loss:   1.299 | Train PPL:   3.665
	Valid Loss:   4.143 | Valid PPL:  63.007


 90%|█████████ | 45/50 [06:53<00:46,  9.33s/it]

	Train Loss:   1.267 | Train PPL:   3.550
	Valid Loss:   4.143 | Valid PPL:  62.993


 92%|█████████▏| 46/50 [07:02<00:37,  9.35s/it]

	Train Loss:   1.139 | Train PPL:   3.124
	Valid Loss:   4.188 | Valid PPL:  65.880


 94%|█████████▍| 47/50 [07:12<00:27,  9.31s/it]

	Train Loss:   1.118 | Train PPL:   3.059
	Valid Loss:   4.180 | Valid PPL:  65.395


 96%|█████████▌| 48/50 [07:21<00:18,  9.28s/it]

	Train Loss:   1.077 | Train PPL:   2.935
	Valid Loss:   4.267 | Valid PPL:  71.289


 98%|█████████▊| 49/50 [07:30<00:09,  9.31s/it]

	Train Loss:   1.034 | Train PPL:   2.811
	Valid Loss:   4.197 | Valid PPL:  66.493


100%|██████████| 50/50 [07:39<00:00,  9.20s/it]

	Train Loss:   0.951 | Train PPL:   2.588
	Valid Loss:   4.283 | Valid PPL:  72.430





In [977]:
model.load_state_dict(torch.load("tut1-model.pt"))

test_loss = evaluate_fn(model, test_data_loader, criterion, device)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

| Test Loss: 4.115 | Test PPL:  61.265 |


In [1005]:
def translate_sentence(
    sentence,
    model,
    de_nlp,
    en_nlp,
    de_vocab,
    en_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        if isinstance(sentence, str):
            tokens = [token.text for token in de_nlp.tokenizer(sentence)]
        else:
            tokens = [token for token in sentence]
        if lower:
            tokens = [token.lower() for token in tokens]
        tokens = [sos_token] + tokens + [eos_token]
        ids = de_vocab.lookup_indices(tokens)
        tensor = torch.LongTensor(ids).unsqueeze(0).to(device)
        hidden, cell = model.encoder(tensor)
        inputs = en_vocab.lookup_indices([sos_token])
        for _ in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, cell = model.decoder(inputs_tensor, hidden, cell)
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == en_vocab[eos_token]:
                break
        tokens = en_vocab.lookup_tokens(inputs)
    return tokens

In [1014]:
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    vi_nlp,
    en_vocab,
    vi_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

In [1015]:
translation

['<sos>', 'tom', 'không', 'có', 'tiền', '.', '<eos>']

In [1017]:
for i in range(20):
    sentence = test_data[i]["en"]
    expected_translation = test_data[i]["vi"]
    translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    vi_nlp,
    en_vocab,
    vi_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    )
    print('----')
    print(f'Input: {sentence}')
    print(f'True: {expected_translation}')
    print(f'Pred: {translation}')

----
Input: I wish Tom wouldn't sing so loudly late at night.
True: Tôi mong sao Tom đừng hát quá to lúc đêm khuya.
Pred: ['<sos>', 'tôi', 'ước', 'là', 'tom', 'không', 'không', 'chơi', 'úc', 'với', 'mùa', 'hè', '.', '<eos>']
----
Input: I went for a walk to get some air.
True: Tôi đã đi dạo để có chút không khí.
Pred: ['<sos>', 'tôi', 'đã', 'dậy', 'dậy', 'qua', 'khi', 'trời', 'bắt_đầu', '.', '.', '<eos>']
----
Input: Her book is very interesting.
True: Cuốn sách của cô ấy rất thú vị.
Pred: ['<sos>', 'chồng', 'của', 'cô', 'ấy', 'là', 'một', 'đầu_bếp', 'xuất_sắc', '.', '<eos>']
----
Input: Tom doesn't eat enough fruit.
True: Tom không ăn đủ trái cây.
Pred: ['<sos>', 'tom', 'không', 'có', 'tiền', '.', '<eos>']
----
Input: If I'd known Tom was in Boston, I'd have told you.
True: Lúc đó nếu tôi biết là Tom ở Boston thì tôi đã nói cho bạn biết rồi.
Pred: ['<sos>', 'nếu', 'tôi', 'biết', 'là', 'tom', ',', 'nhưng', 'tôi', ',', 'tôi', 'không', 'bao_giờ', 'gặp', 'tom', '.', '<eos>']
----
Input: W