In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import torchtext
import tqdm
import random
from spacy.lang.vi import Vietnamese
from spacy.lang.en import English
from torch.utils.data import Dataset, random_split
from torchtext.vocab import build_vocab_from_iterator

In [2]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [3]:
def load_data(path):
    data = []
    with open(path,'r') as file:
        for line in file.readlines():
            splitted_line = line.split('\t')
            eng = splitted_line[0]
            vi = splitted_line[1]
            data.append({'vi':vi, 
                         'en':eng})
    return data

In [4]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        return self.data[index]

In [5]:
dataset = CustomDataset(load_data('/kaggle/input/languagedata/data/vie.txt'))

In [6]:
#7:2:1
total_samples = len(dataset)
train_size = int(0.8 * total_samples)
val_size = int(0.1 * total_samples)
test_size = total_samples - train_size - val_size

In [7]:
train_data, valid_data, test_data = random_split(dataset, [train_size, val_size, test_size])
print("Số lượng mẫu trong tập train:", len(train_data))
print("Số lượng mẫu trong tập validation:", len(valid_data))
print("Số lượng mẫu trong tập test:", len(test_data))

Số lượng mẫu trong tập train: 7542
Số lượng mẫu trong tập validation: 942
Số lượng mẫu trong tập test: 944


In [8]:
train_data[0]

{'vi': 'Bạn thực sự muốn mặc cái đó sao?',
 'en': 'Do you really want to wear that?'}

### Tokenizer

In [9]:
!pip install pyvi

Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite->pyvi)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.10 pyvi-0.1.1 sklearn-crfsui

In [10]:
en_nlp = English()
vi_nlp = Vietnamese()

In [11]:
string = "What a lovely day it is today!"
[token.text for token in en_nlp.tokenizer(string)]

['What', 'a', 'lovely', 'day', 'it', 'is', 'today', '!']

In [12]:
def tokenize_example(example, en_nlp, vi_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    vi_tokens = [token.text for token in vi_nlp.tokenizer(example["vi"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        vi_tokens = [token.lower() for token in vi_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    vi_tokens = [sos_token] + vi_tokens + [eos_token]
    example["en_tokens"] = en_tokens
    example["vi_tokens"] = vi_tokens
    return example

In [13]:
max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "vi_nlp": vi_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}
train_data = [tokenize_example(example, **fn_kwargs) for example in train_data]
valid_data = [tokenize_example(example, **fn_kwargs) for example in valid_data]
test_data = [tokenize_example(example, **fn_kwargs) for example in test_data]

In [14]:
train_data[0]

{'vi': 'Bạn thực sự muốn mặc cái đó sao?',
 'en': 'Do you really want to wear that?',
 'en_tokens': ['<sos>',
  'do',
  'you',
  'really',
  'want',
  'to',
  'wear',
  'that',
  '?',
  '<eos>'],
 'vi_tokens': ['<sos>',
  'bạn',
  'thực sự',
  'muốn',
  'mặc',
  'cái',
  'đó',
  'sao',
  '?',
  '<eos>']}

In [15]:
def yield_tokens(data,s):
    for dct in data:
        yield dct[s]

In [16]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    yield_tokens(train_data,'en_tokens'),
    min_freq=min_freq,
    specials=special_tokens,
)

vi_vocab = torchtext.vocab.build_vocab_from_iterator(
    yield_tokens(train_data,'vi_tokens'),
    min_freq=min_freq,
    specials=special_tokens,
)

In [17]:
en_vocab.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', '.', 'i', 'to', 'tom', 'you', 'the']

In [18]:
en_vocab.get_stoi()["the"]

9

In [19]:
assert en_vocab[unk_token] == vi_vocab[unk_token]
assert en_vocab[pad_token] == vi_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [20]:
en_vocab.set_default_index(unk_index)
vi_vocab.set_default_index(unk_index)

In [21]:
tokens = ["i", "love", "watching", "crime", "shows"]
en_vocab.lookup_indices(tokens)

[5, 173, 509, 0, 0]

In [22]:
en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

['i', 'love', 'watching', '<unk>', '<unk>']

In [23]:
def numericalize_example(example, en_vocab, vi_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    vi_ids = vi_vocab.lookup_indices(example["vi_tokens"])
    example["en_ids"] = en_ids
    example["vi_ids"] = vi_ids
    return example

In [24]:
fn_kwargs = {"en_vocab": en_vocab, "vi_vocab": vi_vocab}
train_data = [numericalize_example(example, **fn_kwargs) for example in train_data]
valid_data = [numericalize_example(example, **fn_kwargs) for example in valid_data]
test_data = [numericalize_example(example, **fn_kwargs) for example in test_data]

In [25]:
train_data[0]

{'vi': 'Bạn thực sự muốn mặc cái đó sao?',
 'en': 'Do you really want to wear that?',
 'en_tokens': ['<sos>',
  'do',
  'you',
  'really',
  'want',
  'to',
  'wear',
  'that',
  '?',
  '<eos>'],
 'vi_tokens': ['<sos>',
  'bạn',
  'thực sự',
  'muốn',
  'mặc',
  'cái',
  'đó',
  'sao',
  '?',
  '<eos>'],
 'en_ids': [2, 14, 8, 88, 37, 6, 431, 15, 10, 3],
 'vi_ids': [2, 8, 184, 30, 281, 34, 15, 97, 11, 3]}

In [26]:
en_vocab.lookup_tokens(train_data[0]["en_ids"])

['<sos>', 'do', 'you', 'really', 'want', 'to', 'wear', 'that', '?', '<eos>']

In [27]:
def to_tensor(example):
    example['en_ids'] = torch.tensor(np.array(example['en_ids']), dtype=torch.int64)
    example['vi_ids'] = torch.tensor(np.array(example['vi_ids']), dtype=torch.int64)
    return example

In [28]:
train_data = [to_tensor(example) for example in train_data]
valid_data = [to_tensor(example) for example in valid_data]
test_data = [to_tensor(example) for example in test_data]

In [29]:
type(train_data[0]["en_ids"])

torch.Tensor

In [30]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_vi_ids = [example["vi_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_vi_ids = nn.utils.rnn.pad_sequence(batch_vi_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids.T,
            "vi_ids": batch_vi_ids.T,
        }
        return batch

    return collate_fn

In [31]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [32]:
a = get_data_loader(train_data, 128, pad_index, shuffle=True)

In [33]:
batch_size = 128
train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [34]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim,)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, src):
        #src: n x seq_length
        embedded = self.dropout(self.embedding(src))
        #embedded: n x seq_length x embedding_dim
        outputs, (hidden, cell) = self.rnn(embedded)
        #outputs: n x seq_length x hidden_dim
        #hidden: n x num_layers x hidden_dim
        #cell: n x num_layers x hidden_dim
        return hidden, cell

In [35]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input, hidden, cell):
        #input: n
        #hidden = n x num_layers x hidden_dim
        #cell = n x num_layers x hidden_dim
        input = input.unsqueeze(1)
        #input: n x 1
        embedded = self.dropout(self.embedding(input))
        #embedded: n x 1 x embedding_dim
        output, (hidden, cell) = self.rnn(embedded, (hidden,cell))
        #output: n x 1 x hidden_dim
        #hidden: n x num_layers x hidden_dim
        #cell: n x num_layers x hidden_dim
        prediction = self.fc_out(output.squeeze(1)) #output.squeeze(1) -> n x hidden_dim
        #prediction: n x output_dim
        return prediction, hidden, cell

In [36]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert(
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal"
        assert(
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers"
    
    def forward(self, src, trg, teacher_forcing_ratio):
        #src: n x seq_length
        #trg: n x seq_length
        #teacher_forcing_ratio is probability to use teacher forcing
        batch_size = src.shape[0]
        trg_length = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(batch_size, trg_length, trg_vocab_size).to(self.device)
        #outputs: n x trg_seq_length x output_dim
        hidden, cell = self.encoder(src)
        #hidden: n x num_layers x hidden_dim
        #cell: n x num_layers x hidden_dim
        #first input to the decoder is the <sos> token
        input = trg[:,0]
        #input: n
        for t in range(1, trg_length):
            output, hidden, cell = self.decoder(input, hidden, cell)
            #output: n x output_dim
            #hidden: n x num_layers x hidden_dim
            #cell: n x num_layers x hidden_dim
            outputs[:,t,:] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:,t] if teacher_force else top1
            #input: n
        return outputs

In [37]:
input_dim = len(en_vocab)
output_dim = len(vi_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)

model = Seq2Seq(encoder, decoder, device).to(device)

In [38]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(2187, 256)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(2065, 256)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=2065, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [39]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 9,504,273 trainable parameters


In [40]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [41]:
def train_fn(model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch['en_ids'].to(device)
        trg = batch['vi_ids'].to(device)
        #src: n x src_seq_length
        #trg: n x trg_seq_length
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        #output: n x trg_seq_length x trg_vocab_size
        output_dim = output.shape[-1]
        output = output[:,1:,].reshape(-1,output_dim)
        #output: (n * trg_seq_length - 1) x trg_vocab_size
        trg = trg[:,1:].reshape(-1)
        #trg: n x trg_seq_length-1
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)


In [42]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch['en_ids'].to(device)
            trg = batch['vi_ids'].to(device)
            #src: n x src_seq_length
            #trg: n x trg_seq_length
            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            output = output[:,1:,].reshape(-1,output_dim)
            #output: n x trg_seq_legth - 1 x trg_vocab_size
            trg = trg[:,1:].reshape(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [43]:

n_epochs = 50
clip = 2.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut1-model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

  2%|▏         | 1/50 [00:04<04:00,  4.91s/it]

	Train Loss:   5.289 | Train PPL: 198.229
	Valid Loss:   4.955 | Valid PPL: 141.899


  4%|▍         | 2/50 [00:08<03:29,  4.37s/it]

	Train Loss:   4.949 | Train PPL: 140.983
	Valid Loss:   4.942 | Valid PPL: 140.097


  6%|▌         | 3/50 [00:12<03:15,  4.17s/it]

	Train Loss:   4.862 | Train PPL: 129.346
	Valid Loss:   4.978 | Valid PPL: 145.127


  8%|▊         | 4/50 [00:16<03:06,  4.05s/it]

	Train Loss:   4.803 | Train PPL: 121.861
	Valid Loss:   4.973 | Valid PPL: 144.403


 10%|█         | 5/50 [00:20<03:00,  4.02s/it]

	Train Loss:   4.728 | Train PPL: 113.040
	Valid Loss:   4.872 | Valid PPL: 130.552


 12%|█▏        | 6/50 [00:24<02:56,  4.01s/it]

	Train Loss:   4.604 | Train PPL:  99.932
	Valid Loss:   4.798 | Valid PPL: 121.275


 14%|█▍        | 7/50 [00:28<02:51,  3.98s/it]

	Train Loss:   4.495 | Train PPL:  89.566
	Valid Loss:   4.740 | Valid PPL: 114.425


 16%|█▌        | 8/50 [00:32<02:46,  3.98s/it]

	Train Loss:   4.360 | Train PPL:  78.253
	Valid Loss:   4.680 | Valid PPL: 107.813


 18%|█▊        | 9/50 [00:36<02:42,  3.96s/it]

	Train Loss:   4.271 | Train PPL:  71.574
	Valid Loss:   4.680 | Valid PPL: 107.802


 20%|██        | 10/50 [00:40<02:37,  3.95s/it]

	Train Loss:   4.202 | Train PPL:  66.851
	Valid Loss:   4.689 | Valid PPL: 108.744


 22%|██▏       | 11/50 [00:44<02:33,  3.95s/it]

	Train Loss:   4.106 | Train PPL:  60.728
	Valid Loss:   4.654 | Valid PPL: 104.983


 24%|██▍       | 12/50 [00:48<02:30,  3.95s/it]

	Train Loss:   4.034 | Train PPL:  56.501
	Valid Loss:   4.589 | Valid PPL:  98.402


 26%|██▌       | 13/50 [00:52<02:26,  3.96s/it]

	Train Loss:   3.922 | Train PPL:  50.524
	Valid Loss:   4.580 | Valid PPL:  97.562


 28%|██▊       | 14/50 [00:56<02:22,  3.95s/it]

	Train Loss:   3.844 | Train PPL:  46.721
	Valid Loss:   4.519 | Valid PPL:  91.786


 30%|███       | 15/50 [01:00<02:18,  3.94s/it]

	Train Loss:   3.695 | Train PPL:  40.257
	Valid Loss:   4.470 | Valid PPL:  87.392


 32%|███▏      | 16/50 [01:04<02:14,  3.95s/it]

	Train Loss:   3.649 | Train PPL:  38.438
	Valid Loss:   4.436 | Valid PPL:  84.408


 34%|███▍      | 17/50 [01:08<02:10,  3.94s/it]

	Train Loss:   3.513 | Train PPL:  33.538
	Valid Loss:   4.399 | Valid PPL:  81.370


 36%|███▌      | 18/50 [01:11<02:06,  3.94s/it]

	Train Loss:   3.451 | Train PPL:  31.518
	Valid Loss:   4.368 | Valid PPL:  78.857


 38%|███▊      | 19/50 [01:15<02:02,  3.94s/it]

	Train Loss:   3.316 | Train PPL:  27.560
	Valid Loss:   4.324 | Valid PPL:  75.454


 40%|████      | 20/50 [01:19<01:58,  3.94s/it]

	Train Loss:   3.224 | Train PPL:  25.127
	Valid Loss:   4.320 | Valid PPL:  75.177


 42%|████▏     | 21/50 [01:23<01:54,  3.96s/it]

	Train Loss:   3.091 | Train PPL:  22.000
	Valid Loss:   4.286 | Valid PPL:  72.659


 44%|████▍     | 22/50 [01:27<01:51,  3.99s/it]

	Train Loss:   3.052 | Train PPL:  21.162
	Valid Loss:   4.220 | Valid PPL:  68.067


 46%|████▌     | 23/50 [01:31<01:46,  3.94s/it]

	Train Loss:   2.924 | Train PPL:  18.619
	Valid Loss:   4.234 | Valid PPL:  68.968


 48%|████▊     | 24/50 [01:35<01:42,  3.93s/it]

	Train Loss:   2.829 | Train PPL:  16.924
	Valid Loss:   4.182 | Valid PPL:  65.484


 50%|█████     | 25/50 [01:39<01:38,  3.94s/it]

	Train Loss:   2.784 | Train PPL:  16.177
	Valid Loss:   4.164 | Valid PPL:  64.350


 52%|█████▏    | 26/50 [01:43<01:34,  3.93s/it]

	Train Loss:   2.652 | Train PPL:  14.184
	Valid Loss:   4.159 | Valid PPL:  63.999


 54%|█████▍    | 27/50 [01:47<01:30,  3.92s/it]

	Train Loss:   2.578 | Train PPL:  13.173
	Valid Loss:   4.119 | Valid PPL:  61.475


 56%|█████▌    | 28/50 [01:51<01:26,  3.94s/it]

	Train Loss:   2.473 | Train PPL:  11.856
	Valid Loss:   4.117 | Valid PPL:  61.345


 58%|█████▊    | 29/50 [01:55<01:22,  3.93s/it]

	Train Loss:   2.390 | Train PPL:  10.914
	Valid Loss:   4.100 | Valid PPL:  60.348


 60%|██████    | 30/50 [01:59<01:18,  3.93s/it]

	Train Loss:   2.348 | Train PPL:  10.468
	Valid Loss:   4.040 | Valid PPL:  56.849


 62%|██████▏   | 31/50 [02:03<01:14,  3.92s/it]

	Train Loss:   2.250 | Train PPL:   9.491
	Valid Loss:   4.059 | Valid PPL:  57.945


 64%|██████▍   | 32/50 [02:07<01:10,  3.92s/it]

	Train Loss:   2.130 | Train PPL:   8.413
	Valid Loss:   4.060 | Valid PPL:  57.965


 66%|██████▌   | 33/50 [02:10<01:06,  3.92s/it]

	Train Loss:   2.076 | Train PPL:   7.976
	Valid Loss:   4.005 | Valid PPL:  54.881


 68%|██████▊   | 34/50 [02:14<01:02,  3.90s/it]

	Train Loss:   1.937 | Train PPL:   6.937
	Valid Loss:   4.016 | Valid PPL:  55.478


 70%|███████   | 35/50 [02:18<00:58,  3.93s/it]

	Train Loss:   1.876 | Train PPL:   6.530
	Valid Loss:   4.083 | Valid PPL:  59.299


 72%|███████▏  | 36/50 [02:22<00:54,  3.90s/it]

	Train Loss:   1.819 | Train PPL:   6.163
	Valid Loss:   4.045 | Valid PPL:  57.131


 74%|███████▍  | 37/50 [02:26<00:50,  3.88s/it]

	Train Loss:   1.736 | Train PPL:   5.676
	Valid Loss:   4.013 | Valid PPL:  55.311


 76%|███████▌  | 38/50 [02:30<00:46,  3.88s/it]

	Train Loss:   1.649 | Train PPL:   5.199
	Valid Loss:   4.014 | Valid PPL:  55.382


 78%|███████▊  | 39/50 [02:34<00:42,  3.85s/it]

	Train Loss:   1.585 | Train PPL:   4.879
	Valid Loss:   4.017 | Valid PPL:  55.508


 80%|████████  | 40/50 [02:37<00:38,  3.85s/it]

	Train Loss:   1.543 | Train PPL:   4.680
	Valid Loss:   4.089 | Valid PPL:  59.680


 82%|████████▏ | 41/50 [02:41<00:34,  3.85s/it]

	Train Loss:   1.465 | Train PPL:   4.329
	Valid Loss:   4.051 | Valid PPL:  57.474


 84%|████████▍ | 42/50 [02:45<00:30,  3.86s/it]

	Train Loss:   1.396 | Train PPL:   4.040
	Valid Loss:   4.088 | Valid PPL:  59.645


 86%|████████▌ | 43/50 [02:49<00:27,  3.86s/it]

	Train Loss:   1.368 | Train PPL:   3.928
	Valid Loss:   4.096 | Valid PPL:  60.082


 88%|████████▊ | 44/50 [02:53<00:23,  3.85s/it]

	Train Loss:   1.257 | Train PPL:   3.514
	Valid Loss:   4.095 | Valid PPL:  60.065


 90%|█████████ | 45/50 [02:57<00:19,  3.87s/it]

	Train Loss:   1.216 | Train PPL:   3.372
	Valid Loss:   4.072 | Valid PPL:  58.678


 92%|█████████▏| 46/50 [03:01<00:15,  3.89s/it]

	Train Loss:   1.166 | Train PPL:   3.211
	Valid Loss:   4.110 | Valid PPL:  60.934


 94%|█████████▍| 47/50 [03:05<00:11,  3.89s/it]

	Train Loss:   1.087 | Train PPL:   2.964
	Valid Loss:   4.169 | Valid PPL:  64.667


 96%|█████████▌| 48/50 [03:09<00:07,  3.89s/it]

	Train Loss:   1.042 | Train PPL:   2.835
	Valid Loss:   4.165 | Valid PPL:  64.390


 98%|█████████▊| 49/50 [03:12<00:03,  3.88s/it]

	Train Loss:   0.987 | Train PPL:   2.684
	Valid Loss:   4.201 | Valid PPL:  66.780


100%|██████████| 50/50 [03:16<00:00,  3.93s/it]

	Train Loss:   0.928 | Train PPL:   2.529
	Valid Loss:   4.201 | Valid PPL:  66.756





In [44]:
model.load_state_dict(torch.load("tut1-model.pt"))

test_loss = evaluate_fn(model, test_data_loader, criterion, device)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

| Test Loss: 4.013 | Test PPL:  55.329 |


In [45]:
def translate_sentence(
    sentence,
    model,
    de_nlp,
    en_nlp,
    de_vocab,
    en_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        if isinstance(sentence, str):
            tokens = [token.text for token in de_nlp.tokenizer(sentence)]
        else:
            tokens = [token for token in sentence]
        if lower:
            tokens = [token.lower() for token in tokens]
        tokens = [sos_token] + tokens + [eos_token]
        ids = de_vocab.lookup_indices(tokens)
        tensor = torch.LongTensor(ids).unsqueeze(0).to(device)
        hidden, cell = model.encoder(tensor)
        inputs = en_vocab.lookup_indices([sos_token])
        for _ in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, cell = model.decoder(inputs_tensor, hidden, cell)
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == en_vocab[eos_token]:
                break
        tokens = en_vocab.lookup_tokens(inputs)
    return tokens

In [58]:
sentence = "I don't understand what you say!!"

In [59]:
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    vi_nlp,
    en_vocab,
    vi_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

In [60]:
translation

['<sos>', 'tôi', 'không', 'biết', 'là', 'bạn', 'có', '.', '.', '<eos>']

In [54]:
for i in range(20):
    sentence = test_data[i]["en"]
    expected_translation = test_data[i]["vi"]
    translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    vi_nlp,
    en_vocab,
    vi_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    )
    print('----')
    print(f'Input: {sentence}')
    print(f'True: {expected_translation}')
    print(f'Pred: {translation}')

----
Input: I wish Tom wouldn't sing so loudly late at night.
True: Tôi mong sao Tom đừng hát quá to lúc đêm khuya.
Pred: ['<sos>', 'tôi', 'nghĩ', 'là', 'tom', 'sẽ', 'không', 'đi', 'đến', 'úc', 'vào', 'tuần', '.', '<eos>']
----
Input: I went for a walk to get some air.
True: Tôi đã đi dạo để có chút không khí.
Pred: ['<sos>', 'tôi', 'thường', 'ngủ', 'ngủ', 'vào', 'khoảng', 'phút', 'phút', '.', '<eos>']
----
Input: Her book is very interesting.
True: Cuốn sách của cô ấy rất thú vị.
Pred: ['<sos>', 'cô', 'ấy', 'đang', 'nấu', 'rất', 'nhiều', '.', '<eos>']
----
Input: Tom doesn't eat enough fruit.
True: Tom không ăn đủ trái cây.
Pred: ['<sos>', 'tom', 'không', 'có', 'vẻ', '.', '.', '<eos>']
----
Input: If I'd known Tom was in Boston, I'd have told you.
True: Lúc đó nếu tôi biết là Tom ở Boston thì tôi đã nói cho bạn biết rồi.
Pred: ['<sos>', 'nếu', 'tôi', 'biết', 'tom', 'tom', 'đi', 'đi', ',', ',', 'anh', 'ấy', 'đã', 'không', '.', '<eos>']
----
Input: We will vote to decide the winner.
Tru