In [119]:
!pip install pyvi torchsummary



In [120]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import torchtext
import tqdm
import random
from spacy.lang.vi import Vietnamese
from spacy.lang.en import English
from torch.utils.data import Dataset, random_split
from torchtext.vocab import build_vocab_from_iterator
from torchsummary import summary

In [121]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [122]:
def load_data(path):
    data = []
    with open(path,'r') as file:
        for line in file.readlines():
            splitted_line = line.split('\t')
            eng = splitted_line[0]
            vi = splitted_line[1]
            data.append({'vi':vi, 
                         'en':eng})
    return data

In [123]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        return self.data[index]

In [124]:
dataset = CustomDataset(load_data('/kaggle/input/languagedata/data/vie.txt'))

In [125]:
#7:2:1
total_samples = len(dataset)
train_size = int(0.8 * total_samples)
val_size = int(0.1 * total_samples)
test_size = total_samples - train_size - val_size

In [126]:
train_data, valid_data, test_data = random_split(dataset, [train_size, val_size, test_size])
print("Số lượng mẫu trong tập train:", len(train_data))
print("Số lượng mẫu trong tập validation:", len(valid_data))
print("Số lượng mẫu trong tập test:", len(test_data))

Số lượng mẫu trong tập train: 7542
Số lượng mẫu trong tập validation: 942
Số lượng mẫu trong tập test: 944


In [127]:
train_data[0]

{'vi': 'Bạn thực sự muốn mặc cái đó sao?',
 'en': 'Do you really want to wear that?'}

In [128]:
en_nlp = English()
vi_nlp = Vietnamese()

In [129]:
string = "What a lovely day it is today!"
[token.text for token in en_nlp.tokenizer(string)]

['What', 'a', 'lovely', 'day', 'it', 'is', 'today', '!']

In [130]:
def tokenize_example(example, en_nlp, vi_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    vi_tokens = [token.text for token in vi_nlp.tokenizer(example["vi"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        vi_tokens = [token.lower() for token in vi_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    vi_tokens = [sos_token] + vi_tokens + [eos_token]
    example["en_tokens"] = en_tokens
    example["vi_tokens"] = vi_tokens
    return example

In [131]:
max_length = 50
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "vi_nlp": vi_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}
train_data = [tokenize_example(example, **fn_kwargs) for example in train_data]
valid_data = [tokenize_example(example, **fn_kwargs) for example in valid_data]
test_data = [tokenize_example(example, **fn_kwargs) for example in test_data]

In [132]:
train_data[0]

{'vi': 'Bạn thực sự muốn mặc cái đó sao?',
 'en': 'Do you really want to wear that?',
 'en_tokens': ['<sos>',
  'do',
  'you',
  'really',
  'want',
  'to',
  'wear',
  'that',
  '?',
  '<eos>'],
 'vi_tokens': ['<sos>',
  'bạn',
  'thực sự',
  'muốn',
  'mặc',
  'cái',
  'đó',
  'sao',
  '?',
  '<eos>']}

In [133]:
def yield_tokens(data,s):
    for dct in data:
        yield dct[s]

In [134]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    yield_tokens(train_data,'en_tokens'),
    min_freq=min_freq,
    specials=special_tokens,
)

vi_vocab = torchtext.vocab.build_vocab_from_iterator(
    yield_tokens(train_data,'vi_tokens'),
    min_freq=min_freq,
    specials=special_tokens,
)

In [135]:
assert en_vocab[unk_token] == vi_vocab[unk_token]
assert en_vocab[pad_token] == vi_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [136]:
en_vocab.set_default_index(unk_index)
vi_vocab.set_default_index(unk_index)

In [137]:
tokens = ["i", "love", "watching", "crime", "shows"]
en_vocab.lookup_indices(tokens)

[5, 173, 509, 0, 0]

In [138]:
en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

['i', 'love', 'watching', '<unk>', '<unk>']

In [139]:
def numericalize_example(example, en_vocab, vi_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    vi_ids = vi_vocab.lookup_indices(example["vi_tokens"])
    example["en_ids"] = en_ids
    example["vi_ids"] = vi_ids
    return example

In [140]:
fn_kwargs = {"en_vocab": en_vocab, "vi_vocab": vi_vocab}
train_data = [numericalize_example(example, **fn_kwargs) for example in train_data]
valid_data = [numericalize_example(example, **fn_kwargs) for example in valid_data]
test_data = [numericalize_example(example, **fn_kwargs) for example in test_data]

In [141]:
train_data[0]

{'vi': 'Bạn thực sự muốn mặc cái đó sao?',
 'en': 'Do you really want to wear that?',
 'en_tokens': ['<sos>',
  'do',
  'you',
  'really',
  'want',
  'to',
  'wear',
  'that',
  '?',
  '<eos>'],
 'vi_tokens': ['<sos>',
  'bạn',
  'thực sự',
  'muốn',
  'mặc',
  'cái',
  'đó',
  'sao',
  '?',
  '<eos>'],
 'en_ids': [2, 14, 8, 88, 37, 6, 431, 15, 10, 3],
 'vi_ids': [2, 8, 184, 30, 281, 34, 15, 97, 11, 3]}

In [142]:
en_vocab.lookup_tokens(train_data[0]["en_ids"])

['<sos>', 'do', 'you', 'really', 'want', 'to', 'wear', 'that', '?', '<eos>']

In [143]:
def to_tensor(example):
    example['en_ids'] = torch.tensor(np.array(example['en_ids']), dtype=torch.int64)
    example['vi_ids'] = torch.tensor(np.array(example['vi_ids']), dtype=torch.int64)
    return example

In [144]:
train_data = [to_tensor(example) for example in train_data]
valid_data = [to_tensor(example) for example in valid_data]
test_data = [to_tensor(example) for example in test_data]

In [145]:
def get_collate_fn(pad_index, max_length):
    def collate_fn(batch):
        batch_en_ids = []
        batch_vi_ids = []
#         batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
#         batch_vi_ids = nn.utils.rnn.pad_sequence(batch_vi_ids, padding_value=pad_index)
        for example in batch:
            en_ids = example["en_ids"]
            vi_ids = example["vi_ids"]
            if len(en_ids) > max_length:
                en_ids = en_ids[:max_length]
            else:
                en_ids = torch.cat((en_ids, torch.tensor([pad_index] * (max_length - len(en_ids)))))
            if len(vi_ids) > max_length:
                vi_ids = vi_ids[:max_length]
            else:
                vi_ids = torch.cat((vi_ids, torch.tensor([pad_index] * (max_length - len(vi_ids)))))
            assert len(en_ids) == max_length
            assert len(vi_ids) == max_length
            batch_en_ids.append(en_ids)
            batch_vi_ids.append(vi_ids)
        batch = {
            "en_ids": torch.stack(batch_en_ids),
            "vi_ids": torch.stack(batch_vi_ids),
        }
        return batch

    return collate_fn

In [146]:
def get_data_loader(dataset, batch_size, pad_index, max_length, shuffle=False):
    collate_fn = get_collate_fn(pad_index, max_length)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [147]:
batch_size = 128
max_length = 50
train_data_loader = get_data_loader(train_data, batch_size, pad_index,max_length, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index, max_length)
test_data_loader = get_data_loader(test_data, batch_size, pad_index, max_length)

In [148]:
class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_length, device='cpu'):
        super().__init__()
        self.device = device
        self.word_emb = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_dim)
        self.pos_emb = nn.Embedding(
            num_embeddings=max_length,
            embedding_dim=embed_dim
        )

    def forward(self, x):
        N, seq_len = x.size()
        positions = torch.arange(0, seq_len).expand(N, seq_len).to(self.device)
        output1 = self.word_emb(x)
        output2 = self.pos_emb(positions)
        output =  output1 + output2
        return output

In [149]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(
            embed_dim = embed_dim,
            num_heads = num_heads,
            batch_first = True
        )
        self.ffn = nn.Sequential(
            nn.Linear(in_features=embed_dim, out_features=ff_dim, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=ff_dim, out_features=embed_dim, bias=True)
        )
        self.layernorm_1 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.layernorm_2 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.dropout_1 = nn.Dropout(p=dropout)
        self.dropout_2 = nn.Dropout(p=dropout)

    def forward(self, query, key, value):
        attn_output, _ = self.attn(query, key, value)
        attn_output = self.dropout_1(attn_output)
        out_1 = self.layernorm_1(query + attn_output)
        ffn_output = self.ffn(out_1)
        ffn_output = self.dropout_2(ffn_output)
        out_2 = self.layernorm_2(out_1 + ffn_output)
        return out_2

In [150]:
class TransformerEncoder(nn.Module):
    def __init__(self, src_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout=0.1, device='cpu'):
        super().__init__()
        self.embedding = TokenAndPositionEmbedding(src_vocab_size, embed_dim, max_length, device)
        self.layers = nn.ModuleList(
            [
                TransformerEncoderBlock(embed_dim, num_heads, ff_dim, dropout) for i in range(num_layers)
            ]
        )

    def forward(self, x):
        output = self.embedding(x)
        for layer in self.layers:
            output = layer(output, output, output)
        return output
     

In [151]:
class TransformerDecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(
            embed_dim = embed_dim,
            num_heads = num_heads,
            batch_first = True
        )
        self.cross_attn = nn.MultiheadAttention(
            embed_dim = embed_dim,
            num_heads = num_heads,
            batch_first = True
        )
        self.ffn = nn.Sequential(
            nn.Linear(in_features=embed_dim, out_features=ff_dim, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=ff_dim, out_features=embed_dim, bias=True)
        )
        self.layernorm_1  = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.layernorm_2  = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.layernorm_3  = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.dropout_1 = nn.Dropout(p=dropout)
        self.dropout_2 = nn.Dropout(p=dropout)
        self.dropout_3 = nn.Dropout(p=dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output, _ = self.attn(x, x, x, attn_mask=tgt_mask)
        attn_output = self.dropout_1(attn_output)
        out_1 = self.layernorm_1(x + attn_output)
        attn_output, _ = self.cross_attn(out_1, enc_output, enc_output)
        attn_output = self.dropout_2(attn_output)
        out_2 = self.layernorm_2(out_1 + attn_output)
        ffn_output = self.ffn(out_2)
        ffn_output = self.dropout_3(ffn_output)
        out_3 = self.layernorm_3(out_2 + ffn_output)
        return out_3

In [152]:
class TransformerDecoder(nn.Module):
    def __init__(self, tgt_vocab_size, embed_dim, max_length, num_layers, num_aheads, ff_dim, dropout=0.1, device='cpu'):
        super().__init__()
        self.embedding = TokenAndPositionEmbedding(tgt_vocab_size, embed_dim, max_length, device)
        self.layers = nn.ModuleList(
            [
                TransformerDecoderBlock(embed_dim, num_heads, ff_dim, dropout) for i in range(num_layers)
            ]
        )

    def forward(self, x, enc_output, src_mask, tgt_mask):
        output = self.embedding(x)
        for layer in self.layers:
            output = layer(output, enc_output, src_mask, tgt_mask)
        return output

In [153]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout=0.1, device='cpu'):
        super().__init__()
        self.device = device
        self.encoder = TransformerEncoder(src_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout, device)
        self.decoder = TransformerDecoder(tgt_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout, device)
        self.fc = nn.Linear(embed_dim, tgt_vocab_size)

    def generate_mask(self, src, tgt):
        src_seq_len = src.shape[1]
        tgt_seq_len = tgt.shape[1]

        src_mask = torch.zeros((src_seq_len, src_seq_len), device=self.device).type(torch.bool)
        tgt_mask = (torch.triu(torch.ones((tgt_seq_len, tgt_seq_len), device=self.device).type(torch.bool)) == 1).transpose(0,1)
        tgt_mask = tgt_mask.float().masked_fill(tgt_mask == 0, float('-inf')).masked_fill(tgt_mask == 1, float(0.0))
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        enc_output = self.encoder(src)
        dec_output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
        output = self.fc(dec_output)
        return output

In [160]:
src_vocab_size = len(en_vocab)
tgt_vocab_size = len(vi_vocab)
embed_dim = 512
max_length = 50
num_layers = 6
num_heads = 8
ff_dim = 2048
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dropout = 0.2
model = Transformer(src_vocab_size, tgt_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout, device).to(device)

In [161]:
print(model)

Transformer(
  (encoder): TransformerEncoder(
    (embedding): TokenAndPositionEmbedding(
      (word_emb): Embedding(2187, 512)
      (pos_emb): Embedding(50, 512)
    )
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ffn): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): ReLU()
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (layernorm_1): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
        (layernorm_2): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
        (dropout_1): Dropout(p=0.2, inplace=False)
        (dropout_2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (decoder): TransformerDecoder(
    (embedding): TokenAndPositionEmbedding(
      (word_emb): Embedding(2065, 512)
      (pos_emb): Embedding(5

In [162]:
def train_fn(model, data_loader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch['en_ids'].to(device)
        trg = batch['vi_ids'].to(device)
        #src: n x src_seq_length
        #trg: n x trg_seq_length
        optimizer.zero_grad()
        output = model(src, trg[:,:-1])
        #output: n x trg_seq_length x trg_vocab_size
        output_dim = output.shape[-1]
        output = output.reshape(-1,output_dim)
        #output: (n * trg_seq_length - 1) x trg_vocab_size
        trg = trg[:,1:].reshape(-1)
        #trg: n x trg_seq_length-1
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [163]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch['en_ids'].to(device)
            trg = batch['vi_ids'].to(device)
            #src: n x src_seq_length
            #trg: n x trg_seq_length
            output = model(src, trg[:,:-1])
            output_dim = output.shape[-1]
            output = output.reshape(-1,output_dim)
            #output: n x trg_seq_legth - 1 x trg_vocab_size
            trg = trg[:,1:].reshape(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [164]:
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9,0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [165]:
n_epochs = 100
best_valid_loss = float("inf")
for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

  1%|          | 1/100 [00:20<33:39, 20.40s/it]

	Train Loss:   5.476 | Train PPL: 238.904
	Valid Loss:   4.798 | Valid PPL: 121.263


  2%|▏         | 2/100 [00:40<33:17, 20.39s/it]

	Train Loss:   4.730 | Train PPL: 113.339
	Valid Loss:   4.502 | Valid PPL:  90.161


  3%|▎         | 3/100 [01:01<32:58, 20.39s/it]

	Train Loss:   4.435 | Train PPL:  84.327
	Valid Loss:   4.255 | Valid PPL:  70.461


  4%|▍         | 4/100 [01:21<32:37, 20.39s/it]

	Train Loss:   4.180 | Train PPL:  65.388
	Valid Loss:   4.059 | Valid PPL:  57.930


  5%|▌         | 5/100 [01:41<32:17, 20.40s/it]

	Train Loss:   3.949 | Train PPL:  51.896
	Valid Loss:   3.885 | Valid PPL:  48.671


  6%|▌         | 6/100 [02:02<31:58, 20.41s/it]

	Train Loss:   3.717 | Train PPL:  41.129
	Valid Loss:   3.683 | Valid PPL:  39.767


  7%|▋         | 7/100 [02:22<31:38, 20.41s/it]

	Train Loss:   3.513 | Train PPL:  33.548
	Valid Loss:   3.565 | Valid PPL:  35.344


  8%|▊         | 8/100 [02:43<31:17, 20.41s/it]

	Train Loss:   3.341 | Train PPL:  28.261
	Valid Loss:   3.426 | Valid PPL:  30.748


  9%|▉         | 9/100 [03:03<30:57, 20.41s/it]

	Train Loss:   3.152 | Train PPL:  23.390
	Valid Loss:   3.300 | Valid PPL:  27.122


 10%|█         | 10/100 [03:24<30:36, 20.41s/it]

	Train Loss:   2.992 | Train PPL:  19.928
	Valid Loss:   3.238 | Valid PPL:  25.493


 11%|█         | 11/100 [03:44<30:16, 20.41s/it]

	Train Loss:   2.834 | Train PPL:  17.013
	Valid Loss:   3.141 | Valid PPL:  23.135


 12%|█▏        | 12/100 [04:04<29:55, 20.40s/it]

	Train Loss:   2.675 | Train PPL:  14.506
	Valid Loss:   3.017 | Valid PPL:  20.435


 13%|█▎        | 13/100 [04:25<29:34, 20.40s/it]

	Train Loss:   2.530 | Train PPL:  12.558
	Valid Loss:   2.952 | Valid PPL:  19.153


 14%|█▍        | 14/100 [04:45<29:13, 20.39s/it]

	Train Loss:   2.395 | Train PPL:  10.966
	Valid Loss:   2.895 | Valid PPL:  18.080


 15%|█▌        | 15/100 [05:05<28:53, 20.39s/it]

	Train Loss:   2.258 | Train PPL:   9.559
	Valid Loss:   2.857 | Valid PPL:  17.408


 16%|█▌        | 16/100 [05:26<28:32, 20.39s/it]

	Train Loss:   2.127 | Train PPL:   8.392
	Valid Loss:   2.815 | Valid PPL:  16.693


 17%|█▋        | 17/100 [05:46<28:12, 20.39s/it]

	Train Loss:   2.007 | Train PPL:   7.440
	Valid Loss:   2.785 | Valid PPL:  16.206


 18%|█▊        | 18/100 [06:07<27:51, 20.39s/it]

	Train Loss:   1.889 | Train PPL:   6.613
	Valid Loss:   2.747 | Valid PPL:  15.602


 19%|█▉        | 19/100 [06:27<27:31, 20.39s/it]

	Train Loss:   1.775 | Train PPL:   5.900
	Valid Loss:   2.685 | Valid PPL:  14.657


 20%|██        | 20/100 [06:47<27:02, 20.28s/it]

	Train Loss:   1.668 | Train PPL:   5.303
	Valid Loss:   2.721 | Valid PPL:  15.202


 21%|██        | 21/100 [07:07<26:44, 20.31s/it]

	Train Loss:   1.563 | Train PPL:   4.771
	Valid Loss:   2.678 | Valid PPL:  14.561


 22%|██▏       | 22/100 [07:27<26:17, 20.23s/it]

	Train Loss:   1.460 | Train PPL:   4.307
	Valid Loss:   2.686 | Valid PPL:  14.670


 23%|██▎       | 23/100 [07:48<26:01, 20.27s/it]

	Train Loss:   1.360 | Train PPL:   3.895
	Valid Loss:   2.671 | Valid PPL:  14.456


 24%|██▍       | 24/100 [08:08<25:43, 20.31s/it]

	Train Loss:   1.278 | Train PPL:   3.590
	Valid Loss:   2.658 | Valid PPL:  14.262


 25%|██▌       | 25/100 [08:29<25:25, 20.34s/it]

	Train Loss:   1.189 | Train PPL:   3.282
	Valid Loss:   2.641 | Valid PPL:  14.031


 26%|██▌       | 26/100 [08:49<24:58, 20.25s/it]

	Train Loss:   1.106 | Train PPL:   3.021
	Valid Loss:   2.646 | Valid PPL:  14.102


 27%|██▋       | 27/100 [09:09<24:33, 20.18s/it]

	Train Loss:   1.034 | Train PPL:   2.812
	Valid Loss:   2.656 | Valid PPL:  14.246


 28%|██▊       | 28/100 [09:29<24:09, 20.14s/it]

	Train Loss:   0.953 | Train PPL:   2.593
	Valid Loss:   2.657 | Valid PPL:  14.249


 29%|██▉       | 29/100 [09:49<23:47, 20.10s/it]

	Train Loss:   0.883 | Train PPL:   2.419
	Valid Loss:   2.676 | Valid PPL:  14.525


 30%|███       | 30/100 [10:09<23:25, 20.08s/it]

	Train Loss:   0.825 | Train PPL:   2.281
	Valid Loss:   2.703 | Valid PPL:  14.920


 31%|███       | 31/100 [10:29<23:04, 20.06s/it]

	Train Loss:   0.760 | Train PPL:   2.139
	Valid Loss:   2.680 | Valid PPL:  14.586


 32%|███▏      | 32/100 [10:49<22:43, 20.05s/it]

	Train Loss:   0.708 | Train PPL:   2.030
	Valid Loss:   2.716 | Valid PPL:  15.127


 33%|███▎      | 33/100 [11:09<22:23, 20.05s/it]

	Train Loss:   0.652 | Train PPL:   1.920
	Valid Loss:   2.757 | Valid PPL:  15.746


 34%|███▍      | 34/100 [11:29<22:02, 20.05s/it]

	Train Loss:   0.599 | Train PPL:   1.821
	Valid Loss:   2.746 | Valid PPL:  15.586


 35%|███▌      | 35/100 [11:49<21:42, 20.04s/it]

	Train Loss:   0.557 | Train PPL:   1.745
	Valid Loss:   2.728 | Valid PPL:  15.305


 36%|███▌      | 36/100 [12:09<21:22, 20.05s/it]

	Train Loss:   0.516 | Train PPL:   1.675
	Valid Loss:   2.776 | Valid PPL:  16.062


 37%|███▋      | 37/100 [12:29<21:03, 20.05s/it]

	Train Loss:   0.470 | Train PPL:   1.601
	Valid Loss:   2.783 | Valid PPL:  16.173


 38%|███▊      | 38/100 [12:49<20:43, 20.06s/it]

	Train Loss:   0.438 | Train PPL:   1.549
	Valid Loss:   2.790 | Valid PPL:  16.285


 39%|███▉      | 39/100 [13:09<20:23, 20.06s/it]

	Train Loss:   0.400 | Train PPL:   1.492
	Valid Loss:   2.837 | Valid PPL:  17.072


 40%|████      | 40/100 [13:29<20:03, 20.06s/it]

	Train Loss:   0.375 | Train PPL:   1.454
	Valid Loss:   2.829 | Valid PPL:  16.921


 41%|████      | 41/100 [13:49<19:43, 20.07s/it]

	Train Loss:   0.347 | Train PPL:   1.415
	Valid Loss:   2.857 | Valid PPL:  17.416


 42%|████▏     | 42/100 [14:09<19:24, 20.07s/it]

	Train Loss:   0.323 | Train PPL:   1.381
	Valid Loss:   2.915 | Valid PPL:  18.457


 43%|████▎     | 43/100 [14:30<19:03, 20.07s/it]

	Train Loss:   0.298 | Train PPL:   1.347
	Valid Loss:   2.967 | Valid PPL:  19.433


 44%|████▍     | 44/100 [14:50<18:43, 20.07s/it]

	Train Loss:   0.282 | Train PPL:   1.326
	Valid Loss:   2.910 | Valid PPL:  18.361


 45%|████▌     | 45/100 [15:10<18:23, 20.07s/it]

	Train Loss:   0.262 | Train PPL:   1.300
	Valid Loss:   2.932 | Valid PPL:  18.773


 46%|████▌     | 46/100 [15:30<18:03, 20.07s/it]

	Train Loss:   0.246 | Train PPL:   1.278
	Valid Loss:   3.002 | Valid PPL:  20.119


 47%|████▋     | 47/100 [15:50<17:43, 20.07s/it]

	Train Loss:   0.235 | Train PPL:   1.265
	Valid Loss:   2.953 | Valid PPL:  19.156


 48%|████▊     | 48/100 [16:10<17:23, 20.07s/it]

	Train Loss:   0.224 | Train PPL:   1.251
	Valid Loss:   3.024 | Valid PPL:  20.564


 49%|████▉     | 49/100 [16:30<17:03, 20.07s/it]

	Train Loss:   0.217 | Train PPL:   1.242
	Valid Loss:   3.035 | Valid PPL:  20.791


 50%|█████     | 50/100 [16:50<16:43, 20.07s/it]

	Train Loss:   0.205 | Train PPL:   1.227
	Valid Loss:   3.050 | Valid PPL:  21.120


 51%|█████     | 51/100 [17:10<16:23, 20.07s/it]

	Train Loss:   0.195 | Train PPL:   1.216
	Valid Loss:   3.039 | Valid PPL:  20.889


 52%|█████▏    | 52/100 [17:30<16:03, 20.07s/it]

	Train Loss:   0.191 | Train PPL:   1.210
	Valid Loss:   3.043 | Valid PPL:  20.968


 53%|█████▎    | 53/100 [17:50<15:43, 20.07s/it]

	Train Loss:   0.180 | Train PPL:   1.197
	Valid Loss:   3.075 | Valid PPL:  21.660


 54%|█████▍    | 54/100 [18:10<15:23, 20.07s/it]

	Train Loss:   0.176 | Train PPL:   1.193
	Valid Loss:   3.107 | Valid PPL:  22.358


 55%|█████▌    | 55/100 [18:30<15:03, 20.07s/it]

	Train Loss:   0.169 | Train PPL:   1.184
	Valid Loss:   3.090 | Valid PPL:  21.968


 56%|█████▌    | 56/100 [18:50<14:43, 20.07s/it]

	Train Loss:   0.164 | Train PPL:   1.178
	Valid Loss:   3.098 | Valid PPL:  22.144


 57%|█████▋    | 57/100 [19:11<14:23, 20.07s/it]

	Train Loss:   0.161 | Train PPL:   1.175
	Valid Loss:   3.135 | Valid PPL:  22.981


 58%|█████▊    | 58/100 [19:31<14:02, 20.07s/it]

	Train Loss:   0.160 | Train PPL:   1.173
	Valid Loss:   3.132 | Valid PPL:  22.925


 59%|█████▉    | 59/100 [19:51<13:42, 20.06s/it]

	Train Loss:   0.153 | Train PPL:   1.166
	Valid Loss:   3.127 | Valid PPL:  22.816


 60%|██████    | 60/100 [20:11<13:21, 20.05s/it]

	Train Loss:   0.151 | Train PPL:   1.163
	Valid Loss:   3.147 | Valid PPL:  23.255


 61%|██████    | 61/100 [20:31<13:01, 20.04s/it]

	Train Loss:   0.145 | Train PPL:   1.156
	Valid Loss:   3.146 | Valid PPL:  23.248


 62%|██████▏   | 62/100 [20:51<12:41, 20.04s/it]

	Train Loss:   0.148 | Train PPL:   1.159
	Valid Loss:   3.149 | Valid PPL:  23.303


 63%|██████▎   | 63/100 [21:11<12:21, 20.03s/it]

	Train Loss:   0.140 | Train PPL:   1.151
	Valid Loss:   3.194 | Valid PPL:  24.378


 64%|██████▍   | 64/100 [21:31<12:01, 20.03s/it]

	Train Loss:   0.141 | Train PPL:   1.151
	Valid Loss:   3.174 | Valid PPL:  23.892


 65%|██████▌   | 65/100 [21:51<11:41, 20.03s/it]

	Train Loss:   0.138 | Train PPL:   1.148
	Valid Loss:   3.196 | Valid PPL:  24.425


 66%|██████▌   | 66/100 [22:11<11:21, 20.03s/it]

	Train Loss:   0.134 | Train PPL:   1.144
	Valid Loss:   3.262 | Valid PPL:  26.089


 67%|██████▋   | 67/100 [22:31<11:01, 20.03s/it]

	Train Loss:   0.129 | Train PPL:   1.138
	Valid Loss:   3.204 | Valid PPL:  24.631


 68%|██████▊   | 68/100 [22:51<10:41, 20.04s/it]

	Train Loss:   0.131 | Train PPL:   1.140
	Valid Loss:   3.259 | Valid PPL:  26.018


 69%|██████▉   | 69/100 [23:11<10:21, 20.05s/it]

	Train Loss:   0.132 | Train PPL:   1.141
	Valid Loss:   3.239 | Valid PPL:  25.517


 70%|███████   | 70/100 [23:31<10:01, 20.05s/it]

	Train Loss:   0.128 | Train PPL:   1.136
	Valid Loss:   3.243 | Valid PPL:  25.616


 71%|███████   | 71/100 [23:51<09:41, 20.05s/it]

	Train Loss:   0.124 | Train PPL:   1.132
	Valid Loss:   3.234 | Valid PPL:  25.372


 72%|███████▏  | 72/100 [24:11<09:21, 20.05s/it]

	Train Loss:   0.121 | Train PPL:   1.129
	Valid Loss:   3.232 | Valid PPL:  25.324


 73%|███████▎  | 73/100 [24:31<09:01, 20.06s/it]

	Train Loss:   0.124 | Train PPL:   1.132
	Valid Loss:   3.253 | Valid PPL:  25.863


 74%|███████▍  | 74/100 [24:51<08:41, 20.05s/it]

	Train Loss:   0.121 | Train PPL:   1.128
	Valid Loss:   3.265 | Valid PPL:  26.188


 75%|███████▌  | 75/100 [25:11<08:21, 20.05s/it]

	Train Loss:   0.114 | Train PPL:   1.121
	Valid Loss:   3.289 | Valid PPL:  26.811


 76%|███████▌  | 76/100 [25:31<08:01, 20.05s/it]

	Train Loss:   0.119 | Train PPL:   1.126
	Valid Loss:   3.294 | Valid PPL:  26.946


 77%|███████▋  | 77/100 [25:51<07:41, 20.05s/it]

	Train Loss:   0.113 | Train PPL:   1.120
	Valid Loss:   3.299 | Valid PPL:  27.078


 78%|███████▊  | 78/100 [26:11<07:21, 20.05s/it]

	Train Loss:   0.114 | Train PPL:   1.121
	Valid Loss:   3.284 | Valid PPL:  26.673


 79%|███████▉  | 79/100 [26:31<07:00, 20.05s/it]

	Train Loss:   0.116 | Train PPL:   1.123
	Valid Loss:   3.319 | Valid PPL:  27.625


 80%|████████  | 80/100 [26:52<06:40, 20.05s/it]

	Train Loss:   0.112 | Train PPL:   1.118
	Valid Loss:   3.308 | Valid PPL:  27.338


 81%|████████  | 81/100 [27:12<06:20, 20.05s/it]

	Train Loss:   0.110 | Train PPL:   1.117
	Valid Loss:   3.357 | Valid PPL:  28.716


 82%|████████▏ | 82/100 [27:32<06:00, 20.05s/it]

	Train Loss:   0.109 | Train PPL:   1.115
	Valid Loss:   3.310 | Valid PPL:  27.387


 83%|████████▎ | 83/100 [27:52<05:40, 20.04s/it]

	Train Loss:   0.110 | Train PPL:   1.116
	Valid Loss:   3.339 | Valid PPL:  28.182


 84%|████████▍ | 84/100 [28:12<05:20, 20.04s/it]

	Train Loss:   0.106 | Train PPL:   1.112
	Valid Loss:   3.361 | Valid PPL:  28.828


 85%|████████▌ | 85/100 [28:32<05:00, 20.05s/it]

	Train Loss:   0.106 | Train PPL:   1.112
	Valid Loss:   3.355 | Valid PPL:  28.637


 86%|████████▌ | 86/100 [28:52<04:40, 20.05s/it]

	Train Loss:   0.107 | Train PPL:   1.113
	Valid Loss:   3.376 | Valid PPL:  29.264


 87%|████████▋ | 87/100 [29:12<04:20, 20.05s/it]

	Train Loss:   0.106 | Train PPL:   1.112
	Valid Loss:   3.295 | Valid PPL:  26.980


 88%|████████▊ | 88/100 [29:32<04:00, 20.04s/it]

	Train Loss:   0.103 | Train PPL:   1.109
	Valid Loss:   3.346 | Valid PPL:  28.387


 89%|████████▉ | 89/100 [29:52<03:40, 20.04s/it]

	Train Loss:   0.102 | Train PPL:   1.108
	Valid Loss:   3.297 | Valid PPL:  27.044


 90%|█████████ | 90/100 [30:12<03:20, 20.04s/it]

	Train Loss:   0.101 | Train PPL:   1.107
	Valid Loss:   3.403 | Valid PPL:  30.064


 91%|█████████ | 91/100 [30:32<03:00, 20.04s/it]

	Train Loss:   0.099 | Train PPL:   1.104
	Valid Loss:   3.397 | Valid PPL:  29.866


 92%|█████████▏| 92/100 [30:52<02:40, 20.04s/it]

	Train Loss:   0.099 | Train PPL:   1.104
	Valid Loss:   3.349 | Valid PPL:  28.478


 93%|█████████▎| 93/100 [31:12<02:20, 20.04s/it]

	Train Loss:   0.100 | Train PPL:   1.105
	Valid Loss:   3.322 | Valid PPL:  27.727


 94%|█████████▍| 94/100 [31:32<02:00, 20.05s/it]

	Train Loss:   0.095 | Train PPL:   1.100
	Valid Loss:   3.386 | Valid PPL:  29.551


 95%|█████████▌| 95/100 [31:52<01:40, 20.04s/it]

	Train Loss:   0.098 | Train PPL:   1.103
	Valid Loss:   3.399 | Valid PPL:  29.944


 96%|█████████▌| 96/100 [32:12<01:20, 20.04s/it]

	Train Loss:   0.098 | Train PPL:   1.103
	Valid Loss:   3.371 | Valid PPL:  29.119


 97%|█████████▋| 97/100 [32:32<01:00, 20.04s/it]

	Train Loss:   0.093 | Train PPL:   1.097
	Valid Loss:   3.400 | Valid PPL:  29.956


 98%|█████████▊| 98/100 [32:52<00:40, 20.04s/it]

	Train Loss:   0.093 | Train PPL:   1.097
	Valid Loss:   3.434 | Valid PPL:  30.987


 99%|█████████▉| 99/100 [33:12<00:20, 20.04s/it]

	Train Loss:   0.095 | Train PPL:   1.100
	Valid Loss:   3.410 | Valid PPL:  30.276


100%|██████████| 100/100 [33:32<00:00, 20.13s/it]

	Train Loss:   0.092 | Train PPL:   1.097
	Valid Loss:   3.406 | Valid PPL:  30.139





In [173]:
model.load_state_dict(torch.load("model.pt"))
test_loss = evaluate_fn(model, test_data_loader, criterion, device)
print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

| Test Loss: 2.634 | Test PPL:  13.924 |


In [174]:
def translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=20,
):
    model.eval()
    with torch.no_grad():
        if isinstance(sentence, str):
            tokens = [token.text for token in en_nlp.tokenizer(sentence)]
        else:
            tokens = [token for token in sentence]
        if lower:
            tokens = [token.lower() for token in tokens]
        tokens = [sos_token] + tokens + [eos_token]
        ids = en_vocab.lookup_indices(tokens)
        src = torch.LongTensor(ids).unsqueeze(0).to(device)
        encoder_output = model.encoder(src)
        tgt = torch.ones(1, max_output_length).type_as(src.data)
        next_symbol = vi_vocab.get_stoi()[sos_token]
        for i in range(max_output_length):
            tgt[0][i] = next_symbol
            src_mask,tgt_mask = model.generate_mask(src, tgt)
            decoder_output = model.decoder(tgt,encoder_output,src_mask,tgt_mask)
            predicted_token = decoder_output[0,i].argmax(-1).item()
            next_symbol = predicted_token
            if predicted_token == en_vocab[eos_token]:
                break
        try:
            eos_idx = int(torch.where(tgt[0] == vi_vocab.get_stoi()[eos_token])[0][0])
            tgt = tgt[0][:eos_idx].unsqueeze(0)
            print(eos_idx)
        except:
            pass
        output = model(src, tgt)
        output = output[0].argmax(-1).squeeze().detach().cpu().numpy()
        tokens = de_vocab.lookup_tokens(output)
    return " ".join(tokens)

In [171]:
# def translate_sentence(
#     sentence,
#     model,
#     en_nlp,
#     de_nlp,
#     en_vocab,
#     de_vocab,
#     lower,
#     sos_token,
#     eos_token,
#     device,
#     max_output_length=20,
# ):
#     model.eval()
#     with torch.no_grad():
#         if isinstance(sentence, str):
#             tokens = [token.text for token in en_nlp.tokenizer(sentence)]
#         else:
#             tokens = [token for token in sentence]
#         if lower:
#             tokens = [token.lower() for token in tokens]
#         res = []
#         tokens = [sos_token] + tokens + [eos_token]
#         ids = en_vocab.lookup_indices(tokens)
#         print(ids)
#         src = torch.LongTensor(ids).unsqueeze(0).to(device)
#         encoder_output = model.encoder(src)
#         tgt = torch.LongTensor(de_vocab.lookup_indices([sos_token])).unsqueeze(0).to(device)
#         for i in range(max_output_length):
#             src_mask,tgt_mask = model.generate_mask(src, tgt)
#             decoder_output = model.decoder(tgt,encoder_output,src_mask,tgt_mask)
#             predicted_token = decoder_output[0,i].argmax(-1).item()
#             res.append(predicted_token)
#             tgt = torch.cat((tgt, torch.tensor([[predicted_token]]).to(device)), dim=1)
#             if predicted_token == en_vocab[eos_token]:
#                 break
# #         output = tgt[0].cpu().numpy()
#         output = model(src,torch.tensor(res).unsqueeze(0).to(device))
#         output = output[0].argmax(-1).squeeze().detach().cpu().numpy()
#         tokens = de_vocab.lookup_tokens(output)
#     return " ".join(tokens)

In [176]:
for i in range(10,20):
    sentence = train_data[i]['en']
    print(sentence)
    print(translate_sentence(
        sentence,
        model,
        en_nlp,
        vi_nlp,
        en_vocab,
        vi_vocab,
        lower,
        sos_token,
        eos_token,
        device,
    ))
    print()

Tom is the person who killed Mary.
tom như tom là là là gặp tom của tom tom là tom . . là là tom . tom

Could you please hurry up, sir?
nếu , , , , được được xe xe bạn , là người bạn , người . , , người

They're drunk.
họ là là xem <unk> pháp pháp anh . . . . . . . . . . . .

I had broken my glasses, so that I couldn't see the blackboard.
tôi như tôi , , , , tôi <unk> , khỏi , khỏi . thứ , . , , tôi

Tom doesn't know a lot about Boston.
tom như là , , , không tên boston tom , , , tom , , . , , tôi

To see him talk, you might think he's a girl.
nếu như bạn , , bạn , , , , , , , , nó , . nó , tôi

I thought I could change your mind.
tôi của tôi tôi tôi tôi tôi tôi với tôi . tôi . tôi mình . mình tôi . tôi

Tom apologized to me for his rudeness.
tom như là với tôi , tôi với , tôi , tôi với tôi tôi , với . . tôi

Music is a gift from God.
trong của của của tom tom họ của của . của và nó . nó của của . . được

You should memorize as many English words as possible.
cậu như như là là khỏi về 