In [1]:
!pip install pyvi torchsummary

Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl.metadata (296 bytes)
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite->pyvi)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import torchtext
import tqdm
import random
from spacy.lang.vi import Vietnamese
from spacy.lang.en import English
from torch.utils.data import Dataset, random_split
from torchtext.vocab import build_vocab_from_iterator
from torchsummary import summary

In [3]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [4]:
def load_data(path):
    data = []
    with open(path,'r') as file:
        for line in file.readlines():
            splitted_line = line.split('\t')
            eng = splitted_line[0]
            vi = splitted_line[1]
            data.append({'vi':vi, 
                         'en':eng})
    return data

In [5]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        return self.data[index]

In [6]:
dataset = CustomDataset(load_data('/kaggle/input/languagedata/data/vie.txt'))

In [7]:
#7:2:1
total_samples = len(dataset)
train_size = int(0.8 * total_samples)
val_size = int(0.1 * total_samples)
test_size = total_samples - train_size - val_size

In [8]:
train_data, valid_data, test_data = random_split(dataset, [train_size, val_size, test_size])
print("Số lượng mẫu trong tập train:", len(train_data))
print("Số lượng mẫu trong tập validation:", len(valid_data))
print("Số lượng mẫu trong tập test:", len(test_data))

Số lượng mẫu trong tập train: 7542
Số lượng mẫu trong tập validation: 942
Số lượng mẫu trong tập test: 944


In [9]:
train_data[0]

{'vi': 'Bạn thực sự muốn mặc cái đó sao?',
 'en': 'Do you really want to wear that?'}

In [10]:
en_nlp = English()
vi_nlp = Vietnamese()

In [11]:
string = "What a lovely day it is today!"
[token.text for token in en_nlp.tokenizer(string)]

['What', 'a', 'lovely', 'day', 'it', 'is', 'today', '!']

In [12]:
def tokenize_example(example, en_nlp, vi_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    vi_tokens = [token.text for token in vi_nlp.tokenizer(example["vi"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        vi_tokens = [token.lower() for token in vi_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    vi_tokens = [sos_token] + vi_tokens + [eos_token]
    example["en_tokens"] = en_tokens
    example["vi_tokens"] = vi_tokens
    return example

In [13]:
max_length = 50
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "vi_nlp": vi_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}
train_data = [tokenize_example(example, **fn_kwargs) for example in train_data]
valid_data = [tokenize_example(example, **fn_kwargs) for example in valid_data]
test_data = [tokenize_example(example, **fn_kwargs) for example in test_data]

In [14]:
train_data[0]

{'vi': 'Bạn thực sự muốn mặc cái đó sao?',
 'en': 'Do you really want to wear that?',
 'en_tokens': ['<sos>',
  'do',
  'you',
  'really',
  'want',
  'to',
  'wear',
  'that',
  '?',
  '<eos>'],
 'vi_tokens': ['<sos>',
  'bạn',
  'thực sự',
  'muốn',
  'mặc',
  'cái',
  'đó',
  'sao',
  '?',
  '<eos>']}

In [15]:
def yield_tokens(data,s):
    for dct in data:
        yield dct[s]

In [16]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    yield_tokens(train_data,'en_tokens'),
    min_freq=min_freq,
    specials=special_tokens,
)

vi_vocab = torchtext.vocab.build_vocab_from_iterator(
    yield_tokens(train_data,'vi_tokens'),
    min_freq=min_freq,
    specials=special_tokens,
)

In [17]:
assert en_vocab[unk_token] == vi_vocab[unk_token]
assert en_vocab[pad_token] == vi_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [18]:
en_vocab.set_default_index(unk_index)
vi_vocab.set_default_index(unk_index)

In [19]:
tokens = ["i", "love", "watching", "crime", "shows"]
en_vocab.lookup_indices(tokens)

[5, 173, 509, 0, 0]

In [20]:
en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

['i', 'love', 'watching', '<unk>', '<unk>']

In [21]:
def numericalize_example(example, en_vocab, vi_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    vi_ids = vi_vocab.lookup_indices(example["vi_tokens"])
    example["en_ids"] = en_ids
    example["vi_ids"] = vi_ids
    return example

In [22]:
fn_kwargs = {"en_vocab": en_vocab, "vi_vocab": vi_vocab}
train_data = [numericalize_example(example, **fn_kwargs) for example in train_data]
valid_data = [numericalize_example(example, **fn_kwargs) for example in valid_data]
test_data = [numericalize_example(example, **fn_kwargs) for example in test_data]

In [23]:
train_data[0]

{'vi': 'Bạn thực sự muốn mặc cái đó sao?',
 'en': 'Do you really want to wear that?',
 'en_tokens': ['<sos>',
  'do',
  'you',
  'really',
  'want',
  'to',
  'wear',
  'that',
  '?',
  '<eos>'],
 'vi_tokens': ['<sos>',
  'bạn',
  'thực sự',
  'muốn',
  'mặc',
  'cái',
  'đó',
  'sao',
  '?',
  '<eos>'],
 'en_ids': [2, 14, 8, 88, 37, 6, 431, 15, 10, 3],
 'vi_ids': [2, 8, 184, 30, 281, 34, 15, 97, 11, 3]}

In [24]:
en_vocab.lookup_tokens(train_data[0]["en_ids"])

['<sos>', 'do', 'you', 'really', 'want', 'to', 'wear', 'that', '?', '<eos>']

In [25]:
def to_tensor(example):
    example['en_ids'] = torch.tensor(np.array(example['en_ids']), dtype=torch.int64)
    example['vi_ids'] = torch.tensor(np.array(example['vi_ids']), dtype=torch.int64)
    return example

In [26]:
train_data = [to_tensor(example) for example in train_data]
valid_data = [to_tensor(example) for example in valid_data]
test_data = [to_tensor(example) for example in test_data]

In [27]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_vi_ids = [example["vi_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_vi_ids = nn.utils.rnn.pad_sequence(batch_vi_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids.T,
            "vi_ids": batch_vi_ids.T,
        }
        return batch

    return collate_fn

In [28]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [29]:
batch_size = 128
train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [30]:
class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_length, device='cpu'):
        super().__init__()
        self.device = device
        self.word_emb = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_dim)
        self.pos_emb = nn.Embedding(
            num_embeddings=max_length,
            embedding_dim=embed_dim
        )

    def forward(self, x):
        N, seq_len = x.size()
        positions = torch.arange(0, seq_len).expand(N, seq_len).to(self.device)
        output1 = self.word_emb(x)
        output2 = self.pos_emb(positions)
        output =  output1 + output2
        return output

In [31]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(
            embed_dim = embed_dim,
            num_heads = num_heads,
            batch_first = True
        )
        self.ffn = nn.Sequential(
            nn.Linear(in_features=embed_dim, out_features=ff_dim, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=ff_dim, out_features=embed_dim, bias=True)
        )
        self.layernorm_1 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.layernorm_2 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.dropout_1 = nn.Dropout(p=dropout)
        self.dropout_2 = nn.Dropout(p=dropout)

    def forward(self, query, key, value):
        attn_output, _ = self.attn(query, key, value)
        attn_output = self.dropout_1(attn_output)
        out_1 = self.layernorm_1(query + attn_output)
        ffn_output = self.ffn(out_1)
        ffn_output = self.dropout_2(ffn_output)
        out_2 = self.layernorm_2(out_1 + ffn_output)
        return out_2

In [32]:
class TransformerEncoder(nn.Module):
    def __init__(self, src_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout=0.1, device='cpu'):
        super().__init__()
        self.embedding = TokenAndPositionEmbedding(src_vocab_size, embed_dim, max_length, device)
        self.layers = nn.ModuleList(
            [
                TransformerEncoderBlock(embed_dim, num_heads, ff_dim, dropout) for i in range(num_layers)
            ]
        )

    def forward(self, x):
        output = self.embedding(x)
        for layer in self.layers:
            output = layer(output, output, output)
        return output

In [33]:
class TransformerDecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(
            embed_dim = embed_dim,
            num_heads = num_heads,
            batch_first = True
        )
        self.cross_attn = nn.MultiheadAttention(
            embed_dim = embed_dim,
            num_heads = num_heads,
            batch_first = True
        )
        self.ffn = nn.Sequential(
            nn.Linear(in_features=embed_dim, out_features=ff_dim, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=ff_dim, out_features=embed_dim, bias=True)
        )
        self.layernorm_1  = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.layernorm_2  = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.layernorm_3  = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.dropout_1 = nn.Dropout(p=dropout)
        self.dropout_2 = nn.Dropout(p=dropout)
        self.dropout_3 = nn.Dropout(p=dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output, _ = self.attn(x, x, x, attn_mask=tgt_mask)
        attn_output = self.dropout_1(attn_output)
        out_1 = self.layernorm_1(x + attn_output)
        attn_output, _ = self.cross_attn(out_1, enc_output, enc_output)
        attn_output = self.dropout_2(attn_output)
        out_2 = self.layernorm_2(out_1 + attn_output)
        ffn_output = self.ffn(out_2)
        ffn_output = self.dropout_3(ffn_output)
        out_3 = self.layernorm_3(out_2 + ffn_output)
        return out_3

In [34]:
class TransformerDecoder(nn.Module):
    def __init__(self, tgt_vocab_size, embed_dim, max_length, num_layers, num_aheads, ff_dim, dropout=0.1, device='cpu'):
        super().__init__()
        self.embedding = TokenAndPositionEmbedding(tgt_vocab_size, embed_dim, max_length, device)
        self.layers = nn.ModuleList(
            [
                TransformerDecoderBlock(embed_dim, num_heads, ff_dim, dropout) for i in range(num_layers)
            ]
        )

    def forward(self, x, enc_output, src_mask, tgt_mask):
        output = self.embedding(x)
        for layer in self.layers:
            output = layer(output, enc_output, src_mask, tgt_mask)
        return output

In [35]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout=0.1, device='cpu'):
        super().__init__()
        self.device = device
        self.encoder = TransformerEncoder(src_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout, device)
        self.decoder = TransformerDecoder(tgt_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout, device)
        self.fc = nn.Linear(embed_dim, tgt_vocab_size)

    def generate_mask(self, src, tgt):
        src_seq_len = src.shape[1]
        tgt_seq_len = tgt.shape[1]

        src_mask = torch.zeros((src_seq_len, src_seq_len), device=self.device).type(torch.bool)
        tgt_mask = (torch.triu(torch.ones((tgt_seq_len, tgt_seq_len), device=self.device).type(torch.bool)) == 1).transpose(0,1)
        tgt_mask = tgt_mask.float().masked_fill(tgt_mask == 0, float('-inf')).masked_fill(tgt_mask == 1, float(0.0))
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        enc_output = self.encoder(src)
        dec_output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
        output = self.fc(dec_output)
        return output

In [36]:
src_vocab_size = len(en_vocab)
tgt_vocab_size = len(vi_vocab)
embed_dim = 300
max_length = 50
num_layers = 5
num_heads = 5
ff_dim = 512
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dropout = 0.5
model = Transformer(src_vocab_size, tgt_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout, device).to(device)

In [37]:
print(model)

Transformer(
  (encoder): TransformerEncoder(
    (embedding): TokenAndPositionEmbedding(
      (word_emb): Embedding(2187, 300)
      (pos_emb): Embedding(50, 300)
    )
    (layers): ModuleList(
      (0-4): 5 x TransformerEncoderBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
        )
        (ffn): Sequential(
          (0): Linear(in_features=300, out_features=512, bias=True)
          (1): ReLU()
          (2): Linear(in_features=512, out_features=300, bias=True)
        )
        (layernorm_1): LayerNorm((300,), eps=1e-06, elementwise_affine=True)
        (layernorm_2): LayerNorm((300,), eps=1e-06, elementwise_affine=True)
        (dropout_1): Dropout(p=0.5, inplace=False)
        (dropout_2): Dropout(p=0.5, inplace=False)
      )
    )
  )
  (decoder): TransformerDecoder(
    (embedding): TokenAndPositionEmbedding(
      (word_emb): Embedding(2065, 300)
      (pos_emb): Embedding(50,

In [38]:
def train_fn(model, data_loader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch['en_ids'].to(device)
        trg = batch['vi_ids'].to(device)
        #src: n x src_seq_length
        #trg: n x trg_seq_length
        optimizer.zero_grad()
        output = model(src, trg)
        #output: n x trg_seq_length x trg_vocab_size
        output_dim = output.shape[-1]
        output = output[:,1:,].reshape(-1,output_dim)
        #output: (n * trg_seq_length - 1) x trg_vocab_size
        trg = trg[:,1:].reshape(-1)
        #trg: n x trg_seq_length-1
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [39]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch['en_ids'].to(device)
            trg = batch['vi_ids'].to(device)
            #src: n x src_seq_length
            #trg: n x trg_seq_length
            output = model(src, trg)
            output_dim = output.shape[-1]
            output = output[:,1:,].reshape(-1,output_dim)
            #output: n x trg_seq_legth - 1 x trg_vocab_size
            trg = trg[:,1:].reshape(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [40]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [41]:
n_epochs = 5
best_valid_loss = float("inf")
for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

 20%|██        | 1/5 [00:04<00:18,  4.71s/it]

	Train Loss:   3.062 | Train PPL:  21.375
	Valid Loss:   0.795 | Valid PPL:   2.214


 40%|████      | 2/5 [00:08<00:12,  4.14s/it]

	Train Loss:   0.631 | Train PPL:   1.879
	Valid Loss:   0.218 | Valid PPL:   1.244


 60%|██████    | 3/5 [00:12<00:07,  3.95s/it]

	Train Loss:   0.219 | Train PPL:   1.245
	Valid Loss:   0.069 | Valid PPL:   1.071


 80%|████████  | 4/5 [00:15<00:03,  3.85s/it]

	Train Loss:   0.080 | Train PPL:   1.083
	Valid Loss:   0.020 | Valid PPL:   1.020


100%|██████████| 5/5 [00:19<00:00,  3.91s/it]

	Train Loss:   0.027 | Train PPL:   1.027
	Valid Loss:   0.006 | Valid PPL:   1.006





In [42]:
model.load_state_dict(torch.load("model.pt"))
test_loss = evaluate_fn(model, test_data_loader, criterion, device)
print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

| Test Loss: 0.005 | Test PPL:   1.005 |


In [45]:
def translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=10,
):
    model.eval()
    with torch.no_grad():
        if isinstance(sentence, str):
            tokens = [token.text for token in en_nlp.tokenizer(sentence)]
        else:
            tokens = [token for token in sentence]
        if lower:
            tokens = [token.lower() for token in tokens]
        res = []
        tokens = [sos_token] + tokens + [eos_token]
        ids = en_vocab.lookup_indices(tokens)
        src = torch.LongTensor(ids).unsqueeze(0).to(device)
        tgt = torch.LongTensor(de_vocab.lookup_indices([sos_token])).unsqueeze(0).to(device)
        for i in range(max_output_length):
            output = model(src,tgt)
            predicted_token = output[0,-1].argmax(-1).item()
            res.append(predicted_token)
            tgt = torch.cat((tgt, torch.tensor([[predicted_token]]).to(device)), dim=1)
            if predicted_token == en_vocab[eos_token]:
                break
        tokens = de_vocab.lookup_tokens(res)
    return tokens

In [46]:
sentence = train_data[5]['en']
print(sentence)
translate_sentence(
    sentence,
    model,
    en_nlp,
    vi_nlp,
    en_vocab,
    vi_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

I promised Tom I'd wait.


['tôi', 'tôi', 'tôi', 'tôi', 'tôi', 'tôi', 'tôi', 'tôi', 'tôi', 'tôi']