# Setup

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
# %pip install datasets transformers evaluate

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import datasets
import tqdm
import evaluate
import torch.nn.functional as F
import math
from transformers import AutoTokenizer, MBartForConditionalGeneration, MBart50TokenizerFast, TrainingArguments, Trainer
from torch.autograd import Variable

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [7]:
seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

# I - Load dataset

In [8]:
dataset = datasets.load_dataset("harouzie/vi_en-translation")

In [9]:
train_data, test_data, valid_data = (dataset['train'], dataset['test'], dataset['valid'])

In [10]:
train_data[1]

{'English': 'The pharmacy is on Fresno Street',
 'Vietnamese': 'hiệu thuốc nằm trên đường fresno'}

# II - Setup tokenizer


In [11]:
UNK_TOKEN = "<unk>"
PAD_TOKEN = "<pad>"
BOS_TOKEN = "<s>"
EOS_TOKEN = "</s>"


In [12]:
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt",
										  unk_token=UNK_TOKEN,
										  pad_token=PAD_TOKEN,
										  bos_token=BOS_TOKEN,
										  eos_token=EOS_TOKEN)

In [13]:
tokenizer('hiệu thuốc nằm trên đường fresno')

{'input_ids': [250004, 6842, 19621, 33937, 2479, 7590, 73989, 157, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [14]:
len(tokenizer)

250054

# III - Prepare data

In [15]:
def convert_to_ids(example, tokenizer):
	en_ids = tokenizer(example["English"], truncation=True)
	vi_ids = tokenizer(example["Vietnamese"], truncation=True)
	return {"en_ids": en_ids['input_ids'], "vi_ids": vi_ids['input_ids']}

In [16]:
fn_kwargs = {"tokenizer":tokenizer}

train_data = train_data.map(convert_to_ids, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(convert_to_ids, fn_kwargs=fn_kwargs)
test_data = test_data.map(convert_to_ids, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/25409 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [14]:
DATA_TYPE = "torch"
format_columns = ["en_ids", "vi_ids"]

train_data = train_data.with_format(
	type=DATA_TYPE, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(
	type=DATA_TYPE,
	columns=format_columns,
	output_all_columns=True,
)

test_data = test_data.with_format(
	type=DATA_TYPE,
	columns=format_columns,
	output_all_columns=True,
)

In [15]:
def get_collate_fn(pad_index):
	def collate_fn(batch):
		batch_en_ids = [example["en_ids"] for example in batch]
		batch_vi_ids = [example["vi_ids"] for example in batch]
		batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
		batch_vi_ids = nn.utils.rnn.pad_sequence(batch_vi_ids, padding_value=pad_index)
		batch_en_ids[0] = 0
		batch_vi_ids[0] = 0
		batch = {
			"en_ids": batch_en_ids.transpose(-2, -1),
			"vi_ids": batch_vi_ids.transpose(-2, -1),
		}
		return batch

	return collate_fn

In [16]:
def get_dataloader(dataset, batch_size, pad_index, shuffle=False):
	collate_fn = get_collate_fn(pad_index)
	data_loader = torch.utils.data.DataLoader(
		dataset=dataset,
		batch_size=batch_size,
		collate_fn=collate_fn,
		shuffle=shuffle,
	)
	return data_loader

In [17]:
PAD_INDEX = tokenizer.pad_token_id
UNK_INDEX = tokenizer.unk_token_id

In [18]:
BATCH_SIZE = 16

train_dataloader = get_dataloader(train_data, BATCH_SIZE, PAD_INDEX, shuffle=True)
valid_dataloader = get_dataloader(valid_data, BATCH_SIZE, PAD_INDEX)
test_dataloader = get_dataloader(test_data, BATCH_SIZE, PAD_INDEX)

In [19]:
# del train_data, valid_data, test_data

In [20]:
batch = next(iter(train_dataloader))
print(batch['en_ids'][0])
print(batch['vi_ids'][0])

tensor([     0,   3293,  18276,     83, 170277,      2,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1])
tensor([    0, 57658,  2455,  1617, 18844,  9457,     2,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1])


# IV - Model

In [21]:
class Embedding(nn.Module):
	def __init__(self, vocab_size, model_dim):
		super().__init__()
		self.vocab_size = vocab_size
		self.model_dim = model_dim
		self.emb = nn.Embedding(vocab_size, model_dim)

	def forward(self, x):
		return self.emb(x)

Embedding(100, 512)(torch.LongTensor([1,2,3,4])).shape

torch.Size([4, 512])

In [22]:
class PositionalEncoding(nn.Module):
	def __init__(self, model_dim, max_seq_len=144, dropout=0.2):
		super().__init__()
		self.model_dim = model_dim
		self.dropout = nn.Dropout(dropout)

		pos_enc = torch.zeros(max_seq_len, model_dim)

		for pos in range(max_seq_len):
			for i in range(0, model_dim, 2):
				pos_enc[pos, i] = math.sin(pos / (10000 ** (i / model_dim)))
				pos_enc[pos, i + 1] = math.cos(pos / (10000 ** (i / model_dim)))
		pos_enc.unsqueeze(0)
		self.register_buffer('pos_enc', pos_enc)

	def forward(self, x):
		x = x * math.sqrt(self.model_dim)
		seq_len = x.size(1)
		pos_enc = Variable(self.pos_enc[:seq_len], requires_grad=True)

		if x.is_cuda:
			pos_enc.cuda()
		x = x + pos_enc
		x = self.dropout(x)

		return x

PositionalEncoding(512)(torch.rand(5, 30, 512)).shape

torch.Size([5, 30, 512])

In [23]:
def self_attention(query, key, value, mask=None, dropout=None):
	"""
	q: [batch_size, head, seq_len, model_dim]
	k: [batch_size, head, seq_len, model_dim]
	v: [batch_size, head, seq_len, model_dim]
	mask: [batch_size, 1, seq_len]
	output: [batch_size, head, seq_len, model_dim]
	"""

	k_dim = key.size(1)
	scores = torch.matmul(query, key.transpose(-2, -1)) /  math.sqrt(k_dim)

	# Handle mask for Decoding
	if mask is not None:
		mask = mask.unsqueeze(1)
		scores = scores.masked_fill(mask==0, -1e4)

	scores = F.softmax(scores, dim=-1)

	if dropout is not None:
		scores = dropout(scores)

	output = torch.matmul(scores, value)
	return output, scores

test_output, test_scores = self_attention(torch.rand(32, 8, 30, 512), torch.rand(32, 8, 30, 512), torch.rand(32, 8, 30, 512))
print(test_output.shape, test_scores.shape)

torch.Size([32, 8, 30, 512]) torch.Size([32, 8, 30, 30])


In [24]:
class MultiHeadAttention(nn.Module):
	def __init__(self, num_heads, model_dim, dropout=0.2):
		super().__init__()
		assert model_dim % num_heads == 0, "Number of heads must be divisor of model dim"

		self.model_dim = model_dim
		self.k_dim = model_dim // num_heads
		self.num_heads = num_heads
		self.attn = None

		self.W_q = nn.Linear(model_dim, model_dim)
		self.W_k = nn.Linear(model_dim, model_dim)
		self.W_v = nn.Linear(model_dim, model_dim)
		self.out = nn.Linear(model_dim, model_dim)

		self.dropout = nn.Dropout(dropout)

	def forward(self, query, key, value, mask=None):
		"""
		q: [batch_size, seq_len, model_dim]
		k: [batch_size, seq_len, model_dim]
		v: [batch_size, seq_len, model_dim]
		mask: [batch_size, 1, seq_len]
		output: [batch_size, seq_len, model_dim]
		"""
		batch_size = query.size(0)
		query = self.W_q(query).view(batch_size, -1, self.num_heads, self.k_dim).transpose(1, 2)
		key = self.W_k(key).view(batch_size, -1, self.num_heads, self.k_dim).transpose(1, 2)
		value = self.W_v(value).view(batch_size, -1, self.num_heads, self.k_dim).transpose(1, 2)

		scores, self.attn = self_attention(query, key, value, mask, self.dropout)

		concat = scores.transpose(1, 2).contiguous().view(batch_size, -1, self.model_dim)
		output = self.out(concat)

		return output

MultiHeadAttention(8, 512)(torch.rand(32, 30, 512), torch.rand(32, 30, 512), torch.rand(32, 30, 512)).shape

torch.Size([32, 30, 512])

In [25]:
class FeedForward(nn.Module):
	def __init__(self, model_dim, ff_dim=1024, dropout=0.2):
		super().__init__()

		self.linear_1 = nn.Linear(model_dim, ff_dim)
		self.dropout = nn.Dropout(dropout)
		self.linear_2 = nn.Linear(ff_dim, model_dim)

	def forward(self, x):
		x = self.linear_1(x)
		x = self.dropout(F.relu(x))
		x = self.linear_2(x)
		return x

In [26]:
class EncoderLayer(nn.Module):
	def __init__(self, model_dim, num_heads, ff_dim=1024, dropout=0.2):
		super().__init__()
		self.norm_1 = nn.LayerNorm(model_dim)
		self.norm_2 = nn.LayerNorm(model_dim)
		self.attn = MultiHeadAttention(num_heads, model_dim, dropout)
		self.ffn = FeedForward(model_dim, ff_dim, dropout)
		self.dropout_1 = nn.Dropout(dropout)
		self.dropout_2 = nn.Dropout(dropout)

	def forward(self, x, mask):

		# Attention
		x2 = self.norm_1(x)
		x = x + self.dropout_1(self.attn(x2, x2, x2, mask))

		# Feed forward
		x2 = self.norm_2(x)
		x = x + self.dropout_2(self.ffn(x2))

		return x

EncoderLayer(512, 8)(torch.rand(32, 30, 512), torch.rand(32 , 1, 30)).shape

torch.Size([32, 30, 512])

In [27]:
class DecoderLayer(nn.Module):
	def __init__(self, model_dim, num_heads, ff_dim=1024, dropout=0.2):
		super().__init__()
		self.norm_1 = nn.LayerNorm(model_dim)
		self.norm_2 = nn.LayerNorm(model_dim)
		self.norm_3 = nn.LayerNorm(model_dim)

		self.dropout_1 = nn.Dropout(dropout)
		self.dropout_2 = nn.Dropout(dropout)
		self.dropout_3 = nn.Dropout(dropout)

		self.attn_1 = MultiHeadAttention(num_heads, model_dim, dropout)
		self.attn_2 = MultiHeadAttention(num_heads, model_dim, dropout)
		self.ffn = FeedForward(model_dim, ff_dim, dropout)

	def forward(self, x, enc_output, src_mask, trg_mask):
		"""
		x: [batch_size, seq_len, model_dim]
		e_outputs: [batch_size, seq_len, model_dim]
		src_mask: [batch_size, 1, seq_len]
		trg_mask: [batch_size, 1, seq_len]
		"""

		# Attention 1
		x2 = self.norm_1(x)
		x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))

		# Attention 2
		x2 = self.norm_2(x)
		x = x + self.dropout_2(self.attn_2(x2, enc_output, enc_output, src_mask))

		# FFN
		x2 = self.norm_3(x)
		x = x + self.dropout_3(self.ffn(x2))

		return x

DecoderLayer(512, 8)(torch.rand(32, 30, 512), torch.rand(32, 30, 512), torch.rand(32, 1, 30), torch.rand(32, 1, 30)).shape

torch.Size([32, 30, 512])

In [28]:
class Encoder(nn.Module):
	def __init__(self, vocab_size, model_dim, N_layers, num_heads, max_seq_len=144, ff_dim=1024, dropout=0.2):
		super().__init__()
		self.N_layers = N_layers
		self.embed = nn.Embedding(vocab_size, model_dim)
		self.pos_enc = PositionalEncoding(model_dim, max_seq_len, dropout)
		self.encoder_layers = nn.ModuleList(EncoderLayer(model_dim, num_heads, ff_dim, dropout)
									for _ in range(N_layers))
		self.norm = nn.LayerNorm(model_dim)

	def forward(self, src, mask):
		"""
		src: [batch_size, seq_len]
		mask: [batch_size, 1, seq_len]
		output: [batch_size, seq_len, model_dim]
		"""

		x = self.embed(src)
		x = self.pos_enc(x)
		for layer in self.encoder_layers:
			x = layer(x, mask)
		return self.norm(x)

Encoder(232, 512, 6, 8)(torch.LongTensor(32, 30).random_(0, 10), torch.rand(32, 1, 30)).shape

torch.Size([32, 30, 512])

In [29]:
class Decoder(nn.Module):
	def __init__(self, vocab_size, model_dim, N_layers, num_heads, max_seq_len=144, ff_dim=1024, dropout=0.2):
		super().__init__()
		self.N_layers = N_layers
		self.embed = nn.Embedding(vocab_size, model_dim)
		self.pos_enc = PositionalEncoding(model_dim, max_seq_len, dropout)
		self.decoder_layers = nn.ModuleList(DecoderLayer(model_dim, num_heads, ff_dim, dropout)
									for _ in range(N_layers))
		self.norm = nn.LayerNorm(model_dim)

	def forward(self, trg, enc_output, src_mask, trg_mask):
		"""
		trg: [batch_size, seq_len]
		enc_output: [batch_size, seq_len, model_dim]
		src_mask: [batch_size, 1, seq_len]
		trg_mask: [batch_size, 1, seq_len]
		output: [batch_size, seq_len, model_dim]
		"""
		x = self.embed(trg)
		x = self.pos_enc(x)
		for layer in self.decoder_layers:
			x = layer(x, enc_output, src_mask, trg_mask)
		return self.norm(x)


Decoder(232, 512, 6, 8)(torch.LongTensor(32, 30).random_(0, 10), torch.rand(32, 30, 512), torch.rand(32, 1, 30), torch.rand(32, 1, 30)).shape

torch.Size([32, 30, 512])

In [30]:
class Transformer(nn.Module):
	def __init__(self, src_vocab_size, trg_vocab_size, model_dim, N_layers, num_heads, max_seq_len=144, ff_dim=2048, dropout=0.2):
		super().__init__()
		self.encoder = Encoder(src_vocab_size, model_dim, N_layers, num_heads, max_seq_len, ff_dim, dropout)
		self.decoder = Decoder(trg_vocab_size, model_dim, N_layers, num_heads, max_seq_len, ff_dim, dropout)
		self.out = nn.Linear(model_dim, trg_vocab_size)

	def forward(self, src, trg, src_mask=None, trg_mask=None):
		"""
		src: [batch_size, seq_len]
		trg: [batch_size, seq_len]
		src_mask: [batch_size, 1, seq_len]
		trg_mask [batch_size, 1, seq_len]
		output: [batch_size, seq_len, vocab_size]
		"""
		enc_output = self.encoder(src, src_mask)
		dec_output = self.decoder(trg, enc_output, src_mask, trg_mask)
		output = self.out(dec_output)
		return output

Transformer(232, 232, 512, 6, 8)(torch.LongTensor(32, 30).random_(0, 10), torch.LongTensor(32, 30).random_(0, 10),torch.rand(32, 1, 30),torch.rand(32, 1, 30)).shape

torch.Size([32, 30, 232])

# V - Training

In [31]:
def generate_masks(src, trg):
	src_mask = (src != tokenizer.pad_token_id).unsqueeze(1)  # [batch_size, 1, src_len]
	trg_mask = (trg != tokenizer.pad_token_id).unsqueeze(1)  # [batch_size, 1, trg_len]
	seq_len = trg.size(1)
	nopeak_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(trg.device)
	trg_mask = trg_mask & (~nopeak_mask).unsqueeze(0)  # Combine padding and causal mask
	return src_mask, trg_mask

In [32]:
def get_lr(step_num, model_dim=512, warmup_steps=4000):
	return model_dim ** (-0.5) * min(step_num ** (-0.5), step_num * warmup_steps ** (-1.5))

In [33]:
SRC_VOCAB_SIZE = tokenizer.vocab_size
TRG_VOCAB_SIZE = tokenizer.vocab_size
MODEL_DIM = 512
N_LAYERS = 6
NUM_HEADS = 8
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Transformer(
	src_vocab_size=SRC_VOCAB_SIZE,
	trg_vocab_size=TRG_VOCAB_SIZE,
	model_dim=MODEL_DIM,
	N_layers=N_LAYERS,
	num_heads=NUM_HEADS
)

In [34]:
def init_weights(m):
	for _, param in m.named_parameters():
		if param.dim() > 1:
			nn.init.xavier_uniform_(param.data)

model.apply(init_weights)

Transformer(
  (encoder): Encoder(
    (embed): Embedding(250054, 512)
    (pos_enc): PositionalEncoding(
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder_layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (norm_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiHeadAttention(
          (W_q): Linear(in_features=512, out_features=512, bias=True)
          (W_k): Linear(in_features=512, out_features=512, bias=True)
          (W_v): Linear(in_features=512, out_features=512, bias=True)
          (out): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (ffn): FeedForward(
          (linear_1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
          (linear_2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (dropo

In [35]:
def count_parameters(model):
	return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 428,473,542 trainable parameters


In [36]:
src = batch['en_ids']
trg = batch['vi_ids']
src_mask, trg_mask = generate_masks(src, trg)
output = model(src, trg, src_mask, trg_mask)

In [37]:
torch.argmax(output[0], dim=-1)

tensor([182222, 225228, 172370, 143265, 244996,  81288, 225353,  32492,  95782,
         52813, 210280, 225027, 149617,   3643, 115520,  16277,  99576,  19704,
         35067, 115520, 150028, 200847,  97937, 249910,   9544, 249910,   8030,
        201780,  65130, 216548])

In [38]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, vocab_size, smoothing=0.1, ignore_index=None):
        super(LabelSmoothingLoss, self).__init__()
        self.vocab_size = vocab_size
        self.smoothing = smoothing
        self.ignore_index = ignore_index
        self.confidence = 1.0 - smoothing

    def forward(self, output, target):
        """
        output: Tensor of shape [batch, seq_len, vocab_size] - Logits
        target: Tensor of shape [batch, seq_len] - IDs target
        """
        true_dist = torch.zeros_like(output)
        true_dist.fill_(self.smoothing / (self.vocab_size - (1 if self.ignore_index is not None else 0)))
        true_dist.scatter_(-1, target.unsqueeze(-1), self.confidence) # Dim = 2 (vocab dim)
        if self.ignore_index is not None:
            mask = (target == self.ignore_index)  # [batch, seq_len]
            true_dist[mask] = 0

        log_probs = torch.log_softmax(output, dim=-1)  # [batch, seq_len, vocab_size]
        loss = -torch.sum(true_dist * log_probs, dim=-1)

        if self.ignore_index is not None:
            mask = (target != self.ignore_index).float()
            loss = loss * mask
            return torch.sum(loss) / torch.sum(mask)
        return torch.mean(loss)

In [39]:
from torch.amp import autocast, GradScaler
scaler = GradScaler('cuda')

In [40]:
optimizer = optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9)
criterion = LabelSmoothingLoss(vocab_size=tokenizer.vocab_size, smoothing=0.1, ignore_index=PAD_INDEX)

In [41]:
criterion(output, trg)

tensor(12.4315, grad_fn=<DivBackward0>)

In [42]:
class Trainer():
    def __init__(self):
        self.global_step = 0

    def train_model(self, model, data_loader, optimizer, criterion, scaler, clip, device):
        model.to(device).train()
        epoch_loss = 0
        for i, batch in enumerate(data_loader):
            self.global_step += 1
            src = batch["en_ids"].to(device)
            trg = batch["vi_ids"].to(device)
            src_mask, trg_mask = generate_masks(src, trg)

            # Forward
            lr = get_lr(self.global_step, model_dim=512, warmup_steps=4000)
            for param_group in optimizer.param_groups:
                param_group["lr"] = lr

            with autocast("cuda"):
                output = model(src, trg, src_mask, trg_mask)
                loss = criterion(output, trg)
            # Update
            optimizer.zero_grad()
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            scaler.step(optimizer)
            scaler.update()

            epoch_loss += loss.item()
            if (i + 1) % 500 == 0:
                    print(f"Batch: {i + 1}/ {len(data_loader)}: Loss {epoch_loss / (i+1):.3f}")
            if (i + 1) % 2000 == 0:
                    torch.save(model.state_dict(), "/content/drive/MyDrive/model.pth")
            del src, trg, output, loss, batch
            torch.cuda.empty_cache()
        return epoch_loss / len(data_loader)

    def evaluate_model(self, model, data_loader, criterion, device):
        model.to(device).eval()
        epoch_loss = 0
        with torch.no_grad():
            for i, batch in enumerate(data_loader):
                src = batch["en_ids"].to(device)
                trg = batch["vi_ids"].to(device)
                src_mask, trg_mask = generate_masks(src, trg)

                with autocast('cuda'):
                    output = model(src, trg, src_mask, trg_mask)  # turn off teacher forcing
                    loss = criterion(output, trg)
                epoch_loss += loss.item()

        return epoch_loss / len(data_loader)

In [43]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:25"
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

In [44]:
n_epochs = 1
clip = 1.0
trainer = Trainer()

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
	train_loss = trainer.train_model(
		model,
		train_dataloader,
		optimizer,
		criterion,
		scaler,
		clip,
		DEVICE
	)
	torch.save(model.state_dict(), "/content/drive/MyDrive/model.pth")
	print(f"\n\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
	valid_loss = trainer.evaluate_model(
		model,
		valid_dataloader,
		criterion,
		DEVICE
	)
	print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")
	if valid_loss < best_valid_loss:
		best_valid_loss = valid_loss
		torch.save(model.state_dict(), "/content/drive/MyDrive/model.pth")

  0%|          | 0/1 [00:00<?, ?it/s]

Batch: 500/ 12705: Loss 10.566
Batch: 1000/ 12705: Loss 8.387
Batch: 1500/ 12705: Loss 6.997
Batch: 2000/ 12705: Loss 5.934
Batch: 2500/ 12705: Loss 5.175
Batch: 3000/ 12705: Loss 4.634
Batch: 3500/ 12705: Loss 4.234
Batch: 4000/ 12705: Loss 3.929
Batch: 4500/ 12705: Loss 3.689
Batch: 5000/ 12705: Loss 3.494
Batch: 5500/ 12705: Loss 3.334
Batch: 6000/ 12705: Loss 3.199
Batch: 6500/ 12705: Loss 3.085
Batch: 7000/ 12705: Loss 2.987
Batch: 7500/ 12705: Loss 2.901
Batch: 8000/ 12705: Loss 2.826
Batch: 8500/ 12705: Loss 2.759
Batch: 9000/ 12705: Loss 2.700
Batch: 9500/ 12705: Loss 2.647
Batch: 10000/ 12705: Loss 2.599
Batch: 10500/ 12705: Loss 2.556
Batch: 11000/ 12705: Loss 2.516
Batch: 11500/ 12705: Loss 2.480
Batch: 12000/ 12705: Loss 2.446
Batch: 12500/ 12705: Loss 2.416

	Train Loss:   2.404 | Train PPL:  11.066
	Valid Loss:   1.703 | Valid PPL:   5.493


100%|██████████| 1/1 [1:11:37<00:00, 4297.64s/it]


# Evaluation

In [45]:
SRC_VOCAB_SIZE = tokenizer.vocab_size
TRG_VOCAB_SIZE = tokenizer.vocab_size
MODEL_DIM = 512
N_LAYERS = 6
NUM_HEADS = 8
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# DEVICE = torch.device('cpu')

model = Transformer(
	src_vocab_size=SRC_VOCAB_SIZE,
	trg_vocab_size=TRG_VOCAB_SIZE,
	model_dim=MODEL_DIM,
	N_layers=N_LAYERS,
	num_heads=NUM_HEADS
)

In [46]:
model.load_state_dict(torch.load("/content/drive/MyDrive/model.pth", weights_only=True))

<All keys matched successfully>

In [47]:
model.to(DEVICE)

Transformer(
  (encoder): Encoder(
    (embed): Embedding(250054, 512)
    (pos_enc): PositionalEncoding(
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder_layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (norm_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiHeadAttention(
          (W_q): Linear(in_features=512, out_features=512, bias=True)
          (W_k): Linear(in_features=512, out_features=512, bias=True)
          (W_v): Linear(in_features=512, out_features=512, bias=True)
          (out): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (ffn): FeedForward(
          (linear_1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
          (linear_2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (dropo

In [48]:
test_loss = trainer.evaluate_model(model, test_dataloader, criterion, DEVICE)
print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

| Test Loss: 1.707 | Test PPL:   5.511 |


In [49]:
def beam_search_decode(model, src, src_mask, tokenizer, beam_width=5, max_len=144, length_penalty=1.0):
    device = next(model.parameters()).device
    model.eval()

    initial_trg = torch.tensor([[tokenizer.bos_token_id]], device=device)
    beams = [(initial_trg, 0.0)]  # [(seq, log-prob)]

    with torch.no_grad():
        for step in range(max_len):
            new_beams = []
            for trg, score in beams:
                # Create causal mask for target (only look at previous token)
                trg_mask = torch.triu(torch.ones(trg.size(1), trg.size(1)), diagonal=1).bool().to(device)
                trg_mask = (~trg_mask).unsqueeze(0)  # [seq_len, seq_len]

                # Predict next word
                output = model(src, trg, src_mask, trg_mask)
                logits = output[:, -1, :]  # Final word logits [batch=1, vocab_size]
                probs = torch.log_softmax(logits, dim=-1)  # Log-probabilities
                top_k_probs, top_k_ids = probs.topk(beam_width, dim=-1)  # Get top-k

                # Expand each beam
                for prob, token_id in zip(top_k_probs[0], top_k_ids[0]):
                    new_score = score + prob.item()
                    new_trg = torch.cat([trg, token_id.unsqueeze(0).unsqueeze(0)], dim=1)
                    # Apply penalty
                    adjusted_score = new_score / ((new_trg.size(1) / 5.0) ** length_penalty)
                    new_beams.append((new_trg, adjusted_score))  # Keep original score to sort accurately

            # Sort and keep top beam_width beams
            beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

            # Check if all beams have finished (got </s>)
            all_done = all(tokenizer.eos_token_id in beam[0][0] for beam in beams)
            if all_done:
                break

    # Choose beam has highest score
    best_beam = max(beams, key=lambda x: x[1])
    return best_beam[0]  # Return best sequence [1, seq_len]

def translate_sentence(sentence, model, tokenizer, beam_width=5, max_len=128):
    device = next(model.parameters()).device
    model.eval()

    # Tokenize src
    inputs = tokenizer(sentence, return_tensors="pt", max_length=max_len, truncation=True, padding="max_length")
    src = inputs["input_ids"].to(device)
    src_mask = (src != tokenizer.pad_token_id).unsqueeze(1)

    # Call beam
    translated_tokens = beam_search_decode(model, src, src_mask, tokenizer, beam_width, max_len)

    # Decode
    translation = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translation

# Thử nghiệm dịch
sentence = "The pharmacy is on Fresno Street"
translation = translate_sentence(sentence, model, tokenizer, beam_width=5)
print(f"Translation: {translation}")

Translation: 


In [50]:
tokenizer.bos_token_id

0

In [51]:
translate_sentence("Who am I?", model, tokenizer, 5)

''

In [52]:
translate_sentence("She sells seashell on the seashore", model, tokenizer, 5)

''

In [53]:
bleu = evaluate.load("bleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [54]:
# dataset = datasets.load_dataset("harouzie/vi_en-translation")
test_data= dataset['test']

In [None]:
translations = [
	translate_sentence(
		example["English"],
		model,
		tokenizer
	)
	for example in test_data
]

In [None]:
predictions = [translation for translation in translations]

references = [[example["vi_i"]] for example in test_data]

In [None]:
predictions[0]

In [None]:
references[0]

In [None]:
results = bleu.compute(
	predictions=predictions, references=references
)

In [None]:
results