In [2]:
from pprint import pprint

# I - Data preprocessing

## I.1 - Download dataset

In [3]:
# ! pip install datasets

In [4]:
from datasets import load_dataset

In [5]:
data_name = 'ncduy/mt-en-vi'
data = load_dataset(data_name)

README.md:   0%|          | 0.00/3.43k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/597M [00:00<?, ?B/s]

valid.csv:   0%|          | 0.00/2.45M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/2.43M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2884451 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11316 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11225 [00:00<?, ? examples/s]

In [6]:
sample_size = 100000
train_data = data['train'].shuffle(seed=42).select(range(sample_size))
validation_data = data['validation'].shuffle(seed=42).select(range(int(0.1*sample_size)))
test_data = data['test'].shuffle(seed=42).select(range(int(0.1*sample_size)))

In [7]:
print(train_data.shape)
pprint(train_data[0])

(100000, 3)
{'en': '"education and leading me to providing service to the Republic."',
 'source': 'OpenSubtitles v2018',
 'vi': '"nơi giáo dục và dìu dắt tôi để tôi có thể phụng sự nền Cộng Hòa"'}


## I.2 - Process data using SentencePiece

In [8]:
import sentencepiece as spm
import os

In [9]:
# Save data into files
def save_to_file(data, filename):
    with open(filename+'.en', 'w', encoding='utf-8') as f_en, open(filename+'.vi', 'w', encoding='utf-8') as f_vi:
        for sample in data:
            f_en.write(sample['en'] + '\n')
            f_vi.write(sample['vi'] + '\n')

In [10]:
save_to_file(train_data, '/kaggle/working/train')
save_to_file(validation_data, '/kaggle/working/validation')
save_to_file(test_data, '/kaggle/working/test')

In [11]:
# Training SentencePiece
def training_sentencepiece(trainfile, prefix, vocab_size=11140, type='bpe'):
    spm.SentencePieceTrainer.Train(
        input=trainfile+'.en',
        model_prefix=prefix+'_en',
        vocab_size=vocab_size,
        model_type=type
    )
    spm.SentencePieceTrainer.Train(
        input=trainfile+'.vi',
        model_prefix=prefix+'_vi',
        vocab_size=vocab_size,
        model_type=type
    )
    
    # Load trained tokenizer
    sp_en = spm.SentencePieceProcessor(model_file=prefix+"_en.model")
    sp_vi = spm.SentencePieceProcessor(model_file=prefix+"_vi.model")
    
    return sp_en, sp_vi

In [12]:
trainfile = '/kaggle/working/train'
prefix = '/kaggle/working/spm'
sp_en, sp_vi = training_sentencepiece(trainfile, prefix)

In [13]:
example_en = "education and leading me to providing service to the Republic."
example_vi = "Tôi yêu đại bàng!"

print(sp_en.encode(example_en, out_type=str))
print(sp_vi.encode(example_vi, out_type=str))

['▁education', '▁and', '▁leading', '▁me', '▁to', '▁providing', '▁service', '▁to', '▁the', '▁Republic', '.']
['▁Tôi', '▁yêu', '▁đại', '▁bàng', '!']


## I.3 - Convert data to Tensor

In [14]:
import torch
from torch.utils.data import Dataset, DataLoader

In [15]:
class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, src_tokenizer, tgt_tokenizer, max_length=128):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        # Tokenize input (source - English)
        src_ids = self.src_tokenizer.encode(self.src_texts[idx])[:self.max_length - 1]  
        src_ids.append(self.src_tokenizer.eos_id())  # Thêm <EOS>

        # Tokennize output (target - Vietnamese)
        tgt_ids = self.tgt_tokenizer.encode(self.tgt_texts[idx])[:self.max_length - 2]  
        tgt_ids = [self.tgt_tokenizer.bos_id()] + tgt_ids + [self.tgt_tokenizer.eos_id()]  # Add <SOS> and <EOS>

        return torch.tensor(src_ids), torch.tensor(tgt_ids)


# Padding batch
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    
    src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_batch = torch.nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=0)

    return {"src": src_batch, "tgt": tgt_batch}

In [16]:
train_src_texts = train_data['en']
train_tgt_texts = train_data['vi']

train_dataset = TranslationDataset(train_src_texts, train_tgt_texts, sp_en, sp_vi)

In [17]:
val_src_texts = validation_data['en']
val_tgt_texts = validation_data['vi']

val_dataset = TranslationDataset(val_src_texts, val_tgt_texts, sp_en, sp_vi)

In [18]:
# DataLoader
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [19]:
batch = next(iter(train_dataloader))
print(batch['src'].shape)
pprint(batch['src'])

torch.Size([32, 64])
tensor([[ 2598,   295,   225,  ...,     0,     0,     0],
        [  286,   884,  1094,  ...,     0,     0,     0],
        [   36,   184, 11000,  ...,     0,     0,     0],
        ...,
        [  488, 11000, 10982,  ...,     0,     0,     0],
        [  201,  1007, 10995,  ...,     0,     0,     0],
        [  385,     8,    49,  ...,     0,     0,     0]])


# II - Build model

In [20]:
import torch.nn as nn

In [21]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, num_layers, dropout=0.3, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.GRU(emb_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)
        output, hidden = self.rnn(embedded)
        return output, hidden
        

In [22]:
# Luong Attention
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim, hidden_dim, bias=False)
        
    def forward(self, hidden, encoder_output):
        """
        hidden: (1, batch, hidden_dim) - decoder hidden state 
        encoder_output: (batch, seq_len, hidden_dim) - encoder output
        """
        hidden = hidden[-1].unsqueeze(0)  # (batch, 1, hidden_dim)
        
        # Compute attention scores
        scores = torch.matmul(self.attn(hidden).transpose(0, 1), encoder_output.permute(0, 2, 1))
        attn_weights = torch.softmax(scores, dim=-1)  # (batch, 1, seq_len)
        
        # Get context vector
        context = torch.matmul(attn_weights, encoder_output)  # (batch, 1, hidden_dim)
        # print(context.size())
        return context.squeeze(1), attn_weights

In [23]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, num_layers, dropout, attention, pad_idx=0):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.GRU(emb_dim + hidden_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.attention = attention

    def forward(self, input, hidden, encoder_output):
        """
        input: (batch,)
        hidden: (1, batch, hidden_dim)
        encoder_output: (batch, seq_len, hidden_dim)
        """
        input = input.unsqueeze(0)  # (1, batch)
        embedded = self.dropout(self.embedding(input))  # (1, batch, emb_dim)
        
        # Get context vector from attention
        context, attn_weights = self.attention(hidden, encoder_output)  # (batch, hidden_dim), (batch, 1, seq_len)

        # Combine context vector with input
        rnn_input = torch.cat((embedded, context.unsqueeze(0)), dim=2)  # (1, batch, emb_dim + hidden_dim)
        
        output, hidden = self.rnn(rnn_input.permute(1, 0, 2), hidden)  # output: (batch, 1, hidden_dim)
        
        # Predict next word
        prediction = self.fc_out(torch.cat((output.squeeze(1), context), dim=1))  # (batch, output_dim)
        return prediction, hidden, attn_weights


In [24]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        """
        src: (batch, seq_len)
        trg: (batch, seq_len)
        """
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        # outputs = torch.zeros(trg_len, batch_size, trg_vocab_size)
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        # Encode input
        encoder_output, hidden = self.encoder(src)

        # First token <SOS>
        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, _ = self.decoder(input, hidden, encoder_output)
            outputs[t] = output

            # Teacher forcing
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)

            input = trg[:, t] if teacher_force else top1

        return outputs


In [25]:
# Hyperparams
INPUT_DIM = len(sp_en)
OUTPUT_DIM = len(sp_vi)
EMB_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 2
DROPOUT = 0.3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Init Encoder & Decoder 
attention = Attention(HIDDEN_DIM)
encoder = Encoder(INPUT_DIM, EMB_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT, attention)

# Init Seq2Seq
model = Seq2Seq(encoder, decoder, DEVICE)


# III - Train and Evaluate model

In [26]:
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm


LEARNING_RATE = 0.001
PAD_IDX = 0

criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)  
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [27]:
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
my_secret = user_secrets.get_secret("wandb-api")
wandb.login(key=my_secret)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mlongluv1605[0m ([33mlongluv1605-institute-for-artificial-intelligence[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [28]:
import random
random.seed(42)

def train(model, iterator, optimizer, criterion, clip, teacher_forcing_ratio=0.5):
    model.train()
    epoch_loss = 0
    model.to(DEVICE)
    for batch in tqdm(iterator, desc=f'Training: '):
        src, trg = batch['src'], batch['tgt']
        src, trg = src.to(DEVICE), trg.to(DEVICE)
        
        optimizer.zero_grad()

        output = model(src, trg, teacher_forcing_ratio)

        output_dim = output.shape[-1]
        output = output[1:].reshape(-1, output_dim)  # [batch_size * trg_len, vocab_size]
        trg = trg[:, 1:].reshape(-1)  # [batch_size * trg_len]

        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


In [29]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for src, trg in tqdm(iterator, desc='Evaluating: '):
            src, trg = batch['src'], batch['tgt']
            src, trg = src.to(DEVICE), trg.to(DEVICE)
            
            output = model(src, trg, teacher_forcing_ratio=0)

            output_dim = output.shape[-1]
            output = output[1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()
    print()
    return epoch_loss / len(iterator)


In [30]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [None]:
N_EPOCHS = 20
CLIP = 5  # Max gradient norm

train_losses = []
val_losses = []

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_dataloader, criterion)

    best_loss = float('inf')
    
    print(f'Epoch {epoch+1}:')
    print(f'  Train Loss: {train_loss:.3f}')
    print(f'  Valid Loss: {valid_loss:.3f}')

    if train_loss < best_loss:
        best_loss = train_loss
        torch.save(model.state_dict(), '/kaggle/working/seq2seqwattn.pth')
    
        # Save as artifact for version control.
        run = wandb.init(project='machine-translator')
        artifact = wandb.Artifact('seq2seqwattn', type='model')
        artifact.add_file('/kaggle/working/seq2seqwattn.pth')
        run.log_artifact(artifact)
        run.finish()

    train_losses.append(train_loss)
    val_losses.append(valid_loss)

Training: 100%|██████████| 3125/3125 [28:20<00:00,  1.84it/s]
Evaluating: 100%|██████████| 313/313 [00:49<00:00,  6.29it/s]



Epoch 1:
  Train Loss: 7.091
  Valid Loss: 7.098


Training: 100%|██████████| 3125/3125 [28:18<00:00,  1.84it/s]
Evaluating: 100%|██████████| 313/313 [00:49<00:00,  6.34it/s]



Epoch 2:
  Train Loss: 7.061
  Valid Loss: 7.105


Training: 100%|██████████| 3125/3125 [28:31<00:00,  1.83it/s]
Evaluating: 100%|██████████| 313/313 [00:49<00:00,  6.35it/s]



Epoch 3:
  Train Loss: 7.057
  Valid Loss: 7.101


Training: 100%|██████████| 3125/3125 [28:25<00:00,  1.83it/s]
Evaluating: 100%|██████████| 313/313 [00:48<00:00,  6.51it/s]



Epoch 4:
  Train Loss: 7.054
  Valid Loss: 7.105


Training: 100%|██████████| 3125/3125 [28:27<00:00,  1.83it/s]
Evaluating: 100%|██████████| 313/313 [00:49<00:00,  6.39it/s]



Epoch 5:
  Train Loss: 7.054
  Valid Loss: 7.094


Training: 100%|██████████| 3125/3125 [28:28<00:00,  1.83it/s]
Evaluating: 100%|██████████| 313/313 [00:48<00:00,  6.40it/s]



Epoch 6:
  Train Loss: 7.053
  Valid Loss: 7.098


Training: 100%|██████████| 3125/3125 [28:27<00:00,  1.83it/s]
Evaluating: 100%|██████████| 313/313 [00:48<00:00,  6.52it/s]



Epoch 7:
  Train Loss: 7.052
  Valid Loss: 7.102


Training: 100%|██████████| 3125/3125 [28:27<00:00,  1.83it/s]
Evaluating: 100%|██████████| 313/313 [00:48<00:00,  6.42it/s]



Epoch 8:
  Train Loss: 7.052
  Valid Loss: 7.105


Training: 100%|██████████| 3125/3125 [28:20<00:00,  1.84it/s]
Evaluating: 100%|██████████| 313/313 [00:48<00:00,  6.49it/s]



Epoch 9:
  Train Loss: 7.052
  Valid Loss: 7.101


Training: 100%|██████████| 3125/3125 [28:24<00:00,  1.83it/s]
Evaluating: 100%|██████████| 313/313 [00:48<00:00,  6.49it/s]



Epoch 10:
  Train Loss: 7.051
  Valid Loss: 7.108


Training:  85%|████████▍ | 2648/3125 [23:55<04:02,  1.97it/s]

In [None]:
torch.save(model.state_dict(), '/kaggle/working/seq2seq.pth')

In [None]:
model.load_state_dict(torch.load("/kaggle/working/seq2seq.pth", 
                                 map_location=torch.device("cpu"),
                                 weights_only=True))
model.eval()  

In [None]:
import torch

def translate_sentence(sentence, sp_en, trg_vocab, model, device, max_len=128):
    model.to(device).eval()
    
    # Tokenize input using SentencePiece
    tokens = sp_en.encode(sentence, out_type=str)  # Tokenize
    print("Tokenized input:", tokens)
    
    # Chuyển token thành chỉ mục (ID)
    src_indexes = sp_en.encode(sentence)  # SentencePiece auto convert to ID
    src_tensor = torch.tensor(src_indexes, dtype=torch.long).unsqueeze(0).to(device)  # [1, seq_len]
    with torch.inference_mode():
        _, hidden = model.encoder(src_tensor)  # Encode input
    
    trg_indexes = [trg_vocab['stoi']["<s>"]]  # Begin with <sos>
    for _ in range(max_len):
        trg_tensor = torch.tensor([trg_indexes[-1]], dtype=torch.long).to(device)  # [1]
        with torch.inference_mode():
            output, hidden = model.decoder(trg_tensor, hidden)  # Get output from decoder
        pred_token = output.argmax(1).item()  # Get token has highest probs
        trg_indexes.append(pred_token)

        if pred_token == trg_vocab['stoi']["</s>"]:  # End if <eos>
            break

    # Convert ID to word
    trg_tokens = [trg_vocab['itos'][i] for i in trg_indexes]
    return " ".join(trg_tokens[1:-1])  # Remove <sos> and <eos>


In [None]:
trg_vocab = {
    "stoi": {sp_vi.id_to_piece(i): i for i in range(sp_vi.get_piece_size())},
    "itos": {i: sp_vi.id_to_piece(i) for i in range(sp_vi.get_piece_size())},
}

sentence = "How are you?"
translated_sentence = translate_sentence(sentence, sp_en, trg_vocab, model, device="cpu")
print("Translated:", translated_sentence)
