In [1]:
from pprint import pprint

# I - Data preprocessing

## I.1 - Download dataset

In [2]:
# ! pip install datasets

In [3]:
from datasets import load_dataset

In [4]:
data_name = 'ncduy/mt-en-vi'
data = load_dataset(data_name)

In [5]:
sample_size = 100000
train_data = data['train'].shuffle(seed=42).select(range(sample_size))
validation_data = data['validation'].shuffle(seed=42).select(range(int(0.1*sample_size)))
test_data = data['test'].shuffle(seed=42).select(range(int(0.1*sample_size)))

In [6]:
print(train_data.shape)
pprint(train_data[0])

(100000, 3)
{'en': '"education and leading me to providing service to the Republic."',
 'source': 'OpenSubtitles v2018',
 'vi': '"nơi giáo dục và dìu dắt tôi để tôi có thể phụng sự nền Cộng Hòa"'}


## I.2 - Process data using SentencePiece

In [7]:
import sentencepiece as spm
import os

In [8]:
# Save data into files
def save_to_file(data, filename):
    with open(filename+'.en', 'w', encoding='utf-8') as f_en, open(filename+'.vi', 'w', encoding='utf-8') as f_vi:
        for sample in data:
            f_en.write(sample['en'] + '\n')
            f_vi.write(sample['vi'] + '\n')

In [9]:
# save_to_file(train_data, 'data/train')
# save_to_file(validation_data, 'data/validation')
# save_to_file(test_data, 'data/test')

In [10]:
# Training SentencePiece
def training_sentencepiece(trainfile, prefix, vocab_size=32000, type='bpe'):
    spm.SentencePieceTrainer.Train(
        input=trainfile+'.en',
        model_prefix=prefix+'_en',
        vocab_size=vocab_size,
        model_type=type
    )
    spm.SentencePieceTrainer.Train(
        input=trainfile+'.vi',
        model_prefix=prefix+'_vi',
        vocab_size=vocab_size,
        model_type=type
    )
    
    # Load trained tokenizer
    sp_en = spm.SentencePieceProcessor(model_file=prefix+"_en.model")
    sp_vi = spm.SentencePieceProcessor(model_file=prefix+"_vi.model")
    
    return sp_en, sp_vi

In [11]:
trainfile = 'data/train'
prefix = 'sentencepiece/spm'
sp_en, sp_vi = training_sentencepiece(trainfile, prefix)

In [12]:
example_en = "education and leading me to providing service to the Republic."
example_vi = "Tôi yêu đại bàng!"

print(sp_en.encode(example_en, out_type=str))
print(sp_vi.encode(example_vi, out_type=str))

['▁education', '▁and', '▁leading', '▁me', '▁to', '▁providing', '▁service', '▁to', '▁the', '▁Republic', '.']
['▁Tôi', '▁yêu', '▁đại', '▁bàng', '!']


## I.3 - Convert data to Tensor

In [13]:
import torch
from torch.utils.data import Dataset, DataLoader

In [14]:
class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, src_tokenizer, tgt_tokenizer, max_length=128):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        # Tokenize input (source - English)
        src_ids = self.src_tokenizer.encode(self.src_texts[idx])[:self.max_length - 1]  
        src_ids.append(self.src_tokenizer.eos_id())  # Thêm <EOS>

        # Tokennize output (target - Vietnamese)
        tgt_ids = self.tgt_tokenizer.encode(self.tgt_texts[idx])[:self.max_length - 2]  
        tgt_ids = [self.tgt_tokenizer.bos_id()] + tgt_ids + [self.tgt_tokenizer.eos_id()]  # Add <SOS> and <EOS>

        return torch.tensor(src_ids), torch.tensor(tgt_ids)


# Padding batch
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    
    src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_batch = torch.nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=0)

    return {"src": src_batch, "tgt": tgt_batch}

In [15]:
train_src_texts = train_data['en'][:5000]
train_tgt_texts = train_data['vi'][:5000]

train_dataset = TranslationDataset(train_src_texts, train_tgt_texts, sp_en, sp_vi)

In [16]:
val_src_texts = validation_data['en'][:100]
val_tgt_texts = validation_data['vi'][:100]

val_dataset = TranslationDataset(val_src_texts, val_tgt_texts, sp_en, sp_vi)

In [17]:
# DataLoader
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [18]:
batch = next(iter(train_dataloader))
print(batch['src'].shape)
pprint(batch['src'])

torch.Size([32, 43])
tensor([[20865, 26471,  2017,  ...,     0,     0,     0],
        [ 4211,  8313,   680,  ...,     0,     0,     0],
        [  286, 31860,    52,  ...,     0,     0,     0],
        ...,
        [ 1205,    29,     8,  ...,     0,     0,     0],
        [ 2081,   292,  3123,  ...,     0,     0,     0],
        [ 3733, 31859,   307,  ...,     0,     0,     0]])


# II - Build model

In [19]:
import torch.nn as nn

In [20]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, num_layers, dropout=0.3, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.GRU(emb_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        src
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)
        output, hidden = self.rnn(embedded)
        return output, hidden
        

In [21]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, num_layers, dropout, pad_idx=0):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.GRU(emb_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden):
        input = input.unsqueeze(1)  # [batch] -> [batch, 1]
        embedded = self.embedding(input)  # [batch, 1, emb_dim]
        embedded = self.dropout(embedded)  # [batch, 1, emb_dim]
        output, hidden = self.rnn(embedded, hidden)  # output: [batch, 1, hidden_dim]
        prediction = self.fc_out(output.squeeze(1))  # [batch, output_dim]
        return prediction, hidden


In [35]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        # outputs = torch.zeros(trg_len, batch_size, trg_vocab_size)
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        # Encode input
        output, hidden = self.encoder(src)

        # First token <SOS>
        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[t] = output

            # Teacher forcing
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)

            input = trg[:, t] if teacher_force else top1

        return outputs


In [36]:
# Hyperparams
INPUT_DIM = len(sp_en)
OUTPUT_DIM = len(sp_vi)
EMB_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 2
DROPOUT = 0.3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Khởi tạo Encoder & Decoder
encoder = Encoder(INPUT_DIM, EMB_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT)

# Khởi tạo Seq2Seq
model = Seq2Seq(encoder, decoder, DEVICE)


# III - Train and Evaluate model

In [37]:
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm


LEARNING_RATE = 0.001
PAD_IDX = 0

criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)  
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)


In [38]:
import random

def train(model, iterator, optimizer, criterion, clip, teacher_forcing_ratio=0.5):
    model.train()
    epoch_loss = 0
    model.to(DEVICE)
    for batch in tqdm(iterator, desc=f'Training: '):
        src, trg = batch['src'], batch['tgt']
        src, trg = src.to(DEVICE), trg.to(DEVICE)
        
        optimizer.zero_grad()

        output = model(src, trg, teacher_forcing_ratio)

        output_dim = output.shape[-1]
        output = output[1:].reshape(-1, output_dim)  # [batch_size * trg_len, vocab_size]
        trg = trg[:, 1:].reshape(-1)  # [batch_size * trg_len]

        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


In [39]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for src, trg in tqdm(iterator, desc='Evaluating: '):
            src, trg = src.to(DEVICE), trg.to(DEVICE)
            output = model(src, trg, teacher_forcing_ratio=0)

            output_dim = output.shape[-1]
            output = output[1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()
    print()
    return epoch_loss / len(iterator)


In [40]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [41]:
N_EPOCHS = 10
CLIP = 5  # Max gradient norm

train_losses = []
val_losses = []

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_dataloader, criterion)

    print(f'Epoch {epoch+1}:')
    print(f'  Train Loss: {train_loss:.3f}')
    print(f'  Valid Loss: {valid_loss:.3f}')

    train_losses.append(train_loss)
    val_losses.append(valid_loss)

Training:  18%|█▊        | 29/157 [00:55<04:06,  1.93s/it]


KeyboardInterrupt: 