# (1) Task description
- Translate text from Chinese to English.
- Main goal: Get familiar with transformer.

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/ColabNotebooks/DEEP_LEARNING/Lab03

## Import package

In [2]:
!pip install torchmetrics

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import os
import json
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchsummary import summary
from timeit import default_timer as timer

from utils import *
from network import *
from tqdm.auto import tqdm

## Fix random seed

In [4]:
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_seed(29)

# (2) Data Processing
- Original dataset is [Tatoeba](https://tatoeba.org/zh-cn/) and [XDailyDialog](https://github.com/liuzeming01/XDailyDialog)
- We select 50000 English-Chinese sentence pairs for translation task

- Args:
  - BATCH_SIZE  (You can modify)
  - data_dir: the path to the given training translation dataset

In [5]:
data_dir = "./translation_train_data.json"
BATCH_SIZE = 48  # Increased from 32 for more stable gradients

## Show the raw data

In [6]:
translation_raw_data = pd.read_json(data_dir)
display(translation_raw_data)

Unnamed: 0,English,Chinese
0,I'm Susan Greene.,我是蘇珊格林。
1,You don't have to take an examination.,你不需要考试。
2,I can't leave.,我走不了。
3,A cold beer would hit the spot!,来杯冰啤酒就太棒了!
4,Let's start!,讓我們開始吧。
...,...,...
49995,Just buy a cask of wine. Have you bought ice yet?,买一桶酒就行了。你买冰块了吗?
49996,OK. No problem.,"好的,没问题。"
49997,"I'm not really in the mood for Italian, actual...","实际上,我不太喜欢意大利菜。我想吃点辣的。"
49998,It's OK. It seems we have a lot in common.,还行吧。看来我们有很多共同点。


## Tokenization
- Tokenizer: BertTokenizer
  - encode: convert text to token ID
  - decode: convert token ID back to text

In [7]:
tokenizer_en = tokenizer_english()
tokenizer_cn = tokenizer_chinese()

In [8]:
english_seqs = translation_raw_data["English"].apply(lambda x: tokenizer_en.encode(x, add_special_tokens=True, padding=False))
chinese_seqs = translation_raw_data["Chinese"].apply(lambda x: tokenizer_cn.encode(x, add_special_tokens=True, padding=False))

MAX_TOKENIZE_LENGTH = max(english_seqs.str.len().max(), chinese_seqs.str.len().max()) # longest string
MAX_TOKENIZE_LENGTH = pow(2, math.ceil(math.log(MAX_TOKENIZE_LENGTH)/math.log(2)))    # closest upper to the power of 2

print("Max tokenize length:", MAX_TOKENIZE_LENGTH)

Max tokenize length: 128


## Add paddings
- make all the sentences the same length by inserting token ID = PAD_IDX at the back

In [9]:
#add padding
def add_padding(token_list, max_length):
    if len(token_list) < max_length:
        padding_length = max_length - len(token_list)
        token_list = token_list + [PAD_IDX] * padding_length
    else:
        token_list = token_list[:max_length]  # Trim to MAX_LENGTH if longer
    return token_list

chinese_seqs = chinese_seqs.apply(lambda x: add_padding(x, MAX_TOKENIZE_LENGTH))
english_seqs = english_seqs.apply(lambda x: add_padding(x, MAX_TOKENIZE_LENGTH))

In [10]:
# check the padding result
print("=====Chinese tokenized data=====")
print(chinese_seqs.iloc[0])

print("=====English tokenized data=====")
print(english_seqs.iloc[0])

=====Chinese tokenized data=====
[101, 2769, 3221, 5979, 4396, 3419, 3360, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
=====English tokenized data=====
[101, 146, 112, 182, 5640, 10983, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## Datalodader
- Split dataset into training dataset(90%) and validation dataset(10%). You can modify the traning/validation ratio
- Create dataloader to iterate the data.

In [11]:
data_size = len(translation_raw_data)
train_size = int(0.9*data_size)
valid_size = data_size - train_size
print("train size:", train_size)
print("valid size:", valid_size)

en_train_data = []
cn_train_data = []
en_valid_data = []
cn_valid_data = []

for i in range(data_size):
    if (i < train_size):
        en_train_data.append(torch.Tensor(english_seqs.iloc[i]))
        cn_train_data.append(torch.Tensor(chinese_seqs.iloc[i]))
    else:
        en_valid_data.append(torch.Tensor(english_seqs.iloc[i]))
        cn_valid_data.append(torch.Tensor(chinese_seqs.iloc[i]))

class TextTranslationDataset(Dataset):
    def __init__(self, src, dst):
        self.src_list = src
        self.dst_list = dst

    def __len__(self):
        return len(self.src_list)

    def __getitem__(self, idx):
        return self.src_list[idx], self.dst_list[idx]

cn_to_en_train_set = TextTranslationDataset(cn_train_data, en_train_data)
cn_to_en_valid_set = TextTranslationDataset(cn_valid_data, en_valid_data)

cn_to_en_train_loader = DataLoader(cn_to_en_train_set, batch_size=BATCH_SIZE, shuffle=True)
cn_to_en_valid_loader = DataLoader(cn_to_en_valid_set, batch_size=BATCH_SIZE, shuffle=True)

train size: 45000
valid size: 5000


# (3) Model
- **TO-DO**: Finish the model in "network.py"
    - You can first write code here for convenience, but note that <span style='color:red'>**TA will test your model using model definition in "network.py"**</span><p>
- Base transformer layers in [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
    - TransformerEncoderLayer:
    - TransformerDecoderLayer:
- Positional encoding and input embedding
- Note that you may need masks when implementing attention mechanism
    - Padding mask: prevent input from attending to padding tokens
    - Causal mask: prevent decoder input from attending to future input

In [12]:
model = load_model()

# Xavier uniform initialization for better convergence
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

model = model.to(DEVICE)
param_model = sum(p.numel() for p in model.parameters())
print(f"The parameter size of model is {param_model/1000} k")
print(f"The parameter size of model is {param_model/1000000:.2f} M")

The parameter size of model is 27656.516 k
The parameter size of model is 27.66 M


# (4) Training
- You can change the training setting by yourself including
  - Number of epoch
  - Optimizer
  - Learning rate
  - Learning rate scheduler
  - etc...

In [13]:
NUM_EPOCHS = 100
# Label smoothing helps prevent overconfident predictions and improves generalization
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.08)  # Reduced from 0.1

# Optimized AdamW for transformer model
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=5e-4,  # Slightly higher max_lr for better convergence
    betas=(0.9, 0.98),
    eps=1e-9,
    weight_decay=0.01  # Slightly higher for better regularization
)

# OneCycleLR: Modern scheduler with warmup, peak, and annealing phases
# With 45k training samples and batch_size=48: ~937 steps/epoch
from torch.optim.lr_scheduler import OneCycleLR

scheduler = OneCycleLR(
    optimizer,
    max_lr=5e-4,               # Peak learning rate (increased from 4e-4)
    epochs=NUM_EPOCHS,         # Total epochs
    steps_per_epoch=937,       # 45000 / 48 = 937 steps per epoch
    pct_start=0.1,             # 10% of cycle for warmup
    anneal_strategy='cos',     # Cosine annealing (smoother than linear)
    div_factor=25.0,           # Initial LR = max_lr / 25 = 2e-5
    final_div_factor=10000.0,  # Final LR = max_lr / 10000 = 5e-8
    three_phase=False          # Two-phase: warmup → annealing
)

## Training and Evaluation Functions

In [14]:
def train_epoch(model, optimizer, train_dataloader, scheduler=None):
    model.train()
    losses = 0

    for src, tgt in train_dataloader:
        # src, tgt shape: (batch_size, seq_length)
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        logits = model(src, tgt_input)

        optimizer.zero_grad()
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_output.reshape(-1).long())
        loss.backward()

        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model, val_dataloader):
    model.eval()
    losses = 0
    score = 0

    with torch.no_grad():
        for src, tgt in val_dataloader:
            # src, tgt shape: (batch_size, seq_length)
            src = src.to(DEVICE)
            tgt = tgt.to(DEVICE)

            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]

            logits = model(src, tgt_input)
            _, tgt_predict = torch.max(logits, dim=-1)
            score_batch = BLEU_batch(tgt_predict, tgt_output, tokenizer_en)

            loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_output.reshape(-1).long())
            losses += loss.item()
            score += score_batch

    return (losses / len(list(val_dataloader))), (score / len(list(val_dataloader)))

## Start training
- MODEL_SAVE_PATH: path for storing the best model

In [15]:
MODEL_SAVE_PATH = "./model.ckpt"

In [16]:
model = model.to(DEVICE)

best_acc = 0
patience = 10  # Stop if no improvement for 10 epochs
patience_counter = 0
best_epoch = 0

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(model, optimizer, cn_to_en_train_loader, scheduler)
    end_time = timer()
    val_loss, val_acc = evaluate(model, cn_to_en_valid_loader)

    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, Val Acc: {val_acc:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

    # Save the best model so far
    if val_acc > best_acc: 
        best_acc = val_acc
        best_epoch = epoch
        best_state_dict = model.state_dict()
        torch.save(best_state_dict, MODEL_SAVE_PATH)
        patience_counter = 0
        print("(model saved)")
    else:
        patience_counter += 1
        
    # Early stopping
    if patience_counter >= patience:
        print(f"\nEarly stopping at epoch {epoch}. Best validation accuracy: {best_acc:.3f} at epoch {best_epoch}")
        break
        
print(f"\nTraining complete! Best validation accuracy: {best_acc:.3f} at epoch {best_epoch}")

Epoch: 1, Train loss: 7.739, Val loss: 6.381, Val Acc: 0.008, Epoch time = 54.573s
(model saved)
Epoch: 2, Train loss: 5.830, Val loss: 5.613, Val Acc: 0.051, Epoch time = 54.616s
(model saved)
Epoch: 3, Train loss: 5.190, Val loss: 5.164, Val Acc: 0.085, Epoch time = 54.711s
(model saved)
Epoch: 4, Train loss: 4.841, Val loss: 4.906, Val Acc: 0.127, Epoch time = 54.739s
(model saved)
Epoch: 5, Train loss: 4.599, Val loss: 4.717, Val Acc: 0.150, Epoch time = 54.773s
(model saved)
Epoch: 6, Train loss: 4.393, Val loss: 4.566, Val Acc: 0.171, Epoch time = 54.766s
(model saved)
Epoch: 7, Train loss: 4.204, Val loss: 4.384, Val Acc: 0.213, Epoch time = 54.774s
(model saved)
Epoch: 8, Train loss: 3.985, Val loss: 4.191, Val Acc: 0.259, Epoch time = 54.774s
(model saved)
Epoch: 9, Train loss: 3.779, Val loss: 4.031, Val Acc: 0.304, Epoch time = 54.777s
(model saved)
Epoch: 10, Train loss: 3.598, Val loss: 3.892, Val Acc: 0.334, Epoch time = 54.790s
(model saved)
Epoch: 11, Train loss: 3.447,

ValueError: Tried to step 93701 times. The specified number of total steps is 93700

# (5) Inference

In [17]:
from utils import *
from network import *

In [18]:
tokenizer_en = tokenizer_english()
tokenizer_cn = tokenizer_chinese()

## Load best model

In [19]:
model = load_model(MODEL_PATH="model.ckpt")
model = model.to(DEVICE)

## Translation testing
 - **TO-DO**: Finish the "translate" function in "network.py"
   - You can first write code here for convenience, but note that <span style='color:red'>**TA will test your model using "translate" function in "network.py"**</span>

In [20]:
sentence = "你好，欢迎来到中国。"
ground_truth = 'Hello, welcome to China.'
predicted = translate(model, sentence, tokenizer_cn, tokenizer_en)

print(f'{"Input:":15s}: {sentence}')
print(f'{"Prediction":15s}: {predicted}')
print(f'{"Ground truth":15s}: {ground_truth}')
print("Bleu Score (1-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 1).item())
print("Bleu Score (2-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 2).item())
print("Bleu Score (3-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 3).item())
print("Bleu Score (4-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 4).item())

Input:         : 你好，欢迎来到中国。
Prediction     : You're good at Chinese.
Ground truth   : Hello, welcome to China.
Bleu Score (1-gram):  0.0
Bleu Score (2-gram):  0.0
Bleu Score (3-gram):  0.0
Bleu Score (4-gram):  0.0


In [21]:
sentence = "她知道您的電話號碼嗎?"
ground_truth = 'Does she know your telephone number?'
predicted = translate(model, sentence, tokenizer_cn, tokenizer_en)

print(f'{"Input:":15s}: {sentence}')
print(f'{"Prediction":15s}: {predicted}')
print(f'{"Ground truth":15s}: {ground_truth}')
print("Bleu Score (1gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 1).item())
print("Bleu Score (2gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 2).item())
print("Bleu Score (3gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 3).item())
print("Bleu Score (4gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 4).item())

Input:         : 她知道您的電話號碼嗎?
Prediction     : Does she know your telephone number?
Ground truth   : Does she know your telephone number?
Bleu Score (1gram):  1.0
Bleu Score (2gram):  1.0
Bleu Score (3gram):  1.0
Bleu Score (4gram):  1.0


In [22]:
sentence = "你现在在哪里工作?"
ground_truth = 'Where do you work now?'
predicted = translate(model, sentence, tokenizer_cn, tokenizer_en)

print(f'{"Input:":15s}: {sentence}')
print(f'{"Prediction":15s}: {predicted}')
print(f'{"Ground truth":15s}: {ground_truth}')
print("Bleu Score (1gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 1).item())
print("Bleu Score (2gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 2).item())
print("Bleu Score (3gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 3).item())
print("Bleu Score (4gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 4).item())

Input:         : 你现在在哪里工作?
Prediction     : Where do you work now?
Ground truth   : Where do you work now?
Bleu Score (1gram):  1.0
Bleu Score (2gram):  1.0
Bleu Score (3gram):  1.0
Bleu Score (4gram):  1.0
