In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import tqdm as tqdm_
from torch.optim import AdamW


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def read_csv(file_name):
    df = pd.read_csv(file_name, encoding = "utf-8")
    return df

df_training = read_csv('drive/MyDrive/PhoMT_training.csv').dropna(subset=['en','vi']).reset_index(drop=True)
df_training = df_training.sample(n = 80000, random_state = 42).reset_index(drop = True)
print(df_training['en'].head())
print(df_training['vi'].head())
print(len(df_training))

df_validation = read_csv('drive/MyDrive/PhoMT_validation.csv').dropna(subset=['en','vi']).reset_index(drop=True)
df_validation = df_validation.sample(n = 10000, random_state = 42).reset_index(drop = True)
print(df_validation['en'].head())
print(df_validation['vi'].head())
print(len(df_validation))

df_test = read_csv('drive/MyDrive/PhoMT_test.csv').dropna(subset=['en','vi']).reset_index(drop=True)
df_test = df_test.sample(n = 10000, random_state = 42).reset_index(drop = True)
print(df_test['en'].head())
print(df_test['vi'].head())
print(len(df_test))


0                   And one coffee, black, for oliver.
1                                    "Bless you, man."
2                              You saw the way he ran.
3                          What are we supposed to do?
4    At first when I heard of cancer stem cells, I ...
Name: en, dtype: object
0                      Và một coffee, đen, cho Oliver.
1                                    "Phù hộ cho ông".
2                        Cậu đã thấy cách ông ấy chạy.
3                             Gia đình tớ phải làm gì?
4    Khi lần đầu tiên tôi nghe về tế bào gốc ung th...
Name: vi, dtype: object
80000
0                             - Nighty - night, Hymie.
1                        Give yourself some "me" time.
2                Wash your face, then wait 15 minutes.
3    That's why the demon deceived us into believin...
4    Windows 10 added the possibility to have more ...
Name: en, dtype: object
0                              - Chúc ngủ ngon, Hymie.
1                    Dành thời gian chăm s

In [None]:
class TranslationData(Dataset):
    def __init__(self, datafranme, tokenizer, max_length):
        self.dataframe = datafranme
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)


    def __getitem__(self, index):
        src = "translate English to Vietnamese: " + self.dataframe.loc[index, 'en']
        tgt = self.dataframe.loc[index,'vi']

        src_tokenizer  = self.tokenizer(src, padding = "max_length", truncation = True, max_length = self.max_length, return_tensors = "pt")
        tgt_tokenizer = self.tokenizer(tgt, padding ="max_length", truncation = True, max_length = self.max_length, return_tensors = "pt" )

        input_ids = src_tokenizer['input_ids'].squeeze()
        attention_mask = src_tokenizer['attention_mask'].squeeze()
        labels = tgt_tokenizer['input_ids'].squeeze()

        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

In [None]:
!pip install sacrebleu



In [7]:
from sacrebleu import corpus_bleu

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

max_length = 128
batch_size = 32

train_loader = DataLoader(TranslationData(df_training, tokenizer, max_length), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TranslationData(df_validation, tokenizer, max_length), batch_size=batch_size, shuffle=False)
test_loader = DataLoader(TranslationData(df_test, tokenizer, max_length), batch_size=batch_size, shuffle=False)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 10
patience = 2
best_val_loss = float("inf")
epochs_no_improve = 0
save_dir = "/content/drive/MyDrive/my_t5_translation_model"

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    loop = tqdm_.tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} Training")

    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())

    avg_train_loss = train_loss / len(train_loader)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)
        print("Saved model.")
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

        if epochs_no_improve >= patience:
            print("Early stopping.")
            break




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Epoch 1/10 Training:   0%|          | 0/2500 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 1/10 Training: 100%|██████████| 2500/2500 [22:48<00:00,  1.83it/s, loss=1.61]


Epoch 1 - Train Loss: 1.9855, Val Loss: 1.6276
Saved model.


Epoch 2/10 Training: 100%|██████████| 2500/2500 [22:49<00:00,  1.83it/s, loss=1.65]


Epoch 2 - Train Loss: 1.6862, Val Loss: 1.4646
Saved model.


Epoch 3/10 Training: 100%|██████████| 2500/2500 [22:50<00:00,  1.82it/s, loss=1.52]


Epoch 3 - Train Loss: 1.5588, Val Loss: 1.3581
Saved model.


Epoch 4/10 Training: 100%|██████████| 2500/2500 [22:50<00:00,  1.82it/s, loss=1.47]


Epoch 4 - Train Loss: 1.4677, Val Loss: 1.2778
Saved model.


Epoch 5/10 Training: 100%|██████████| 2500/2500 [22:49<00:00,  1.83it/s, loss=1.3]


Epoch 5 - Train Loss: 1.3957, Val Loss: 1.2122
Saved model.


Epoch 6/10 Training: 100%|██████████| 2500/2500 [22:50<00:00,  1.82it/s, loss=1.31]


Epoch 6 - Train Loss: 1.3367, Val Loss: 1.1595
Saved model.


Epoch 7/10 Training: 100%|██████████| 2500/2500 [22:50<00:00,  1.82it/s, loss=1.27]


Epoch 7 - Train Loss: 1.2876, Val Loss: 1.1164
Saved model.


Epoch 8/10 Training: 100%|██████████| 2500/2500 [22:51<00:00,  1.82it/s, loss=1.28]


Epoch 8 - Train Loss: 1.2451, Val Loss: 1.0818
Saved model.


Epoch 9/10 Training: 100%|██████████| 2500/2500 [22:50<00:00,  1.82it/s, loss=1.15]


Epoch 9 - Train Loss: 1.2076, Val Loss: 1.0495
Saved model.


Epoch 10/10 Training: 100%|██████████| 2500/2500 [22:51<00:00,  1.82it/s, loss=1.22]


Epoch 10 - Train Loss: 1.1751, Val Loss: 1.0219
Saved model.


In [9]:
import tqdm as tqdm_
from sacrebleu import corpus_bleu

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/my_t5_translation_model")
tokenizer = T5Tokenizer.from_pretrained("/content/drive/MyDrive/my_t5_translation_model")
model.to(device)
model.eval()

predictions = []
references = []
test_loader = DataLoader(TranslationData(df_test, tokenizer, max_length = 128), batch_size=32, shuffle=False)
with torch.no_grad():
    for batch in tqdm_.tqdm(test_loader, desc="Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"]

        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        labels[labels == -100] = tokenizer.pad_token_id
        decoded_refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

        predictions.extend(decoded_preds)

        references.extend([[ref] for ref in decoded_refs])

bleu = corpus_bleu(predictions, references)
print(f"BLEU score: {bleu.score:.2f}")


Testing: 100%|██████████| 313/313 [10:23<00:00,  1.99s/it]


BLEU score: 46.25
