In [1]:
from datasets import load_dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import io

In [2]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

In [3]:
dataset = load_dataset('mt_eng_vietnamese', 'iwslt2015-vi-en')

Found cached dataset mt_eng_vietnamese (C:/Users/PC/.cache/huggingface/datasets/mt_eng_vietnamese/iwslt2015-vi-en/1.0.0/53add551a01e9874588066f89d42925f9fad43db347199dad00f7e4b0c905a71)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
num_emxample = 1000

train_dataset = dataset["train"].select(range(num_emxample))
valid_dataset = dataset["validation"].select(range(num_emxample))
test_dataset = dataset["test"].select(range(num_emxample))

sampled_dataset = DatasetDict({
    "train": train_dataset,
    "validation": valid_dataset,
    "test": test_dataset
})


In [5]:
# Tokenize dataset
def tokenize_function(examples):
    vietnamese_sentences = []
    english_sentences = []
    for _ in examples["translation"]:
            vietnamese_sentences.append(_["vi"])
            english_sentences.append(_["en"])
    #vietnamese_sentences = examples["translation"]["vi"]
    #english_sentences = examples["translation"]["en"]
    
    tokenized_inputs = tokenizer(vietnamese_sentences, truncation=True, padding="max_length")
    tokenized_targets = tokenizer(english_sentences, truncation=True, padding="max_length")
    
    examples["input_ids"] = tokenized_inputs.input_ids
    examples["attention_mask"] = tokenized_inputs.attention_mask
    examples["labels"] = tokenized_targets.input_ids
    
    return examples

tokenized_dataset = sampled_dataset.map(tokenize_function, batched=True)

# Split dataset into train and validation sets
train_dataset = tokenized_dataset["train"]
val_dataset = tokenized_dataset["validation"]

Loading cached processed dataset at C:\Users\PC\.cache\huggingface\datasets\mt_eng_vietnamese\iwslt2015-vi-en\1.0.0\53add551a01e9874588066f89d42925f9fad43db347199dad00f7e4b0c905a71\cache-7c864e7a39287e94.arrow
Loading cached processed dataset at C:\Users\PC\.cache\huggingface\datasets\mt_eng_vietnamese\iwslt2015-vi-en\1.0.0\53add551a01e9874588066f89d42925f9fad43db347199dad00f7e4b0c905a71\cache-aa2700aeb5e9afb1.arrow
Loading cached processed dataset at C:\Users\PC\.cache\huggingface\datasets\mt_eng_vietnamese\iwslt2015-vi-en\1.0.0\53add551a01e9874588066f89d42925f9fad43db347199dad00f7e4b0c905a71\cache-70ffa497fba55c36.arrow


In [6]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [7]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./mt_eng_vietnamese_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    save_steps=50,
    eval_steps=100,
    logging_steps=100,
    overwrite_output_dir=True,
    save_total_limit=3,
)


In [8]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()




  0%|          | 0/1500 [00:00<?, ?it/s]

{'loss': 2.6547, 'learning_rate': 4.666666666666667e-05, 'epoch': 0.2}
{'loss': 0.2431, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}
{'loss': 0.2009, 'learning_rate': 4e-05, 'epoch': 0.6}
{'loss': 0.1888, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}
{'loss': 0.1825, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}
{'loss': 0.1778, 'learning_rate': 3e-05, 'epoch': 1.2}
{'loss': 0.1861, 'learning_rate': 2.6666666666666667e-05, 'epoch': 1.4}
{'loss': 0.1705, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.6}
{'loss': 0.1774, 'learning_rate': 2e-05, 'epoch': 1.8}
{'loss': 0.153, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}
{'loss': 0.1626, 'learning_rate': 1.3333333333333333e-05, 'epoch': 2.2}
{'loss': 0.1719, 'learning_rate': 1e-05, 'epoch': 2.4}
{'loss': 0.1646, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.6}
{'loss': 0.1516, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.8}
{'loss': 0.1624, 'learning_rate': 0.0, 'epoch': 3.0}
{'train_ru

TrainOutput(global_step=1500, training_loss=0.3431839408874512, metrics={'train_runtime': 368.6744, 'train_samples_per_second': 8.137, 'train_steps_per_second': 4.069, 'train_loss': 0.3431839408874512, 'epoch': 3.0})

In [12]:
model_checkpoint = "./mt_eng_vietnamese_finetuned/checkpoint-1500"
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [15]:
input_ids = tokenizer("Chính vì lượng khí thải rất lớn , nó có ý nghĩa quan trọng với hệ thống khí quyển .", return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

It &apos;s a very good thing, it &apos;s


In [16]:
input_ids = tokenizer("Hôm nay tôi đi học", return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

I &apos;m a little bit of a tad &


In [17]:
input_ids = tokenizer("Bắt chước những gì bạn nhìn thấy .", return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

It &apos;s a great way to get to know the world.
