In [33]:
#Install libraries
!pip install transformers datasets evaluate sacrebleu datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [35]:
#Import libraries
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
from evaluate import evaluator
import evaluate
import numpy as np
import torch
from datasets import DatasetDict

In [43]:
#load the dataset for finetuning
ted_iwlst2013 = load_dataset("ted_iwlst2013", "de-en")



  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
#Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [44]:
#Split the data into train, test and validation sets
ted_iwlst2013_train_testvalid = ted_iwlst2013["train"].train_test_split(test_size=0.1)
ted_iwlst2013_test_valid = ted_iwlst2013_train_testvalid["test"].train_test_split(test_size=0.5)
ted_iwlst2013 = DatasetDict({
    'train': ted_iwlst2013_train_testvalid['train'],
    'test': ted_iwlst2013_test_valid['test'],
    'valid': ted_iwlst2013_test_valid['train']})

In [45]:
src_lang = "en"
tgt_lang = 'de'
#Have to add prefix for the t5 model
prefix = "translate English to German: "
#Define preprocessing function to prepare and tokeninze the dataset
def preprocess(dataset):
    inputs = [prefix + data[src_lang] for data in dataset["translation"]]
    targets = [data[tgt_lang] for data in dataset["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

iwlst2013_tokenized = ted_iwlst2013.map(preprocess, batched=True)

Map:   0%|          | 0/129452 [00:00<?, ? examples/s]

Map:   0%|          | 0/7192 [00:00<?, ? examples/s]

Map:   0%|          | 0/7192 [00:00<?, ? examples/s]

In [20]:
#Use bleu score as an evaluation metric
bleu = evaluate.load("sacrebleu")

#Function to clean the decoded predictions and labels
#pr
def clean_texts(preds, labels):
  preds = [pred.strip() for pred in preds]
  labels = [[label.strip()] for label in labels]
  return preds, labels

#Function to compute the bleu score during training and evaluation
def compute_metrics(pred_labels):
  preds, labels = pred_labels
  if isinstance(preds, tuple):
      preds = preds[0]
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  decoded_preds, decoded_labels = clean_texts(decoded_preds, decoded_labels)

  result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
  result = {"bleu": result["score"]}
  return result

In [None]:
#Load the pretrained model and mount it on the GPU
t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
t5.to(device)

In [47]:
#Define the data collator to be used by the trainer object
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=t5)

In [51]:
#Some simple training arguments for finetuning
training_args = Seq2SeqTrainingArguments(
    output_dir="iwlst_t5",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True
)

#Define the trainer object
trainer = Seq2SeqTrainer(
    model=t5,
    args=training_args,
    train_dataset=iwlst2013_tokenized["train"],
    eval_dataset=iwlst2013_tokenized["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend


In [None]:
#Check the bleu score on the test set to have a comparison for after finetuning
prefintuned_results = trainer.evaluate(eval_dataset=iwlst2013_tokenized["test"])

In [54]:
print("Test set bleu score before finetuning: ", prefintuned_results["eval_bleu"])

Test set bleu score before finetuning:  16.088677519036988


In [49]:
#Translate and example sentence, again for comparison, I picked a sentence that might be more similar to the 
#sentences (specifically a sentences that sounds like spoken language) in the finetuning dataset, hopefully after training the translation will be better.
test_text = "translate English to German: This example right here seems like great work to me if I don't say so myself."
inputs_ids = tokenizer(test_text, return_tensors="pt").input_ids.to(device)
outputs = t5.generate(inputs_ids, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
print("Translation of example sentence: ", tokenizer.decode(outputs[0], skip_special_tokens=True))

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Translation of example sentence:  Ich kann dieses Beispiel hier wirklich so gut wie meinen eigenen.


Putting the translated sentence into google translate and converting back to English gives the following: I really can do this example as well as my own.

In [55]:
#Finetune the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: id, translation. If id, translation are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 129452
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 16182
  Number of trainable parameters = 60506624


Epoch,Training Loss,Validation Loss,Bleu
1,1.4232,1.228401,17.628526
2,1.406,1.222271,17.729482


Saving model checkpoint to iwlst_t5/checkpoint-500
Configuration saved in iwlst_t5/checkpoint-500/config.json
Configuration saved in iwlst_t5/checkpoint-500/generation_config.json
Model weights saved in iwlst_t5/checkpoint-500/pytorch_model.bin
tokenizer config file saved in iwlst_t5/checkpoint-500/tokenizer_config.json
Special tokens file saved in iwlst_t5/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [iwlst_t5/checkpoint-7000] due to args.save_total_limit
Saving model checkpoint to iwlst_t5/checkpoint-1000
Configuration saved in iwlst_t5/checkpoint-1000/config.json
Configuration saved in iwlst_t5/checkpoint-1000/generation_config.json
Model weights saved in iwlst_t5/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in iwlst_t5/checkpoint-1000/tokenizer_config.json
Special tokens file saved in iwlst_t5/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [iwlst_t5/checkpoint-7500] due to args.save_total_limit
Saving model checkpoint to iwlst

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_i

TrainOutput(global_step=16182, training_loss=1.4255912817536702, metrics={'train_runtime': 2228.032, 'train_samples_per_second': 116.203, 'train_steps_per_second': 7.263, 'total_flos': 4402015354748928.0, 'train_loss': 1.4255912817536702, 'epoch': 2.0})

If compute resources weren't limited training could continue since the validation loss didn't increase

In [56]:
#Check the bleu score on the test set to see if it has improved
postfintuned_results = trainer.evaluate(eval_dataset=iwlst2013_tokenized["test"])

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: id, translation. If id, translation are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7192
  Batch size = 16
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

In [57]:
print("Test set bleu score before finetuning: ", postfintuned_results["eval_bleu"])

Test set bleu score before finetuning:  18.02474202088865


Finetuning for two epochs gave almost a 2 point increase in bleu score!

In [58]:
#Try translating the example sentence again to see if the translation improved
test_text = "translate English to German: This example right here seems like great work to me if I don't say so myself."
inputs_ids = tokenizer(test_text, return_tensors="pt").input_ids.to(device)
outputs = t5.generate(inputs_ids, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
print("Translation of example sentence: ", tokenizer.decode(outputs[0], skip_special_tokens=True))

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Translation of example sentence:  Das Beispiel dort scheint mir wirklich großartig zu sein, wenn ich dies nicht selber sage.


Again putting the translated sentence into google translate and converting back to English gives the following: The example there seems really great to me, if I don't say so myself.


That sounds much more like the original sentence than the pre-finetuning translation!

Note: Training was limited to 2 epochs and the small T5 model was used due to compute restraints