<a href="https://colab.research.google.com/github/monilouise/unicamp-P_IA368DD_2023S1/blob/main/Aula_6/T5_%2B_doc2query_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# T5 + doc2query - T5 Finetuning Experiments

Author: Monique Monteiro (moniquelouise@gmail.com)

## Dataset download

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)


Mounted at /content/gdrive


In [None]:
main_dir = "/content/gdrive/MyDrive/Unicamp-aula-6-3"

In [None]:
!ls {main_dir}

doc2query  msmarco_triples.train.tiny.tsv


In [None]:
!wget https://storage.googleapis.com/unicamp-dl/ia368dd_2023s1/msmarco/msmarco_triples.train.tiny.tsv

--2023-04-03 04:50:35--  https://storage.googleapis.com/unicamp-dl/ia368dd_2023s1/msmarco/msmarco_triples.train.tiny.tsv
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.24.128, 142.251.10.128, 142.251.12.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.24.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8076179 (7.7M) [text/tab-separated-values]
Saving to: ‘msmarco_triples.train.tiny.tsv’


2023-04-03 04:50:37 (7.37 MB/s) - ‘msmarco_triples.train.tiny.tsv’ saved [8076179/8076179]



In [None]:
!mv msmarco_triples.train.tiny.tsv {main_dir}

## Libraries installation

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
!pip install sacrebleu

In [None]:
!pip install datasets

In [None]:
!pip install evaluate

## Dataset creation

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv(f"{main_dir}/msmarco_triples.train.tiny.tsv", delimiter="\t", 
                 header=None, names=["query", "relevant_passage", "non_relevant_passage"])
X_train = df["relevant_passage"].tolist()
Y_train = df["query"].tolist()

#X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=1000, random_state=42)

In [None]:
from torch.utils.data import Dataset, DataLoader

class Doc2QueryDataset(Dataset):
  def __init__(self, X, Y, tokenizer):
    self.tokenizer = tokenizer
    self.X = X
    self.Y = Y

  def __len__(self):
    return len(self.X)

  def __getitem__(self, index):    
    tokenized_input = self.tokenizer(self.X[index])
    tokenized_query = self.tokenizer(self.Y[index])
    return {"input_ids": tokenized_input["input_ids"], 
            "attention_mask": tokenized_input["attention_mask"], 
            "labels": tokenized_query["input_ids"]}
    


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config, AdamW, AutoTokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-base")

train_dataset = Doc2QueryDataset(X_train, Y_train, tokenizer)
val_dataset = Doc2QueryDataset(X_val, Y_val, tokenizer)


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
len(train_dataset)

10000

In [None]:
len(val_dataset)

1000

In [None]:
tokenizer.model_max_length

512

## Metrics definition

Ref.: https://github.com/huggingface/transformers/blob/main/examples/pytorch/translation/run_translation.py

In [None]:
import evaluate

In [None]:
# Metric
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

## Finetuning Experiments

In [None]:
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed
)

### 1st experiment (baseline): Default optimizer and learning rate scheduler

In [None]:
#batch_size = 32 #bleu = 15.663058, lr default, split de validação igual a 0.2
#steps = 17
batch_size = 32 #bleu = 15.806152, lr default, split de validação igual a 0.2
steps = 50
epochs = 100

In [None]:
import numpy as np

model = T5ForConditionalGeneration.from_pretrained("t5-base")


print("batch size = ", batch_size)
print("len train_dataset = ", len(train_dataset))

training_args = Seq2SeqTrainingArguments(output_dir=f"{main_dir}/doc2query",
                                          overwrite_output_dir=True,
                                          per_device_train_batch_size=batch_size,
                                          per_device_eval_batch_size=batch_size,
                                          gradient_accumulation_steps=8,
                                          evaluation_strategy='steps',
                                          eval_steps=steps, logging_steps=steps, 
                                          save_steps=steps,
                                          predict_with_generate=True,
                                          fp16=True, 
                                          num_train_epochs=epochs,
                                          load_best_model_at_end=True,
                                          metric_for_best_model='bleu',
                                          save_total_limit = 2
                                        )

#If you use mixed precision, you need all your tensors to have dimensions that are multiple of 8s to maximize the benefits of your tensor cores.
#So pas_to_multiple_of=8 is a good value
#Ref.: https://discuss.huggingface.co/t/whats-a-good-value-for-pad-to-multiple-of/1481

#Se não usar o collator e tokenizar com parâmetros além da entrada, todo tipo de erro acontece.
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8 if training_args.fp16 else None,
)

trainer = Seq2SeqTrainer(model=model,
                        args=training_args,
                        train_dataset=train_dataset,
                        eval_dataset=val_dataset,
                        data_collator=data_collator,
                        tokenizer=tokenizer,
                        compute_metrics=compute_metrics
                        )

train_results = trainer.train()


batch size =  32
len train_dataset =  10000




Step,Training Loss,Validation Loss,Bleu
50,3.2438,1.779949,13.804612
100,1.9467,1.633787,16.332318
150,1.9888,1.606424,15.381401


KeyboardInterrupt: ignored

### 2nd Experiment: AdamW and constant learning rate = 1e-4

In [None]:
from transformers.optimization import get_constant_schedule

In [None]:
#batch_size=8 #bleu=17.585528 com split=0.2
#batch_size=16 #bleu=16.439199 com split=0.2
batch_size=8
steps=50
epochs=100

In [None]:
import numpy as np

model = T5ForConditionalGeneration.from_pretrained("t5-base")

optimizer = AdamW(model.parameters(), lr=1e-4)
lr_scheduler = get_constant_schedule(optimizer)
print("batch size = ", batch_size)
print("len train_dataset = ", len(train_dataset))

training_args = Seq2SeqTrainingArguments(output_dir=f"{main_dir}/doc2query",
                                          overwrite_output_dir=True,
                                          per_device_train_batch_size=batch_size,
                                          per_device_eval_batch_size=batch_size,
                                          gradient_accumulation_steps=8,
                                          evaluation_strategy='steps',
                                          eval_steps=steps, logging_steps=steps, 
                                          save_steps=steps,
                                          predict_with_generate=True,
                                          fp16=True, 
                                          num_train_epochs=epochs,
                                          load_best_model_at_end=True,
                                          metric_for_best_model='bleu',
                                          save_total_limit = 2
                                        )

#If you use mixed precision, you need all your tensors to have dimensions that are multiple of 8s to maximize the benefits of your tensor cores.
#So pas_to_multiple_of=8 is a good value
#Ref.: https://discuss.huggingface.co/t/whats-a-good-value-for-pad-to-multiple-of/1481

#Se não usar o collator e tokenizar com parâmetros além da entrada, todo tipo de erro acontece.
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8 if training_args.fp16 else None,
)

trainer = Seq2SeqTrainer(model=model,
                        args=training_args,
                        train_dataset=train_dataset,
                        eval_dataset=val_dataset,
                        data_collator=data_collator,
                        tokenizer=tokenizer,
                        compute_metrics=compute_metrics,
                        optimizers=(optimizer,lr_scheduler)
                        )

train_results = trainer.train()




batch size =  8
len train_dataset =  10000


Step,Training Loss,Validation Loss,Bleu
50,2.7478,1.760042,14.191379
100,1.9363,1.634877,16.028759
150,1.8429,1.59147,17.079439
200,1.6709,1.576075,17.013726


KeyboardInterrupt: ignored

In [None]:
trainer.save_model()

### 3rd Experiment: AdaFactor optimizer


In [None]:
batch_size=32
#batch_size=8
steps=50
#epochs=100
epochs=13

Without FP16, the model achieved a high BLEU value (>22), but the validation loss increased as BLEU also increased (!).  On the other hand, with FP16, the model gets stuck at BLEU=11 and does not improve for several iterations.  

After replacing fp16 by bf16, the same "overfiting-like" behaviour occurred.  So I saved the model with early stoping, up to a point with minimal validation loss and maximum BLEU (19.92) ("normal behaviour). 



In [None]:
import numpy as np
from transformers.optimization import Adafactor, AdafactorSchedule

model = T5ForConditionalGeneration.from_pretrained("t5-base")

optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True)
lr_scheduler = AdafactorSchedule(optimizer)

print("batch size = ", batch_size)
print("len train_dataset = ", len(train_dataset))

training_args = Seq2SeqTrainingArguments(output_dir=f"{main_dir}/doc2query",
                                          overwrite_output_dir=True,
                                          per_device_train_batch_size=batch_size,
                                          per_device_eval_batch_size=batch_size,
                                          gradient_accumulation_steps=8,
                                          evaluation_strategy='steps',
                                          eval_steps=steps, logging_steps=steps, 
                                          save_steps=steps,
                                          predict_with_generate=True,
                                          #fp16=True,
                                          bf16=True,
                                          num_train_epochs=epochs,
                                          load_best_model_at_end=True,
                                          metric_for_best_model='bleu',
                                          save_total_limit = 2
                                          
                                        )

#If you use mixed precision, you need all your tensors to have dimensions that are multiple of 8s to maximize the benefits of your tensor cores.
#So pas_to_multiple_of=8 is a good value
#Ref.: https://discuss.huggingface.co/t/whats-a-good-value-for-pad-to-multiple-of/1481

#Se não usar o collator e tokenizar com parâmetros além da entrada, todo tipo de erro acontece.
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8 if training_args.fp16 else None,
)

trainer = Seq2SeqTrainer(model=model,
                        args=training_args,
                        train_dataset=train_dataset,
                        eval_dataset=val_dataset,
                        data_collator=data_collator,
                        tokenizer=tokenizer,
                        compute_metrics=compute_metrics,
                        optimizers=(optimizer, lr_scheduler)
                        )

train_results = trainer.train()


batch size =  32
len train_dataset =  10000


Step,Training Loss,Validation Loss,Bleu
50,5.1251,4.09191,1.297463
100,3.026,1.969804,8.736579
150,2.0047,1.700381,14.301186
200,1.8078,1.611367,15.833664
250,1.6848,1.558003,17.080495
300,1.5907,1.520123,18.581309
350,1.496,1.498664,19.053267
400,1.4178,1.487062,18.643619
450,1.3302,1.479391,19.920438
500,1.2559,1.48059,19.761084


In [None]:
# See https://stats.stackexchange.com/questions/282160/how-is-it-possible-that-validation-loss-is-increasing-while-validation-accuracy and https://forum.opennmt.net/t/scorer-test-set-vs-validation-set/4517/3

In [None]:
trainer.save_model()

In [None]:
metrics = trainer.evaluate()

In [None]:
metrics

{'eval_loss': 1.479391098022461,
 'eval_bleu': 19.920438361367808,
 'eval_runtime': 35.2199,
 'eval_samples_per_second': 28.393,
 'eval_steps_per_second': 0.909,
 'epoch': 12.96}

In [None]:
import json

with open(f"{main_dir}/doc2query/metrics.json", 'w') as f:
  json.dump(metrics,f)

### 4th Experiment - mixing different batch sizes

Inspired by PALM paper
It's based on starting the training with a low batch size and latter incresase this batch size.

In [None]:
import os

In [None]:
steps=50

In [None]:
import numpy as np

model = T5ForConditionalGeneration.from_pretrained("t5-base")

def train(model, batch_size, epochs=3):
  print('batch size = ', batch_size)
  training_args = Seq2SeqTrainingArguments(output_dir=f"{main_dir}/doc2query",
                                            overwrite_output_dir=True,
                                            per_device_train_batch_size=batch_size,
                                            per_device_eval_batch_size=batch_size,
                                            gradient_accumulation_steps=8,
                                            evaluation_strategy='steps',
                                            eval_steps=steps, logging_steps=steps, 
                                            save_steps=steps,
                                            predict_with_generate=True,
                                            fp16=True, 
                                            num_train_epochs=epochs,
                                            load_best_model_at_end=True,
                                            metric_for_best_model='bleu',
                                            save_total_limit = 2
                                          )

  #If you use mixed precision, you need all your tensors to have dimensions that are multiple of 8s to maximize the benefits of your tensor cores.
  #So pas_to_multiple_of=8 is a good value
  #Ref.: https://discuss.huggingface.co/t/whats-a-good-value-for-pad-to-multiple-of/1481

  #Se não usar o collator e tokenizar com parâmetros além da entrada, todo tipo de erro acontece.
  data_collator = DataCollatorForSeq2Seq( 
      tokenizer,
      model=model,
      label_pad_token_id=-100,
      pad_to_multiple_of=8 if training_args.fp16 else None,
  )

  trainer = Seq2SeqTrainer(model=model,
                          args=training_args,
                          train_dataset=train_dataset,
                          eval_dataset=val_dataset,
                          data_collator=data_collator,
                          tokenizer=tokenizer,
                          compute_metrics=compute_metrics
                          )

  train_results = trainer.train()
  return trainer



Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
trainer = train(model, 8)
trainer = train(model, 32, 12)

In [None]:
trainer.save_model()
metrics = trainer.evaluate()


In [None]:
import json

with open(f"{main_dir}/doc2query/metrics.json", 'w') as f:
  json.dump(metrics,f)

In [None]:
import os

os.rename(f"{main_dir}/doc2query", f"{main_dir}/doc2query_default_b8-b32")

Now the opposite - start with a high batch size and decrease it later.

In [None]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")
trainer = train(model, 32, 12)
trainer = train(model, 8)

batch size =  32




Step,Training Loss,Validation Loss,Bleu
50,3.1888,1.774593,13.636776
100,2.3199,1.658584,14.665341
150,2.1475,1.66301,13.621872
200,1.9192,1.661236,13.445569
250,1.9802,1.6598,13.465747
300,2.0862,1.659727,13.465747
350,2.0109,1.65971,13.465747
400,1.9829,1.65971,13.465747
450,2.0558,1.65971,13.465747


batch size =  8




Step,Training Loss,Validation Loss,Bleu
50,1.8158,1.62449,15.565391
100,1.7867,1.597662,16.743959
150,1.8093,1.583537,16.668995
200,1.678,1.575306,16.547084
250,1.8004,1.563522,17.30483
300,1.7009,1.555191,16.679671
350,1.5759,1.551357,17.205937
400,1.8054,1.548646,16.418515
450,1.6673,1.548901,16.356012


In [None]:
trainer.save_model()
metrics = trainer.evaluate()
with open(f"{main_dir}/doc2query/metrics.json", 'w') as f:
  json.dump(metrics,f)

In [None]:
os.rename(f"{main_dir}/doc2query", f"{main_dir}/doc2query_default_b32-b8")

Finally, for the best result (smaller to higher), increase progressively from 8 to 16 then 32.

In [None]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")
trainer = train(model, 8)
trainer = train(model, 16, 6)
trainer = train(model, 32, 12)


batch size =  8




Step,Training Loss,Validation Loss,Bleu
50,2.9803,1.846013,11.816433
100,2.0026,1.707517,14.135295
150,1.9628,1.661441,14.941337
200,1.7738,1.631891,14.921651
250,1.8089,1.613374,16.065293
300,1.7985,1.60275,16.25942
350,1.6661,1.596123,16.504767
400,1.7064,1.589016,16.112679


Step,Training Loss,Validation Loss,Bleu
50,2.9803,1.846013,11.816433
100,2.0026,1.707517,14.135295
150,1.9628,1.661441,14.941337
200,1.7738,1.631891,14.921651
250,1.8089,1.613374,16.065293
300,1.7985,1.60275,16.25942
350,1.6661,1.596123,16.504767
400,1.7064,1.589016,16.112679
450,1.6999,1.587041,16.143037


batch size =  16




Step,Training Loss,Validation Loss,Bleu
50,1.7704,1.572804,16.61332
100,1.6096,1.559006,17.37101
150,1.5993,1.543522,18.069003
200,1.5524,1.536495,18.468013
250,1.6299,1.533385,18.725323
300,1.5037,1.527405,18.090745
350,1.5318,1.524894,18.66694
400,1.4619,1.524966,18.66209
450,1.4431,1.523645,18.312985


batch size =  32


Step,Training Loss,Validation Loss,Bleu
50,1.5022,1.529532,18.970029
100,1.4541,1.522025,19.221423
150,1.5149,1.518564,18.875055
200,1.4859,1.517217,18.963882
250,1.4051,1.495193,19.531806
300,1.3911,1.505102,18.038478
350,1.4494,1.518167,17.902955
400,1.4134,1.517734,17.793682
450,1.438,1.517273,17.812849


In [None]:
trainer.save_model()
metrics = trainer.evaluate()
with open(f"{main_dir}/doc2query/metrics.json", 'w') as f:
  json.dump(metrics,f)

In [None]:
os.rename(f"{main_dir}/doc2query", f"{main_dir}/doc2query_default_b8-b16-b32")

## Conclusions for the next step

Use 3 candidate models to expand the queries:

1.   doc2query-adafactor-bs-32-split-1000-no-fp16 (BLEU = 22.46)
2.   doc2query-adafactor-bs-32-split-1000-withbf16-early-stoping (BLEU = 19.92)
3.   doc2query_default_b8-b16-b32 (BLEU = 19.53, with expected behavior)
