<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/20_Flan-T5_hf.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# Fine-tune Flan-T5

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
os.environ["KERAS_BACKEND"] = "torch"
import keras_core as keras
import torch
import pandas as pd
import pathlib
import random

torch.__version__

Using PyTorch backend.


'2.1.1'

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import Dataset

In [3]:
checkpoint = "google/flan-t5-small"
# checkpoint = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

inputs = tokenizer("Tell something people love", return_tensors="pt")
outputs = model.generate(**inputs,  max_new_tokens=20)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['a sexy song']


In [4]:
task_prefix = "translate English to Spanish: "
sentences = ["I like to read.", "I like to work in NYC."]

inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)

output_sequences = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    do_sample=False,  # disable sampling to test if batching affects output
    max_new_tokens=20
)

print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))

['Yo es el ao.', 'Yo es el trabajo en NYC.']


## 1.- Conjuntos de datos

In [5]:
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

In [6]:
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]

len(lines)

118964

In [7]:
translation = []
idx = []
for i, line in enumerate(lines):
    eng, spa = line.split("\t")
    idx.append(i)
    translation.append({'es': spa, 'en':eng})

translation[0], idx[0]

({'es': 'Ve.', 'en': 'Go.'}, 0)

In [8]:
my_dict = { "id": idx, "translation": translation}

pairs = Dataset.from_dict(my_dict)
pairs = pairs.train_test_split(test_size=0.1)
pairs

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 107067
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 11897
    })
})

## 2.- Pipeline

In [9]:
source_lang = "en"
target_lang = "es"

prefix = "translate English to Spanish: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]

    model_inputs = tokenizer(inputs, text_target=targets, max_length=64, truncation=True)

    return model_inputs

In [10]:
tokenized_pairs = pairs.map(preprocess_function, batched=True)

Map:   0%|          | 0/107067 [00:00<?, ? examples/s]

Map:   0%|          | 0/11897 [00:00<?, ? examples/s]

In [11]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [12]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

## 3.- Entrenamiento

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=False, # fp16 nan loss,
    max_steps=25000
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_pairs["train"],
    eval_dataset=tokenized_pairs["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
1000,1.8757,1.431429
2000,1.7736,1.367013
3000,1.7087,1.318818
4000,1.6485,1.286806
5000,1.6119,1.253046
6000,1.5678,1.22609
7000,1.5451,1.205256
8000,1.5063,1.187517
9000,1.4965,1.169578
10000,1.4807,1.155675


TrainOutput(global_step=25000, training_loss=1.4914699609375, metrics={'train_runtime': 1303.8505, 'train_samples_per_second': 306.784, 'train_steps_per_second': 19.174, 'total_flos': 3141192997318656.0, 'train_loss': 1.4914699609375, 'epoch': 3.74})

In [14]:
model = AutoModelForSeq2SeqLM.from_pretrained("./results/checkpoint-25000")

output_sequences = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    do_sample=False,  # disable sampling to test if batching affects output
    max_new_tokens=20
)

print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))

['Me gusta leer.', 'Me gusta trabajar en NYC.']
