<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/19_LORA_hf.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

In [1]:
# The MIT License (MIT) Copyright (c) 2025 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of 
# this software and associated documentation files (the "Software"), to deal in the Software without 
# restriction, including without limitation the rights to use, copy, modify, merge, publish, 
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or 
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

# LORA

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
os.environ["KERAS_BACKEND"] = "torch"
import keras
import torch
import transformers
import pandas as pd
import pathlib
import random

torch.__version__, transformers.__version__

('2.5.1+cu124', '4.46.3')

In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import Dataset

In [4]:
checkpoint = "google/flan-t5-base"
# checkpoint = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

inputs = tokenizer("Tell something people love", return_tensors="pt")
outputs = model.generate(**inputs,  max_new_tokens=20)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['a good story']


In [5]:
task_prefix = "Translate from English to Spanish: "
sentences = ["I like to read.", "The black dog."]

inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)

output_sequences = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    do_sample=False,  # disable sampling to test if batching affects output
    max_new_tokens=20
)

print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))

['Mientras leer.', 'El chihuahua negra.']


## 1.- Conjuntos de datos

In [6]:
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

In [7]:
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]

len(lines)

118964

In [8]:
translation = []
idx = []
for i, line in enumerate(lines):
    eng, spa = line.split("\t")
    idx.append(i)
    translation.append({'es': spa, 'en':eng})

translation[0], idx[0]

({'es': 'Ve.', 'en': 'Go.'}, 0)

In [9]:
my_dict = { "id": idx, "translation": translation}

pairs = Dataset.from_dict(my_dict)
pairs = pairs.train_test_split(test_size=0.02)
pairs

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 116584
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 2380
    })
})

## 2.- Pipeline

In [10]:
source_lang = "en"
target_lang = "es"

prefix = "translate English to Spanish: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]

    model_inputs = tokenizer(inputs, text_target=targets, max_length=64, truncation=True)

    return model_inputs

tokenized_pairs = pairs.map(preprocess_function, batched=True)
tokenized_pairs

Map:   0%|          | 0/116584 [00:00<?, ? examples/s]

Map:   0%|          | 0/2380 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 116584
    })
    test: Dataset({
        features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2380
    })
})

In [11]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

## 3.- Entrenamiento

In [12]:
# pip install sacrebleu

In [13]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak. 
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [14]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):

    preds, labels = eval_preds

    if isinstance(preds, tuple):

        preds = preds[0]
    preds= np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

In [15]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

trainable params: 3,538,944 || all params: 251,116,800 || trainable%: 1.4093


- LORA requires higher learning rate

In [16]:
max_steps = 2000
training_args = Seq2SeqTrainingArguments(
    output_dir="./ckpt-lora",
    eval_strategy="steps",
    eval_steps=250,
    learning_rate=2e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=False, # fp16 nan loss,
    max_steps=max_steps,
)

trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_pairs["train"],
    eval_dataset=tokenized_pairs["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

trainer.train()

max_steps is given, it will override any value given in num_train_epochs
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Bleu,Gen Len
250,No log,1.033605,8.7341,31.3782
500,1.418700,1.003558,9.2433,31.3782
750,1.418700,0.972252,9.5953,31.3782
1000,1.303200,0.944325,9.8549,31.3782
1250,1.303200,0.91374,10.3503,31.3782
1500,1.215200,0.893053,10.5065,31.3782
1750,1.215200,0.880005,10.7918,31.3782
2000,1.159300,0.874557,10.7403,31.3782


TrainOutput(global_step=2000, training_loss=1.2741121520996095, metrics={'train_runtime': 180.5996, 'train_samples_per_second': 177.188, 'train_steps_per_second': 11.074, 'total_flos': 936012280725504.0, 'train_loss': 1.2741121520996095, 'epoch': 0.2744613695622341})

In [17]:
model = AutoModelForSeq2SeqLM.from_pretrained(f"./ckpt-lora/checkpoint-{max_steps}")

output_sequences = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    do_sample=False,  # disable sampling to test if batching affects output
    max_new_tokens=20
)

print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))

['Me gusta leer.', 'El perro negro.']
