In [1]:
%load_ext autoreload
%autoreload 2
from datasets import load_dataset,load_metric,concatenate_datasets
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments,DataCollatorForSeq2Seq, AutoTokenizer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


Benchmark based on this [hugging face tutorial](https://huggingface.co/docs/peft/en/quicktour)

Load dataset and prepare it for seq2seq training

In [2]:
data = load_dataset("Samsung/samsum")
# data = data.remove_columns("id")
# data = data.rename_columns({
# 	"dialogue":"input_ids",
# 	"summary":"labels"
# })
data["train"]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 14732
})

Load model and tokenizer

In [3]:
MODEL_NAME = "google-t5/t5-small"

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [4]:
# The maximum total input sequence length after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([data["train"], data["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
# take 85 percentile of max length for better utilization
max_source_length = int(np.percentile(input_lenghts, 85))
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([data["train"], data["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
# take 90 percentile of max length for better utilization
max_target_length = int(np.percentile(target_lenghts, 90))
print(f"Max target length: {max_target_length}")

Max source length: 255
Max target length: 50


In [5]:
def preprocess_function(sample,padding="max_length",data_prefix="summarize: "):
    inputs = [data_prefix + item for item in sample["dialogue"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
tokenized_dataset = data.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])
tokenized_dataset

Map: 100%|██████████| 14732/14732 [00:03<00:00, 3926.45 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

See peft model parameters

In [7]:
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850


Training arguments

In [8]:
training_args = Seq2SeqTrainingArguments(
    output_dir=".output/t5-summarizer",
    learning_rate=1e-3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
	remove_unused_columns=False 
)

data_collator = DataCollatorForSeq2Seq(tokenizer,model,
	return_tensors="pt",
	label_pad_token_id=-100,
	pad_to_multiple_of=8)



In [9]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

                                                 
 33%|███▎      | 231/693 [02:24<03:54,  1.97it/s]

{'eval_loss': 1.9175565242767334, 'eval_runtime': 3.4474, 'eval_samples_per_second': 237.281, 'eval_steps_per_second': 3.771, 'epoch': 1.0}


 67%|██████▋   | 462/693 [04:47<01:49,  2.12it/s]


{'eval_loss': 1.8664380311965942, 'eval_runtime': 3.31, 'eval_samples_per_second': 247.13, 'eval_steps_per_second': 3.927, 'epoch': 2.0}


 72%|███████▏  | 500/693 [05:15<02:01,  1.59it/s]

{'loss': 2.1348, 'grad_norm': 0.7134456038475037, 'learning_rate': 0.0002784992784992785, 'epoch': 2.16}


100%|██████████| 693/693 [07:15<00:00,  2.03it/s]


{'eval_loss': 1.8543962240219116, 'eval_runtime': 3.379, 'eval_samples_per_second': 242.086, 'eval_steps_per_second': 3.847, 'epoch': 3.0}


100%|██████████| 693/693 [07:19<00:00,  1.58it/s]

{'train_runtime': 439.7363, 'train_samples_per_second': 100.506, 'train_steps_per_second': 1.576, 'train_loss': 2.107162321456755, 'epoch': 3.0}





TrainOutput(global_step=693, training_loss=2.107162321456755, metrics={'train_runtime': 439.7363, 'train_samples_per_second': 100.506, 'train_steps_per_second': 1.576, 'total_flos': 3010803246563328.0, 'train_loss': 2.107162321456755, 'epoch': 3.0})

In [10]:
peft_model_id="results"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)



('results\\tokenizer_config.json',
 'results\\special_tokens_map.json',
 'results\\tokenizer.json')