In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import time
import pandas as pd
import evaluate
import numpy as np
from accelerate import Accelerator, DataLoaderConfiguration
pip install rouge_score

2025-07-10 03:25:15.350592: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import accelerate, transformers
print(accelerate.__version__)
print(transformers.__version__)

1.7.0
4.52.4


In [2]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [3]:
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn", torch_dtype=torch.bfloat16)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
def number_of_trainable_parameters(model):
    total_model_parameters = 0
    total_trainable_parametrs = 0
    for name, param in model.named_parameters():
        total_model_parameters += param.numel()
        if param.requires_grad:
            total_trainable_parametrs += param.numel()
    return f"Total Parameters: {total_model_parameters}, \n Total Trainable Parameters are: {total_trainable_parametrs}"

print(number_of_trainable_parameters(model))

Total Parameters: 406290432, 
 Total Trainable Parameters are: 406290432


In [6]:
dialogue = dataset['test'][200]['dialogue']
summary = dataset['test'][200]['summary']
topic = dataset['test'][200]['topic']

print(f"dialogue: {dialogue}\n summary: {summary}\n topic: {topic}")

dialogue: #Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.
 summary: #Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
 topic: upgrading system


In [7]:
# Zero shot Inference
prompt = f"""
Summarize the following

{dialogue}

Summary:
"""

inputs_to_encoder = tokenizer(prompt, dialogue, return_tensors="pt")
output_from_decoder = tokenizer.decode(
    model.generate(
        inputs_to_encoder["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

print(f"Input Prompt: {prompt}\n Human Summary: {summary}\n Model Summary - Zero Shot: {output_from_decoder}" )

Input Prompt: 
Summarize the following

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

 Human Summary: #Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
 Model Summary - Zero Shot: You'd probably need a faster processor, to begin 

In [8]:
def preprocess(batch):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in batch["dialogue"]]
    batch['input_ids'] = tokenizer(prompt, padding="max_length", max_length=512, truncation=True, return_tensors="pt").input_ids
    batch['labels'] = tokenizer(batch["summary"], padding="max_length", max_length=512,truncation=True, return_tensors="pt").input_ids
    
    return batch

tokenized_training_datasets = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)
#tokenized_training_datasets =tokenized_training_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary'])
tokenized_training_datasets

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
})

In [9]:
output_dir = "/tmp/finetuned_model_checkpoints"

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_training_datasets['train'],
    eval_dataset=tokenized_training_datasets['validation'])

In [37]:
from torch.utils.data import DataLoader

loader = DataLoader(
    tokenized_training_datasets["train"],
    batch_size=2,
    collate_fn=data_collator
)

batch = next(iter(loader))
print(batch.keys())  # Should only include: input_ids, attention_mask, labels, etc.


dict_keys(['input_ids', 'labels', 'attention_mask'])


In [39]:
trainer.train()

Step,Training Loss
1,11.0


TrainOutput(global_step=1, training_loss=11.0, metrics={'train_runtime': 121.812, 'train_samples_per_second': 0.066, 'train_steps_per_second': 0.008, 'total_flos': 8668418408448.0, 'train_loss': 11.0, 'epoch': 0.0006418485237483953})

In [74]:
#Check disk space if it fails
!df -h .

Filesystem      Size  Used Avail Use% Mounted on
/dev/nvme1n1    5.0G  1.3G  3.8G  26% /home/sagemaker-user


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [75]:
#And remove unnecessary files
!rm -rf /home/sagemaker-user/*checkpoint*
!rm -rf /home/sagemaker-user/output/*
!rm -rf /home/sagemaker-user/.cache/huggingface


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [103]:
import os
import boto3

def upload_dir_to_s3(local_dir, bucket, s3_prefix):
    s3 = boto3.client("s3")
    for root, _, files in os.walk(local_dir):
        for file in files:
            full_path = os.path.join(root, file)
            rel_path = os.path.relpath(full_path, local_dir)
            s3_key = os.path.join(s3_prefix, rel_path)
            print(f"Uploading {full_path} to s3://{bucket}/{s3_key}")
            s3.upload_file(full_path, bucket, s3_key)

In [None]:
upload_dir_to_s3("/tmp/finetuned_model_checkpoints", "BUCKET_NAME", "models/dialogsum/final")

In [14]:
#Save model locally
trainer.save_model("./tmp/finetuned_model_checkpoints")



In [15]:
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("./tmp/finetuned_model_checkpoints", torch_dtype=torch.bfloat16)



# Manual Evaluation of the model

In [None]:
# We are going to get the same 200th dialogue from the dataset
dialogue = dataset['test'][200]['dialogue']
human_baseline_summary = dataset['test'][200]['summary']

# Zero shot Inference
prompt = f"""
Summarize the following

{dialogue}

Summary:
"""

print("HUMAN BASELINE SUMMARY")
print("---------------------------------------------------")
print(f"{human_baseline_summary}" )
inputs_to_original_model_encoder = tokenizer(prompt, dialogue, return_tensors="pt")
output_from_orginal_model_decoder = tokenizer.decode(
    model.generate(
        inputs_to_original_model_encoder["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)


print("ORIGINAL MODEL SUMMARY")
print("---------------------------------------------------")
print(f"{output_from_orginal_model_decoder}" )

print("---------------------------------------------------")
inputs_to_finetuned_model_encoder = tokenizer(prompt, dialogue, return_tensors="pt")
output_from_finetuned_model_decoder = tokenizer.decode(
    model.generate(
        inputs_to_finetuned_model_encoder["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)
print("FINE TUNED MODEL SUMMARY")
print("---------------------------------------------------")

print(f"{output_from_finetuned_model_decoder}" )


HUMAN BASELINE SUMMARY
---------------------------------------------------
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
ORIGINAL MODEL SUMMARY
---------------------------------------------------
You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. You might also want to add a CD-ROM drive too, because most new software programs are coming out on Cds. It would allow you to make up your own flyers and banners for advertising.
---------------------------------------------------
FINE TUNED MODEL SUMMARY
---------------------------------------------------
You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. You might also want to add a CD-ROM drive too, because most new software programs are coming out on Cds. It would allow you to make up your own flyers and banners for advertising.


# Evaluate model with ROUGE metrics

ROUGE is a set of metrics commonly used to evaluate text summarisation and machine-generated content. It measures the overlap between the generated text and reference summaries based on n-grams, word sequences, and word pairs.

The most commonly used ROUGE variants are:

ROUGE-1: Overlap of individual words (unigrams)

ROUGE-2: Overlap of word pairs (bigrams)

ROUGE-L: Longest Common Subsequence between generated and reference texts

Higher ROUGE scores indicate that the model-generated summary is more similar to the human-written one.

In [18]:
#pip install rouge_score

rouge = evaluate.load('rouge')

In [20]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
finetuned_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

    finetuned_model_outputs = finetuned_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    finetuned_model_text_output = tokenizer.decode(finetuned_model_outputs[0], skip_special_tokens=True)
    finetuned_model_summaries.append(finetuned_model_text_output)

Evaluate the models computing ROUGE metrics. Notice the improvement in the results!

In [None]:
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

finetuned_model_results = rouge.compute(
    predictions=finetuned_model_summaries,
    references=human_baseline_summaries[0:len(finetuned_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('FINETUNED MODEL:')
print(finetuned_model_results)

ORIGINAL MODEL:
{'rouge1': 0.037142857142857144, 'rouge2': 0.005714285714285714, 'rougeL': 0.021525096525096522, 'rougeLsum': 0.021525096525096522}
INSTRUCT MODEL:
{'rouge1': 0.2559007555448247, 'rouge2': 0.06673966727231631, 'rougeL': 0.1666196165366397, 'rougeLsum': 0.16674475937860173}


Notice the improvement in the fine-tuned model’s results. Let’s calculate the absolute percentage of the improvement of the fine-tuned model over original model.

In [22]:
print("Absolute percentage improvement of FineTuned MODEL over ORIGINAL MODEL")

improvement = (np.array(list(finetuned_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(finetuned_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of FineTuned MODEL over ORIGINAL MODEL
rouge1: 21.88%
rouge2: 6.10%
rougeL: 14.51%
rougeLsum: 14.52%


# Perform Parameter Efficient Fine-Tuning (PEFT with LoRA)

Parameter-Efficient Fine-Tuning (PEFT) with LoRA (Low-Rank Adaptation) is a lightweight technique that allows you to fine-tune large language models (LLMs) by updating only a small subset of parameters. Instead of modifying the entire model, LoRA injects a small number of trainable weights (called low-rank adapters) into specific layers. This approach drastically reduces the number of trainable parameters, making fine-tuning faster, cheaper, and more memory-efficient, while still achieving competitive performance compared to full fine-tuning. Using PEFT/LoRA, you freeze the underlying model layers, add a new adapter layer and train only that. The newly trained layer is then reunited and combined with the original LLM to serve the inference request.

# Preparing the Model with PEFT and LoRA
Configure LoRA configuration. Note the rank parameter (r) controls the size of the low-rank matrices inserted into the model’s layers during fine-tuning.

A lower rank (e.g., r=4) means fewer parameters, faster training, less memory – but potentially less expressiveness.

A higher rank (e.g., r=64) increases model capacity, but also cost and memory.

In [76]:
#pip install peft

In [77]:
from peft import LoraConfig, get_peft_model, TaskType

In [25]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM 
)

Add LoRA adapter layer to the original model to train it, and print the trainable parameters. You would see that the number of trainable parameters will be much smaller in this case. Only 1.15% of the total parameters are trainable in the adapter layer.



In [26]:
peft_model = get_peft_model(model, lora_config)

In [27]:
print(number_of_trainable_parameters(peft_model))

Total Parameters: 411009024, 
 Total Trainable Parameters are: 4718592


## Train PEFT layer

Similar to Full fine-tuning, we will create training arguments and an instance of the Trainer class from Hugging Face.



In [70]:
import inspect
from transformers import Trainer

class SafeTrainer(Trainer):
    def training_step(self, model, inputs, optimizer=None):
        model.train()

        # Filter out invalid keys not accepted by model.forward
        valid_keys = inspect.signature(model.forward).parameters
        filtered_inputs = {k: v for k, v in inputs.items() if k in valid_keys}

        with self.compute_loss_context_manager():
            loss = self.compute_loss(model, filtered_inputs)

        # Normalize loss for gradient accumulation
        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        return loss


In [87]:
output_dir = "/tmp/peft_model_checkpoints"

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1,
    save_safetensors=False

)

In [88]:
peft_trainer = SafeTrainer(
    model=peft_model,
    args=peft_training_args,
    data_collator=data_collator,
    train_dataset=tokenized_training_datasets['train'])
   # eval_dataset=tokenized_training_datasets['validation'])

Train and save the model locally

In [89]:
peft_trainer.train()
peft_trainer.model.save_pretrained("./tmp/peft_model_checkpoints", safe_serialization=True)


Step,Training Loss
1,11.125


config.json: 0.00B [00:00, ?B/s]

Optionally, you can save the model to S3

In [None]:
upload_dir_to_s3("/tmp/peft_model_checkpoints", "BUCKET_NAME", "peft/models/dialogsum/final")

In [90]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

peft_model = PeftModel.from_pretrained(peft_model_base, 
                                       './tmp/peft_model_checkpoints/', 
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

The is_trainable flag is set to False to freeze the model, indicating that no further training will be performed. As a result, the number of trainable parameters is now zero.

In [92]:
print(number_of_trainable_parameters(peft_model))

Total Parameters: 411009024, 
 Total Trainable Parameters are: 0


# Evaluate the Model Manually


We are testing the 200th dialogue as in the previous section.



In [95]:
index = 200
dialogue = dataset['test'][200]['dialogue']
human_baseline_summary = dataset['test'][200]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

finetuned_model_outputs = finetuned_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
finetuned_model_text_output = tokenizer.decode(finetuned_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print("---------------------------------------")
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print("---------------------------------------")
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print("---------------------------------------")
print(f'FULL FINETUNED MODEL:\n{finetuned_model_text_output}')
print("---------------------------------------")
print(f'PEFT MODEL: {peft_model_text_output}')

---------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------
ORIGINAL MODEL:
The conversation goes as follows: #Person1#: Have you considered upgrading your system? #Person2#: Yes, but I'm not sure what exactly I would need. #Person1#: You could consider adding a painting program to your software. #Person2#: That would be a definite bonus.
---------------------------------------
FULL FINETUNED MODEL:
The conversation took place between a man and his son. The man asked his son if he would like to upgrade his computer. The son said he would need a faster processor, a more powerful hard disc, more memory and a faster modem. The pair also discussed the possibility of adding a painting program to their software.
---------------------------------------
PEFT MODEL: . The conversation with a friend. The conversation took place in a computer lab.


# Evaluate Model with ROUGE Metrics

In [98]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
finetuned_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    human_baseline_text_output = human_baseline_summaries[idx]
    
    original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    finetuned_model_outputs = finetuned_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    finetuned_model_text_output = tokenizer.decode(finetuned_model_outputs[0], skip_special_tokens=True)

    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_summaries.append(original_model_text_output)
    finetuned_model_summaries.append(finetuned_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

Compute ROUGE score for this subset of the data. 

In [99]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

finetuned_model_results = rouge.compute(
    predictions=finetuned_model_summaries,
    references=human_baseline_summaries[0:len(finetuned_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('FINETUNED MODEL:')
print(finetuned_model_results)
print('PEFT MODEL:')
print(peft_model_results)

Downloading builder script: 0.00B [00:00, ?B/s]

ORIGINAL MODEL:
{'rouge1': 0.20441034936319014, 'rouge2': 0.043375890362790576, 'rougeL': 0.14238471172139486, 'rougeLsum': 0.14018566815697964}
FINETUNED MODEL:
{'rouge1': 0.2559007555448247, 'rouge2': 0.06673966727231631, 'rougeL': 0.1666196165366397, 'rougeLsum': 0.16674475937860173}
PEFT MODEL:
{'rouge1': 0.037142857142857144, 'rouge2': 0.005714285714285714, 'rougeL': 0.021525096525096522, 'rougeLsum': 0.021525096525096522}


Let’s calculate the percentage of improvement of PEFT over original model

In [100]:
print("Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL")

improvement = (np.array(list(peft_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL
rouge1: -16.73%
rouge2: -3.77%
rougeL: -12.09%
rougeLsum: -11.87%


While the PEFT (LoRA) model shows a slight decrease in ROUGE metrics compared to the fully fine-tuned model, this trade-off is expected and often acceptable. The small drop in performance is compensated by significant efficiency gains — LoRA trains only a small fraction of the model’s parameters, drastically reducing memory usage, compute requirements, and training time.

This means you can fine-tune large models on a single GPU or limited hardware, making it ideal for quick experimentation or resource-constrained environments. In many practical applications, this minor performance difference is negligible when weighed against the cost and scalability benefits of PEFT.