# Finetuning of large language models to perform text summarization

Install all necessary libraries, which are not preinstalled on Google colab.


In [1]:
!pip install transformers --quiet
!pip install torch --quiet
!pip install loralib --quiet
!pip install peft --quiet
!pip install datasets --quiet
!pip install py7zr --quiet
!pip install evaluate rouge_score --quiet

Import all necessary packages.

In [2]:
import torch

from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

from datasets import load_dataset

Load the FLAN T5 pretrained model. It is a seq2seq type of model. We can experiment with varius models and model sizes: https://huggingface.co/google.

In [None]:
model_name='google/flan-t5-small'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
original_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

Count the number of parameters in the model.

In [None]:
parameters = 0
for _, param in original_model.named_parameters():
  parameters += param.numel()

print (f"Number of model parameters: {parameters}.")

## Data preprocessing

We need to donwload the data first.

In [5]:
dataset = load_dataset("samsum")

print("Train dataset size: " + str(len(dataset['train'])))
print("Test dataset size: " +  str(len(dataset['test'])))

Train dataset size: 14732
Test dataset size: 819


Let's prepare the task prompts and tokenize the whole dataset.

In [None]:
max_prompt_length = 200
max_answer_length = 50

def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    inputs = original_tokenizer(prompt, padding="max_length", max_length = max_prompt_length, truncation=True, return_tensors="pt")
    example['input_ids'] = inputs['input_ids']
    example['input_attn_mask'] = inputs['attention_mask']
    example['labels'] = original_tokenizer(example["summary"], padding="max_length", max_length = max_answer_length, truncation=True, return_tensors="pt").input_ids

    return example

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(['id', 'dialogue', 'summary',])

In [None]:
print (tokenized_datasets.keys())

## Test the summarization with one-shot learning

In [None]:
example_prompt_ids = torch.IntTensor([tokenized_datasets['test']['input_ids'][125]])
example_prompt_mask = torch.IntTensor([tokenized_datasets['test']['input_attn_mask'][125]])
example_answer_ids = tokenized_datasets['test']['labels'][125]

model_outputs_ids = original_model.generate(input_ids=example_prompt_ids, attention_mask=example_prompt_mask,
                                        generation_config=GenerationConfig(max_new_tokens=max_answer_length,
                                                                           num_beams=1))

prompt_txt = original_tokenizer.decode(example_prompt_ids[0], skip_special_tokens=True)
human_answer_txt = original_tokenizer.decode(example_answer_ids, skip_special_tokens=True)
machine_answer_txt = original_tokenizer.decode(model_outputs_ids[0], skip_special_tokens=True)

print(f'PROMPT:\n{prompt_txt}')
print('---------------------------------------------')
print(f'HUMAN ANSWER:\n{human_answer_txt}')
print('---------------------------------------------')
print(f'MACHINE ANSWER:\n{machine_answer_txt}')

## Original model evaluation using ROUGE.

In [None]:
import evaluate
import numpy as np
from tqdm import tqdm

# Metric
metric = evaluate.load("rouge")

def evaluate_model(example, model, max_answer_length=max_answer_length):

    prompt_ids = torch.IntTensor([example['input_ids']])
    prompt_mask = torch.IntTensor(example['input_attn_mask'])
    machine_answer_ids = model.generate(input_ids=prompt_ids, attention_mask=example_prompt_mask,
                                        generation_config=GenerationConfig(max_new_tokens=max_answer_length,
                                                                           num_beams=1))
    prediction = original_tokenizer.decode(machine_answer_ids[0], skip_special_tokens=True)
    labels = original_tokenizer.decode(example['labels'], skip_special_tokens=True)

    return prediction, labels


predictions, references = [] , []
i = 1
for example in tqdm(tokenized_datasets['test']):
    prediction,labels = evaluate_model(example, original_model)
    predictions.append(prediction)
    references.append(labels)
    if i == 50:
      break
    i += 1

# compute metric
rouge = metric.compute(predictions=predictions, references=references, use_stemmer=True)

# print results
print(f"\nROUGE-1: {rouge['rouge1']* 100:2f}%")
print(f"ROUGE-2: {rouge['rouge2']* 100:2f}%")

# Full finetuning

In [10]:
from transformers import TrainingArguments, Trainer

output_dir = './dialogue-summary-full-finetuning'

fully_finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=500,
    max_steps=5000
)

trainer = Trainer(
    model = fully_finetuned_model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['validation']
)

Don't run the following cell (full finetuning) in Google colab. You could run out of memory.

In [11]:
#trainer.train()

fully_finetuned_model_path='./dialogue-summary-full-finetuning-checkpoint'

#trainer.model.save_pretrained(fully_finetuned_model_path)
#original_tokenizer.save_pretrained(fully_finetuned_model_path)

We will load a saved finetuned model instead.

In [None]:
!wget https://mlcollege.com/data/full-finetuning.zip
!rm -rf ./dialogue-summary-full-finetuning-checkpoint
!unzip full-finetuning.zip
fully_finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(fully_finetuned_model_path)

## Test summarization with the fully finetuned model.

In [None]:
model_outputs_ids = fully_finetuned_model.generate(input_ids=example_prompt_ids, attention_mask=example_prompt_mask,
                                        generation_config=GenerationConfig(max_new_tokens=max_answer_length,
                                                                           num_beams=1))

prompt_txt = original_tokenizer.decode(example_prompt_ids[0], skip_special_tokens=True)
human_answer_txt = original_tokenizer.decode(example_answer_ids, skip_special_tokens=True)
machine_answer_txt = original_tokenizer.decode(model_outputs_ids[0], skip_special_tokens=True)

print(f'PROMPT:\n{prompt_txt}')
print('---------------------------------------------')
print(f'HUMAN ANSWER:\n{human_answer_txt}')
print('---------------------------------------------')
print(f'MACHINE ANSWER:\n{machine_answer_txt}')

## Evaluation of the fully finetuned model with ROUGE

In [None]:
fully_finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(fully_finetuned_model_path)

predictions, references = [] , []
i = 1
for example in tqdm(tokenized_datasets['test']):
    prediction,labels = evaluate_model(example, fully_finetuned_model)
    predictions.append(prediction)
    references.append(labels)
    if i == 50:
      break
    i += 1

# compute metric
rouge = metric.compute(predictions=predictions, references=references, use_stemmer=True, use_aggregator=True)

# print results
print(f"\nROUGE-1: {rouge['rouge1']* 100:2f}%")
print(f"ROUGE-2: {rouge['rouge2']* 100:2f}%")

# Finetuning with Low-Rank Adaptation (LoRA)

r: the rank of the update matrices, expressed in int. Lower rank results in smaller update matrices with fewer trainable parameters.

lora_alpha: LoRA scaling factor.

target_modules: The modules (for example, attention blocks) to apply the LoRA update matrices.

In [15]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

In [16]:
peft_model = get_peft_model(original_model, lora_config)

Print the number of trainable parameters.

In [None]:
parameters = 0
for _, param in peft_model.named_parameters():
  if param.requires_grad:
    parameters += param.numel()

print (f"Number of trainable model parameters: {parameters}.")

In [18]:
output_dir = './dialogue-summary-peft'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=5,
    logging_steps=5, #500
    max_steps=50 #5000
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset = tokenized_datasets['validation']
)

Train the model. Complete training would require more steps than 50. We have pretrained the model in 5000 steps.

In [None]:
peft_trainer.train()

peft_model_path='./dialogue-summary-peft-checkpoint'

trainer.model.save_pretrained(peft_model_path)
original_tokenizer.save_pretrained(peft_model_path)

Let's donwload the pretrained PEFT model (trained in 5000 steps).

In [None]:

!wget https://mlcollege.com/data/peft.zip
!rm -rf ./dialogue-summary-peft-checkpoint
!unzip peft.zip

peft_model = AutoModelForSeq2SeqLM.from_pretrained(peft_model_path)

## Test summarization with the LoRA model.

In [None]:
model_outputs_ids = fully_finetuned_model.generate(input_ids=example_prompt_ids, attention_mask=example_prompt_mask,
                                        generation_config=GenerationConfig(max_new_tokens=max_answer_length,
                                                                           num_beams=1))

prompt_txt = original_tokenizer.decode(example_prompt_ids[0], skip_special_tokens=True)
human_answer_txt = original_tokenizer.decode(example_answer_ids, skip_special_tokens=True)
machine_answer_txt = original_tokenizer.decode(model_outputs_ids[0], skip_special_tokens=True)

print(f'PROMPT:\n{prompt_txt}')
print('---------------------------------------------')
print(f'HUMAN ANSWER:\n{human_answer_txt}')
print('---------------------------------------------')
print(f'MACHINE ANSWER:\n{machine_answer_txt}')

# Evaluation of the LoRA model with ROUGE

In [None]:
peft_model = AutoModelForSeq2SeqLM.from_pretrained(peft_model_path)

predictions, references = [] , []
i = 1
for example in tqdm(tokenized_datasets['test']):
    prediction,labels = evaluate_model(example, peft_model)
    predictions.append(prediction)
    references.append(labels)
    if i == 50:
      break
    i += 1

# compute metric
rouge = metric.compute(predictions=predictions, references=references, use_stemmer=True, use_aggregator=True)

# print results
print(f"\nROUGE-1: {rouge['rouge1']* 100:2f}%")
print(f"ROUGE-2: {rouge['rouge2']* 100:2f}%")