In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from sklearn.model_selection import train_test_split
from bert_score import score

In [None]:
csv_path = "../data/final_texts.csv"
df = pd.read_csv(csv_path)
df = df

## Dataset loading

We split the data set into 80% training, 10% test and 10% validation.

In [4]:
train_texts, temp_texts, train_summaries, temp_summaries = train_test_split(
    df["text"], df["summary"], test_size=0.2, random_state=42
)
val_texts, test_texts, val_summaries, test_summaries = train_test_split(
    temp_texts, temp_summaries, test_size=0.5, random_state=42
)

In [5]:
train_dataset = Dataset.from_dict({"text": train_texts, "summary": train_summaries})
val_dataset = Dataset.from_dict({"text": val_texts, "summary": val_summaries})
test_dataset = Dataset.from_dict({"text": test_texts, "summary": test_summaries})

## Model loading

We will fine-tune **Google t5-base**.

In [6]:
model_name = "google-t5/t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
print(model)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

## LoRA Configuration for Fine-Tuning T5

We'll fine-tune a T5 model using LoRA (Low-Rank Adaptation). LoRA is a parameter-efficient fine-tuning method that significantly reduces the number of trainable parameters while maintaining performance.

### What is LoRA?

LoRA works by adding low-rank matrices to the original model weights instead of fine-tuning all parameters. This approach:
- Reduces memory usage
- Accelerates training
- Produces smaller fine-tuned models

### Our Configuration

We're using the following LoRA configuration:
- Rank (r): 8
- Alpha: 32
- Dropout: 0.1
- Target modules: Query (q), Key (k), Value (v), and Output (o) matrices in attention layers

This configuration allows us to reduce the number of trainable parameters from 225 million to only 1.8 million (0.79% of the original model), achieving a reduction factor of 127x.

### Benefits for Summarization Task

By using LoRA for our summarization task:
1. We can efficiently fine-tune the T5 model on our dataset
2. The resulting model maintains good performance with minimal parameter changes
3. The fine-tuned model has a much smaller footprint
4. We can adapt the pre-trained model to our specific text summarization needs

In [None]:
lora_config = LoraConfig(r=8, lora_alpha=32, lora_dropout=0.1, target_modules=["q", "k", "v", "o"])
model = get_peft_model(model, lora_config)

In [None]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Percentage of parameters being trained: {100 * trainable_params / total_params:.2f}%")
print(f"Parameter reduction factor: {total_params / trainable_params:.2f}x")

Total parameters: 224,673,024
Trainable parameters: 1,769,472
Percentage of parameters being trained: 0.79%
Parameter reduction factor: 126.97x


In [None]:
def preprocessing(data):
    inputs = tokenizer(
        ["summarize: " + text for text in data["text"]],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            data["summary"], 
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
    
    inputs["labels"] = labels["input_ids"]
    
    inputs["labels"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels]
        for labels in inputs["labels"]
    ]
    
    return inputs

In [None]:
train_dataset = train_dataset.map(preprocessing, batched=True)
val_dataset = val_dataset.map(preprocessing, batched=True)
test_dataset = test_dataset.map(preprocessing, batched=True)

In [None]:
training_args = TrainingArguments(
    output_dir="./lora_summarization",
    report_to="none",
    eval_strategy="epoch",
    label_names=["labels"],
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=20,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
)

In [12]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.5164,1.328949
2,1.5022,1.296803
3,1.3678,1.279965
4,1.444,1.266407
5,1.336,1.264849
6,1.4579,1.256821
7,1.434,1.249727
8,1.3079,1.242358
9,1.3371,1.239278
10,1.4259,1.236546


TrainOutput(global_step=40000, training_loss=1.3753213768959045, metrics={'train_runtime': 6293.0592, 'train_samples_per_second': 12.712, 'train_steps_per_second': 6.356, 'total_flos': 4.915149668352e+16, 'train_loss': 1.3753213768959045, 'epoch': 20.0})

In [None]:
trainer.evaluate(eval_dataset=test_dataset)

{'eval_loss': 1.3135524988174438,
 'eval_runtime': 12.8454,
 'eval_samples_per_second': 38.924,
 'eval_steps_per_second': 19.462,
 'epoch': 2.0}

In [None]:
trainer.save_model("./lora_summarization")
tokenizer.save_pretrained("./lora_summarization")

('./lora_summarization/tokenizer_config.json',
 './lora_summarization/special_tokens_map.json',
 './lora_summarization/tokenizer.json')

In [7]:
def generate_summary(batch,model):
    inputs = tokenizer(batch["text"], return_tensors="pt", padding="longest")

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    summary_ids = model.generate(**inputs, max_length=128, num_beams=5)
    batch["generated_summary"] = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

    return batch

In [8]:
model_name_base = "google-t5/t5-base"
model_base = AutoModelForSeq2SeqLM.from_pretrained(model_name_base)

model_name_ft = "./lora_summarization"
model_ft = AutoModelForSeq2SeqLM.from_pretrained(model_name_ft)

In [None]:
val_dataset_scored_base = val_dataset.map(lambda batch : generate_summary(batch, model_base))
val_dataset_scored_ft = val_dataset.map(lambda batch : generate_summary(batch, model_ft))

In [None]:
def extract_from_list(example):
    example['generated_summary_clean'] = example['generated_summary'][0] if isinstance(example['generated_summary'], list) else example['generated_summary']
    return example

val_dataset_scored_base = val_dataset_scored_base.map(extract_from_list)
val_dataset_scored_ft = val_dataset_scored_ft.map(extract_from_list)

## Model evaluation

We use bert score to compute metrics on our results. The BERTScore metrics reveal improvements after fine-tuning the model.

In [25]:
P_base, R_base, F1_base = score(val_dataset_scored_base["generated_summary_clean"], val_dataset_scored_base["summary"], lang="fr" if "fr" in model_name else "en")

print(f"BERTScore Precision base: {P_base.mean():.4f}")
print(f"BERTScore Recall base: {R_base.mean():.4f}")
print(f"BERTScore F1 base: {F1_base.mean():.4f}")

print("--------")

P_ft, R_ft, F1_ft = score(val_dataset_scored_ft["generated_summary_clean"], val_dataset_scored_ft["summary"], lang="fr" if "fr" in model_name else "en")

print(f"BERTScore Precision base: {P_ft.mean():.4f}")
print(f"BERTScore Recall base: {R_ft.mean():.4f}")
print(f"BERTScore F1 base: {F1_ft.mean():.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Precision base: 0.8328
BERTScore Recall base: 0.8294
BERTScore F1 base: 0.8309
--------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Precision base: 0.8584
BERTScore Recall base: 0.8627
BERTScore F1 base: 0.8603


In [None]:
val_dataset_scored_base.to_csv("../data/val_dataset_scored_base.csv")
val_dataset_scored_ft.to_csv("../data/val_dataset_scored_ft.csv")

In [None]:
for i in range(10):
    print("Text:", val_dataset_scored_base["text"][i])
    print("Summary:", val_dataset_scored_base["summary"][i])
    print("Generated summary base:", val_dataset_scored_base["generated_summary_clean"][i])
    print("Generated summary fine-tuned:", val_dataset_scored_ft["generated_summary_clean"][i])
    print("")
    print("")

Text: DU MÊME AUTEUR ROMAU L~ Fus DB CoRAUE, a~' édition, t vol. in-t8 3 5o Ls MARtAOE c'OoBTTE, ta édition, t vol. ! n-~8 3 5o LE P&RB M MARTfAi., ~e édition, t vol. tn-t8.. 3 5o LA MAttQUtss, ~y édjtton, t vol. in-! 8. 3 50 LES AMOURS otON.Mss, g édition, ï vol. in-t8.. 3 5o SoLANSB Du CMtx-SAMT-Luc, 35~ édition, tvoL ia.t8 3 5o MAt'BMO! 8E!.t.) E BBBtttsstER, 228 édition, t vol. in t8 3 5c Ta~~smE, 3os édMon, t vol. in-t8 3 5o DISPARU, Bae édition, voL ia-t8 3 5o MH ttONB* ONt e'tx WA t
Summary: Romain Lefebvre : "Du Même Auteur" : "Fus DB Coraue", "Le Péril Martial", "La Matquisse", "Les Amours Otomanes", "Solansb du Cmtx-Samt-Luc", "Mabbermo! 8E!.t.) et "Bbbttster" ; "Taesme" ; "Disparu" ; "Mh tonbon on t".

Romain Lefebvre a écrit "Du Même Auteur" comprenant "Fus DB Coraue", "Le Péril Martial", "La Matquisse", "Les Amours Otomanes", "Solansb du Cmtx-Samt-Luc", "Mabbermo! 8E!.t)", "Bbbttster", "Taesme" et "Disparu", ainsi que "Mh tonbon on t".
Generated summary base: in-t8 3 5o Ls