In [4]:
print("--- Step 1: Setting up the environment ---")

from google.colab import drive
from datasets import load_dataset, concatenate_datasets, DatasetDict
from transformers import T5TokenizerFast, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import os
import numpy as np
import torch

--- Step 1: Setting up the environment ---


In [5]:
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

Mounted at /content/drive
Google Drive mounted successfully.


In [6]:
print("\n--- Step 2: Loading multilingual data from Google Drive ---")

base_data_path = "/content/drive/MyDrive/legal-document-summarizer/Datasets/eur-lex-sum"


--- Step 2: Loading multilingual data from Google Drive ---


In [7]:
languages = ["french", "german", "irish", "italian", "dutch", "english"]
print(f"Loading {len(languages)} specified languages: {languages}")

Loading 6 specified languages: ['french', 'german', 'irish', 'italian', 'dutch', 'english']


In [8]:
all_splits = {}
for lang in languages:
    lang_path = os.path.join(base_data_path, lang)
    if not os.path.isdir(lang_path):
        print(f"Warning: Directory for language '{lang}' not found. Skipping.")
        continue
    print(f"Loading dataset for language: {lang}")
    data_files = {
        "train": os.path.join(lang_path, "train.json"),
        "validation": os.path.join(lang_path, "validation.json"),
        "test": os.path.join(lang_path, "test.json"),
    }
    dataset = load_dataset("json", data_files=data_files)
    for split in dataset.keys():
        dataset[split] = dataset[split].map(lambda example: {'lang': lang})
    for split in dataset.keys():
        if split not in all_splits:
            all_splits[split] = []
        all_splits[split].append(dataset[split])

multilingual_dataset = DatasetDict({
    split: concatenate_datasets(all_splits[split]).shuffle(seed=42)
    for split in all_splits.keys()
})

print("\n--- Successfully created a multilingual dataset! ---")
print(multilingual_dataset)

Loading dataset for language: french


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1130 [00:00<?, ? examples/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

Loading dataset for language: german


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

Loading dataset for language: irish


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

Loading dataset for language: italian


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1028 [00:00<?, ? examples/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

Loading dataset for language: dutch


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

Loading dataset for language: english


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1129 [00:00<?, ? examples/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

Map:   0%|          | 0/188 [00:00<?, ? examples/s]


--- Successfully created a multilingual dataset! ---
DatasetDict({
    train: Dataset({
        features: ['celex_id', 'reference', 'summary', 'lang'],
        num_rows: 5419
    })
    validation: Dataset({
        features: ['celex_id', 'reference', 'summary', 'lang'],
        num_rows: 1122
    })
    test: Dataset({
        features: ['celex_id', 'reference', 'summary', 'lang'],
        num_rows: 1128
    })
})


In [9]:
print("\n--- Step 3: Preprocessing and Tokenizing the data ---")

model_checkpoint = "google/long-t5-tglobal-base"
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint)

max_input_length = 2048
max_target_length = 512


--- Step 3: Preprocessing and Tokenizing the data ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/851 [00:00<?, ?B/s]

In [10]:
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["reference"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = multilingual_dataset.map(
    preprocess_function, batched=True, batch_size=100
)
print("Tokenization complete.")

Map:   0%|          | 0/5419 [00:00<?, ? examples/s]



Map:   0%|          | 0/1122 [00:00<?, ? examples/s]

Map:   0%|          | 0/1128 [00:00<?, ? examples/s]

Tokenization complete.


In [11]:
print("\n--- Step 4: Setting up and starting model training ---")

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


--- Step 4: Setting up and starting model training ---


pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [12]:
persistent_output_dir = "/content/drive/MyDrive/longt5_eurlexsum_checkpoints"
print(f"Checkpoints will be saved to: {persistent_output_dir}")

Checkpoints will be saved to: /content/drive/MyDrive/longt5_eurlexsum_checkpoints


In [13]:
batch_size = 1
training_args = Seq2SeqTrainingArguments(
    output_dir=persistent_output_dir, # <-- CRITICAL: Save to Drive
    eval_strategy="steps",      # <-- Evaluate periodically
    eval_steps=200,                   # <-- How often to evaluate
    logging_steps=100,
    save_strategy="steps",            # <-- Save checkpoints periodically
    save_steps=200,                   # <-- How often to save a checkpoint
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    save_total_limit=3,               # <-- Keep only the 3 most recent checkpoints
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    report_to="none",
    optim="adafactor",
)

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

In [15]:
print(f"Starting model training... Will resume from checkpoint if one exists.")
trainer.train(resume_from_checkpoint=True)
print("--- Training finished! ---")

Starting model training... Will resume from checkpoint if one exists.


Step,Training Loss,Validation Loss


--- Training finished! ---


In [16]:
print("\n--- Step 5: Evaluating multilingual performance ---")


--- Step 5: Evaluating multilingual performance ---


In [21]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=dca819a00add39106af7967e3a4ea7cbd20b374ba3390bbbe20a4711c55632ed
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [22]:
print("\n--- Loading ROUGE metric using evaluate library ---")
import evaluate

rouge_metric = evaluate.load("rouge")


--- Loading ROUGE metric using evaluate library ---


In [30]:
!pip install datasets



In [39]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE expects a newline after each sentence for certain calculations
    decoded_preds = ["\n".join(pred.strip()) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip()) for label in decoded_labels]

    # The result object from evaluate.load("rouge").compute is a simple dictionary
    # e.g., {'rouge1': 0.5, 'rouge2': 0.3, ...}
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # --- THE FIX IS HERE ---
    # The 'value' is already the final score (a float), so we just multiply by 100.
    # No more .mid.fmeasure is needed.
    result = {key: value * 100 for key, value in result.items()}

    # Add the average generated length to the results
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    # Round all values to 4 decimal places
    return {k: round(v, 4) for k, v in result.items()}

In [40]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics, # Pass the corrected function
)



  trainer = Seq2SeqTrainer(


In [41]:
print("\n--- Evaluating on the combined (All Languages) test set ---")
results = trainer.evaluate(tokenized_datasets["test"], metric_key_prefix="test")
print(results)


--- Evaluating on the combined (All Languages) test set ---


{'test_loss': nan, 'test_model_preparation_time': 0.0196, 'test_rouge1': 7.1685, 'test_rouge2': 6.1451, 'test_rougeL': 6.9821, 'test_rougeLsum': 7.1704, 'test_gen_len': 20.0, 'test_runtime': 1115.5791, 'test_samples_per_second': 1.011, 'test_steps_per_second': 1.011}


In [42]:
print("\n--- Evaluating on specific languages to check for bias ---")
test_en = tokenized_datasets["test"].filter(lambda x: x['lang'] == 'english')
test_fr = tokenized_datasets["test"].filter(lambda x: x['lang'] == 'french')


--- Evaluating on specific languages to check for bias ---


Filter:   0%|          | 0/1128 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1128 [00:00<?, ? examples/s]

In [43]:
if len(test_en) > 0:
    print("\n--- Evaluating on the English-only test set ---")
    results_en = trainer.evaluate(test_en, metric_key_prefix="test_en")
    print(results_en)

if len(test_fr) > 0:
    print("\n--- Evaluating on the French-only test set ---")
    results_fr = trainer.evaluate(test_fr, metric_key_prefix="test_fr")
    print(results_fr)

print("\n--- Research complete! Compare the ROUGE scores to analyze multilingual efficiency. ---")


--- Evaluating on the English-only test set ---


{'test_en_loss': nan, 'test_en_model_preparation_time': 0.0196, 'test_en_rouge1': 6.111, 'test_en_rouge2': 5.1754, 'test_en_rougeL': 5.8606, 'test_en_rougeLsum': 6.1147, 'test_en_gen_len': 20.0, 'test_en_runtime': 209.5434, 'test_en_samples_per_second': 0.897, 'test_en_steps_per_second': 0.897}

--- Evaluating on the French-only test set ---
{'test_fr_loss': nan, 'test_fr_model_preparation_time': 0.0196, 'test_fr_rouge1': 7.5818, 'test_fr_rouge2': 6.4844, 'test_fr_rougeL': 7.3251, 'test_fr_rougeLsum': 7.5859, 'test_fr_gen_len': 20.0, 'test_fr_runtime': 187.3833, 'test_fr_samples_per_second': 1.003, 'test_fr_steps_per_second': 1.003}

--- Research complete! Compare the ROUGE scores to analyze multilingual efficiency. ---
