In [3]:
!pip install datasets evaluate rouge_score bert_score wandb sentencepiece accelerate>=0.26.0

[0m

In [4]:
from datasets import load_dataset
import evaluate
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
from bert_score import score as bert_score
import torch
from transformers import EarlyStoppingCallback
import wandb
wandb.init(mode="disabled")  # ปิด wandb


In [6]:
train_path = "./train_set_qsum.csv"
val_path = "./val_set_qsum.csv"

# Load datasets using the datasets library
dataset = load_dataset("csv", data_files={"train": train_path, "validation": val_path})

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [20]:
# Load the tokenizer (using LED-large-16384)
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")

# Set maximum lengths and batch size
# encoder_max_length = 8192   # adjust as needed based on your inputs
encoder_max_length = 4096 
decoder_max_length = 512    # adjust as needed for outputs
batch_size = 2

def process_data_to_model_inputs(batch):
    # Tokenize the inputs and targets from your dataset columns
    inputs = tokenizer(
        batch["clean_input"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
    )
    outputs = tokenizer(
        batch["clean_output"],
        padding="max_length",
        truncation=True,
        max_length=decoder_max_length,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    # Create a global attention mask (required by LED):
    # Each sample gets a list of zeros with the first token set to 1.
    batch_size_local = len(batch["input_ids"])
    batch["global_attention_mask"] = [
        [0] * encoder_max_length for _ in range(batch_size_local)
    ]
    for i in range(batch_size_local):
        batch["global_attention_mask"][i][0] = 1

    batch["labels"] = outputs.input_ids
    # Replace padding token id's in labels by -100 so that they are ignored during loss computation
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in batch["labels"]
    ]
    return batch

# Map the processing function onto your datasets and remove the original text columns
train_dataset = dataset["train"].map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=dataset["train"].column_names,
)
val_dataset = dataset["validation"].map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=dataset["validation"].column_names,
)

# Set the dataset format to PyTorch tensors for the required columns
train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "global_attention_mask", "labels"]
)
val_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "global_attention_mask", "labels"]
)


Map:   0%|          | 0/286 [00:00<?, ? examples/s]

In [25]:
# Define Seq2Seq training arguments
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    num_train_epochs=30,
    eval_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,                   # enable mixed precision training if supported
    output_dir="./led_qmsum_results",
    logging_steps=100,
    eval_steps=250,
    save_steps=1000,
    warmup_steps=1500,
    metric_for_best_model="eval_loss" ,
    save_total_limit=2,
    gradient_accumulation_steps=4,
)

# # Compute ROUGE score during evaluation
# def compute_metrics(pred):
#     labels_ids = pred.label_ids
#     pred_ids = pred.predictions

#     # Decode predictions and labels
#     pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
#     labels_ids[labels_ids == -100] = tokenizer.pad_token_id
#     label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

#     rouge_output = rouge.compute(
#         predictions=pred_str, references=label_str, rouge_types=["rouge2"]
#     )["rouge2"].mid

#     return {
#         "rouge2_precision": round(rouge_output.precision, 4),
#         "rouge2_recall": round(rouge_output.recall, 4),
#         "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
#     }

# Load LED model for sequence-to-sequence generation with gradient checkpointing enabled
model = AutoModelForSeq2SeqLM.from_pretrained(
    "allenai/led-base-16384", gradient_checkpointing=True, use_cache=False
)

# Set generation hyperparameters
model.config.num_beams = 4
model.config.max_length = decoder_max_length   # Maximum output length
model.config.min_length = 10
model.config.length_penalty = 2.0
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3

# Instantiate the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    # compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [26]:
# torch.cuda.empty_cache()

In [27]:
# Start training
trainer.train()

Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss
250,3.7474,3.569837
500,3.2552,3.377002


KeyboardInterrupt: 

In [10]:
# Save the model
model_path = "./led-finetuned-qmsum"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

Model saved to ./led-finetuned-qmsum


In [11]:
from transformers import LEDTokenizer, LEDForConditionalGeneration

In [16]:
# Load your test CSV dataset
test_path = "./test_set_qsum.csv"
test_dataset = load_dataset("csv", data_files={"test": test_path})["test"]

# Load T5 tokenizer and model
# model_path = "./led-finetuned-qmsum"  # Your saved model path
model_path = "./led_qmsum_results/checkpoint-1000/"
tokenizer = LEDTokenizer.from_pretrained(model_path)
model = LEDForConditionalGeneration.from_pretrained(model_path).to("cuda")

def infer_led(input_text: str) -> str:
    """
    Inference function for LED model.
    
    Args:
        input_text (str): The input text string to summarize or answer.
        
    Returns:
        str: The generated output text.
    """
    # Tokenize the input text
    inputs_dict = tokenizer(
        input_text,
        padding="max_length",
        truncation=True,
        max_length=4096,  # Adjust max_length as needed
        return_tensors="pt"
    )
    input_ids = inputs_dict.input_ids.to("cuda")
    attention_mask = inputs_dict.attention_mask.to("cuda")
    
    # Create a global attention mask (required for LED)
    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1  # Set global attention on the first token
    
    # Generate prediction
    predicted_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        global_attention_mask=global_attention_mask
    )
    
    # Decode the generated tokens to text
    output_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
    return output_text



def generate_answer_LED(batch):
    # Tokenize the input text
    inputs_dict = tokenizer(
        batch["clean_input"],
        padding="max_length",
        truncation=True,
        max_length=4096,  # Adjust as needed for your inputs
        return_tensors="pt"
    )
    input_ids = inputs_dict.input_ids.to("cuda")
    attention_mask = inputs_dict.attention_mask.to("cuda")

    # Create a global attention mask (required for LED)
    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1  # Set global attention on the first token

    # Generate predictions
    predicted_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        global_attention_mask=global_attention_mask
    )
    # Decode the generated tokens to text
    batch["predicted_output"] = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
    return batch

# Map the generation function over the test set (batched for efficiency)
results_led = test_dataset.map(generate_answer_LED, batched=True, batch_size=4)

# Load ROUGE metric
rouge = evaluate.load("rouge")

# Calculate ROUGE scores
print("Calculating ROUGE scores...")
rouge_scores = rouge.compute(
    predictions=results_led["predicted_output"],
    references=results_led["clean_output"],
    use_stemmer=True,
    rouge_types=["rouge1", "rouge2", "rougeL"]
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Calculating ROUGE scores...


In [17]:
# Print all scores
print("\n===== T5 Evaluation Results =====")
print("ROUGE Scores:")
for metric, scores in rouge_scores.items():
    print(f"{metric}: {scores:.4f}")


===== T5 Evaluation Results =====
ROUGE Scores:
rouge1: 0.2101
rouge2: 0.0637
rougeL: 0.1667


In [18]:
import numpy as np
# Calculate BERTScore
print("Calculating BERTScore...")
# If the dataset is large, you might want to limit the number of examples for BERTScore
# as it can be computationally intensive
max_samples_for_bertscore = 100
if len(results_led["predicted_output"]) > max_samples_for_bertscore:
    print(f"Limiting BERTScore calculation to {max_samples_for_bertscore} samples.")
    indices = np.random.choice(len(results_led["predicted_output"]), max_samples_for_bertscore, replace=False)
    bertscore_preds = [results_led["predicted_output"][i] for i in indices]
    bertscore_refs = [results_led["clean_output"][i] for i in indices]
else:
    bertscore_preds = results_led["predicted_output"]
    bertscore_refs = results_led["clean_output"]

P, R, F1 = bert_score(bertscore_preds, bertscore_refs, lang='en', rescale_with_baseline=False)
bert_f1 = torch.mean(F1).item()

Calculating BERTScore...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Print all scores
print("\n===== Longformer Evaluation Results =====")
print("ROUGE Scores:")
for metric, scores in rouge_scores.items():
    print(f"{metric}: {scores:.4f}")
print(f"\nBERTScore F1: {bert_f1:.4f}")


===== Longformer Evaluation Results =====
ROUGE Scores:
rouge1: 0.2009
rouge2: 0.0506
rougeL: 0.1538

BERTScore F1: 0.8386


In [20]:
from nltk.tokenize import sent_tokenize
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# Compute ROUGE scores
def postprocess_text(preds, refs):
    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred.strip())) for pred in preds]
    refs = ["\n".join(sent_tokenize(ref.strip())) for ref in refs]
    return preds, refs

# Post-process predictions and references
print("Post-processing text for evaluation...")
processed_preds, processed_refs = postprocess_text(
    results_led["predicted_output"],
    results_led["clean_output"]
)

# Calculate ROUGE scores
print("Calculating ROUGE scores...")
rouge_scores = rouge.compute(
    predictions=processed_preds,
    references=processed_refs,
    use_stemmer=True,
    rouge_types=["rouge1", "rouge2", "rougeL"]
)

# Calculate BERTScore
print("Calculating BERTScore...")
# If the dataset is large, you might want to limit the number of examples for BERTScore
# as it can be computationally intensive
max_samples_for_bertscore = 100
if len(processed_preds) > max_samples_for_bertscore:
    print(f"Limiting BERTScore calculation to {max_samples_for_bertscore} samples.")
    indices = np.random.choice(len(processed_preds), max_samples_for_bertscore, replace=False)
    bertscore_preds = [processed_preds[i] for i in indices]
    bertscore_refs = [processed_refs[i] for i in indices]
else:
    bertscore_preds = processed_preds
    bertscore_refs = processed_refs

P, R, F1 = bert_score(bertscore_preds, bertscore_refs, lang='en', rescale_with_baseline=False)
bert_f1 = torch.mean(F1).item()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Post-processing text for evaluation...
Calculating ROUGE scores...
Calculating BERTScore...
Limiting BERTScore calculation to 100 samples.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Print all scores
print("\n===== Longformer Evaluation Results =====")
print("ROUGE Scores:")
for metric, scores in rouge_scores.items():
    print(f"{metric}: {scores:.4f}")
print(f"\nBERTScore F1: {bert_f1:.4f}")


===== Longformer Evaluation Results =====
ROUGE Scores:
rouge1: 0.2101
rouge2: 0.0637
rougeL: 0.1667

BERTScore F1: 0.8414


## Inferenece

In [1]:
import torch
from transformers import LEDTokenizer, LEDForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def generate_summary_led(input_text: str) -> str:
    """
    Generate a summary using a finetuned LED model for a given input text.
    
    Args:
        input_text (str): The text to summarize.
        
    Returns:
        str: The generated summary.
    """
    # Tokenize the input text
    inputs_dict = tokenizer(
        input_text,
        padding="max_length",
        truncation=True,
        max_length=4096,  # Adjust based on your input requirements
        return_tensors="pt"
    )
    
    # Move inputs to the selected device
    input_ids = inputs_dict.input_ids.to(device)
    attention_mask = inputs_dict.attention_mask.to(device)
    
    # Create a global attention mask required for LED (set global attention on the first token)
    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1
    
    # Generate predictions
    with torch.no_grad():
        predicted_ids = model.generate(
            input_ids,
            attention_mask=attention_mask,
            global_attention_mask=global_attention_mask,
            max_length=256,  # Adjust the target max_length as needed
            num_beams=4,
            early_stopping=True
        )
    
    # Decode the generated tokens to text
    summary = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
    return summary

In [4]:
# Function to select the device dynamically
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")

# Select the device and load the model/tokenizer onto it
device = get_device()
model_path = "led-finetuned-qmsum"  # Your saved model path

tokenizer = LEDTokenizer.from_pretrained(model_path)
model = LEDForConditionalGeneration.from_pretrained(model_path).to(device)

In [5]:
model

LEDForConditionalGeneration(
  (led): LEDModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): LEDEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): LEDLearnedPositionalEmbedding(16384, 768)
      (layers): ModuleList(
        (0-5): 6 x LEDEncoderLayer(
          (self_attn): LEDEncoderAttention(
            (longformer_self_attn): LEDEncoderSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_global): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): Linear(in_features=768, out_features=768, bias=True)
          )
     

model