In [1]:
!pip install datasets evaluate rouge_score bert_score wandb sentencepiece accelerate>=0.26.0

[0m

In [2]:
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from transformers import EarlyStoppingCallback
import nltk
from nltk.tokenize import sent_tokenize
from bert_score import score as bert_score
import evaluate

import wandb
wandb.init(mode="disabled")  # ปิด wandb
# Download necessary NLTK packages
nltk.download('punkt')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
train_path = "./train_set_qsum.csv"
val_path = "./val_set_qsum.csv"

# Load datasets using the datasets library
dataset = load_dataset("csv", data_files={"train": train_path, "validation": val_path})

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [4]:
# Define model name
model_name = "t5-small"  # Options: t5-small, t5-base, t5-large, t5-3b, t5-11b

# Initialize tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Define max lengths (adjust based on your data)
max_input_length = 512
max_target_length = 128

# Modified preprocessing function to handle list data
def preprocess_function(examples):
    inputs = ["summarize: " + text for text in examples["clean_input"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["clean_output"], max_length=max_target_length, padding="max_length", truncation=True)

    # Replace padding token id with -100 in labels so it's ignored in loss calculation
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels_example]
        for labels_example in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to the datasets
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,  # Remove original columns
    desc="Running tokenizer on dataset",
)

# Let's modify the compute_metrics function to avoid the error
def compute_metrics(eval_preds):
    # Simple placeholder metrics function that doesn't use the tokenizer directly
    # This avoids the error during evaluation
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Calculate accuracy (simple metric that doesn't use tokenizer)
    # Just to avoid the error during training
    accuracy = np.mean(np.equal(preds.argmax(-1), labels))

    return {"accuracy": accuracy}

# Initialize model
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Data collator - use model instance directly
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    return_tensors="pt"
)

# Define training arguments - disable evaluation for now
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=30,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=1500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    # Disable evaluation during training to avoid the error
    eval_strategy="steps",
    eval_steps=250,
    save_steps=250,
    load_best_model_at_end=False,
    metric_for_best_model="eval_loss" ,
    fp16=True  # Disable mixed precision training for troubleshooting
)

# Initialize trainer - don't use compute_metrics for now
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    # Remove compute_metrics to simplify troubleshooting
)

# Debug: Let's check a small batch through the data collator
print("\nTesting data collator with a small batch:")
sample_batch = {
    key: tokenized_datasets["train"][0:4][key]
    for key in tokenized_datasets["train"][0:4].keys()
}
try:
    # Process a batch through the data collator
    collated_batch = data_collator(sample_batch)
    print("Data collator successful!")
    # Print batch shape info
    for key, value in collated_batch.items():
        print(f"{key}: shape {value.shape}")
except Exception as e:
    print(f"Error in data collator: {e}")

# Train model - with minimal functionality to debug the core issue
print("\nStarting training...")
try:
    trainer.train()
except Exception as e:
    print(f"Error during training: {e}")
    import traceback
    traceback.print_exc()

    # Let's try to identify exactly where the error is happening
    print("\nTrying to debug the specific issue...")

    # Check if the error is happening in the forward pass
    print("Testing model forward pass:")
    try:
        # Get a single batch
        batch = next(iter(trainer.get_train_dataloader()))
        # Move to the same device as model
        batch = {k: v.to(model.device) for k, v in batch.items()}
        # Run forward pass
        outputs = model(**batch)
        print("Model forward pass successful!")
    except Exception as e:
        print(f"Error in model forward pass: {e}")
        traceback.print_exc()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Running tokenizer on dataset:   0%|          | 0/1143 [00:00<?, ? examples/s]



Running tokenizer on dataset:   0%|          | 0/286 [00:00<?, ? examples/s]

  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.



Testing data collator with a small batch:
Error in data collator: 0

Starting training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
250,4.8406,4.417253
500,4.3257,4.047715
750,4.1635,3.91678
1000,4.0404,3.844207
1250,3.9542,3.7929
1500,3.8875,3.748579
1750,3.8161,3.714562
2000,3.7776,3.687241
2250,3.7718,3.665784
2500,3.6616,3.653906


In [5]:
# Save the model
model_path = "./t5small-finetuned-qmsum"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

Model saved to ./t5small-finetuned-qmsum


In [6]:
# Load your test CSV dataset
test_path = "./test_set_qsum.csv"
test_dataset = load_dataset("csv", data_files={"test": test_path})["test"]

# Load T5 tokenizer and model
model_path = "./t5small-finetuned-qmsum"  # Your saved model path
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path).to("cuda")

def generate_answer_T5(batch):
    # Tokenize the input text with the T5 prefix
    inputs = ["summarize: " + text for text in batch["clean_input"]]
    inputs_dict = tokenizer(
        inputs,
        padding="max_length",
        truncation=True,
        max_length=512,  # Match the max_length used during training
        return_tensors="pt"
    )
    input_ids = inputs_dict.input_ids.to("cuda")
    attention_mask = inputs_dict.attention_mask.to("cuda")

    # Generate predictions
    with torch.no_grad():
        predicted_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,  # Match the target length used during training
            num_beams=4,     # Beam search for better quality summaries
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    # Decode the generated tokens to text
    batch["predicted_output"] = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
    return batch

Generating test split: 0 examples [00:00, ? examples/s]

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Define stopwords and the lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove punctuation (keeps alphanumeric and whitespace)
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize the text into words
    words = text.split()
    # Remove stopwords and lemmatize each word
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Join words back into a string
    return " ".join(words)

def clean_dataset(example):
    # Clean both the input and output fields
    example["clean_input"] = clean_text(example["input"])
    example["clean_output"] = clean_text(example["output"])
    return example


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [8]:
print(test_dataset[0])

{'id': 'tr-sq-440', 'pid': 'tr-sq-440_0', 'input': "What did User Interface introduce about the detailed design of the prototype?\nProject Manager: Welcome back.\nIndustrial Designer: I'm sorry to be late.\nProject Manager: Welcome back everybody.\nUser Interface: Yeah. Thanks.\nProject Manager: So this meeting agenda will be the detailed design meeting. And uh opening and uh P_M_s {gap} of the meet minutes, uh prototype presentation from uh Christine and uh Agnes.\nIndustrial Designer: Agnes, yes.\nProject Manager: Yes and uh evaluation criteria. The finance, it's uh from my side, from the management, and uh production evaluation. Then uh closing. So we have forty minutes to discuss and uh finalise and close the product and project and to move further, okay, so {disfmarker} Okay, let's talk about uh maybe first uh for the prototype.\nUser Interface: Mm, okay.\nProject Manager: So I handle to {disfmarker}\nUser Interface: I've done a presentation, but it pretty much covers work that we

In [9]:
# Map the generation function over the test set (batched for efficiency)
print("Generating predictions...")
results_t5 = test_dataset.map(generate_answer_T5, batched=True, batch_size=2)  # Smaller batch size for T5

# Load ROUGE metric
rouge = evaluate.load("rouge")

# Compute ROUGE scores
def postprocess_text(preds, refs):
    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred.strip())) for pred in preds]
    refs = ["\n".join(sent_tokenize(ref.strip())) for ref in refs]
    return preds, refs

# Post-process predictions and references
print("Post-processing text for evaluation...")
processed_preds, processed_refs = postprocess_text(
    results_t5["predicted_output"],
    results_t5["clean_output"]
)

# Calculate ROUGE scores
print("Calculating ROUGE scores...")
rouge_scores = rouge.compute(
    predictions=processed_preds,
    references=processed_refs,
    use_stemmer=True,
    rouge_types=["rouge1", "rouge2", "rougeL"]
)

# Calculate BERTScore
print("Calculating BERTScore...")
# If the dataset is large, you might want to limit the number of examples for BERTScore
# as it can be computationally intensive
max_samples_for_bertscore = 100
if len(processed_preds) > max_samples_for_bertscore:
    print(f"Limiting BERTScore calculation to {max_samples_for_bertscore} samples.")
    indices = np.random.choice(len(processed_preds), max_samples_for_bertscore, replace=False)
    bertscore_preds = [processed_preds[i] for i in indices]
    bertscore_refs = [processed_refs[i] for i in indices]
else:
    bertscore_preds = processed_preds
    bertscore_refs = processed_refs

P, R, F1 = bert_score(bertscore_preds, bertscore_refs, lang='en', rescale_with_baseline=False)
bert_f1 = torch.mean(F1).item()

Generating predictions...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Post-processing text for evaluation...
Calculating ROUGE scores...
Calculating BERTScore...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Print all scores
print("\n===== T5 Evaluation Results =====")
print("ROUGE Scores:")
for metric, scores in rouge_scores.items():
    print(f"{metric}: {scores:.4f}")

print(f"\nBERTScore F1: {bert_f1:.4f}")

# Save the predictions and scores to a CSV file
import pandas as pd
results_df = pd.DataFrame({
    "input": results_t5["clean_input"],
    "reference": results_t5["clean_output"],
    "prediction": results_t5["predicted_output"]
})

# Add metrics to filename for easy reference
metrics_summary = f"R1_{rouge_scores['rouge1']:.3f}_R2_{rouge_scores['rouge2']:.3f}_RL_{rouge_scores['rougeL']:.3f}_BS_{bert_f1:.3f}"

output_file = f"t5_predictions_{metrics_summary}.csv"
# results_df.to_csv(output_file, index=False)


===== T5 Evaluation Results =====
ROUGE Scores:
rouge1: 0.3113
rouge2: 0.0842
rougeL: 0.2027

BERTScore F1: 0.8500


In [11]:
results_df.input[5]

'summarize the whole meeting project manager vocalsound marketing vocalsound project manager okay good afternoon again user interface vocalsound project manager so we should have our final meeting about the detail designed of disfmarker detail design of the product of the remote control um vocalsound so here is the agenda for today uh uh just going to go quickly through the minutes of the last last uh meeting then we have a p presentation of prototype of you two sounds interesting and well have um vocalsound presentation of evaluation crit criteria by ou our marketing experts then well have to go through finance evaluation of the of the cost of the thing and um hopefully uh we should fit the target o tw of twelve point five uh uh euro vocalsound okay so lets go uh if i go quickly through the minutes of the last meeting vocalsound so we went through th uh w we took this following decisions no lcd no speech recognition technology okay we went through a b to a banana look and feel for the

In [12]:
results_df.reference[5]

'project manager started meeting on the detailed design of the remote control user interface and industrial designer gave a presentation on the prototype with a banana leaf base station and a handsized remote with two scroll wheels the turbo button the teletext button the calling button the rechargeablebatteries power source and the speaker regular chip next the group evaluated based on marketings list of user requirement criteria then groupmates discussed the cost estimation including the component production cost they agreed to use plastic instead of rubber material for the dual chips also they agreed not to use lcd and no button supplements to avoid over budget lastly the group discussed the project process the group agreed they had creativity while user interface suggested that they should find out more target markets besides project manager suggested doing a street survey also user interface suggested that they could simplify the interface and the circuit board'

In [13]:
results_df.prediction[5]

'the meeting was about the detail design of the remote control user interface proposed to design a yellow banana with wheels and buttons the team decided to use lcd instead of speech recognition technology to make the product more attractive they also discussed the shape and shape of their product'