In [None]:
from datasets import load_dataset

ds = load_dataset("EdinburghNLP/xsum")

In [None]:
small_train_dataset = ds["train"].select(range(100000))
small_eval_dataset = ds["validation"].select(range(5000))

In [None]:
import wandb

In [None]:
wandb.login(key="5aebd9bb882a1970238cc8743aa4de990e61d2c7")

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("hchang/t5-small-finetuned-xsum")
model = AutoModelForSeq2SeqLM.from_pretrained("hchang/t5-small-finetuned-xsum")

def preprocess_function(examples):
    inputs = examples["document"]
    targets = examples["summary"]
    model_inputs = tokenizer(inputs, max_length=1024, padding="max_length", truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tokenized_datasets = small_train_dataset.map(preprocess_function, batched=True)
eval_tokenized_datasets = small_eval_dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",             # Directory for model checkpoints
    evaluation_strategy="epoch",       # Evaluate after every epoch
    learning_rate=1e-5,                # Initial learning rate
    per_device_train_batch_size=4,     # Training batch size per device
    per_device_eval_batch_size=4,      # Evaluation batch size per device
    weight_decay=0.01,                 # Weight decay for optimizer
    save_total_limit=3,                # Max number of checkpoints to save
    num_train_epochs=2,                # Number of training epochs
    predict_with_generate=True,        # Use the model's `generate` for evaluation
    logging_dir="./logs",              # Directory for logs
    logging_steps=10,                  # Log every 10 steps
)

In [None]:
from transformers import Seq2SeqTrainer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=eval_tokenized_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("hchang/t5-small-finetuned-xsum")
tokenizer.save_pretrained("hchang/t5-small-finetuned-xsum")

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


tokenizer_finetuned = AutoTokenizer.from_pretrained("hchang/t5-small-finetuned-xsum")
model_finetuned = AutoModelForSeq2SeqLM.from_pretrained("hchang/t5-small-finetuned-xsum")

In [None]:
#tokenizer_pretrained = AutoTokenizer.from_pretrained("google/flan-t5-small")
#model_pretrained = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer_pretrained = AutoTokenizer.from_pretrained("hchang/t5-small-finetuned-xsum")
model_pretrained = AutoModelForSeq2SeqLM.from_pretrained("hchang/t5-small-finetuned-xsum")

In [None]:
text = '''The three-day extravaganza of farming, food and family fun celebrates many aspects of agricultural life. 
The Balmoral Show is run by the Royal Ulster Agricultural Society (RUAS) and dates back 148 years. 
Last year, it attracted more than 90,000 visitors to its recently-adopted home outside Lisburn in County Antrim. 
It was traditionally staged at the RUAS's headquarters in south Belfast, but the show moved to a larger venue on the site of the former Maze prison in 2013. 
The Maze venue, re-named Balmoral Park, is now hosting the show for the fourth consecutive year. 
The 2016 event coincides with Northern Ireland's Year of Food and Drink, and local produce features prominently in the exhibitions. 
One of this year's highlights is an "edible garden", in which visitors can see their food growing in the ground before it gets to their plates. 
The aim of the garden is to encourage people to grow their own food at home. The event will also showcase the best of local livestock, with prized pigs, cattle, poultry and ponies all lining up in bid to be the stars of the show. 
Their owners will also get a chance to shine, with horse riding and show jumping displays along with sheep shearing competitions and awards for the best livestock breeders and handlers. 
For younger visitors, there is a family fun area hosting displays from the Northern Ireland School of Falconry as well as a gun dog skills demonstration and a performance from balloon artist Bruce Airhead. 
BBC News NI are covering the event live on social media on Wednesday on Twitter at @BBCNewsNI, on Snapchat at bbcnewsni, and on BBC Newsline's Facebook page.'''
inputs = tokenizer_finetuned(text, return_tensors="pt").input_ids
outputs = model_finetuned.generate(inputs)
print("generate: ", tokenizer_finetuned.decode(outputs[0], skip_special_tokens=True))

In [None]:
inputs = tokenizer_pretrained(text, return_tensors="pt").input_ids
outputs = model_pretrained.generate(inputs)
print("generate: ", tokenizer_pretrained.decode(outputs[0], skip_special_tokens=True))

In [None]:
pip install rouge-score

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from rouge_score import rouge_scorer
import torch

def validate_model(model, tokenizer, dataset, max_input_length=512, max_target_length=128):
    """
    Validate the performance of the model on a validation dataset.

    Args:
        model: The summarization model (pre-finetuned or fine-tuned).
        tokenizer: The tokenizer associated with the model.
        dataset: The validation dataset containing 'document' and 'summary' fields.
        max_input_length: Maximum input length for tokenization.
        max_target_length: Maximum output length for summaries.

    Returns:
        A dictionary containing Rouge scores.
    """
    model.eval()  # Set the model to evaluation mode
    predictions = []
    references = []

    # Initialize a Rouge scorer
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

    with torch.no_grad():
        for example in dataset:
            # Tokenize the input document
            inputs = tokenizer(
                example["document"],
                max_length=max_input_length,
                truncation=True,
                return_tensors="pt",
                padding="max_length",
            ).input_ids.to(model.device)

            # Generate a summary
            outputs = model.generate(inputs, max_length=max_target_length, num_beams=4, early_stopping=True)
            
            # Decode the generated summary
            generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
            predictions.append(generated_summary)

            # Add the reference summary
            references.append(example["summary"])

    # Compute aggregate Rouge scores
    rouge1, rouge2, rougeL = 0, 0, 0
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge1 += scores["rouge1"].fmeasure
        rouge2 += scores["rouge2"].fmeasure
        rougeL += scores["rougeL"].fmeasure

    # Average the scores
    total = len(predictions)
    return {
        "rouge-1": rouge1 / total,
        "rouge-2": rouge2 / total,
        "rouge-L": rougeL / total,
    }

In [None]:
validation_data = ds["validation"].select(range(1000))

# Validate the model
metrics = validate_model(model_pretrained, tokenizer_pretrained, validation_data)
print("Validation Metrics:", metrics)

In [None]:
#metrics_2 = validate_model(model_finetuned, tokenizer_finetuned, validation_data)
#print("Validation Metrics:", metrics_2)