In [1]:
pip install rouge_score

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# Import Libraries

In [56]:
from transformers import (AutoModelForSeq2SeqLM,
                          AutoTokenizer,
                          DataCollatorForSeq2Seq,
                          Seq2SeqTrainer,
                          Seq2SeqTrainingArguments,
                          EarlyStoppingCallback,
                          GenerationConfig)
from datasets import Dataset, DatasetDict, load_metric
import pandas as pd
import numpy as np
import torch
import nltk

nltk.download("punkt", quiet=True)

# Load metric
metric = load_metric("rouge", trust_remote_code=True)


In [57]:
# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"

# Parameters
encoder_max_length = 512
decoder_max_length = 128
batch_size = 2

# Prepare Data, Model, and Tokenizer

In [58]:
train = pd.read_csv("./dataset/train.csv")
train_dataset = Dataset.from_pandas(train)

  train = pd.read_csv("./dataset/train.csv")


In [59]:
cheakPoint = "facebook/bart-large-cnn"
model = AutoModelForSeq2SeqLM.from_pretrained(cheakPoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(cheakPoint)

In [60]:
# Split dataset
train_dataset = train_dataset.shuffle(seed=42)
train, val = train_dataset.select(range(400)), train_dataset.select(range(400, 490))
dataset_dict = DatasetDict({"train": train, "validation": val})
dataset_dict = dataset_dict.remove_columns("id")

In [61]:
# Filter out rows with None values
def filter_none_rows(dataset):
    def is_not_none(example):
        return example["article"] is not None and example["highlights"] is not None
    return dataset.filter(is_not_none)

train_filtered = filter_none_rows(train)
val_filtered = filter_none_rows(val)

Filter:   0%|          | 0/400 [00:00<?, ? examples/s]

Filter: 100%|██████████| 400/400 [00:00<00:00, 8305.31 examples/s]
Filter: 100%|██████████| 90/90 [00:00<00:00, 21304.10 examples/s]


# Tokenization Step

In [62]:
# Tokenization Step
def batch_tokenize_preprocess(batch, tokenizer, encoder_max_length, decoder_max_length):
    source = batch["article"]
    target = batch["highlights"]

    # Ensure that source and target are lists of strings
    if isinstance(source, str):
        source = [source]
    if isinstance(target, str):
        target = [target]

    # Tokenize the source and target
    source_tokenized = tokenizer(source, padding="max_length", truncation=True, max_length=encoder_max_length)
    target_tokenized = tokenizer(target, padding="max_length", truncation=True, max_length=decoder_max_length)

    # Ignore padding in the loss
    target_labels = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]

    # Create a dictionary for the batch
    batch_dict = {
        "input_ids": source_tokenized["input_ids"],
        "attention_mask": source_tokenized["attention_mask"],
        "labels": target_labels,
    }

    return batch_dict

In [89]:
# Apply tokenization to the data
train_data = train_filtered.map(
    lambda batch: batch_tokenize_preprocess(batch, tokenizer, encoder_max_length, decoder_max_length),
    batched=True,
    remove_columns=["id"]
)

validation_data = val_filtered.map(
    lambda batch: batch_tokenize_preprocess(batch, tokenizer, encoder_max_length, decoder_max_length),
    batched=True,
    remove_columns=["id"]
)

# Verify tokenized data
print(train_data.select(range(min(5, train_data.num_rows))))
print(validation_data.select(range(min(5, validation_data.num_rows))))


Map: 100%|██████████| 15/15 [00:00<00:00, 176.36 examples/s]

Map: 100%|██████████| 4/4 [00:00<00:00, 258.13 examples/s]

Dataset({
    features: ['article', 'highlights', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 5
})
Dataset({
    features: ['article', 'highlights', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 4
})





# *Metric func* for compute metrics at evaluation. 

In [90]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Print shapes for debugging
    print(f"Preds shape: {preds.shape}")
    print(f"Labels shape: {labels.shape}")

    if isinstance(preds, tuple):
        preds = preds[0]

    # Ensure preds is 2D or 3D
    if len(preds.shape) == 2:
        preds = np.argmax(preds, axis=-1)
    elif len(preds.shape) == 3:
        preds = np.argmax(preds, axis=-1)  # Logits might be 3D
    else:
        raise ValueError("Predictions have an unexpected shape.")

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Handle -100 tokens in labels for padding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


In [93]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Prepare the module

In [94]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=40,
    save_steps=1e6,
    num_train_epochs=5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=3,
    predict_with_generate=True,
    learning_rate=5e-5,
    load_best_model_at_end=True,
)

In [95]:
# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


# Fine-Tuning step

In [96]:
trainer.train()

100%|██████████| 40/40 [34:01<00:00, 51.05s/it]
 25%|██▌       | 10/40 [03:04<09:01, 18.06s/it]

{'loss': 1.0445, 'grad_norm': 5.121329307556152, 'learning_rate': 1.0000000000000002e-06, 'epoch': 1.25}


 50%|█████     | 20/40 [06:08<06:12, 18.64s/it]

{'loss': 1.0257, 'grad_norm': 10.177864074707031, 'learning_rate': 2.0000000000000003e-06, 'epoch': 2.5}


 75%|███████▌  | 30/40 [09:11<03:08, 18.81s/it]

{'loss': 0.8349, 'grad_norm': 6.7616286277771, 'learning_rate': 3e-06, 'epoch': 3.75}


100%|██████████| 40/40 [12:06<00:00, 16.65s/it]

{'loss': 0.7034, 'grad_norm': 12.537304878234863, 'learning_rate': 4.000000000000001e-06, 'epoch': 5.0}




Preds shape: (4, 142)
Labels shape: (4, 142)



Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'eval_loss': 1.688814640045166, 'eval_rouge1': 0.0, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0, 'eval_rougeLsum': 0.0, 'eval_gen_len': 1.0, 'eval_runtime': 53.1565, 'eval_samples_per_second': 0.075, 'eval_steps_per_second': 0.038, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 40/40 [13:06<00:00, 19.66s/it]

{'train_runtime': 786.5427, 'train_samples_per_second': 0.095, 'train_steps_per_second': 0.051, 'train_loss': 0.902126145362854, 'epoch': 5.0}





TrainOutput(global_step=40, training_loss=0.902126145362854, metrics={'train_runtime': 786.5427, 'train_samples_per_second': 0.095, 'train_steps_per_second': 0.051, 'total_flos': 81266422579200.0, 'train_loss': 0.902126145362854, 'epoch': 5.0})

In [97]:
# Save the model
save_directory = './saved_model'
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\vocab.json',
 './saved_model\\merges.txt',
 './saved_model\\added_tokens.json',
 './saved_model\\tokenizer.json')

# Generate sumary

In [98]:
def generate_summary(test_samples, model, tokenizer, max_length=1024):
    inputs = tokenizer(test_samples, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)

    # Set the decoder_start_token_id if not already set
    if model.config.decoder_start_token_id is None:
        model.config.decoder_start_token_id = tokenizer.pad_token_id or tokenizer.bos_token_id

    # Generate summary
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        num_beams=4,
        early_stopping=True
    )

    # Decode the generated summaries
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return output_str

In [101]:
# Example usage
sample = "Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee. 'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues than fighting for space in the overhead lockers, crashing elbows and seat back kicking? Tests conducted by the FAA use planes with a 31 inch pitch, a standard which on some airlines has decreased . Many economy seats on United Airlines have 30 inches of room, while some airlines offer as little as 28 inches . Cynthia Corbertt, a human factors researcher with the Federal Aviation Administration, that it conducts tests on how quickly passengers can leave a plane. But these tests are conducted using planes with 31 inches between each row of seats, a standard which on some airlines has decreased, reported the Detroit News. The distance between two seats from one point on a seat to the same point on the seat behind it is known as the pitch. While most airlines stick to a pitch of 31 inches or above, some fall below this. While United Airlines has 30 inches of space, Gulf Air economy seats have between 29 and 32 inches, Air Asia offers 29 inches and Spirit Airlines offers just 28 inches. British Airways has a seat pitch of 31 inches, while easyJet has 29 inches, Thomson's short haul seat pitch is 28 inches, and Virgin Atlantic's is 30-31."

res = generate_summary(sample, model, tokenizer, max_length=110)
print(res)

["Shrinking space on planes is putting our health and safety in danger, say experts. Consumer advisory group set up by Department of Transportation say it is putting passengers at risk. They say shrinking space on aeroplanes is not only uncomfortable - it's putting passengers' health in danger."]
