In [1]:
#%pip install --upgrade accelerate
#%pip uninstall -y transformers accelerate
#%pip install transformers accelerate
#%pip install datasets nltk tqdm
#%pip install sentencepiece
#%pip install py7zr
#%pip install matplotlib
#%pip install absl-py rouge-score
#%pip install -U flash-attn --no-build-isolation

In [2]:
#from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk,load_metric
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.auto import tqdm
from transformers import BartForConditionalGeneration, BartTokenizer,AdamW, get_scheduler, DataCollatorForSeq2Seq, trainer
import torch
from torch.utils.data import DataLoader
import nltk
from nltk.tokenize import sent_tokenize


#nltk.download("punkt")

In [3]:
#!nvidia-smi

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:
model_ckpt = "facebook/bart-base"

tokenizer = BartTokenizer.from_pretrained(model_ckpt)

curr_model = BartForConditionalGeneration.from_pretrained(model_ckpt).to(device)

In [6]:
dataset_samsum = load_dataset('samsum')
dataset_samsum

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [7]:
split_lengths = [len(dataset_samsum[split])for split in dataset_samsum]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print("\nDialogue:")

print(dataset_samsum["test"][1]["dialogue"])

print("\nSummary:")

print(dataset_samsum["test"][1]["summary"])

Split lengths: [14732, 819, 818]
Features: ['id', 'dialogue', 'summary']

Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)

Summary:
Eric and Rob are going to watch a stand-up on youtube.


In [8]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'], max_length=1024, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length=128, truncation=True, padding="max_length")

    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

In [9]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched=True, remove_columns=["dialogue", "summary", "id"])

# Training

In [10]:
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=curr_model)

In [11]:
train_dataloader = DataLoader(
    dataset_samsum_pt["train"], shuffle=True, batch_size=2, collate_fn=seq2seq_data_collator
)
eval_dataloader = DataLoader(
    dataset_samsum_pt["validation"], batch_size=2, collate_fn=seq2seq_data_collator
)
test_dataloader = DataLoader(
    dataset_samsum_pt["test"], batch_size=2, collate_fn=seq2seq_data_collator
)

In [12]:
for batch in train_dataloader:
    break
print({k: v.shape for k, v in batch.items()})

{'input_ids': torch.Size([2, 1024]), 'attention_mask': torch.Size([2, 1024]), 'labels': torch.Size([2, 128]), 'decoder_input_ids': torch.Size([2, 128])}


In [13]:
optimizer = AdamW(curr_model.parameters(), lr=7e-5, weight_decay=0.01)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=100,
    num_training_steps=num_training_steps
)



In [14]:
progress_bar = tqdm(range(num_training_steps))

curr_model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    epoch_loss = 0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = curr_model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

        epoch_loss += loss.item()

    avg_epoch_loss = epoch_loss / len(train_dataloader)
    print(f"Average training loss for epoch {epoch + 1}: {avg_epoch_loss:.4f}")

    # Validation loop
    curr_model.eval()
    val_loss = 0
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = curr_model(**batch)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(eval_dataloader)
    print(f"Validation loss after epoch {epoch + 1}: {avg_val_loss:.4f}")

    # Switch back to training mode
    curr_model.train()

  0%|          | 0/22098 [00:00<?, ?it/s]

Epoch 1/3
Average training loss for epoch 1: 0.5214
Validation loss after epoch 1: 0.3697
Epoch 2/3
Average training loss for epoch 2: 0.3309
Validation loss after epoch 2: 0.3450
Epoch 3/3
Average training loss for epoch 3: 0.2482
Validation loss after epoch 3: 0.3381


# Evaluation

In [15]:
def calculate_metric_on_test_ds(test_ds, metric, model, tokenizer, batch_size=2, column_text='dialogue', column_summary='summary'):
    model.eval()
    model.to(device)
    
    for i in tqdm(range(0, len(test_ds), batch_size)):
        batch = test_ds[i:i + batch_size]
        inputs = tokenizer([example[column_text] for example in batch], return_tensors='pt', truncation=True, padding='max_length', max_length=1024).to(device)
        summaries = model.generate(inputs['input_ids'])
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=False) for s in summaries]
        decoded_labels = [example[column_summary] for example in batch]

        metric.add_batch(predictions=decoded_summaries, references=decoded_labels)
    
    score = metric.compute()
    return score

In [16]:
rouge_metric = load_metric("rouge")

# Column names
column_text = 'dialogue'
column_summary = 'summary'

  rouge_metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [17]:
test_subset = list(dataset_samsum['test'])#[:]

In [None]:
score = calculate_metric_on_test_ds(
    test_subset, rouge_metric, curr_model, tokenizer, batch_size=2, column_text=column_text, column_summary=column_summary
)

  0%|          | 0/410 [00:00<?, ?it/s]



In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = {rn: score[rn].mid.fmeasure for rn in rouge_names}

# Create a DataFrame
rouge_df = pd.DataFrame(rouge_dict, index=['BART'])
print(rouge_df)

## Save model

In [None]:
curr_model.save_pretrained("bart-samsum-model-wd")

## Save model

# Load

In [None]:
tokenizer = BartTokenizer.from_pretrained("/content/tokenizer")

# Prediction

In [None]:
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
sample_text = dataset_samsum["test"][0]["dialogue"]

reference = dataset_samsum["test"][0]["summary"]

pipe = pipeline("summarization", model="bart-samsum-model",tokenizer=tokenizer)

print("Dialogue:") print(sample_text)

print("\nReference Summary:") print(reference)

print("\nModel Summary:") print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])