In [1]:
!pip install -q transformers torch scikit-learn pandas pytorch-lightning datasets rouge-score evaluate

In [2]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from transformers import BartTokenizer, BartForConditionalGeneration, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import evaluate
from nltk.translate.bleu_score import sentence_bleu

In [13]:
class Config:
    # Model
    MODEL_NAME = 'facebook/bart-base'

    # Training
    BATCH_SIZE = 8 if torch.cuda.is_available() else 4  # Smaller batch for CPU
    GRAD_ACCUM_STEPS = 2
    MAX_EPOCHS = 1
    LR = 5e-5
    WARMUP_STEPS = 100

    # Data
    MAX_SOURCE_LEN = 384
    MAX_TARGET_LEN = 96
    VAL_SIZE = 0.1

    # Precision - use mixed precision only if GPU available
    PRECISION = '16-mixed' if torch.cuda.is_available() else '32-true'

    # Generation
    NUM_BEAMS = 4
    NO_REPEAT_NGRAM_SIZE = 3

cfg = Config()

In [14]:
def load_data():
    df = pd.read_csv('/content/samsum-train.csv')
    df['dialogue'] = df['dialogue'].astype(str)
    df['summary'] = df['summary'].astype(str)
    return train_test_split(df, test_size=cfg.VAL_SIZE, random_state=42)

train_df, val_df = load_data()

# Tokenizer
tokenizer = BartTokenizer.from_pretrained(cfg.MODEL_NAME)


In [15]:
class SummaryDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        dialogue = row['dialogue'][-2000:] if len(row['dialogue']) > 2000 else row['dialogue']

        source = self.tokenizer(
            dialogue,
            max_length=cfg.MAX_SOURCE_LEN,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target = self.tokenizer(
            row['summary'],
            max_length=cfg.MAX_TARGET_LEN,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': source['input_ids'].squeeze(),
            'attention_mask': source['attention_mask'].squeeze(),
            'labels': target['input_ids'].squeeze()
        }


In [16]:
def create_loaders():
    num_workers = 4 if torch.cuda.is_available() else 2  # Fewer workers for CPU
    train_loader = DataLoader(
        SummaryDataset(train_df, tokenizer),
        batch_size=cfg.BATCH_SIZE,
        shuffle=True,
        num_workers=num_workers
    )

    val_loader = DataLoader(
        SummaryDataset(val_df, tokenizer),
        batch_size=cfg.BATCH_SIZE,
        num_workers=num_workers
    )
    return train_loader, val_loader

train_loader, val_loader = create_loaders()

In [17]:
class Summarizer(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = BartForConditionalGeneration.from_pretrained(cfg.MODEL_NAME)

    def forward(self, input_ids, attention_mask, labels):
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

    def training_step(self, batch, batch_idx):
        outputs = self(
            batch['input_ids'],
            batch['attention_mask'],
            batch['labels']
        )
        loss = outputs.loss
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self(
            batch['input_ids'],
            batch['attention_mask'],
            batch['labels']
        )
        loss = outputs.loss
        self.log('val_loss', loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=cfg.LR)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=cfg.WARMUP_STEPS,
            num_training_steps=len(train_loader) * cfg.MAX_EPOCHS
        )
        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]

In [18]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    filename='best-model-{epoch}-{val_loss:.2f}',
    save_top_k=1,
    mode='min'
)

lr_monitor = LearningRateMonitor(logging_interval='step')

In [19]:
trainer = pl.Trainer(
    max_epochs=cfg.MAX_EPOCHS,
    callbacks=[checkpoint_callback, lr_monitor],
    accelerator='auto',
    devices='auto',
    precision=cfg.PRECISION,
    accumulate_grad_batches=cfg.GRAD_ACCUM_STEPS,
    enable_progress_bar=True
)


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [20]:
model = Summarizer()
trainer.fit(model, train_loader, val_loader)

INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                         | Params | Mode
--------------------------------------------------------------
0 | model | BartForConditionalGeneration | 139 M  | eval
--------------------------------------------------------------
139 M     Trainable params
0         Non-trainable params
139 M     Total params
557.682   Total estimated model params size (MB)
0         Modules in train mode
182       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [21]:
def summarize(text, model, tokenizer):
    inputs = tokenizer(
        text,
        max_length=cfg.MAX_SOURCE_LEN,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    ).to(model.device)

    summary_ids = model.model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=cfg.MAX_TARGET_LEN,
        num_beams=cfg.NUM_BEAMS,
        no_repeat_ngram_size=cfg.NO_REPEAT_NGRAM_SIZE,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [22]:
article = """Anna: Hey! Just got back from the dentist.
Mark: Oh no, how did it go?
Anna: Not too bad, just a cavity. He filled it right away.
Mark: That’s good. Were you in pain before?
Anna: Yeah, a little. It started hurting last weekend.
Mark: Oof. At least it’s sorted now.
Anna: Yep, he said I need to floss more though
Mark: Classic dentist advice
Anna: Haha, yeah. Anyway, want to grab coffee later?
Mark: Sure, let’s meet at 5 at the usual spot?"""

# Reference summary for evaluation
reference = "Anna told Mark she had a cavity filled at the dentist and suggested meeting for coffee later."

# Generate summary
summary = summarize(article, model, tokenizer)
print(f"Generated Summary:\n{summary}")

Generated Summary:
Anna had a cavity filled. It started hurting last weekend. She needs to floss more. Mark and Anna will meet at 5 for a coffee.


In [23]:
# Evaluate with ROUGE
rouge = evaluate.load('rouge')
rouge_scores = rouge.compute(predictions=[summary], references=[reference])
print(f"\nROUGE Scores: {rouge_scores}")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]


ROUGE Scores: {'rouge1': np.float64(0.5238095238095238), 'rouge2': np.float64(0.15), 'rougeL': np.float64(0.38095238095238104), 'rougeLsum': np.float64(0.38095238095238104)}
