# This is a Jupyter Notebook


Installs

In [3]:
!pip install -U datasets --quiet
!pip install ipywidgets --quiet
!pip install py7zr --quiet
!pip install transformers --quiet
!pip install torch --quiet
!pip install rouge-score --quiet
!pip install hf_xet --quiet
!pip install -U datasets transformers torch rouge-score --quiet
!pip install bitsandbytes accelerate peft --quiet


[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new 

Importing dataset

In [4]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0")
# Taking a small subset for quick training
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(2000))
small_eval_dataset = dataset["validation"].shuffle(seed=42).select(range(200))

train_dataset = small_train_dataset

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

Tokenizer

In [5]:
import torch
import gc
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig, default_data_collator
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader
from torch.optim import AdamW

def run_optimized_summarization():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float32
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_id = "t5-small"

    MAX_ARTICLE_LENGTH = 768
    MAX_SUMMARY_LENGTH = 128
    TRAIN_SIZE = 500
    BATCH_SIZE = 4

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    print("Loading T5-small model with 4-bit Quantization...")
    try:
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_id,
            quantization_config=bnb_config,
            device_map="auto"
        )
    except Exception:
        model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)
        print("Warning: Quantization failed. Loading full model.")

    raw_datasets = load_dataset("cnn_dailymail", "3.0.0")

    raw_train = raw_datasets["train"].shuffle(seed=42).select(range(TRAIN_SIZE))
    raw_validation = raw_datasets["validation"].shuffle(seed=42).select(range(50))

    raw_small_datasets = DatasetDict({"train": raw_train, "validation": raw_validation})

    def preprocess_function(examples):
        inputs = [f"summarize: {article}" for article in examples["article"]]

        model_inputs = tokenizer(
            inputs, max_length=MAX_ARTICLE_LENGTH, truncation=True, padding="max_length"
        )

        with tokenizer.as_target_tokenizer():
            labels = tokenizer(
                examples["highlights"], max_length=MAX_SUMMARY_LENGTH, truncation=True, padding="max_length"
            )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    print("Tokenizing dataset...")
    tokenized_datasets = raw_small_datasets.map(
        preprocess_function,
        batched=True,
        remove_columns=["article", "highlights", "id"],
        num_proc=1
    )

    small_tokenized_train_dataset = tokenized_datasets["train"]
    small_tokenized_eval_dataset = tokenized_datasets["validation"]

    print(f"Train Dataset size: {len(small_tokenized_train_dataset)} examples")
    print(f"Validation Dataset size: {len(small_tokenized_eval_dataset)} examples")

    del raw_datasets, raw_train, raw_validation, raw_small_datasets, tokenized_datasets
    gc.collect()

    train_dataloader = DataLoader(
        small_tokenized_train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        collate_fn=default_data_collator
    )

    print("\nData is ready for training.")
    return model, tokenizer, small_tokenized_train_dataset, train_dataloader

model, tokenizer, small_tokenized_train_dataset, train_dataloader = run_optimized_summarization()

print(f"\nDataLoader created with {len(train_dataloader)} batches.")

Loading T5-small model with 4-bit Quantization...
Tokenizing dataset...
Train Dataset size: 500 examples
Validation Dataset size: 50 examples

Data is ready for training.

DataLoader created with 125 batches.


Loading data

In [6]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

train_dataset = small_tokenized_train_dataset

train_dataloader = DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=True,
    collate_fn=default_data_collator
)

print(f"DataLoader created with {len(train_dataloader)} batches.")

DataLoader created with 125 batches.


Tuning

In [7]:
import torch
from torch.optim import AdamW

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Diagnostic Check
print(f"Device being used: {device}")

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-6)

# Training loop
num_epochs = 3
print(f"Starting training on {len(train_dataloader)} batches ({len(train_dataset)} examples).")

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    # Using enumerate for tracking progress
    for step, batch in enumerate(train_dataloader):

        inputs = {
            "input_ids": batch["input_ids"].to(device),
            "attention_mask": batch["attention_mask"].to(device),
            "labels": batch["labels"].to(device)
        }

        # Forward pass
        outputs = model(**inputs)
        loss = outputs.loss

        # Backpropagation
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        # Printing loss update every 100 steps
        if (step + 1) % 100 == 0:
            print(f"  Epoch {epoch+1}/{num_epochs}, Step {step+1}/{len(train_dataloader)}, Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(train_dataloader)
    print(f"\n--- Epoch {epoch+1}/{num_epochs} finished. Average Loss: {avg_loss:.4f} ---")

Device being used: cpu
Starting training on 125 batches (500 examples).


KeyboardInterrupt: 

Generate Summary

In [1]:
def generate_summary(article):
    # Preparing the input article
    inputs = tokenizer(article, return_tensors="pt", max_length=1024, truncation=True, padding=True)
    inputs = inputs.to(device)

    summary_ids = model.generate(
        inputs['input_ids'],
        min_length=50,
        max_length=200,

        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Testing
sample_article = """
The annual Galactic Peace Summit, held this year on the orbital station Serenity Prime,
concluded late Tuesday evening with a landmark agreement on interstellar trade routes.
Representatives from the Federation of Planets and the independent merchant guilds spent
three days locked in intense negotiations. The main point of contention was the taxation
of highly valued crystalline resources mined from the Proxima Centauri system's asteroid belt.
The final treaty establishes a zero-tariff policy for all humanitarian aid shipments and a
graduated tax system for commercial goods, which is expected to boost cross-sector economic
activity by 25% in the next fiscal cycle. Dr. Elara Vance, lead negotiator for the Federation,
praised the outcome, calling it "a new era of cooperation and shared prosperity." The agreement
is set to be ratified by all major planetary councils within the next two weeks and will take
effect immediately thereafter. Security forces reported no incidents, making this one of the
smoothest summits in history.
"""

summary = generate_summary(sample_article)
print(f"--- Article ---")
print(f"{sample_article.strip()}")
print(f"\n--- Generated Summary ---")
print(f"{summary}")

NameError: name 'tokenizer' is not defined

Rouge score

In [8]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def evaluate_summary(reference, generated):
    scores = scorer.score(reference, generated)
    return scores

# Example evaluation
reference_summary = dataset['test'][0]['highlights']
generated_summary = generate_summary(sample_article)

scores = evaluate_summary(reference_summary, generated_summary)
print(scores)

{'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0.0, recall=0.0, fmeasure=0.0)}
