In [1]:
import re
import nltk
import numpy as np
import unicodedata
import pandas as pd
from bs4 import BeautifulSoup
from typing import List, Tuple
from nltk.tokenize import sent_tokenize
from datasets import load_dataset, concatenate_datasets, Dataset, DatasetDict


  from .autonotebook import tqdm as notebook_tqdm


Import Datasets

In [2]:
ds1 = load_dataset("ccdv/govreport-summarization")
ds2 = load_dataset("FiscalNote/billsum")

# GovReport splits (report, summary)
gov_train = ds1["train"].rename_columns({"report": "text"})
gov_val   = ds1["validation"].rename_columns({"report": "text"})
gov_test  = ds1["test"].rename_columns({"report": "text"})

# BillSum splits (text, summary, title)
bill_train = ds2["train"]
bill_test  = ds2["test"]
bill_ca_test = ds2["ca_test"]  # treat as out of domain test for model generalisation


Data Cleaning

In [3]:
# remove boilerplate for text

header_irrelevant = [
    r"^Page\s+\d+(\s+of\s+\d+)?\s*$", # pages e.g. Page 3
    r"^\d+\s*$", # numbers e.g. 13
    r"^–\s*\d+\s*–$", # e.g. - 12 -
    r"^\s*U\.S\. Government Accountability Office.*$", # common report headers
    r"^\s*Congressional Research Service.*$",
    r"^\s*Congressional Budget Office.*$",
    r"^\s*GAO-\d{2}-\d+\s*$", # report ids
    r"^\s*For Official Use Only\s*$",
    r"^\s*This report was prepared by.*$",
]

toc_irrelevant = [
    r"^\s*Table of Contents\s*$",
    r"\.{2,}\s*\d+\s*$", # dotted lines with page numbers
]

# remove excessive whitespaces, line breaks
def remove_whitespaces(text):
    text = re.sub(r'\r\n', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text) # multiple spaces into single space
    text = re.sub(r'\n{3,}', '\n\n', text) # 3+ consecutive newlines into two to keep paragraph breaks
    return text.strip() # drop leading/trailing whitespaces

def remove_boilerplate(text):
    if not text or not isinstance(text, str):
        return text
    
    # standardize unicode/punctuation for tokenizer consistency
    text = unicodedata.normalize("NFKC", text)

    lines = text.splitlines()
    cleaned = []

    inside_toc = False
    for line in lines:
        line_stripped = line.strip()

        if any(re.match(i, line_stripped) for i in header_irrelevant):
            continue

        if any(re.match(i, line_stripped) for i in toc_irrelevant):
            inside_toc = True
            continue

        if inside_toc:
            # skip TOC lines till normal paragraph appears
            if re.search(r"\.{2,}\s*\d+\s*$", line_stripped):
                continue
            inside_toc = False

        cleaned.append(line)

    cleaned_text = "\n".join(cleaned)
    cleaned_text = remove_whitespaces(cleaned_text)
    # remove rule lines ------ / =====
    cleaned_text = re.sub(r"(?:^|\n)[\-=]{4,}(?:\n|$)", "\n", cleaned_text)
    # remove HTML tags
    tags = re.compile(r"<[^>]+>|&[a-zA-Z]+;")
    if isinstance(cleaned_text, str) and tags.search(cleaned_text):
        cleaned_text = BeautifulSoup(cleaned_text, "html.parser").get_text(separator=" ")

    return cleaned_text

gov_train = gov_train.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})
gov_val = gov_val.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})
gov_test = gov_test.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})

bill_train = bill_train.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})
bill_test = bill_test.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})
bill_ca_test = bill_ca_test.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})


In [4]:
# remove minimal whitespaces for summary
def clean_summary(text):
    if not isinstance(text, str):
        return text
    text = re.sub(r'\s+', ' ', text).strip()
    return text

gov_train = gov_train.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})
gov_val   = gov_val.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})
gov_test  = gov_test.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})

bill_train = bill_train.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})
bill_test  = bill_test.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})
bill_ca_test   = bill_ca_test.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})


Data Preprocessing

In [5]:
import nltk
from nltk.tokenize import sent_tokenize
import re

# Download punkt if not already available
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

def safe_sent_split(text):
    """
    Safely split text into sentences with heading merging
    """
    if not isinstance(text, str) or not text.strip():
        return []
    
    try:
        sents = sent_tokenize(text)
        
        # Merge headings with next sentence
        output = []
        heading = None
        
        for s in sents:
            s = s.strip()
            if not s:
                continue
            
            # Check if this looks like a heading
            if (heading is None and 
                len(s) <= 25 and 
                (s.endswith(":") or re.match(r"^[A-Z][A-Za-z0-9 \-]{0,20}:?$", s))):
                heading = s
                continue
            
            # If we have a heading, merge it with current sentence
            if heading is not None:
                output.append(f"{heading} {s}".strip())
                heading = None
            else:
                output.append(s)
        
        # Add any remaining heading
        if heading is not None:
            output.append(heading)
            
        return output
        
    except Exception as e:
        print(f"Error in sentence splitting: {e}")
        # Fallback: simple split by periods
        return [s.strip() for s in text.split('.') if s.strip()]

# Alternative simpler approach without heading merging
def simple_sent_split(text):
    """Simple sentence splitting without heading logic"""
    if not isinstance(text, str) or not text.strip():
        return []
    
    try:
        return [s.strip() for s in sent_tokenize(text) if s.strip()]
    except:
        return [s.strip() for s in text.split('.') if s.strip()]

# Apply to datasets using the simpler approach
gov_train = gov_train.map(lambda ex: {"sentences": simple_sent_split(ex["clean_text"])}, batched=False)
gov_val = gov_val.map(lambda ex: {"sentences": simple_sent_split(ex["clean_text"])}, batched=False)
gov_test = gov_test.map(lambda ex: {"sentences": simple_sent_split(ex["clean_text"])}, batched=False)

bill_train = bill_train.map(lambda ex: {"sentences": simple_sent_split(ex["clean_text"])}, batched=False)
bill_test = bill_test.map(lambda ex: {"sentences": simple_sent_split(ex["clean_text"])}, batched=False)
bill_ca_test = bill_ca_test.map(lambda ex: {"sentences": simple_sent_split(ex["clean_text"])}, batched=False)

In [6]:
# dataset splits for modeling
split = bill_test.train_test_split(test_size=0.5, seed=42)
bill_val = split["train"]
bill_test = split["test"]

# Combined
comb_train = concatenate_datasets([gov_train, bill_train.select_columns(['text', 'summary', 'clean_text', 'clean_summary', 'sentences'])])
comb_val = concatenate_datasets([gov_val, bill_val.select_columns(['text', 'summary', 'clean_text', 'clean_summary', 'sentences'])])
comb_test = concatenate_datasets([gov_test, bill_test.select_columns(['text', 'summary', 'clean_text', 'clean_summary', 'sentences'])])
# add source column
comb_train = comb_train.add_column("source",["govreport"] * len(gov_train) + ["billsum"] * len(bill_train))
comb_val = comb_val.add_column("source",["govreport"] * len(gov_val) + ["billsum"] * len(bill_val))
comb_test = comb_test.add_column("source",["govreport"] * len(gov_test) + ["billsum"] * len(bill_test))
# shuffle to avoid bias
comb_train = comb_train.shuffle(seed=42)
comb_val = comb_val.shuffle(seed=42)
comb_test = comb_test.shuffle(seed=42)

Feature Engineering

Exploratory Data Analysis

Modeling

In [None]:
# Extractive Baseline: TF-IDF Cosine Similarity
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import Dataset

def extractive_summary(sentences: list, top_n: int = 3) -> str:
    """Return extractive summary using sentence similarity"""
    if not sentences:
        return ""
    if len(sentences) <= top_n:
        return " ".join(sentences)
    
    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(sentences)
    
    # Cosine similarity matrix
    sim_matrix = cosine_similarity(X, X)
    
    # Sentence scores: sum of similarities
    scores = sim_matrix.sum(axis=1)
    
    # Pick top_n sentences
    top_indices = np.argsort(scores)[-top_n:][::-1]
    
    # Return sentences in original order
    summary = [sentences[i] for i in sorted(top_indices)]
    return " ".join(summary)

# Add extractive summaries to dataset
def add_extractive_predictions(dataset: Dataset, top_n: int = 3) -> Dataset:
    summaries = [extractive_summary(ex["sentences"], top_n=top_n) for ex in dataset]
    return dataset.add_column("extractive_summary", summaries)

# Example: test on small subset
comb_val_small = comb_val.select(range(100))  # first 100 for quick run
comb_val_small = add_extractive_predictions(comb_val_small, top_n=3)

# ROUGE Evaluation
import evaluate

rouge = evaluate.load("rouge")

preds = comb_val_small["extractive_summary"]
refs  = comb_val_small["clean_summary"]

results = rouge.compute(predictions=preds, references=refs)
print("Initial Extractive ROUGE Scores:", results)


Initial Extractive ROUGE Scores: {'rouge1': 0.2692639621339274, 'rouge2': 0.12041119417612786, 'rougeL': 0.175222460579046, 'rougeLsum': 0.19015080435007944}


: 

In [None]:
from datasets import Dataset
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments
import torch

#Testing Fine Tuning on easy model - still breaks kernel need GPU
demo_train = comb_train.select(range(500))  
demo_val   = comb_val.select(range(100))    

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

max_input_length = 128
max_target_length = 32 

def preprocess(batch):
    inputs = tokenizer(batch["clean_text"], truncation=True, padding="max_length", max_length=max_input_length)
    targets = tokenizer(batch["clean_summary"], truncation=True, padding="max_length", max_length=max_target_length)
    inputs["labels"] = targets["input_ids"]
    return inputs
