In [14]:
import re
import nltk
import numpy as np
import unicodedata
import pandas as pd
from bs4 import BeautifulSoup
from typing import List, Tuple
from nltk.tokenize import sent_tokenize
from datasets import load_dataset, concatenate_datasets, Dataset, DatasetDict


Import Datasets

In [18]:
ds1 = load_dataset("ccdv/govreport-summarization")
ds2 = load_dataset("FiscalNote/billsum")

# GovReport splits (report, summary)
gov_train = ds1["train"].rename_columns({"report": "text"})
gov_val   = ds1["validation"].rename_columns({"report": "text"})
gov_test  = ds1["test"].rename_columns({"report": "text"})

# BillSum splits (text, summary, title)
bill_train = ds2["train"]
bill_test  = ds2["test"]
bill_ca_test = ds2["ca_test"]  # treat as out of domain test for model generalisation


Data Cleaning

In [19]:
# remove boilerplate for text

header_irrelevant = [
    r"^Page\s+\d+(\s+of\s+\d+)?\s*$", # pages e.g. Page 3
    r"^\d+\s*$", # numbers e.g. 13
    r"^–\s*\d+\s*–$", # e.g. - 12 -
    r"^\s*U\.S\. Government Accountability Office.*$", # common report headers
    r"^\s*Congressional Research Service.*$",
    r"^\s*Congressional Budget Office.*$",
    r"^\s*GAO-\d{2}-\d+\s*$", # report ids
    r"^\s*For Official Use Only\s*$",
    r"^\s*This report was prepared by.*$",
]

toc_irrelevant = [
    r"^\s*Table of Contents\s*$",
    r"\.{2,}\s*\d+\s*$", # dotted lines with page numbers
]

# remove excessive whitespaces, line breaks
def remove_whitespaces(text):
    text = re.sub(r'\r\n', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text) # multiple spaces into single space
    text = re.sub(r'\n{3,}', '\n\n', text) # 3+ consecutive newlines into two to keep paragraph breaks
    return text.strip() # drop leading/trailing whitespaces

def remove_boilerplate(text):
    if not text or not isinstance(text, str):
        return text
    
    # standardize unicode/punctuation for tokenizer consistency
    text = unicodedata.normalize("NFKC", text)

    lines = text.splitlines()
    cleaned = []

    inside_toc = False
    for line in lines:
        line_stripped = line.strip()

        if any(re.match(i, line_stripped) for i in header_irrelevant):
            continue

        if any(re.match(i, line_stripped) for i in toc_irrelevant):
            inside_toc = True
            continue

        if inside_toc:
            # skip TOC lines till normal paragraph appears
            if re.search(r"\.{2,}\s*\d+\s*$", line_stripped):
                continue
            inside_toc = False

        cleaned.append(line)

    cleaned_text = "\n".join(cleaned)
    cleaned_text = remove_whitespaces(cleaned_text)
    # remove rule lines ------ / =====
    cleaned_text = re.sub(r"(?:^|\n)[\-=]{4,}(?:\n|$)", "\n", cleaned_text)
    # remove HTML tags
    tags = re.compile(r"<[^>]+>|&[a-zA-Z]+;")
    if isinstance(cleaned_text, str) and tags.search(cleaned_text):
        cleaned_text = BeautifulSoup(cleaned_text, "lxml").get_text(separator=" ")

    return cleaned_text

gov_train = gov_train.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})
gov_val = gov_val.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})
gov_test = gov_test.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})

bill_train = bill_train.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})
bill_test = bill_test.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})
bill_ca_test = bill_ca_test.map(lambda ex: {"clean_text": remove_boilerplate(ex["text"])})


In [None]:
# remove minimal whitespaces for summary
def clean_summary(text):
    if not isinstance(text, str):
        return text
    text = re.sub(r'\s+', ' ', text).strip()
    return text

gov_train = gov_train.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})
gov_val   = gov_val.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})
gov_test  = gov_test.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})

bill_train = bill_train.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})
bill_test  = bill_test.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})
bill_ca_test   = bill_ca_test.map(lambda ex: {"clean_summary": clean_summary(ex["summary"])})


Data Preprocessing

In [22]:
# add sentences column for extractive baselines

def sent_split(text):
    if not isinstance(text, str) or not text.strip():
        return []
    sents = sent_tokenize(text)

    # merge headings with next sentence
    output = []
    heading = None
    for s in sents:
        s = s.strip()
        if not s:
            continue
        if heading is None and (len(s) <= 25 and (s.endswith(":") or re.match(r"^[A-Z][A-Za-z0-9 \-]{0,20}$", s))):
            heading = s
            continue
        if heading is not None:
            output.append((heading + " " + s).strip())
            heading = None
        else:
            output.append(s)
    if heading is not None:
        output.append(heading)
    return output

gov_train = gov_train.map(lambda ex: {"sentences": sent_split(ex["clean_text"])},batched=False)
gov_val = gov_val.map(lambda ex: {"sentences": sent_split(ex["clean_text"])},batched=False)
gov_test = gov_test.map(lambda ex: {"sentences": sent_split(ex["clean_text"])},batched=False)

bill_train = bill_train.map(lambda ex: {"sentences": sent_split(ex["clean_text"])},batched=False)
bill_test = bill_test.map(lambda ex: {"sentences": sent_split(ex["clean_text"])},batched=False)
bill_ca_test = bill_ca_test.map(lambda ex: {"sentences": sent_split(ex["clean_text"])},batched=False)


Map: 100%|██████████| 17517/17517 [02:12<00:00, 132.55 examples/s]
Map: 100%|██████████| 973/973 [00:05<00:00, 169.57 examples/s]
Map: 100%|██████████| 973/973 [00:05<00:00, 187.35 examples/s]
Map: 100%|██████████| 18949/18949 [00:18<00:00, 998.36 examples/s] 
Map: 100%|██████████| 3269/3269 [00:03<00:00, 967.79 examples/s] 
Map: 100%|██████████| 1237/1237 [00:01<00:00, 923.37 examples/s]


In [None]:
# dataset splits for modeling
split = bill_test.train_test_split(test_size=0.5, seed=42)
bill_val = split["train"]
bill_test = split["test"]

# Combined
comb_train = concatenate_datasets([gov_train, bill_train.select_columns(['text', 'summary', 'clean_text', 'clean_summary', 'sentences'])])
comb_val = concatenate_datasets([gov_val, bill_val.select_columns(['text', 'summary', 'clean_text', 'clean_summary', 'sentences'])])
comb_test = concatenate_datasets([gov_test, bill_test.select_columns(['text', 'summary', 'clean_text', 'clean_summary', 'sentences'])])
# add source column
comb_train = comb_train.add_column("source",["govreport"] * len(gov_train) + ["billsum"] * len(bill_train))
comb_val = comb_val.add_column("source",["govreport"] * len(gov_val) + ["billsum"] * len(bill_val))
comb_test = comb_test.add_column("source",["govreport"] * len(gov_test) + ["billsum"] * len(bill_test))
# shuffle to avoid bias
comb_train = comb_train.shuffle(seed=42)
comb_val = comb_val.shuffle(seed=42)
comb_test = comb_test.shuffle(seed=42)

Flattening the indices: 100%|██████████| 2607/2607 [00:00<00:00, 4962.17 examples/s]
Flattening the indices: 100%|██████████| 2608/2608 [00:00<00:00, 7321.66 examples/s]


Feature Engineering

Exploratory Data Analysis

Modeling