In [1]:
import pandas as pd
import numpy as np
import copy
import os
from collections import Counter
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, precision_recall_fscore_support, f1_score
from transformers import AutoTokenizer
import matplotlib.pyplot as plt

data_path = os.path.join(os.getcwd(), '..', 'data', 'cleaned', 'full_data.csv')
df = pd.read_csv(data_path)
df.describe()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,rendering_order,depth,parent_index,text_length,sibling_index,children_count,same_tag_sibling_count,same_text_sibling_count,word_count,letter_ratio,digit_ratio,whitespace_ratio,attribute_count,event_id
count,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,789.0
mean,665.224674,17.407742,659.378075,26.265557,0.77822,0.140376,1.033285,0.001447,4.020984,0.767786,0.1212,0.07535,0.921129,11.693283
std,528.325947,6.531681,526.280103,43.190741,2.411446,0.520146,3.663405,0.038021,6.543288,0.302293,0.27128,0.061539,1.733493,9.542569
min,19.0,2.0,18.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
25%,314.0,12.0,307.0,8.0,0.0,0.0,0.0,0.0,1.0,0.785714,0.0,0.0,0.0,4.0
50%,514.0,17.0,507.0,16.0,0.0,0.0,0.0,0.0,2.0,0.885714,0.0,0.083333,1.0,8.0
75%,837.5,23.0,833.0,27.0,0.0,0.0,0.0,0.0,4.0,0.941176,0.0,0.117647,1.0,19.0
max,2375.0,30.0,2366.0,559.0,25.0,11.0,25.0,1.0,81.0,1.0,1.0,0.333333,15.0,42.0


In [2]:
LABEL_MERGE_MAP = {
    # keep
    "Other": "Other",

    # Name variants
    "Name": "Name",
    "NameLink": "Name",
    "NameLocation": "Name",

    # Date variants
    "Date": "Date",
    "DateTime": "Date",      

    # Time variants
    "Time": "Time",
    "StartTime": "Time",
    "EndTime": "Time",
    "StartEndTime": "Time",
    "TimeLocation": "Time",

  
    "Location": "Location",

    "Description": "Description",
    "Desc": "Description",
    "Details": "Description",
}

def merge_labels(df, label_col="label", mapping=None, default_to_other=True):
    df = df.copy()
    mapping = mapping or {}

    def _map(x):
        if x in mapping:
            return mapping[x]
        return "Other" if default_to_other else x

    df[label_col] = df[label_col].astype(str).map(_map)
    return df

df = merge_labels(df, label_col="label", mapping=LABEL_MERGE_MAP)

In [3]:
# Sort within each page
df["start_event"] = 0
m = df["event_id"].notna()

# Ensure a stable DOM order
# If you have rendering_order already: sort by it
df = df.sort_values(["source", "rendering_order"]).reset_index(drop=True)

first_idx = df.loc[m].groupby(["source", "event_id"], sort=False).head(1).index
df.loc[first_idx, "start_event"] = 1

print("Pages:", df["source"].nunique())
print("Total nodes:", len(df))
print("start_event positives:", int(df["start_event"].sum()))
print("Label counts:\n", df["label"].value_counts())

# positive rate overall
print("Start_event positive rate:", df["start_event"].mean())

m = df["event_id"].notna()
print("start_event positives:", df["start_event"].sum())
print("unique events:", df.loc[m].drop_duplicates(["source","event_id"]).shape[0])

Pages: 15
Total nodes: 2764
start_event positives: 177
Label counts:
 label
Other          1976
Date            314
Name            150
Time            146
Location        121
Description      57
Name: count, dtype: int64
Start_event positive rate: 0.06403762662807526
start_event positives: 177
unique events: 177


In [4]:
# -------------------------------
# 1) Choose a fixed TEST set of pages (holdout)
# -------------------------------
all_sources = np.array(sorted(df["source"].unique()))
rng = np.random.default_rng(42)
rng.shuffle(all_sources)

TEST_N_PAGES = 2
test_sources = set(all_sources[:TEST_N_PAGES])
cv_sources   = all_sources[TEST_N_PAGES:]   # remaining pages used for cross-val

print("Holdout TEST pages:", len(test_sources), sorted(list(test_sources)))

test_df = df[df["source"].isin(test_sources)].copy()

Holdout TEST pages: 2 [np.str_('members.sacac.org_pattern_labeled'), np.str_('nacacnet.org_pattern_labeled')]


In [5]:
#check if cuda is available
print(torch.cuda.is_available())
print(torch.version.cuda)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Labels: keep as-is
LABELS = sorted(df["label"].unique().tolist())
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}
OTHER_ID = label2id["Other"]
# Vocab for categorical structural features
TAG_VOCAB = {t:i for i,t in enumerate(sorted(df["tag"].astype(str).unique().tolist()))}
PARENT_TAG_VOCAB = {t:i for i,t in enumerate(sorted(df["parent_tag"].astype(str).unique().tolist()))}

# Structural columns (already in your CSV)
STRUCT_COLS_NUM = [
    "depth","sibling_index","children_count","same_tag_sibling_count",
    "same_text_sibling_count","text_length","word_count",
    "letter_ratio","digit_ratio","whitespace_ratio","attribute_count"
]
STRUCT_COLS_BOOL = [
    "has_link","link_is_absolute","parent_has_link","is_leaf",
    "contains_date","contains_time","starts_with_digit","ends_with_digit",
    "has_class","has_id",
    "attr_has_word_name","attr_has_word_date","attr_has_word_time","attr_has_word_location","attr_has_word_link",
    "text_has_word_name","text_has_word_date","text_word_time","text_word_description","text_word_location"
]

class PageDataset(Dataset):
    """
    Caches tokenization as Python lists (NOT tensors) to avoid heavy per-batch Python padding.
    """
    def __init__(self, df, tokenizer, max_tokens=64):
        self.pages = []
        for src, g in df.groupby("source"):
            g = g.sort_values("rendering_order").reset_index(drop=True)

            texts = g["text_context"].astype(str).tolist()

            # tokenize once per page -> list[list[int]]
            enc = tokenizer(
                texts,
                padding=False,
                truncation=True,
                max_length=max_tokens,
                return_attention_mask=True,
                return_tensors=None
            )

            page = {
                "input_ids": enc["input_ids"],             # list of lists (per node)
                "attention_mask": enc["attention_mask"],   # list of lists (per node)
                "field_y": [label2id[x] for x in g["label"].tolist()],
                "boundary_y": g["start_event"].astype(int).tolist(),
                "tag_id": [TAG_VOCAB[str(x)] for x in g["tag"]],
                "parent_tag_id": [PARENT_TAG_VOCAB[str(x)] for x in g["parent_tag"]],
                "num_feats": g[STRUCT_COLS_NUM].fillna(0).values.astype("float32"),
                "bool_feats": g[STRUCT_COLS_BOOL].astype(int).values.astype("float32"),
            }
            self.pages.append(page)

    def __len__(self):
        return len(self.pages)

    def __getitem__(self, idx):
        return self.pages[idx]



True
12.8


In [6]:

MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


def collate_fn(batch):
    """
    Fast collate:
      - pads tokens via tokenizer.pad (vectorized)
      - pads node dimension for labels/features
    """
    B = len(batch)
    max_nodes = max(len(x["input_ids"]) for x in batch)

    # ---- flatten nodes across pages ----
    flat = []
    node_offsets = []
    node_mask = torch.zeros((B, max_nodes), dtype=torch.bool)

    start = 0
    for i, item in enumerate(batch):
        n = len(item["input_ids"])
        node_mask[i, :n] = True

        for j in range(n):
            flat.append({
                "input_ids": item["input_ids"][j],
                "attention_mask": item["attention_mask"][j],
            })

        end = start + n
        node_offsets.append((start, end))
        start = end

    # ---- pad tokens in one shot (FAST) ----
    enc = tokenizer.pad(
        flat,
        padding=True,
        return_tensors="pt"
    )  # dict of tensors [total_nodes, max_seq_len]

    # ---- helpers to pad node-dimension tensors ----
    def pad_1d_list(list_of_lists, pad_value, dtype):
        out = torch.full((B, max_nodes), pad_value, dtype=dtype)
        for i, lst in enumerate(list_of_lists):
            n = len(lst)
            out[i, :n] = torch.tensor(lst, dtype=dtype)
        return out

    def pad_2d_array(list_of_arrays, feat_dim, pad_value=0.0, dtype=torch.float32):
        out = torch.full((B, max_nodes, feat_dim), pad_value, dtype=dtype)
        for i, arr in enumerate(list_of_arrays):
            n = arr.shape[0]
            out[i, :n, :] = torch.tensor(arr, dtype=dtype)
        return out

    field_y    = pad_1d_list([x["field_y"] for x in batch], pad_value=-100, dtype=torch.long)
    boundary_y = pad_1d_list([x["boundary_y"] for x in batch], pad_value=0, dtype=torch.float32)

    tag_id        = pad_1d_list([x["tag_id"] for x in batch], pad_value=0, dtype=torch.long)
    parent_tag_id = pad_1d_list([x["parent_tag_id"] for x in batch], pad_value=0, dtype=torch.long)

    num_feats  = pad_2d_array([x["num_feats"] for x in batch], feat_dim=len(STRUCT_COLS_NUM), pad_value=0.0)
    bool_feats = pad_2d_array([x["bool_feats"] for x in batch], feat_dim=len(STRUCT_COLS_BOOL), pad_value=0.0)

    return {
        "enc": enc,
        "node_offsets": node_offsets,
        "node_mask": node_mask,
        "field_y": field_y,
        "boundary_y": boundary_y,
        "tag_id": tag_id,
        "parent_tag_id": parent_tag_id,
        "num_feats": num_feats,
        "bool_feats": bool_feats,
    }

In [7]:
class DOMAwareEventExtractor(nn.Module):
    def __init__(
        self,
        text_model_name: str,
        num_field_labels: int,
        tag_vocab_size: int,
        parent_tag_vocab_size: int,
        d_model: int = 128,
        nhead: int = 4,
        num_layers: int = 2,
        dropout: float = 0.2
    ):
        super().__init__()
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        text_dim = self.text_encoder.config.hidden_size

        self.text_proj = nn.Linear(text_dim, d_model)

        self.tag_emb = nn.Embedding(tag_vocab_size, d_model)
        self.parent_tag_emb = nn.Embedding(parent_tag_vocab_size, d_model)

        self.num_proj = nn.Linear(len(STRUCT_COLS_NUM), d_model)
        self.bool_proj = nn.Linear(len(STRUCT_COLS_BOOL), d_model)

        self.layernorm = nn.LayerNorm(d_model)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dropout=dropout, batch_first=True
        )
        self.node_encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)

        self.field_head = nn.Linear(d_model, num_field_labels)
        self.boundary_head = nn.Linear(d_model, 1)

    def forward(self, enc, node_offsets, node_mask, tag_id, parent_tag_id, num_feats, bool_feats):
        out = self.text_encoder(**enc)
        cls = out.last_hidden_state[:, 0, :]           # [total_nodes, text_dim]
        node_text = self.text_proj(cls)                # [total_nodes, d_model]

        B, max_nodes = node_mask.shape
        packed = node_text.new_zeros((B, max_nodes, node_text.shape[-1]))
        for i, (s, e) in enumerate(node_offsets):
            packed[i, : (e - s), :] = node_text[s:e]

        x = packed \
            + self.tag_emb(tag_id) \
            + self.parent_tag_emb(parent_tag_id) \
            + self.num_proj(num_feats) \
            + self.bool_proj(bool_feats)

        x = self.layernorm(x)

        key_padding_mask = ~node_mask
        x = self.node_encoder(x, src_key_padding_mask=key_padding_mask)

        field_logits = self.field_head(x)                    # [B, N, C]
        boundary_logits = self.boundary_head(x).squeeze(-1)  # [B, N]
        return field_logits, boundary_logits


In [8]:
class FocalLossWithLogits(nn.Module):
    """
    Binary focal loss operating on logits.
    alpha: weight for positive class (0..1). Often 0.25.
    gamma: focusing parameter. Often 2.0.
    reduction: 'mean' or 'sum'
    """
    def __init__(self, alpha=0.9, gamma=2.0, reduction="mean"):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, targets):
        targets = targets.float()
        bce = F.binary_cross_entropy_with_logits(logits, targets, reduction="none")
        p = torch.sigmoid(logits)
        pt = torch.where(targets == 1, p, 1 - p)          # prob of the true class
        alpha_t = torch.where(targets == 1, self.alpha, 1 - self.alpha)
        loss = alpha_t * (1 - pt).pow(self.gamma) * bce

        if self.reduction == "mean":
            return loss.mean()
        if self.reduction == "sum":
            return loss.sum()
        return loss

In [9]:
def make_losses_for_train_df(train_df, LABELS, device, other_scale=0.05, weight_cap=50.0):
    """
    Build loss functions using TRAIN ONLY statistics (correct for CV).
    Returns: (field_loss_fn, boundary_loss_fn)
    """

    # ---- Field class weights computed on TRAIN ONLY ----
    label_counts = Counter(train_df["label"].tolist())
    total = sum(label_counts.values())

    weights = []
    for label in LABELS:
        count = label_counts.get(label, 1)
        weights.append(total / count)

    weights = torch.tensor(weights, dtype=torch.float32).to(device)
    weights = torch.clamp(weights, max=weight_cap)  

    if "Other" in LABELS: #downweight the other class
        other_idx = LABELS.index("Other")
        weights[other_idx] *= other_scale

    field_loss_fn = nn.CrossEntropyLoss(
        weight=weights,
        ignore_index=-100
    )

    # ---- Boundary pos_weight computed on TRAIN ONLY ----
    pos = float(train_df["start_event"].sum())
    neg = float(len(train_df) - pos)
    pos_weight = torch.tensor([neg / (pos + 1e-6)], dtype=torch.float32).to(device)

    boundary_loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    return field_loss_fn, boundary_loss_fn


In [10]:
@torch.no_grad()
def eval_on_loader(
    loader,
    model,
    field_loss_fn,
    boundary_loss_fn,
    LABELS,
    id2label,
    device,
    threshold=0.5,
    boundary_weight=3.0
):
    """
    Eval for both field + boundary WITHOUT suppression.
    Padding is excluded via node_mask in both losses + metrics.
    """
    model.eval()

    total_loss = 0.0
    all_field_true, all_field_pred = [], []
    all_bound_true, all_bound_pred = [], []

    for batch in loader:
        enc = {k: v.to(device, non_blocking=True) for k, v in batch["enc"].items()}
        node_mask = batch["node_mask"].to(device).bool()

        tag_id = batch["tag_id"].to(device)
        parent_tag_id = batch["parent_tag_id"].to(device)
        num_feats = batch["num_feats"].to(device)
        bool_feats = batch["bool_feats"].to(device)

        field_y = batch["field_y"].to(device)
        boundary_y = batch["boundary_y"].to(device)

        field_logits, boundary_logits = model(
            enc=enc,
            node_offsets=batch["node_offsets"],
            node_mask=node_mask,
            tag_id=tag_id,
            parent_tag_id=parent_tag_id,
            num_feats=num_feats,
            bool_feats=bool_feats
        )

        # ---- losses (match training) ----
        field_loss = field_loss_fn(field_logits[node_mask], field_y[node_mask])
        boundary_loss = boundary_loss_fn(boundary_logits[node_mask], boundary_y[node_mask].float())
        loss = field_loss + boundary_weight * boundary_loss
        total_loss += loss.item()

        # ---- predictions ----
        field_pred = torch.argmax(field_logits, dim=-1)

        bound_prob = torch.sigmoid(boundary_logits)       # [B, N]
        bound_pred = (bound_prob >= threshold).long()     # [B, N]

        # gather valid nodes for FIELD metrics
        valid_field = node_mask & (field_y != -100)
        all_field_true.extend(field_y[valid_field].detach().cpu().tolist())
        all_field_pred.extend(field_pred[valid_field].detach().cpu().tolist())

        # gather valid nodes for BOUNDARY metrics
        all_bound_true.extend(boundary_y[node_mask].detach().cpu().long().tolist())
        all_bound_pred.extend(bound_pred[node_mask].detach().cpu().tolist())

    avg_loss = total_loss / max(1, len(loader))

    print("=== Field Label Metrics (node-level) ===")
    print(classification_report(
        all_field_true,
        all_field_pred,
        labels=list(range(len(LABELS))),
        target_names=[id2label[i] for i in range(len(LABELS))],
        digits=4,
        zero_division=0
    ))

    p, r, f1, _ = precision_recall_fscore_support(
        all_bound_true, all_bound_pred, average="binary", zero_division=0
    )
    print("=== Boundary Metrics (node-level) ===")
    print(f"threshold={threshold}")
    print(f"Precision: {p:.4f}  Recall: {r:.4f}  F1: {f1:.4f}")
    print(f"\nLoss: {avg_loss:.4f}")

    return avg_loss, (p, r, f1)


In [11]:
@torch.no_grad()
def field_metrics_fast(loader, model, device, label2id, average="micro"):
    """
    Computes field F1 while IGNORING:
      - padding labels (-100)
      - the 'Other' class (since we don't care about it for event field classification)

    average: "micro" or "macro"
    """
    model.eval()
    yt, yp = [], []

    # Resolve the integer id for "Other"
    if "Other" not in label2id:
        raise ValueError("'Other' must exist in label2id to ignore it in metrics.")
    OTHER_ID = label2id["Other"]

    for batch in loader:
        enc = {k: v.to(device) for k, v in batch["enc"].items()}
        node_mask = batch["node_mask"].to(device).bool()
        field_y = batch["field_y"].to(device)

        field_logits, _ = model(
            enc=enc,
            node_offsets=batch["node_offsets"],
            node_mask=node_mask,
            tag_id=batch["tag_id"].to(device),
            parent_tag_id=batch["parent_tag_id"].to(device),
            num_feats=batch["num_feats"].to(device),
            bool_feats=batch["bool_feats"].to(device)
        )

        pred = torch.argmax(field_logits, dim=-1)

        # valid = real nodes, not padding, and not "Other"
        valid = node_mask & (field_y != -100) & (field_y != OTHER_ID)

        yt.extend(field_y[valid].detach().cpu().tolist())
        yp.extend(pred[valid].detach().cpu().tolist())

    # If a fold has no non-Other labels in val, avoid crashing / nonsense
    if len(yt) == 0:
        return 0.0

    return f1_score(yt, yp, average=average, zero_division=0)

In [12]:
# -------------------------------
# 2) Helper functions to build loaders, model, losses per fold
# -------------------------------
def make_loaders(train_df, val_df, batch_size=2, max_tokens=64):
    train_dataset = PageDataset(train_df, tokenizer=tokenizer, max_tokens=max_tokens)
    val_dataset   = PageDataset(val_df, tokenizer=tokenizer, max_tokens=max_tokens)

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn,
        num_workers=4,
        pin_memory=True,
        persistent_workers=True
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=4,
        pin_memory=True,
        persistent_workers=True
    )
    return train_loader, val_loader

def init_model_and_optim(lr=1e-5):
    model = DOMAwareEventExtractor(
        text_model_name=MODEL_NAME,
        num_field_labels=len(LABELS),
        tag_vocab_size=len(TAG_VOCAB),
        parent_tag_vocab_size=len(PARENT_TAG_VOCAB),
        d_model=128,
        nhead=4,
        num_layers=2
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    return model, optimizer

def make_losses_for_fold(train_df):
    # ---- FIELD weights computed on TRAIN ONLY ----
    from collections import Counter
    label_counts = Counter(train_df["label"])
    total = sum(label_counts.values())
    weights = []
    for label in LABELS:
        c = label_counts.get(label, 1)
        weights.append(total / c)
    weights = torch.tensor(weights, dtype=torch.float32).to(device)

    field_loss_fn = torch.nn.CrossEntropyLoss(weight=weights, ignore_index=-100)

    # ---- BOUNDARY pos_weight computed on TRAIN ONLY ----
    pos = float(train_df["start_event"].sum())
    neg = float(len(train_df) - pos)
    pos_weight = torch.tensor([neg / (pos + 1e-6)], dtype=torch.float32).to(device)
    boundary_loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    return field_loss_fn, boundary_loss_fn

# -------------------------------
# 3) Training + evaluation per fold
# -------------------------------
def run_epoch(model, optimizer, loader, field_loss_fn, boundary_loss_fn, boundary_weight=3.0, training=True):
    model.train() if training else model.eval()
    total_loss = 0.0

    for batch in loader:
        enc = {k: v.to(device) for k, v in batch["enc"].items()}
        node_mask = batch["node_mask"].to(device).bool()

        tag_id = batch["tag_id"].to(device)
        parent_tag_id = batch["parent_tag_id"].to(device)
        num_feats = batch["num_feats"].to(device)
        bool_feats = batch["bool_feats"].to(device)

        field_y = batch["field_y"].to(device)
        boundary_y = batch["boundary_y"].to(device)

        with torch.set_grad_enabled(training):
            field_logits, boundary_logits = model(
                enc=enc,
                node_offsets=batch["node_offsets"],
                node_mask=node_mask,
                tag_id=tag_id,
                parent_tag_id=parent_tag_id,
                num_feats=num_feats,
                bool_feats=bool_feats
            )

            field_mask = node_mask & (field_y != -100) & (field_y != OTHER_ID)
            field_loss = field_loss_fn(field_logits[field_mask], field_y[field_mask])
            boundary_loss = boundary_loss_fn(boundary_logits[node_mask], boundary_y[node_mask].float())
            loss = field_loss + boundary_weight * boundary_loss

            if training:
                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

        total_loss += loss.detach().item()

    return total_loss / max(1, len(loader))

@torch.no_grad()
def find_best_threshold(loader, model, suppress_k=0):
    # suppression should be OFF now; keep param for compatibility
    model.eval()

    page_probs, page_true = [], []

    for batch in loader:
        enc = {k:v.to(device) for k,v in batch["enc"].items()}
        node_mask = batch["node_mask"].to(device).bool()

        _, boundary_logits = model(
            enc=enc,
            node_offsets=batch["node_offsets"],
            node_mask=node_mask,
            tag_id=batch["tag_id"].to(device),
            parent_tag_id=batch["parent_tag_id"].to(device),
            num_feats=batch["num_feats"].to(device),
            bool_feats=batch["bool_feats"].to(device)
        )

        probs = torch.sigmoid(boundary_logits).detach().cpu()
        true  = batch["boundary_y"].long().cpu()
        mask  = node_mask.detach().cpu()

        B, N = probs.shape
        for b in range(B):
            valid_idx = torch.where(mask[b])[0]
            if valid_idx.numel() == 0:
                continue
            page_probs.append(probs[b, valid_idx].numpy())
            page_true.append(true[b, valid_idx].numpy().astype(int))

    best_th, best_f1 = 0.5, -1.0

    for th in np.linspace(0.05, 0.95, 19):
        pred_flat, true_flat = [], []
        for p, t in zip(page_probs, page_true):
            pred = (p >= th).astype(int)
            pred_flat.append(pred)
            true_flat.append(t)

        if len(pred_flat) == 0:
            continue

        pred_flat = np.concatenate(pred_flat)
        true_flat = np.concatenate(true_flat)

        tp = ((pred_flat == 1) & (true_flat == 1)).sum()
        fp = ((pred_flat == 1) & (true_flat == 0)).sum()
        fn = ((pred_flat == 0) & (true_flat == 1)).sum()

        precision = tp / (tp + fp + 1e-9)
        recall    = tp / (tp + fn + 1e-9)
        f1        = 2 * precision * recall / (precision + recall + 1e-9)

        if f1 > best_f1:
            best_f1, best_th = f1, th

    return best_th, best_f1

@torch.no_grad()
def boundary_metrics(loader, model, threshold):
    model.eval()
    all_true, all_pred = [], []

    for batch in loader:
        enc = {k:v.to(device) for k,v in batch["enc"].items()}
        node_mask = batch["node_mask"].to(device).bool()

        _, boundary_logits = model(
            enc=enc,
            node_offsets=batch["node_offsets"],
            node_mask=node_mask,
            tag_id=batch["tag_id"].to(device),
            parent_tag_id=batch["parent_tag_id"].to(device),
            num_feats=batch["num_feats"].to(device),
            bool_feats=batch["bool_feats"].to(device)
        )

        prob = torch.sigmoid(boundary_logits)
        pred = (prob >= threshold).long()

        y = batch["boundary_y"].to(device).long()

        all_true.extend(y[node_mask].detach().cpu().tolist())
        all_pred.extend(pred[node_mask].detach().cpu().tolist())

    from sklearn.metrics import precision_recall_fscore_support
    p, r, f1, _ = precision_recall_fscore_support(all_true, all_pred, average="binary", zero_division=0)
    return p, r, f1

# -------------------------------
# 4) Run K-Fold CV by source
# -------------------------------
N_SPLITS = min(5, len(cv_sources))  # safety
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

cv_results = []

for fold, (tr_idx, va_idx) in enumerate(kf.split(cv_sources), start=1):
    fold_train_sources = set(cv_sources[tr_idx])
    fold_val_sources   = set(cv_sources[va_idx])

    fold_train_df = df[df["source"].isin(fold_train_sources)].copy()
    fold_val_df   = df[df["source"].isin(fold_val_sources)].copy()

    field_loss_fn, boundary_loss_fn = make_losses_for_train_df(fold_train_df, LABELS, device, other_scale=0.01, weight_cap=50.0)

    print(f"\n===== Fold {fold}/{N_SPLITS} =====")
    print("Train pages:", fold_train_df["source"].nunique(), "Val pages:", fold_val_df["source"].nunique())

    train_loader, val_loader = make_loaders(fold_train_df, fold_val_df, batch_size=2)

    model, optimizer = init_model_and_optim(lr=1e-5)

    best = {"f1": -1.0, "th": 0.5, "state": None}

    EPOCHS = 20
    for epoch in range(EPOCHS):
        tr_loss = run_epoch(model, optimizer, train_loader, field_loss_fn, boundary_loss_fn,
                            boundary_weight=3.0, training=True)
        va_loss = run_epoch(model, optimizer, val_loader, field_loss_fn, boundary_loss_fn,
                            boundary_weight=3.0, training=False)

        th, f1 = find_best_threshold(val_loader, model, suppress_k=0)

        if f1 > best["f1"]:
            best["f1"] = f1
            best["th"] = th
            best["state"] = copy.deepcopy(model.state_dict())
 
        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1:02d} | tr_loss={tr_loss:.4f} va_loss={va_loss:.4f} best_f1={best['f1']:.4f} best_th={best['th']:.2f}")

    # load best and report metrics on fold val
    model.load_state_dict(best["state"])
    bp, br, bf1 = boundary_metrics(val_loader, model, threshold=best["th"])

    # Field (fast summary only)
    field_micro_f1 = field_metrics_fast(val_loader, model, device, label2id=label2id, average="micro")
    field_macro_f1 = field_metrics_fast(val_loader, model, device, label2id=label2id, average="macro")

    print(
        f"Fold {fold} | boundary: P={bp:.4f} R={br:.4f} F1={bf1:.4f} (th={best['th']:.2f})"
        f" | field: microF1={field_micro_f1:.4f} macroF1={field_macro_f1:.4f}"
    )

    cv_results.append({
        "fold": fold,
        "bp": bp, "br": br, "bf1": bf1, "th": best["th"],
        "field_micro_f1": field_micro_f1,
        "field_macro_f1": field_macro_f1
    })

print("\n===== CV Summary =====")

mean_bf1 = float(np.mean([x["bf1"] for x in cv_results]))
mean_bp  = float(np.mean([x["bp"]  for x in cv_results]))
mean_br  = float(np.mean([x["br"]  for x in cv_results]))
mean_th  = float(np.mean([x["th"]  for x in cv_results]))

mean_field_micro = float(np.mean([x["field_micro_f1"] for x in cv_results]))
mean_field_macro = float(np.mean([x["field_macro_f1"] for x in cv_results]))

print(f"Boundary  Mean F1: {mean_bf1:.4f}")
print(f"Boundary  Mean P : {mean_bp:.4f}")
print(f"Boundary  Mean R : {mean_br:.4f}")
print(f"Boundary  Mean th: {mean_th:.4f}")

print(f"Field     Mean micro-F1: {mean_field_micro:.4f}")
print(f"Field     Mean macro-F1: {mean_field_macro:.4f}")

best_th_cv = mean_th
print("Using CV-avg threshold:", best_th_cv)

# -------------------------------
# 5) Final training on ALL CV pages, then evaluate on HOLDOUT TEST pages
# -------------------------------
final_train_df = df[df["source"].isin(set(cv_sources))].copy()

final_train_loader, _ = make_loaders(final_train_df, final_train_df, batch_size=2)  # dummy val loader
test_dataset = PageDataset(test_df, tokenizer)
test_loader  = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

field_loss_fn, boundary_loss_fn = make_losses_for_train_df(
    final_train_df, LABELS, device, other_scale=0.01, weight_cap=50.0
)
final_model, final_optimizer = init_model_and_optim(lr=1e-5)

EPOCHS_FINAL = 20
for epoch in range(EPOCHS_FINAL):
    tr_loss = run_epoch(final_model, final_optimizer, final_train_loader,
                        field_loss_fn, boundary_loss_fn, boundary_weight=3.0, training=True)
    if (epoch + 1) % 5 == 0:
        print(f"[FINAL] Epoch {epoch+1:02d} | tr_loss={tr_loss:.4f}")

p, r, f1 = boundary_metrics(test_loader, final_model, threshold=best_th_cv)
print("\n===== HOLDOUT TEST (Boundary) =====")
print(f"Threshold={best_th_cv:.2f}  P={p:.4f}  R={r:.4f}  F1={f1:.4f}")


===== Fold 1/5 =====
Train pages: 10 Val pages: 3


Loading weights: 100%|██████████| 100/100 [00:00<00:00, 285.70it/s, Materializing param=transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
  output = torch._nested_tensor_from_mask(


Epoch 05 | tr_loss=5.2767 va_loss=3.2424 best_f1=0.1149 best_th=0.40
Epoch 10 | tr_loss=4.9377 va_loss=3.1925 best_f1=0.1982 best_th=0.45
Epoch 15 | tr_loss=4.7454 va_loss=2.6644 best_f1=0.2785 best_th=0.45
Epoch 20 | tr_loss=3.4520 va_loss=2.3155 best_f1=0.4138 best_th=0.50
Fold 1 | boundary: P=0.3333 R=0.5455 F1=0.4138 (th=0.50) | field: microF1=0.6716 macroF1=0.4023

===== Fold 2/5 =====
Train pages: 10 Val pages: 3


Loading weights: 100%|██████████| 100/100 [00:00<00:00, 376.82it/s, Materializing param=transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Epoch 05 | tr_loss=5.4571 va_loss=4.9763 best_f1=0.1284 best_th=0.45
Epoch 10 | tr_loss=5.4087 va_loss=4.8477 best_f1=0.1558 best_th=0.50
Epoch 15 | tr_loss=4.3502 va_loss=4.9625 best_f1=0.1728 best_th=0.50
Epoch 20 | tr_loss=3.3353 va_loss=5.7973 best_f1=0.1728 best_th=0.50
Fold 2 | boundary: P=0.1000 R=0.6364 F1=0.1728 (th=0.50) | field: microF1=0.2778 macroF1=0.1380

===== Fold 3/5 =====
Train pages: 10 Val pages: 3


Loading weights: 100%|██████████| 100/100 [00:00<00:00, 357.88it/s, Materializing param=transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Epoch 05 | tr_loss=5.5637 va_loss=5.5621 best_f1=0.0983 best_th=0.45
Epoch 10 | tr_loss=4.5645 va_loss=6.0047 best_f1=0.1020 best_th=0.40
Epoch 15 | tr_loss=3.9580 va_loss=6.4905 best_f1=0.1493 best_th=0.50
Epoch 20 | tr_loss=3.2677 va_loss=7.0296 best_f1=0.2368 best_th=0.55
Fold 3 | boundary: P=0.1607 R=0.4500 F1=0.2368 (th=0.55) | field: microF1=0.4769 macroF1=0.2638

===== Fold 4/5 =====
Train pages: 11 Val pages: 2


Loading weights: 100%|██████████| 100/100 [00:00<00:00, 386.99it/s, Materializing param=transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Epoch 05 | tr_loss=5.4913 va_loss=11.9599 best_f1=0.4264 best_th=0.55
Epoch 10 | tr_loss=4.8642 va_loss=11.5304 best_f1=0.4690 best_th=0.45
Epoch 15 | tr_loss=4.2137 va_loss=10.9689 best_f1=0.5385 best_th=0.45
Epoch 20 | tr_loss=3.0975 va_loss=11.6543 best_f1=0.5758 best_th=0.40
Fold 4 | boundary: P=0.4578 R=0.7755 F1=0.5758 (th=0.40) | field: microF1=0.3333 macroF1=0.4269

===== Fold 5/5 =====
Train pages: 11 Val pages: 2


Loading weights: 100%|██████████| 100/100 [00:00<00:00, 471.04it/s, Materializing param=transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Epoch 05 | tr_loss=5.0488 va_loss=8.3533 best_f1=0.2276 best_th=0.35
Epoch 10 | tr_loss=4.7132 va_loss=8.5873 best_f1=0.2500 best_th=0.35
Epoch 15 | tr_loss=4.0132 va_loss=9.1726 best_f1=0.2616 best_th=0.30
Epoch 20 | tr_loss=2.7902 va_loss=10.3072 best_f1=0.2857 best_th=0.25
Fold 5 | boundary: P=0.1742 R=0.7941 F1=0.2857 (th=0.25) | field: microF1=0.1813 macroF1=0.1016

===== CV Summary =====
Boundary  Mean F1: 0.3370
Boundary  Mean P : 0.2452
Boundary  Mean R : 0.6403
Boundary  Mean th: 0.4400
Field     Mean micro-F1: 0.3882
Field     Mean macro-F1: 0.2665
Using CV-avg threshold: 0.43999999999999995


Loading weights: 100%|██████████| 100/100 [00:00<00:00, 327.31it/s, Materializing param=transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


[FINAL] Epoch 05 | tr_loss=5.1852
[FINAL] Epoch 10 | tr_loss=4.3720
[FINAL] Epoch 15 | tr_loss=3.7728
[FINAL] Epoch 20 | tr_loss=2.7044

===== HOLDOUT TEST (Boundary) =====
Threshold=0.44  P=0.0796  R=0.1731  F1=0.1091


In [13]:
print("Holdout test sources:", sorted(list(test_sources)))

print("Holdout start_event positives (total):", int(test_df["start_event"].sum()))

per_page = test_df.groupby("source")["start_event"].sum().sort_values(ascending=False)
print("Holdout start_event positives per page:\n", per_page.astype(int))


Holdout test sources: [np.str_('members.sacac.org_pattern_labeled'), np.str_('nacacnet.org_pattern_labeled')]
Holdout start_event positives (total): 52
Holdout start_event positives per page:
 source
members.sacac.org_pattern_labeled    30
nacacnet.org_pattern_labeled         22
Name: start_event, dtype: int64


In [14]:
best_th_test, best_f1_test = find_best_threshold(test_loader, final_model, suppress_k=0)
print("Best holdout threshold:", best_th_test, "Best holdout F1:", best_f1_test)

Best holdout threshold: 0.1 Best holdout F1: 0.1496402876309094
