In [1]:
import pandas as pd
import numpy as np
import os

data_path = os.path.join(os.getcwd(), '..', 'data', 'cleaned', 'full_data.csv')
df = pd.read_csv(data_path)
df.describe()

Unnamed: 0,rendering_order,depth,parent_index,text_length,sibling_index,children_count,same_tag_sibling_count,same_text_sibling_count,word_count,letter_ratio,digit_ratio,whitespace_ratio,attribute_count,event_id
count,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,789.0
mean,665.224674,17.407742,659.378075,26.265557,0.77822,0.140376,1.033285,0.001447,4.020984,0.767786,0.1212,0.07535,0.921129,11.693283
std,528.325947,6.531681,526.280103,43.190741,2.411446,0.520146,3.663405,0.038021,6.543288,0.302293,0.27128,0.061539,1.733493,9.542569
min,19.0,2.0,18.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
25%,314.0,12.0,307.0,8.0,0.0,0.0,0.0,0.0,1.0,0.785714,0.0,0.0,0.0,4.0
50%,514.0,17.0,507.0,16.0,0.0,0.0,0.0,0.0,2.0,0.885714,0.0,0.083333,1.0,8.0
75%,837.5,23.0,833.0,27.0,0.0,0.0,0.0,0.0,4.0,0.941176,0.0,0.117647,1.0,19.0
max,2375.0,30.0,2366.0,559.0,25.0,11.0,25.0,1.0,81.0,1.0,1.0,0.333333,15.0,42.0


In [2]:
# Sort within each page
df = df.sort_values(["source", "rendering_order"]).reset_index(drop=True)

#boundary label
is_event = df["event_id"].notna()

df["start_event"] = 0
df.loc[is_event, "start_event"] = (
    df[is_event]
    .groupby(["source", "event_id"])
    .cumcount()
    .eq(0)
    .astype(int)
).values

print("Pages:", df["source"].nunique())
print("Total nodes:", len(df))
print("start_event positives:", int(df["start_event"].sum()))
print("Label counts:\n", df["label"].value_counts())

# positive rate overall
print("Start_event positive rate:", df["start_event"].mean())

# sanity: should equal number of unique (source,event_id) pairs among events
expected = df.loc[is_event, ["source","event_id"]].drop_duplicates().shape[0]
actual = int(df["start_event"].sum())
print("Expected positives:", expected, "Actual positives:", actual)

Pages: 15
Total nodes: 2764
start_event positives: 177
Label counts:
 label
Other           1976
Date             267
Location         121
StartEndTime     103
Name              84
NameLink          60
Description       57
DateTime          47
EndTime           20
StartTime         19
NameLocation       6
Time               2
TimeLocation       2
Name: count, dtype: int64
Start_event positive rate: 0.06403762662807526
Expected positives: 177 Actual positives: 177


In [3]:
# Get unique pages
sources = df["source"].unique()

# Shuffle reproducibly
rng = np.random.default_rng(42)
rng.shuffle(sources)

# 11 train / 2 val / 2 test (for 15 pages)
test_sources = set(sources[:2])
val_sources  = set(sources[2:4])
train_sources= set(sources[4:])

# Create splits
train_df = df[df["source"].isin(train_sources)].copy()
val_df   = df[df["source"].isin(val_sources)].copy()
test_df  = df[df["source"].isin(test_sources)].copy()

print("Train pages:", train_df["source"].nunique())
print("Val pages:", val_df["source"].nunique())
print("Test pages:", test_df["source"].nunique())

print("Train events:",
      train_df.dropna(subset=["event_id"]).groupby(["source","event_id"]).ngroups)
print("Val events:",
      val_df.dropna(subset=["event_id"]).groupby(["source","event_id"]).ngroups)
print("Test events:",
      test_df.dropna(subset=["event_id"]).groupby(["source","event_id"]).ngroups)

Train pages: 11
Val pages: 2
Test pages: 2
Train events: 117
Val events: 8
Test events: 52


  rng.shuffle(sources)


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

#check if cuda is available
print(torch.cuda.is_available())
print(torch.version.cuda)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Labels: keep as-is
LABELS = sorted(df["label"].unique().tolist())
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

# Vocab for categorical structural features
TAG_VOCAB = {t:i for i,t in enumerate(sorted(df["tag"].astype(str).unique().tolist()))}
PARENT_TAG_VOCAB = {t:i for i,t in enumerate(sorted(df["parent_tag"].astype(str).unique().tolist()))}

# Structural columns (already in your CSV)
STRUCT_COLS_NUM = [
    "depth","sibling_index","children_count","same_tag_sibling_count",
    "same_text_sibling_count","text_length","word_count",
    "letter_ratio","digit_ratio","whitespace_ratio","attribute_count"
]
STRUCT_COLS_BOOL = [
    "has_link","link_is_absolute","parent_has_link","is_leaf",
    "contains_date","contains_time","starts_with_digit","ends_with_digit",
    "has_class","has_id",
    "attr_has_word_name","attr_has_word_date","attr_has_word_time","attr_has_word_location","attr_has_word_link",
    "text_has_word_name","text_has_word_date","text_word_time","text_word_description","text_word_location"
]

class PageDataset(Dataset):
    def __init__(self, df):
        self.pages = []
        for src, g in df.groupby("source"):
            g = g.sort_values("rendering_order").reset_index(drop=True)
            self.pages.append(g)

    def __len__(self):
        return len(self.pages)

    def __getitem__(self, idx):
        g = self.pages[idx]
        texts = g["text_context"].astype(str).tolist()

        field_y = torch.tensor([label2id[x] for x in g["label"].tolist()], dtype=torch.long)
        boundary_y = torch.tensor(g["start_event"].astype(int).tolist(), dtype=torch.float32)

        tag_id = torch.tensor([TAG_VOCAB[str(x)] for x in g["tag"]], dtype=torch.long)
        parent_tag_id = torch.tensor([PARENT_TAG_VOCAB[str(x)] for x in g["parent_tag"]], dtype=torch.long)

        num_feats = torch.tensor(g[STRUCT_COLS_NUM].fillna(0).values, dtype=torch.float32)
        bool_feats = torch.tensor(g[STRUCT_COLS_BOOL].astype(int).values, dtype=torch.float32)

        return {
            "texts": texts,
            "field_y": field_y,
            "boundary_y": boundary_y,
            "tag_id": tag_id,
            "parent_tag_id": parent_tag_id,
            "num_feats": num_feats,
            "bool_feats": bool_feats,
        }


True
12.8


In [5]:
from transformers import AutoTokenizer


MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def collate_fn(batch, max_tokens=64):
    B = len(batch)
    max_nodes = max(len(x["texts"]) for x in batch)

    # node mask
    node_mask = torch.zeros((B, max_nodes), dtype=torch.bool)
    all_texts = []
    node_offsets = []
    for i, item in enumerate(batch):
        n = len(item["texts"])
        node_mask[i, :n] = True
        node_offsets.append((len(all_texts), len(all_texts) + n))
        all_texts.extend(item["texts"])

    enc = tokenizer(
        all_texts, padding=True, truncation=True, max_length=max_tokens, return_tensors="pt"
    )

    def pad_1d(tensors, pad_value):
        out = torch.full((B, max_nodes), pad_value, dtype=tensors[0].dtype)
        for i, t in enumerate(tensors):
            out[i, :len(t)] = t
        return out

    def pad_2d(tensors, feat_dim, pad_value=0.0):
        out = torch.full((B, max_nodes, feat_dim), pad_value, dtype=tensors[0].dtype)
        for i, t in enumerate(tensors):
            out[i, :t.shape[0], :] = t
        return out

    field_y = pad_1d([x["field_y"] for x in batch], pad_value=-100)   # ignore padding
    boundary_y = pad_1d([x["boundary_y"] for x in batch], pad_value=0)

    tag_id = pad_1d([x["tag_id"] for x in batch], pad_value=0)
    parent_tag_id = pad_1d([x["parent_tag_id"] for x in batch], pad_value=0)

    num_feats = pad_2d([x["num_feats"] for x in batch], len(STRUCT_COLS_NUM), 0.0)
    bool_feats = pad_2d([x["bool_feats"] for x in batch], len(STRUCT_COLS_BOOL), 0.0)

    return {
        "enc": enc,
        "node_offsets": node_offsets,
        "node_mask": node_mask,
        "field_y": field_y,
        "boundary_y": boundary_y,
        "tag_id": tag_id,
        "parent_tag_id": parent_tag_id,
        "num_feats": num_feats,
        "bool_feats": bool_feats,
    }

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import torch.nn as nn
from transformers import AutoModel

class DOMAwareEventExtractor(nn.Module):
    def __init__(
        self,
        text_model_name: str,
        num_field_labels: int,
        tag_vocab_size: int,
        parent_tag_vocab_size: int,
        d_model: int = 256,
        nhead: int = 8,
        num_layers: int = 4,
        dropout: float = 0.1
    ):
        super().__init__()
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        text_dim = self.text_encoder.config.hidden_size

        self.text_proj = nn.Linear(text_dim, d_model)

        self.tag_emb = nn.Embedding(tag_vocab_size, d_model)
        self.parent_tag_emb = nn.Embedding(parent_tag_vocab_size, d_model)

        self.num_proj = nn.Linear(len(STRUCT_COLS_NUM), d_model)
        self.bool_proj = nn.Linear(len(STRUCT_COLS_BOOL), d_model)

        self.layernorm = nn.LayerNorm(d_model)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dropout=dropout, batch_first=True
        )
        self.node_encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)

        self.field_head = nn.Linear(d_model, num_field_labels)
        self.boundary_head = nn.Linear(d_model, 1)

    def forward(self, enc, node_offsets, node_mask, tag_id, parent_tag_id, num_feats, bool_feats):
        out = self.text_encoder(**enc)
        cls = out.last_hidden_state[:, 0, :]           # [total_nodes, text_dim]
        node_text = self.text_proj(cls)                # [total_nodes, d_model]

        B, max_nodes = node_mask.shape
        packed = node_text.new_zeros((B, max_nodes, node_text.shape[-1]))
        for i, (s, e) in enumerate(node_offsets):
            packed[i, : (e - s), :] = node_text[s:e]

        x = packed \
            + self.tag_emb(tag_id) \
            + self.parent_tag_emb(parent_tag_id) \
            + self.num_proj(num_feats) \
            + self.bool_proj(bool_feats)

        x = self.layernorm(x)

        key_padding_mask = ~node_mask
        x = self.node_encoder(x, src_key_padding_mask=key_padding_mask)

        field_logits = self.field_head(x)                    # [B, N, C]
        boundary_logits = self.boundary_head(x).squeeze(-1)  # [B, N]
        return field_logits, boundary_logits


In [7]:
import torch.nn.functional as F

class FocalLossWithLogits(nn.Module):
    """
    Binary focal loss operating on logits.
    alpha: weight for positive class (0..1). Often 0.25.
    gamma: focusing parameter. Often 2.0.
    reduction: 'mean' or 'sum'
    """
    def __init__(self, alpha=0.25, gamma=2.0, reduction="mean"):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, targets):
        targets = targets.float()
        bce = F.binary_cross_entropy_with_logits(logits, targets, reduction="none")
        p = torch.sigmoid(logits)
        pt = torch.where(targets == 1, p, 1 - p)          # prob of the true class
        alpha_t = torch.where(targets == 1, self.alpha, 1 - self.alpha)
        loss = alpha_t * (1 - pt).pow(self.gamma) * bce

        if self.reduction == "mean":
            return loss.mean()
        if self.reduction == "sum":
            return loss.sum()
        return loss

In [8]:
from collections import Counter

# Field class weights (inverse frequency)
counts = Counter(df["label"].tolist())
w = torch.tensor([1.0 / (counts[l] + 1e-6) for l in LABELS], dtype=torch.float32)
w = (w / w.sum()) * len(LABELS)   # normalize scale

# Boundary pos_weight
pos = float(df["start_event"].sum())
neg = float(len(df) - pos)
pos_weight = torch.tensor([neg / (pos + 1e-6)], dtype=torch.float32).to(device)
boundary_loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

label_counts = Counter(train_df["label"])
total = sum(label_counts.values())

weights = []
for label in LABELS:
    count = label_counts.get(label, 1)
    weights.append(total / count)

weights = torch.tensor(weights, dtype=torch.float32).to(device)

field_loss_fn = torch.nn.CrossEntropyLoss(
    weight=weights,
    ignore_index=-100
)


In [9]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader

train_dataset = PageDataset(train_df)
val_dataset   = PageDataset(val_df)
test_dataset  = PageDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True,  collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset,   batch_size=2, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset,  batch_size=2, shuffle=False, collate_fn=collate_fn)

model = DOMAwareEventExtractor(
    text_model_name=MODEL_NAME,
    num_field_labels=len(LABELS),
    tag_vocab_size=len(TAG_VOCAB),
    parent_tag_vocab_size=len(PARENT_TAG_VOCAB),
    d_model=256,
    nhead=8,
    num_layers=4
).to(device)

# IMPORTANT: ensure any weight tensors are on the right device
# (If your field_loss_fn is CrossEntropyLoss(weight=...), make sure its weight tensor is on device)
try:
    if hasattr(field_loss_fn, "weight") and field_loss_fn.weight is not None:
        field_loss_fn.weight = field_loss_fn.weight.to(device)
except Exception:
    pass

# IMPORTANT: recreate BCEWithLogitsLoss AFTER moving pos_weight to device
pos_weight = pos_weight.to(device)
boundary_loss_fn = FocalLossWithLogits(alpha=0.85, gamma=2.0).to(device)

optimizer = optim.AdamW(model.parameters(), lr=2e-5)


@torch.no_grad()
def find_best_threshold(loader):
    model.eval()
    all_probs, all_true = [], []

    for batch in loader:
        enc = {k:v.to(device) for k,v in batch["enc"].items()}
        mask = batch["node_mask"].to(device).bool()

        field_logits, boundary_logits = model(
            enc=enc,
            node_offsets=batch["node_offsets"],
            node_mask=mask,
            tag_id=batch["tag_id"].to(device),
            parent_tag_id=batch["parent_tag_id"].to(device),
            num_feats=batch["num_feats"].to(device),
            bool_feats=batch["bool_feats"].to(device)
        )

        probs = torch.sigmoid(boundary_logits[mask]).detach().cpu().numpy()
        true  = batch["boundary_y"][mask.cpu()].numpy().astype(int)

        all_probs.append(probs)
        all_true.append(true)

    probs = np.concatenate(all_probs)
    true  = np.concatenate(all_true)

    best_th, best_f1 = 0.5, -1.0
    for th in np.linspace(0.05, 0.95, 19):
        pred = (probs >= th).astype(int)
        tp = ((pred == 1) & (true == 1)).sum()
        fp = ((pred == 1) & (true == 0)).sum()
        fn = ((pred == 0) & (true == 1)).sum()
        precision = tp / (tp + fp + 1e-9)
        recall    = tp / (tp + fn + 1e-9)
        f1        = 2 * precision * recall / (precision + recall + 1e-9)
        if f1 > best_f1:
            best_f1 = f1
            best_th = th

    return best_th, best_f1

def run_epoch(loader, training: bool = True):
    model.train() if training else model.eval()
    total_loss = 0.0

    for batch in loader:
        enc = {k: v.to(device) for k, v in batch["enc"].items()}
        node_mask = batch["node_mask"].to(device).bool()  # FIX: force boolean mask

        tag_id = batch["tag_id"].to(device)
        parent_tag_id = batch["parent_tag_id"].to(device)
        num_feats = batch["num_feats"].to(device)
        bool_feats = batch["bool_feats"].to(device)

        field_y = batch["field_y"].to(device)
        boundary_y = batch["boundary_y"].to(device)

        with torch.set_grad_enabled(training):
            field_logits, boundary_logits = model(
                enc=enc,
                node_offsets=batch["node_offsets"],
                node_mask=node_mask,
                tag_id=tag_id,
                parent_tag_id=parent_tag_id,
                num_feats=num_feats,
                bool_feats=bool_feats
            )

            # FIX: mask out padding for BOTH losses
            field_loss = field_loss_fn(
                field_logits[node_mask],     # [num_real_nodes, C]
                field_y[node_mask]           # [num_real_nodes]
            )

            boundary_loss = boundary_loss_fn(
                boundary_logits[node_mask],  # [num_real_nodes]
                boundary_y[node_mask].float()
            )

            loss = field_loss + 3.0 * boundary_loss

            if training:
                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # optional but helpful
                optimizer.step()

        total_loss += loss.detach().item()

    return total_loss / max(1, len(loader))


for epoch in range(25):
    train_loss = run_epoch(train_loader, training=True)
    val_loss   = run_epoch(val_loader, training=False)
    best_th, best_val_f1 = find_best_threshold(val_loader)
    print(f"Epoch {epoch+1}")
    print(f"Train loss: {train_loss:.4f}")
    print(f"Val loss:   {val_loss:.4f}")
    print("Best boundary threshold:", best_th, "Best val F1:", best_val_f1)

Loading weights: 100%|██████████| 100/100 [00:00<00:00, 418.94it/s, Materializing param=transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
  output = torch._nested_tensor_from_mask(


Epoch 1
Train loss: 3.1650
Val loss:   2.8848
Best boundary threshold: 0.05 Best val F1: 0.04651162786127636
Epoch 2
Train loss: 2.6886
Val loss:   2.7773
Best boundary threshold: 0.05 Best val F1: 0.04651162786127636
Epoch 3
Train loss: 2.6608
Val loss:   2.6095
Best boundary threshold: 0.05 Best val F1: 0.04651162786127636
Epoch 4
Train loss: 2.6013
Val loss:   2.5364
Best boundary threshold: 0.3 Best val F1: 0.04719764007163181
Epoch 5
Train loss: 2.4817
Val loss:   2.4671
Best boundary threshold: 0.35 Best val F1: 0.046920821068274256
Epoch 6
Train loss: 2.4041
Val loss:   2.4285
Best boundary threshold: 0.39999999999999997 Best val F1: 0.04733727806001191
Epoch 7
Train loss: 2.3314
Val loss:   2.3617
Best boundary threshold: 0.39999999999999997 Best val F1: 0.048632218797239495
Epoch 8
Train loss: 2.2350
Val loss:   2.2605
Best boundary threshold: 0.39999999999999997 Best val F1: 0.057347670194781666
Epoch 9
Train loss: 2.1469
Val loss:   2.2026
Best boundary threshold: 0.35 Best 

In [10]:
import numpy as np
from sklearn.metrics import classification_report, precision_recall_fscore_support

@torch.no_grad()
def eval_on_loader(loader, threshold=0.5, boundary_weight=3.0):
    model.eval()

    total_loss = 0.0
    all_field_true, all_field_pred = [], []
    all_bound_true, all_bound_pred = [], []

    for batch in loader:
        enc = {k:v.to(device) for k,v in batch["enc"].items()}
        node_mask = batch["node_mask"].to(device).bool()

        tag_id = batch["tag_id"].to(device)
        parent_tag_id = batch["parent_tag_id"].to(device)
        num_feats = batch["num_feats"].to(device)
        bool_feats = batch["bool_feats"].to(device)

        field_y = batch["field_y"].to(device)
        boundary_y = batch["boundary_y"].to(device)

        field_logits, boundary_logits = model(
            enc=enc,
            node_offsets=batch["node_offsets"],
            node_mask=node_mask,
            tag_id=tag_id,
            parent_tag_id=parent_tag_id,
            num_feats=num_feats,
            bool_feats=bool_feats
        )

        # ---- losses (match training) ----
        field_loss = field_loss_fn(field_logits[node_mask], field_y[node_mask])
        boundary_loss = boundary_loss_fn(boundary_logits[node_mask], boundary_y[node_mask].float())
        loss = field_loss + boundary_weight * boundary_loss
        total_loss += loss.item()

        # ---- predictions ----
        field_pred = torch.argmax(field_logits, dim=-1)

        bound_prob = torch.sigmoid(boundary_logits)
        bound_pred = (bound_prob >= threshold).long()

        # gather valid nodes
        valid = node_mask & (field_y != -100)

        all_field_true.extend(field_y[valid].detach().cpu().tolist())
        all_field_pred.extend(field_pred[valid].detach().cpu().tolist())

        all_bound_true.extend(boundary_y[node_mask].detach().cpu().long().tolist())
        all_bound_pred.extend(bound_pred[node_mask].detach().cpu().tolist())

    avg_loss = total_loss / max(1, len(loader))

    print("=== Field Label Metrics (node-level) ===")
    print(classification_report(
        all_field_true,
        all_field_pred,
        labels=list(range(len(LABELS))),
        target_names=[id2label[i] for i in range(len(LABELS))],
        digits=4,
        zero_division=0
    ))

    p, r, f1, _ = precision_recall_fscore_support(
        all_bound_true, all_bound_pred, average="binary", zero_division=0
    )
    print("=== Boundary Metrics (node-level) ===")
    print(f"threshold={threshold}")
    print(f"Precision: {p:.4f}  Recall: {r:.4f}  F1: {f1:.4f}")
    print(f"\nLoss: {avg_loss:.4f}")

    return avg_loss, (p, r, f1)

# 1) pick best threshold on val
best_th, best_val_f1 = find_best_threshold(val_loader)
print("Using best_th from val:", best_th, "best_val_f1:", best_val_f1)

# 2) evaluate test using that threshold (NOT 0.5)
_ = eval_on_loader(test_loader, threshold=best_th, boundary_weight=3.0)

Using best_th from val: 0.39999999999999997 best_val_f1: 0.43243243207012416


=== Field Label Metrics (node-level) ===
              precision    recall  f1-score   support

        Date     0.4121    0.6071    0.4910       112
    DateTime     0.0000    0.0000    0.0000         0
 Description     0.2083    0.1667    0.1852        30
     EndTime     0.0000    0.0000    0.0000         0
    Location     0.8780    0.8182    0.8471        44
        Name     0.0000    0.0000    0.0000         0
    NameLink     0.1020    0.1667    0.1266        30
NameLocation     0.0000    0.0000    0.0000         0
       Other     0.9350    0.7554    0.8357       552
StartEndTime     0.5116    0.4231    0.4632        52
   StartTime     0.0000    0.0000    0.0000         0
        Time     0.0000    0.0000    0.0000         0
TimeLocation     0.0000    0.0000    0.0000         0

    accuracy                         0.6744       820
   macro avg     0.2344    0.2259    0.2268       820
weighted avg     0.7766    0.6744    0.7158       820

=== Boundary Metrics (node-level) ===
