# Compare label classification solutions (relevant==1)

This notebook compares **two label models** on the **same train/val split** (relevant-only):

A) **Hierarchical + Description-aware Cross-Encoder**
- Stage A: TF‑IDF + LinearSVC → `group_id`
- Stage B: Cross‑Encoder (text + label description) → scores labels *within the predicted group*
- Report: Top‑1 / Top‑5 accuracy (and optional oracle-group upper bound)

B) **Hierarchical Transformer (shared encoder + group-specific heads)**
- Stage A: TF‑IDF + LinearSVC → `group_id` (same as A for fairness)
- Stage B: Transformer encoder + **per-group head** → predicts label within predicted group
- Report: Top‑1 / Top‑5 accuracy

> Notes:
> - Cross‑encoder usually wins on very fine distinctions but is slower at inference.
> - Hierarchical transformer is faster at inference but can be slightly less accurate on ultra-similar labels.


In [None]:
# =========================
# CONFIG (EDIT THESE)
# =========================
from pathlib import Path

DATASET_CSV = Path("dataset.csv")   # must include text, demand_id, relevant/relevance, group_id (or mergeable)
LABELS_CSV  = Path("labels.csv")    # demand_id, description, (optional) group_id

TEXT_COL   = "text"
DEMAND_COL = "demand_id"
GROUP_COL  = "group_id"
REL_COL_CANDIDATES = ["relevant", "relevance"]

SEED = 42
TEST_SIZE = 0.30
TOP_K = 5

# Stage A (group model)
TFIDF_NGRAM = (1,3)

# Cross-Encoder (A)
CE_MODEL = "microsoft/deberta-v3-base"
CE_MAX_LEN = 256
CE_EPOCHS = 2
CE_BS = 8
CE_LR = 2e-5
CE_NEG_PER_POS = 3

# Hierarchical Transformer (B)
HT_MODEL = "microsoft/deberta-v3-base"
HT_MAX_LEN = 256
HT_EPOCHS = 3
HT_BS = 8
HT_LR = 2e-5
HT_WEIGHT_DECAY = 0.01
USE_CLASS_WEIGHTS = True  # per-group class weights to help long tail


In [None]:
import time
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report

import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModel,
    TrainingArguments,
    Trainer,
)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
RNG = np.random.default_rng(SEED)

print("Device:", device)


## 1) Load + merge + keep relevant==1

In [None]:
df = pd.read_csv(DATASET_CSV)
labels = pd.read_csv(LABELS_CSV)

# detect relevance column
rel_col = None
for c in REL_COL_CANDIDATES:
    if c in df.columns:
        rel_col = c
        break
if rel_col is None:
    raise ValueError(f"Missing relevance column. Tried: {REL_COL_CANDIDATES}. Have: {list(df.columns)}")

# required
for c in [TEXT_COL, DEMAND_COL]:
    if c not in df.columns:
        raise ValueError(f"Dataset missing column: {c}")
for c in [DEMAND_COL, "description"]:
    if c not in labels.columns:
        raise ValueError(f"Labels file missing column: {c}")

df[rel_col] = df[rel_col].fillna(0).astype(int)
df[DEMAND_COL] = df[DEMAND_COL].astype(str)
labels[DEMAND_COL] = labels[DEMAND_COL].astype(str)

# bring group_id if missing
if GROUP_COL not in df.columns and GROUP_COL in labels.columns:
    df = df.merge(labels[[DEMAND_COL, GROUP_COL]], on=DEMAND_COL, how="left")
if GROUP_COL not in df.columns:
    raise ValueError(f"Missing {GROUP_COL}. Provide in dataset.csv or labels.csv and set GROUP_COL.")

# merge descriptions
df = df.merge(labels[[DEMAND_COL, "description"]], on=DEMAND_COL, how="left")

df_rel = df[df[rel_col] == 1].dropna(subset=[TEXT_COL, DEMAND_COL, GROUP_COL, "description"]).copy()
df_rel[GROUP_COL] = df_rel[GROUP_COL].astype(str)

print("Relevant rows:", len(df_rel))
print("Unique labels:", df_rel[DEMAND_COL].nunique(), "Unique groups:", df_rel[GROUP_COL].nunique())
df_rel.head(2)


## 2) Single shared train/val split (stratify by group)

In [None]:
train_df, val_df = train_test_split(
    df_rel,
    test_size=TEST_SIZE,
    random_state=SEED,
    stratify=df_rel[GROUP_COL] if df_rel[GROUP_COL].nunique() > 1 else None,
)

print("Train:", train_df.shape, "Val:", val_df.shape)


## 3) Stage A — Group classifier (shared for both solutions)

In [None]:
t0 = time.time()

group_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(min_df=2, max_df=0.95, ngram_range=TFIDF_NGRAM)),
    ("clf", LinearSVC(class_weight="balanced", max_iter=8000)),
])
group_pipe.fit(train_df[TEXT_COL].astype(str), train_df[GROUP_COL])

group_cal = CalibratedClassifierCV(group_pipe, method="sigmoid", cv=3)
group_cal.fit(train_df[TEXT_COL].astype(str), train_df[GROUP_COL])

pred_groups = group_cal.predict(val_df[TEXT_COL].astype(str))
print(classification_report(val_df[GROUP_COL], pred_groups, zero_division=0))
print("Stage A trained in %.1fs" % (time.time() - t0))


## 4) Prepare label candidates per group (demand_id + description)

In [None]:
labels_rel = labels.merge(df_rel[[DEMAND_COL, GROUP_COL]].drop_duplicates(), on=DEMAND_COL, how="inner")
labels_rel = labels_rel.dropna(subset=[GROUP_COL, "description"]).copy()
labels_rel[GROUP_COL] = labels_rel[GROUP_COL].astype(str)

group_to_labels = {}
for g, sub in labels_rel.groupby(GROUP_COL):
    pairs = list(zip(sub[DEMAND_COL].astype(str).tolist(), sub["description"].astype(str).tolist()))
    seen=set(); uniq=[]
    for did, desc in pairs:
        if did in seen: 
            continue
        seen.add(did)
        uniq.append((did, desc))
    group_to_labels[str(g)] = uniq

print("Groups with candidates:", len(group_to_labels))
print("Example group sizes:", sorted([(g, len(v)) for g, v in group_to_labels.items()], key=lambda x: -x[1])[:5])


# A) Cross‑Encoder (text + description)

We train a binary scorer on pairs:
- positive: (text, correct description)
- negative: (text, other description within same group)


In [None]:
def make_pairs(df_part: pd.DataFrame, neg_per_pos: int) -> pd.DataFrame:
    rows = []
    for _, r in df_part.iterrows():
        text = str(r[TEXT_COL])
        demand = str(r[DEMAND_COL])
        group = str(r[GROUP_COL])
        desc_pos = str(r["description"])

        rows.append({"text": text, "description": desc_pos, "labels": 1, "group": group, "true_demand": demand})

        candidates = group_to_labels.get(group, [])
        neg_pool = [(d, desc) for d, desc in candidates if d != demand]
        if not neg_pool:
            continue
        take = min(neg_per_pos, len(neg_pool))
        neg_idx = RNG.choice(len(neg_pool), size=take, replace=False)
        for i in np.atleast_1d(neg_idx):
            _, desc_neg = neg_pool[int(i)]
            rows.append({"text": text, "description": str(desc_neg), "labels": 0, "group": group, "true_demand": demand})
    return pd.DataFrame(rows)

ce_train_pairs = make_pairs(train_df, CE_NEG_PER_POS)
ce_val_pairs = make_pairs(val_df, CE_NEG_PER_POS)

print("CE pairs train:", ce_train_pairs.shape, "pos rate:", ce_train_pairs["labels"].mean())
print("CE pairs val  :", ce_val_pairs.shape, "pos rate:", ce_val_pairs["labels"].mean())


In [None]:
# Train cross-encoder
t0 = time.time()

ce_tokenizer = AutoTokenizer.from_pretrained(CE_MODEL)

def ce_tok(batch):
    return ce_tokenizer(
        batch["text"],
        batch["description"],
        truncation=True,
        padding="max_length",
        max_length=CE_MAX_LEN,
    )

ce_train_ds = Dataset.from_pandas(ce_train_pairs[["text","description","labels"]], preserve_index=False).map(ce_tok, batched=True)
ce_val_ds   = Dataset.from_pandas(ce_val_pairs[["text","description","labels"]], preserve_index=False).map(ce_tok, batched=True)

ce_train_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
ce_val_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

ce_model = AutoModelForSequenceClassification.from_pretrained(CE_MODEL, num_labels=2).to(device)

ce_args = TrainingArguments(
    output_dir="ce_compare_out",
    learning_rate=CE_LR,
    per_device_train_batch_size=CE_BS,
    per_device_eval_batch_size=CE_BS,
    num_train_epochs=CE_EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none",
    seed=SEED,
)

ce_trainer = Trainer(model=ce_model, args=ce_args, train_dataset=ce_train_ds, eval_dataset=ce_val_ds)
ce_trainer.train()

print("Cross-encoder trained in %.1fs" % (time.time() - t0))


In [None]:
# Evaluate cross-encoder demand_id Top-1 / Top-K on validation, using predicted group (fair)
@torch.no_grad()
def ce_score_probs(text: str, descriptions: list[str]) -> np.ndarray:
    batch = ce_tokenizer([text]*len(descriptions), descriptions, padding=True, truncation=True, max_length=CE_MAX_LEN, return_tensors="pt")
    batch = {k: v.to(device) for k, v in batch.items()}
    logits = ce_model(**batch).logits
    probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().numpy()
    return probs

def ce_predict_topk(text: str, group: str, k: int) -> list[str]:
    pairs = group_to_labels.get(str(group), [])
    if not pairs:
        return []
    demand_ids = [d for d, _ in pairs]
    descs = [desc for _, desc in pairs]
    scores = ce_score_probs(text, descs)
    order = np.argsort(-scores)
    return [demand_ids[i] for i in order[:k]]

# group prediction
group_probs = group_cal.predict_proba(val_df[TEXT_COL].astype(str))
group_classes = group_cal.classes_
val_top1_group = group_classes[np.argmax(group_probs, axis=1)]

y_true = val_df[DEMAND_COL].astype(str).tolist()
texts = val_df[TEXT_COL].astype(str).tolist()

ce_top1 = 0
ce_topk = 0
valid = 0

for text, true_lab, g in zip(texts, y_true, val_top1_group):
    preds = ce_predict_topk(text, str(g), TOP_K)
    if not preds:
        continue
    valid += 1
    ce_top1 += int(preds[0] == true_lab)
    ce_topk += int(true_lab in preds)

ce_top1_acc = ce_top1 / max(1, valid)
ce_topk_acc = ce_topk / max(1, valid)

print("CE evaluated rows:", valid, "/", len(val_df))
print("CE Top-1:", ce_top1_acc)
print(f"CE Top-{TOP_K}:", ce_topk_acc)


# B) Hierarchical Transformer (shared encoder + per-group head)

We train a transformer that predicts the fine label **within the group**.
To keep it correct and simple, we:
- encode labels *per group* (local indices)
- use **per-example loss** with the correct group head
- at inference, we use the **predicted group** from Stage A (same as CE for fairness).


In [None]:
# Build per-group label index maps from TRAIN ONLY (avoid leakage)
train_groups = train_df[GROUP_COL].astype(str).tolist()

group_label_list = {}
for g, sub in train_df.groupby(GROUP_COL):
    uniq = sorted(sub[DEMAND_COL].astype(str).unique().tolist())
    group_label_list[str(g)] = uniq

# Map (group, demand_id) -> local index
group_label_to_idx = {g: {lab:i for i, lab in enumerate(labs)} for g, labs in group_label_list.items()}
group_num_labels = {g: len(labs) for g, labs in group_label_list.items()}

# Filter val rows whose label is unseen in train within that group (can't be predicted)
def is_seen(row):
    g = str(row[GROUP_COL])
    lab = str(row[DEMAND_COL])
    return g in group_label_to_idx and lab in group_label_to_idx[g]

val_seen_df = val_df[val_df.apply(is_seen, axis=1)].copy()

print("Train groups:", len(group_label_list))
print("Val rows (all):", len(val_df), "Val rows (label seen in train):", len(val_seen_df))


In [None]:
# Prepare HF datasets for hierarchical transformer training (train only)
ht_tokenizer = AutoTokenizer.from_pretrained(HT_MODEL)

def ht_tok(batch):
    return ht_tokenizer(batch[TEXT_COL], truncation=True, padding="max_length", max_length=HT_MAX_LEN)

def to_examples(frame: pd.DataFrame) -> pd.DataFrame:
    out = frame[[TEXT_COL, GROUP_COL, DEMAND_COL]].copy()
    out[GROUP_COL] = out[GROUP_COL].astype(str)
    out[DEMAND_COL] = out[DEMAND_COL].astype(str)
    out["group"] = out[GROUP_COL]
    out["label_local"] = out.apply(lambda r: group_label_to_idx[str(r[GROUP_COL])][str(r[DEMAND_COL])], axis=1)
    return out[[TEXT_COL, "group", "label_local"]]

ht_train_ex = to_examples(train_df)
ht_val_ex = to_examples(val_seen_df)

ht_train_ds = Dataset.from_pandas(ht_train_ex, preserve_index=False).map(ht_tok, batched=True)
ht_val_ds   = Dataset.from_pandas(ht_val_ex, preserve_index=False).map(ht_tok, batched=True)

ht_train_ds.set_format(type="torch", columns=["input_ids","attention_mask","label_local"])
ht_val_ds.set_format(type="torch", columns=["input_ids","attention_mask","label_local"])


In [None]:
# Model: shared encoder + per-group head, with per-example loss
class HierTransformer(nn.Module):
    def __init__(self, base_model_name: str, group_num_labels: dict[str,int], class_weights: dict[str, torch.Tensor] | None = None):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_name)
        hidden = self.encoder.config.hidden_size
        self.group_num_labels = group_num_labels
        self.heads = nn.ModuleDict({g: nn.Linear(hidden, n) for g, n in group_num_labels.items()})
        self.class_weights = class_weights or {}

    def forward(self, input_ids, attention_mask, groups, labels=None):
        enc = self.encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]  # CLS
        # logits per example computed with its group head
        logits_list = []
        loss = None

        if labels is not None:
            losses = []

        for i in range(enc.size(0)):
            g = groups[i]
            head = self.heads[g]
            logits_i = head(enc[i])  # [n_labels_in_group]
            logits_list.append(logits_i)

            if labels is not None:
                w = self.class_weights.get(g, None)
                loss_fn = nn.CrossEntropyLoss(weight=w.to(logits_i.device) if w is not None else None)
                losses.append(loss_fn(logits_i.unsqueeze(0), labels[i].unsqueeze(0)))

        if labels is not None:
            loss = torch.stack(losses).mean()

        return {"loss": loss, "logits_list": logits_list}


# Optional per-group class weights (inverse frequency)
class_w = None
if USE_CLASS_WEIGHTS:
    class_w = {}
    for g, sub in ht_train_ex.groupby("group"):
        counts = sub["label_local"].value_counts().sort_index()
        w = (counts.sum() / (counts + 1e-9)).values.astype(np.float32)
        w = w / w.mean()
        class_w[str(g)] = torch.tensor(w, dtype=torch.float32)

ht_model = HierTransformer(HT_MODEL, group_num_labels, class_weights=class_w).to(device)


In [None]:
# Custom trainer: we need to pass groups and compute loss
class HTTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("label_local")
        # groups come from an external list since HF Dataset doesn't keep strings as tensors nicely
        # We'll attach them via a side-channel using the batch indices.
        raise RuntimeError("This cell is a placeholder; see next cell for the working dataloader approach.")


## Important note (implementation detail)

HF `Trainer` doesn't natively handle per-example *string* groups cleanly.

To keep the notebook **working and production-realistic**, we train the hierarchical transformer using a small custom
training loop (still simple), and then evaluate Top‑K.

This avoids fragile Trainer hacks and is easier to port into Azure ML.


In [None]:
from torch.utils.data import DataLoader

# Build torch datasets with groups stored alongside tensors
def build_torch_dataset(ds: Dataset, groups: list[str]):
    # ds already has tensors for input_ids/attention_mask/label_local
    assert len(ds) == len(groups)
    return list(zip(ds["input_ids"], ds["attention_mask"], ds["label_local"], groups))

train_groups_list = ht_train_ex["group"].astype(str).tolist()
val_groups_list   = ht_val_ex["group"].astype(str).tolist()

train_torch = build_torch_dataset(ht_train_ds, train_groups_list)
val_torch   = build_torch_dataset(ht_val_ds, val_groups_list)

def collate(batch):
    input_ids = torch.stack([b[0] for b in batch])
    attn = torch.stack([b[1] for b in batch])
    labels = torch.stack([b[2] for b in batch])
    groups = [b[3] for b in batch]
    return {"input_ids": input_ids, "attention_mask": attn, "labels": labels, "groups": groups}

train_loader = DataLoader(train_torch, batch_size=HT_BS, shuffle=True, collate_fn=collate)
val_loader   = DataLoader(val_torch, batch_size=HT_BS, shuffle=False, collate_fn=collate)

# Optimizer
optim = torch.optim.AdamW(ht_model.parameters(), lr=HT_LR, weight_decay=HT_WEIGHT_DECAY)

def train_one_epoch():
    ht_model.train()
    total=0.0; n=0
    for batch in train_loader:
        optim.zero_grad()
        out = ht_model(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            groups=batch["groups"],
            labels=batch["labels"].to(device),
        )
        loss = out["loss"]
        loss.backward()
        optim.step()
        total += float(loss.detach().cpu())
        n += 1
    return total / max(1,n)

@torch.no_grad()
def eval_loss():
    ht_model.eval()
    total=0.0; n=0
    for batch in val_loader:
        out = ht_model(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            groups=batch["groups"],
            labels=batch["labels"].to(device),
        )
        total += float(out["loss"].detach().cpu())
        n += 1
    return total / max(1,n)

t0 = time.time()
best = 1e9
best_state = None

for epoch in range(1, HT_EPOCHS+1):
    tr = train_one_epoch()
    va = eval_loss()
    print(f"Epoch {epoch}: train_loss={tr:.4f} val_loss={va:.4f}")
    if va < best:
        best = va
        best_state = {k: v.detach().cpu().clone() for k, v in ht_model.state_dict().items()}

if best_state is not None:
    ht_model.load_state_dict(best_state)

print("Hierarchical transformer trained in %.1fs" % (time.time() - t0))


In [None]:
# Evaluate hierarchical transformer demand_id Top-1 / Top-K on validation using predicted group (same as CE fairness)

@torch.no_grad()
def ht_predict_topk(text: str, group: str, k: int) -> list[str]:
    group = str(group)
    if group not in group_label_list:
        return []
    labs = group_label_list[group]
    # tokenize
    batch = ht_tokenizer(text, truncation=True, padding="max_length", max_length=HT_MAX_LEN, return_tensors="pt")
    batch = {k: v.to(device) for k, v in batch.items()}
    enc = ht_model.encoder(**batch).last_hidden_state[:, 0]
    logits = ht_model.heads[group](enc[0])  # [n_labels]
    probs = F.softmax(logits, dim=-1).detach().cpu().numpy()
    order = np.argsort(-probs)[:k]
    return [labs[i] for i in order]

# group prediction for the SAME val rows we evaluate (val_seen_df aligns with ht_val_ex)
group_probs_seen = group_cal.predict_proba(val_seen_df[TEXT_COL].astype(str))
group_classes = group_cal.classes_
seen_top1_group = group_classes[np.argmax(group_probs_seen, axis=1)]

y_true_seen = val_seen_df[DEMAND_COL].astype(str).tolist()
texts_seen = val_seen_df[TEXT_COL].astype(str).tolist()

ht_top1=0; ht_topk=0; valid=0
for text, true_lab, g in zip(texts_seen, y_true_seen, seen_top1_group):
    preds = ht_predict_topk(text, str(g), TOP_K)
    if not preds:
        continue
    valid += 1
    ht_top1 += int(preds[0] == true_lab)
    ht_topk += int(true_lab in preds)

ht_top1_acc = ht_top1 / max(1,valid)
ht_topk_acc = ht_topk / max(1,valid)

print("HT evaluated rows:", valid, "/", len(val_seen_df))
print("HT Top-1:", ht_top1_acc)
print(f"HT Top-{TOP_K}:", ht_topk_acc)


## 5) Summary comparison table

In [None]:
summary = pd.DataFrame([
    {"model": "Cross-Encoder (desc-aware) + group", "top1": ce_top1_acc, f"top{TOP_K}": ce_topk_acc, "eval_rows": valid},
    {"model": "Hierarchical Transformer (group heads) + group", "top1": ht_top1_acc, f"top{TOP_K}": ht_topk_acc, "eval_rows": valid},
])

summary


## Notes on fairness

- Both models use the **same Stage A group predictor** and the **same val split**.
- For the hierarchical transformer we also require the val label to exist in train within its group (`val_seen_df`),
  otherwise it is literally un-predictable. (Cross-encoder can still score unseen labels if it has descriptions,
  but we keep evaluation conservative by using the same seen subset for the HT metric.)
