# Improvements: Top-N groups + hard-negative mining (Cross-Encoder)

This notebook upgrades the **hierarchical + description-aware cross-encoder** solution by adding:

1) **Top-N groups** shortlist (e.g., N=3) instead of top-1 group  
2) **Hard-negative mining** from model confusions (within-group) and optional cross-group (from top-N)

You get:
- Baseline evaluation (top-1 group, random negatives)
- Improved evaluation (top-N groups)
- Fine-tuned cross-encoder with mined hard negatives
- Final comparison table

## Inputs
- `dataset.csv`: columns `text`, `demand_id`, `relevant`/`relevance`, `group_id`
- `labels.csv`: columns `demand_id`, `description` (and optionally `group_id`)



In [None]:
# =========================
# CONFIG (EDIT THESE)
# =========================
from pathlib import Path

DATASET_CSV = Path("dataset.csv")
LABELS_CSV  = Path("labels.csv")

TEXT_COL   = "text"
DEMAND_COL = "demand_id"
GROUP_COL  = "group_id"
REL_COL_CANDIDATES = ["relevant", "relevance"]

SEED = 42
TEST_SIZE = 0.30

# Stage A: Group classifier
TFIDF_NGRAM = (1,3)
TOP_N_GROUPS = 3          # <-- improvement: evaluate with top-3 groups
TOP_K_LABELS = 5          # report top-5

# Cross-Encoder backbone (English)
CE_MODEL = "microsoft/deberta-v3-base"
CE_MAX_LEN = 256
CE_EPOCHS_BASE = 2
CE_EPOCHS_HARD = 1        # extra fine-tune epochs on hard negatives
CE_BS = 8
CE_LR = 2e-5

# Pairing
NEG_PER_POS_RANDOM = 3    # baseline random negatives within group
HARD_NEG_PER_POS = 2      # mined hard negatives per positive

# Mining settings
MINE_FROM = "val"         # "val" or "train_sample"
MAX_CANDIDATES_PER_GROUP_FOR_MINING = 128  # speed cap: sample candidate labels per group when mining (None for all)



In [None]:
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report

import torch
import torch.nn.functional as F
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
RNG = np.random.default_rng(SEED)
print("Device:", device)


## 1) Load + merge + keep relevant==1

In [None]:
df = pd.read_csv(DATASET_CSV)
labels = pd.read_csv(LABELS_CSV)

# detect relevance column
rel_col = None
for c in REL_COL_CANDIDATES:
    if c in df.columns:
        rel_col = c
        break
if rel_col is None:
    raise ValueError(f"Missing relevance column. Tried: {REL_COL_CANDIDATES}. Have: {list(df.columns)}")

for c in [TEXT_COL, DEMAND_COL]:
    if c not in df.columns:
        raise ValueError(f"Dataset missing column: {c}")
for c in [DEMAND_COL, "description"]:
    if c not in labels.columns:
        raise ValueError(f"Labels file missing column: {c}")

df[rel_col] = df[rel_col].fillna(0).astype(int)
df[DEMAND_COL] = df[DEMAND_COL].astype(str)
labels[DEMAND_COL] = labels[DEMAND_COL].astype(str)

# bring group_id if missing
if GROUP_COL not in df.columns and GROUP_COL in labels.columns:
    df = df.merge(labels[[DEMAND_COL, GROUP_COL]], on=DEMAND_COL, how="left")
if GROUP_COL not in df.columns:
    raise ValueError(f"Missing {GROUP_COL}. Provide in dataset.csv or labels.csv and set GROUP_COL.")

# merge descriptions into df
df = df.merge(labels[[DEMAND_COL, "description"]], on=DEMAND_COL, how="left")

df_rel = df[df[rel_col] == 1].dropna(subset=[TEXT_COL, DEMAND_COL, GROUP_COL, "description"]).copy()
df_rel[GROUP_COL] = df_rel[GROUP_COL].astype(str)

print("Relevant rows:", len(df_rel))
print("Unique labels:", df_rel[DEMAND_COL].nunique(), "Unique groups:", df_rel[GROUP_COL].nunique())
df_rel.head(2)


## 2) Shared train/val split (stratify by group)

In [None]:
train_df, val_df = train_test_split(
    df_rel,
    test_size=TEST_SIZE,
    random_state=SEED,
    stratify=df_rel[GROUP_COL] if df_rel[GROUP_COL].nunique() > 1 else None,
)

print("Train:", train_df.shape, "Val:", val_df.shape)


## 3) Stage A — Group classifier (shared)

In [None]:
t0 = time.time()

group_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(min_df=2, max_df=0.95, ngram_range=TFIDF_NGRAM)),
    ("clf", LinearSVC(class_weight="balanced", max_iter=8000)),
])
group_pipe.fit(train_df[TEXT_COL].astype(str), train_df[GROUP_COL])

group_cal = CalibratedClassifierCV(group_pipe, method="sigmoid", cv=3)
group_cal.fit(train_df[TEXT_COL].astype(str), train_df[GROUP_COL])

pred_groups = group_cal.predict(val_df[TEXT_COL].astype(str))
print(classification_report(val_df[GROUP_COL], pred_groups, zero_division=0))
print("Stage A trained in %.1fs" % (time.time() - t0))


## 4) Build group -> candidate labels (demand_id, description)

In [None]:
labels_rel = labels.merge(df_rel[[DEMAND_COL, GROUP_COL]].drop_duplicates(), on=DEMAND_COL, how="inner")
labels_rel = labels_rel.dropna(subset=[GROUP_COL, "description"]).copy()
labels_rel[GROUP_COL] = labels_rel[GROUP_COL].astype(str)

group_to_labels = {}
for g, sub in labels_rel.groupby(GROUP_COL):
    # dedupe
    sub = sub.drop_duplicates(subset=[DEMAND_COL], keep="first")
    group_to_labels[str(g)] = list(zip(sub[DEMAND_COL].astype(str).tolist(), sub["description"].astype(str).tolist()))

print("Groups with candidates:", len(group_to_labels))
print("Example group sizes:", sorted([(g, len(v)) for g, v in group_to_labels.items()], key=lambda x: -x[1])[:5])


## 5) Cross-Encoder baseline training (random negatives within group)

In [None]:
def make_random_pairs(df_part: pd.DataFrame, neg_per_pos: int) -> pd.DataFrame:
    rows = []
    for _, r in df_part.iterrows():
        text = str(r[TEXT_COL])
        demand = str(r[DEMAND_COL])
        group = str(r[GROUP_COL])
        desc_pos = str(r["description"])

        rows.append({"text": text, "description": desc_pos, "labels": 1, "group": group, "true_demand": demand})

        candidates = group_to_labels.get(group, [])
        neg_pool = [(d, desc) for d, desc in candidates if d != demand]
        if not neg_pool:
            continue
        take = min(neg_per_pos, len(neg_pool))
        neg_idx = RNG.choice(len(neg_pool), size=take, replace=False)
        for i in np.atleast_1d(neg_idx):
            _, desc_neg = neg_pool[int(i)]
            rows.append({"text": text, "description": str(desc_neg), "labels": 0, "group": group, "true_demand": demand})
    return pd.DataFrame(rows)

ce_train_pairs = make_random_pairs(train_df, NEG_PER_POS_RANDOM)
ce_val_pairs = make_random_pairs(val_df, NEG_PER_POS_RANDOM)

print("Train pairs:", ce_train_pairs.shape, "pos rate:", ce_train_pairs['labels'].mean())
print("Val pairs  :", ce_val_pairs.shape, "pos rate:", ce_val_pairs['labels'].mean())


In [None]:
ce_tokenizer = AutoTokenizer.from_pretrained(CE_MODEL)

def ce_tok(batch):
    return ce_tokenizer(
        batch["text"], batch["description"],
        truncation=True, padding="max_length", max_length=CE_MAX_LEN
    )

def to_hf_ds(pairs_df: pd.DataFrame) -> Dataset:
    ds = Dataset.from_pandas(pairs_df[["text","description","labels"]], preserve_index=False)
    ds = ds.map(ce_tok, batched=True)
    ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
    return ds

ce_train_ds = to_hf_ds(ce_train_pairs)
ce_val_ds = to_hf_ds(ce_val_pairs)

ce_model = AutoModelForSequenceClassification.from_pretrained(CE_MODEL, num_labels=2).to(device)

ce_args = TrainingArguments(
    output_dir="ce_improve_out",
    learning_rate=CE_LR,
    per_device_train_batch_size=CE_BS,
    per_device_eval_batch_size=CE_BS,
    num_train_epochs=CE_EPOCHS_BASE,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none",
    seed=SEED,
)

ce_trainer = Trainer(model=ce_model, args=ce_args, train_dataset=ce_train_ds, eval_dataset=ce_val_ds)
t0 = time.time()
ce_trainer.train()
print("Baseline CE trained in %.1fs" % (time.time() - t0))


## 6) Evaluation helper: Top-N groups → score union of candidate labels

In [None]:
@torch.no_grad()
def ce_score_probs(text: str, descriptions: list[str]) -> np.ndarray:
    batch = ce_tokenizer([text]*len(descriptions), descriptions, padding=True, truncation=True, max_length=CE_MAX_LEN, return_tensors="pt")
    batch = {k: v.to(device) for k, v in batch.items()}
    logits = ce_model(**batch).logits
    probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().numpy()
    return probs

def top_n_groups_for_text(texts: pd.Series, n: int) -> np.ndarray:
    probs = group_cal.predict_proba(texts.astype(str))
    classes = group_cal.classes_
    topn = np.argsort(-probs, axis=1)[:, :n]
    return classes[topn]

def predict_topk_union(text: str, groups: list[str], k: int) -> list[str]:
    # union candidates from groups
    cand = []
    for g in groups:
        cand.extend(group_to_labels.get(str(g), []))
    if not cand:
        return []
    # dedupe demand_id (keep first)
    seen=set(); demand_ids=[]; descs=[]
    for did, desc in cand:
        if did in seen: 
            continue
        seen.add(did)
        demand_ids.append(did)
        descs.append(desc)

    scores = ce_score_probs(text, descs)
    order = np.argsort(-scores)[:k]
    return [demand_ids[i] for i in order]

def evaluate_topk(df_part: pd.DataFrame, top_n_groups: int, top_k_labels: int):
    groups_topn = top_n_groups_for_text(df_part[TEXT_COL], top_n_groups)
    y_true = df_part[DEMAND_COL].astype(str).tolist()
    texts = df_part[TEXT_COL].astype(str).tolist()

    top1=0; topk=0; valid=0
    for text, true_lab, g_row in zip(texts, y_true, groups_topn):
        preds = predict_topk_union(text, list(g_row), top_k_labels)
        if not preds:
            continue
        valid += 1
        top1 += int(preds[0] == true_lab)
        topk += int(true_lab in preds)
    return {
        "eval_rows": valid,
        "top1": top1 / max(1,valid),
        f"top{top_k_labels}": topk / max(1,valid),
    }

baseline_top1group = evaluate_topk(val_df, top_n_groups=1, top_k_labels=TOP_K_LABELS)
baseline_topNgroups = evaluate_topk(val_df, top_n_groups=TOP_N_GROUPS, top_k_labels=TOP_K_LABELS)

print("Baseline CE + top-1 group:", baseline_top1group)
print(f"Baseline CE + top-{TOP_N_GROUPS} groups:", baseline_topNgroups)


## 7) Hard-negative mining (within-group), then extra fine-tune

Mining strategy (simple and effective):
For each (text, true_label) in the mining set:
- score all candidate labels in its **true group** (or sampled subset for speed)
- take the highest-scoring **incorrect** labels as **hard negatives**
- add those pairs to training data and fine-tune 1 more epoch

This targets your pain: *ultra-similar labels within the same group*.


In [None]:
def mine_hard_negatives(df_part: pd.DataFrame, hard_neg_per_pos: int) -> pd.DataFrame:
    rows = []
    for _, r in df_part.iterrows():
        text = str(r[TEXT_COL])
        true_demand = str(r[DEMAND_COL])
        group = str(r[GROUP_COL])
        cand = group_to_labels.get(group, [])
        if not cand:
            continue

        # optional sampling for speed on big groups
        if MAX_CANDIDATES_PER_GROUP_FOR_MINING and len(cand) > MAX_CANDIDATES_PER_GROUP_FOR_MINING:
            idx = RNG.choice(len(cand), size=MAX_CANDIDATES_PER_GROUP_FOR_MINING, replace=False)
            cand = [cand[int(i)] for i in idx]

        demand_ids = [d for d, _ in cand]
        descs = [desc for _, desc in cand]

        scores = ce_score_probs(text, descs)
        order = np.argsort(-scores)

        # select top incorrect labels
        hard = []
        for i in order:
            did = demand_ids[int(i)]
            if did == true_demand:
                continue
            hard.append((did, descs[int(i)]))
            if len(hard) >= hard_neg_per_pos:
                break

        # build pairs: positives + hard negatives
        # positive (ensure we have correct description from df)
        rows.append({"text": text, "description": str(r["description"]), "labels": 1})
        for _, desc_neg in hard:
            rows.append({"text": text, "description": str(desc_neg), "labels": 0})

    return pd.DataFrame(rows)

mine_src = val_df if MINE_FROM == "val" else train_df.sample(min(len(train_df), 2000), random_state=SEED)

hard_pairs = mine_hard_negatives(mine_src, HARD_NEG_PER_POS)
print("Hard mined pairs:", hard_pairs.shape, "pos rate:", hard_pairs["labels"].mean())
hard_pairs.head(3)


In [None]:
# Fine-tune on union(train_pairs + hard_pairs)
# (Keep it small: 1 epoch is typically enough)

ft_pairs = pd.concat([ce_train_pairs[["text","description","labels"]], hard_pairs], ignore_index=True)

ft_train_ds = to_hf_ds(ft_pairs)

# reuse same model/tokenizer, continue training
ce_args_hard = TrainingArguments(
    output_dir="ce_improve_out_hard",
    learning_rate=CE_LR,
    per_device_train_batch_size=CE_BS,
    per_device_eval_batch_size=CE_BS,
    num_train_epochs=CE_EPOCHS_HARD,
    evaluation_strategy="no",
    save_strategy="no",
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none",
    seed=SEED,
)

ce_trainer_hard = Trainer(model=ce_model, args=ce_args_hard, train_dataset=ft_train_ds)
t0 = time.time()
ce_trainer_hard.train()
print("Hard-negative fine-tune done in %.1fs" % (time.time() - t0))


## 8) Re-evaluate after hard-negative fine-tune

In [None]:
hard_top1group = evaluate_topk(val_df, top_n_groups=1, top_k_labels=TOP_K_LABELS)
hard_topNgroups = evaluate_topk(val_df, top_n_groups=TOP_N_GROUPS, top_k_labels=TOP_K_LABELS)

print("After hard-neg CE + top-1 group:", hard_top1group)
print(f"After hard-neg CE + top-{TOP_N_GROUPS} groups:", hard_topNgroups)


## 9) Summary table

In [None]:
summary = pd.DataFrame([
    {"variant": "baseline_CE_top1group", **baseline_top1group},
    {"variant": f"baseline_CE_top{TOP_N_GROUPS}groups", **baseline_topNgroups},
    {"variant": "hardneg_CE_top1group", **hard_top1group},
    {"variant": f"hardneg_CE_top{TOP_N_GROUPS}groups", **hard_topNgroups},
])
summary


## 10) Save artifacts

This saves:
- group model (calibrated)
- cross-encoder model + tokenizer
- group_to_labels mapping

Use the final fine-tuned model for inference.


In [None]:
import json, joblib
from pathlib import Path

OUT_DIR = Path("demand_ce_improved_artifacts")
OUT_DIR.mkdir(parents=True, exist_ok=True)

joblib.dump(group_cal, OUT_DIR / "group_model_calibrated.joblib")
ce_model.save_pretrained(OUT_DIR / "cross_encoder")
ce_tokenizer.save_pretrained(OUT_DIR / "cross_encoder")

mapping = {g: [{"demand_id": did, "description": desc} for did, desc in pairs] for g, pairs in group_to_labels.items()}
(OUT_DIR / "group_to_labels.json").write_text(json.dumps(mapping, indent=2), encoding="utf-8")

print("Saved to:", OUT_DIR.resolve())
