# Demand label classification (200+ labels) — best practical solution

Best pattern for your case:
- many (200+) similar, hierarchical labels
- long-tail imbalance
- label `description` exists
- English text

✅ Pipeline:
1) (separate) Relevance filter -> keep `relevant==1`
2) Group classifier (TF-IDF + LinearSVC) to predict coarse group
3) Description-aware **cross-encoder** to score labels within the predicted group

This notebook implements steps (2) and (3), and reports Top-1 / Top-5 accuracy on validation.

---

## Inputs
- `dataset.csv` with columns: `text`, `demand_id`, `relevant`/`relevance`, and `group_id` (or mergeable from labels file)
- `labels.csv` with columns: `demand_id`, `description` (and optionally `group_id`)

---

## Notes
- Cross-encoder training is heavier; GPU recommended. It can still run on CPU for a smoke-test.


In [None]:
# =========================
# CONFIG (EDIT THESE)
# =========================
from pathlib import Path

DATASET_CSV = Path("dataset.csv")
LABELS_CSV  = Path("labels.csv")

TEXT_COL = "text"
DEMAND_COL = "demand_id"
GROUP_COL = "group_id"
REL_COL_CANDIDATES = ["relevant", "relevance"]

SEED = 42
TEST_SIZE = 0.30

# Cross-encoder backbone (English)
CROSS_ENCODER_MODEL = "microsoft/deberta-v3-base"
MAX_LEN = 256
EPOCHS = 2
BATCH_SIZE = 8
LR = 2e-5

NEG_PER_POS = 3
TOP_K = 5


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report

import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

RNG = np.random.default_rng(SEED)


## 1) Load data + detect relevance column + merge descriptions

In [None]:
df = pd.read_csv(DATASET_CSV)
labels = pd.read_csv(LABELS_CSV)

# detect relevance column
rel_col = None
for c in REL_COL_CANDIDATES:
    if c in df.columns:
        rel_col = c
        break
if rel_col is None:
    raise ValueError(f"Missing relevance column. Tried: {REL_COL_CANDIDATES}. Have: {list(df.columns)}")

# required columns
for c in [TEXT_COL, DEMAND_COL]:
    if c not in df.columns:
        raise ValueError(f"Dataset missing required column: {c}")

for c in [DEMAND_COL, "description"]:
    if c not in labels.columns:
        raise ValueError(f"Labels file missing required column: {c}")

df[rel_col] = df[rel_col].fillna(0).astype(int)
df[DEMAND_COL] = df[DEMAND_COL].astype(str)
labels[DEMAND_COL] = labels[DEMAND_COL].astype(str)

# Add group_id if needed (from labels)
if GROUP_COL not in df.columns and GROUP_COL in labels.columns:
    df = df.merge(labels[[DEMAND_COL, GROUP_COL]], on=DEMAND_COL, how="left")

if GROUP_COL not in df.columns:
    raise ValueError(f"Missing {GROUP_COL} in dataset. Provide it in dataset.csv or labels.csv (and set GROUP_COL).")

# Merge descriptions into dataset
df = df.merge(labels[[DEMAND_COL, "description"]], on=DEMAND_COL, how="left")

print("Dataset shape:", df.shape)
print("Relevant distribution:", df[rel_col].value_counts().to_dict())
print("Unique demand labels:", df[DEMAND_COL].nunique())
print("Unique groups:", df[GROUP_COL].nunique())
df.head(3)


## 2) Keep only relevant==1 for label classification

In [None]:
df_rel = df[df[rel_col] == 1].copy()
df_rel = df_rel.dropna(subset=[TEXT_COL, DEMAND_COL, GROUP_COL, "description"]).copy()

print("Relevant-only rows:", len(df_rel))
print("Unique demand labels (relevant):", df_rel[DEMAND_COL].nunique())
print("Unique groups (relevant):", df_rel[GROUP_COL].nunique())
df_rel.head(3)


## 3) Train/val split (stratify by group)

In [None]:
train_df, val_df = train_test_split(
    df_rel,
    test_size=TEST_SIZE,
    random_state=SEED,
    stratify=df_rel[GROUP_COL] if df_rel[GROUP_COL].nunique() > 1 else None,
)

print("Train:", train_df.shape, "Val:", val_df.shape)
print("Train groups:", train_df[GROUP_COL].nunique(), "Val groups:", val_df[GROUP_COL].nunique())


## 4) Stage A — Group classifier (TF-IDF + LinearSVC) + calibration

In [None]:
group_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(min_df=2, max_df=0.95, ngram_range=(1,3))),
    ("clf", LinearSVC(class_weight="balanced", max_iter=8000)),
])

group_pipe.fit(train_df[TEXT_COL].astype(str), train_df[GROUP_COL].astype(str))

group_cal = CalibratedClassifierCV(group_pipe, method="sigmoid", cv=3)
group_cal.fit(train_df[TEXT_COL].astype(str), train_df[GROUP_COL].astype(str))

pred_groups = group_cal.predict(val_df[TEXT_COL].astype(str))
print(classification_report(val_df[GROUP_COL].astype(str), pred_groups, zero_division=0))


## 5) Build group -> candidate labels (demand_id, description)

In [None]:
labels_rel = labels.merge(df_rel[[DEMAND_COL, GROUP_COL]].drop_duplicates(), on=DEMAND_COL, how="inner")
labels_rel = labels_rel.dropna(subset=[GROUP_COL, "description"])

group_to_labels = {}
for g, sub in labels_rel.groupby(GROUP_COL):
    pairs = list(zip(sub[DEMAND_COL].astype(str).tolist(), sub["description"].astype(str).tolist()))
    seen=set()
    uniq=[]
    for did, desc in pairs:
        if did in seen:
            continue
        seen.add(did)
        uniq.append((did, desc))
    group_to_labels[str(g)] = uniq

print("Example group sizes (first 10):")
for g, pairs in list(group_to_labels.items())[:10]:
    print(g, "labels:", len(pairs))


## 6) Build cross-encoder training pairs (positive + negatives within same group)

- Positive: (text, correct description) -> 1
- Negatives: (text, other descriptions in same group) -> 0

This targets the "very similar labels" problem directly.


In [None]:
def make_pairs(df_part: pd.DataFrame, neg_per_pos: int) -> pd.DataFrame:
    rows = []
    for _, r in df_part.iterrows():
        text = str(r[TEXT_COL])
        demand = str(r[DEMAND_COL])
        group = str(r[GROUP_COL])
        desc_pos = str(r["description"])

        rows.append({"text": text, "description": desc_pos, "labels": 1, "group_id": group, "true_demand": demand})

        candidates = group_to_labels.get(group, [])
        neg_pool = [(d, desc) for d, desc in candidates if d != demand]
        if not neg_pool:
            continue
        take = min(neg_per_pos, len(neg_pool))
        neg_idx = RNG.choice(len(neg_pool), size=take, replace=False)
        for i in np.atleast_1d(neg_idx):
            _, desc_neg = neg_pool[int(i)]
            rows.append({"text": text, "description": str(desc_neg), "labels": 0, "group_id": group, "true_demand": demand})
    return pd.DataFrame(rows)

pair_train = make_pairs(train_df, NEG_PER_POS)
pair_val = make_pairs(val_df, NEG_PER_POS)

print("Pair-train:", pair_train.shape, "pos rate:", pair_train["labels"].mean())
print("Pair-val  :", pair_val.shape, "pos rate:", pair_val["labels"].mean())
pair_train.head(3)


## 7) Train cross-encoder (transformers Trainer)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CROSS_ENCODER_MODEL)

def tok_fn(batch):
    return tokenizer(
        batch["text"],
        batch["description"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
    )

ds_train = Dataset.from_pandas(pair_train[["text","description","labels"]], preserve_index=False).map(tok_fn, batched=True)
ds_val   = Dataset.from_pandas(pair_val[["text","description","labels"]], preserve_index=False).map(tok_fn, batched=True)

ds_train.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
ds_val.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

model = AutoModelForSequenceClassification.from_pretrained(CROSS_ENCODER_MODEL, num_labels=2)

training_args = TrainingArguments(
    output_dir="ce_out",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none",
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
)

trainer.train()


## 8) Evaluate demand_id prediction (Top-1 / Top-K)

In [None]:
import torch.nn.functional as F

@torch.no_grad()
def score_pairs(text: str, descriptions: list[str]) -> np.ndarray:
    batch = tokenizer([text]*len(descriptions), descriptions, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt")
    batch = {k: v.to(trainer.model.device) for k, v in batch.items()}
    logits = trainer.model(**batch).logits
    probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().numpy()
    return probs

def predict_topk_for_row(text: str, group: str, k: int = 5):
    pairs = group_to_labels.get(str(group), [])
    if not pairs:
        return []
    demand_ids = [d for d, _ in pairs]
    descs = [desc for _, desc in pairs]
    scores = score_pairs(text, descs)
    order = np.argsort(-scores)
    return [demand_ids[i] for i in order[:k]]

# group prediction top-1
group_probs = group_cal.predict_proba(val_df[TEXT_COL].astype(str))
group_classes = group_cal.classes_
top1_groups = group_classes[np.argmax(group_probs, axis=1)]

y_true = val_df[DEMAND_COL].astype(str).tolist()
texts = val_df[TEXT_COL].astype(str).tolist()

top1 = 0
topk = 0
valid = 0

for t, true_lab, g in zip(texts, y_true, top1_groups):
    preds = predict_topk_for_row(t, str(g), k=TOP_K)
    if not preds:
        continue
    valid += 1
    if preds[0] == true_lab:
        top1 += 1
    if true_lab in preds:
        topk += 1

print("Evaluated rows (had group candidates):", valid, "out of", len(val_df))
print(f"Top-1 accuracy: {top1/max(1,valid):.4f}")
print(f"Top-{TOP_K} accuracy: {topk/max(1,valid):.4f}")


## 9) Save artifacts for inference

In [None]:
import json, joblib
from pathlib import Path

OUT_DIR = Path("demand_classifier_artifacts")
OUT_DIR.mkdir(parents=True, exist_ok=True)

joblib.dump(group_cal, OUT_DIR / "group_model_calibrated.joblib")

trainer.model.save_pretrained(OUT_DIR / "cross_encoder")
tokenizer.save_pretrained(OUT_DIR / "cross_encoder")

mapping = {g: [{"demand_id": did, "description": desc} for did, desc in pairs] for g, pairs in group_to_labels.items()}
(OUT_DIR / "group_to_labels.json").write_text(json.dumps(mapping, indent=2), encoding="utf-8")

print("Saved to:", OUT_DIR.resolve())
