# Demand label classification — Transformer multiclass (hierarchical-aware, weighted)

This notebook presents **another strong solution** for your problem, fully transformer-based,
*without* description pairing and *without* kNN:

## When to use this solution
- You want **one transformer model** (simpler infra than cross-encoder)
- You have **hierarchical labels** (group → fine)
- You want to handle:
  - 200+ labels
  - severe imbalance
  - very similar labels

## Core idea
Instead of flat softmax over 200+ labels, we use:

### 1) Shared encoder (Transformer)
### 2) Group-aware heads
- One classification head **per group**
- Loss computed **only for the true group**
- Strongly reduces confusion between unrelated labels

This is often called:
- *hierarchical softmax*
- *conditional computation*
- *grouped classification heads*

---

## Pipeline
1. Filter `relevant==1`
2. Train **group classifier** (transformer)
3. Train **fine-label transformer** with group-aware heads

This notebook implements **step 3** assuming group is known during training.
At inference, you first predict group, then use the corresponding head.

---

## Pros vs cross-encoder
+ Faster inference (one forward pass)
+ Easier serving
+ Better when you have more data per label

## Cons
- Slightly worse on ultra-fine distinctions than cross-encoder
- Requires careful loss handling


In [None]:
# =========================
# CONFIG (EDIT THESE)
# =========================
from pathlib import Path

DATASET_CSV = Path("dataset.csv")

TEXT_COL = "text"
DEMAND_COL = "demand_id"
GROUP_COL = "group_id"
REL_COL_CANDIDATES = ["relevant", "relevance"]

MODEL_NAME = "microsoft/deberta-v3-base"

MAX_LEN = 256
EPOCHS = 3
BATCH_SIZE = 8
LR = 2e-5
WEIGHT_DECAY = 0.01

SEED = 42
TEST_SIZE = 0.30


In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


## 1) Load data and keep only relevant==1

In [None]:
df = pd.read_csv(DATASET_CSV)

# detect relevance column
rel_col = None
for c in REL_COL_CANDIDATES:
    if c in df.columns:
        rel_col = c
        break
if rel_col is None:
    raise ValueError("Missing relevance column")

df = df[df[rel_col].fillna(0).astype(int) == 1].copy()

for c in [TEXT_COL, DEMAND_COL, GROUP_COL]:
    if c not in df.columns:
        raise ValueError(f"Missing column: {c}")

df[DEMAND_COL] = df[DEMAND_COL].astype(str)
df[GROUP_COL] = df[GROUP_COL].astype(str)

print("Rows:", len(df))
print("Groups:", df[GROUP_COL].nunique())
print("Labels:", df[DEMAND_COL].nunique())


## 2) Encode groups and labels

In [None]:
group_encoder = LabelEncoder()
df["group_idx"] = group_encoder.fit_transform(df[GROUP_COL])

label_encoders = {}
df["label_idx"] = -1

for g, sub in df.groupby("group_idx"):
    le = LabelEncoder()
    idx = sub.index
    df.loc[idx, "label_idx"] = le.fit_transform(sub[DEMAND_COL])
    label_encoders[g] = le

assert (df["label_idx"] >= 0).all()

print("Example group sizes:")
print(df.groupby("group_idx")["label_idx"].nunique().head())


## 3) Train/val split (stratified by group)

In [None]:
train_df, val_df = train_test_split(
    df,
    test_size=TEST_SIZE,
    random_state=SEED,
    stratify=df["group_idx"] if df["group_idx"].nunique() > 1 else None,
)

print("Train:", train_df.shape, "Val:", val_df.shape)


## 4) Dataset + tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(
        batch[TEXT_COL],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )

train_ds = Dataset.from_pandas(train_df[[TEXT_COL, "group_idx", "label_idx"]]).map(tokenize, batched=True)
val_ds = Dataset.from_pandas(val_df[[TEXT_COL, "group_idx", "label_idx"]]).map(tokenize, batched=True)

cols = ["input_ids", "attention_mask", "group_idx", "label_idx"]
train_ds.set_format(type="torch", columns=cols)
val_ds.set_format(type="torch", columns=cols)


## 5) Model: shared encoder + group-specific heads

In [None]:
class HierarchicalClassifier(nn.Module):
    def __init__(self, base_model_name, num_labels_per_group):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_name)
        hidden = self.encoder.config.hidden_size

        self.heads = nn.ModuleDict({
            str(g): nn.Linear(hidden, n_labels)
            for g, n_labels in num_labels_per_group.items()
        })

    def forward(self, input_ids, attention_mask, group_idx, labels=None):
        enc = self.encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
        logits = []

        loss = None
        for i in range(enc.size(0)):
            g = str(group_idx[i].item())
            logit = self.heads[g](enc[i])
            logits.append(logit)

        logits = torch.nn.utils.rnn.pad_sequence(logits, batch_first=True)

        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits}


num_labels_per_group = train_df.groupby("group_idx")["label_idx"].nunique().to_dict()
model = HierarchicalClassifier(MODEL_NAME, num_labels_per_group).to(device)


## 6) Trainer

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("label_idx")
        group_idx = inputs.pop("group_idx")
        outputs = model(**inputs, group_idx=group_idx, labels=labels)
        loss = outputs["loss"]
        return (loss, outputs) if return_outputs else loss


training_args = TrainingArguments(
    output_dir="hier_transformer_out",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    weight_decay=WEIGHT_DECAY,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none",
    seed=SEED,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
)

trainer.train()


## 7) Evaluation (per-group reports)

In [None]:
model.eval()

y_true = []
y_pred = []

with torch.no_grad():
    for row in val_ds:
        inputs = {
            "input_ids": row["input_ids"].unsqueeze(0).to(device),
            "attention_mask": row["attention_mask"].unsqueeze(0).to(device),
        }
        g = row["group_idx"].item()
        logits = model(**inputs, group_idx=torch.tensor([g]).to(device))["logits"]
        pred = logits.argmax(dim=-1).item()
        y_true.append(row["label_idx"].item())
        y_pred.append(pred)

print(classification_report(y_true, y_pred, zero_division=0))
