# Full end-to-end NLP pipeline (one notebook)

**Steps**
1. Merge train+test → drop excluded `demand_id` → resplit 70/30 (single-sample labels stay in train)
2. Relevance model comparison: TF‑IDF + (LogReg, LinearSVC, optional LightGBM) + PR curves + best selection
3. Label classification (relevant==1):
   - Group classifier (TF‑IDF + LinearSVC calibrated)
   - Model A: Description-aware Cross‑Encoder + Top‑N groups + Hard Negatives
   - Model B: Hierarchical Transformer (shared encoder + per-group head)
   - Compare + pick best + save artifacts

> Notes: This notebook is written to be **runnable** and **debuggable** (prints progress, saves intermediate files).

In [None]:
# =========================
# PARAMETERS (Papermill / AML)
# =========================
# If you run this notebook locally, you can edit these.
# If you run it on Azure ML, the command job uses Papermill to inject values.

TRAIN_CSV  = "train.csv"
TEST_CSV   = "test.csv"
LABELS_CSV = "labels.csv"

# Core columns
TEXT_COL   = "text"
DEMAND_COL = "demand_id"
GROUP_COL  = "group_id"
REL_COL_CANDIDATES = ["relevant", "relevance"]  # 0/1

# Filtering
EXCLUDE_COL = "exclude"  # labels.csv: 1 => drop

# Splits / eval
SEED = 42
TEST_SIZE_RESPLIT = 0.30      # 70/30
TEST_SIZE_RELEVANCE = 0.30    # relevance model eval split


In [None]:
# -------------------------
# Runtime paths / outputs
# -------------------------
from pathlib import Path
import os

TRAIN_CSV  = Path(TRAIN_CSV)
TEST_CSV   = Path(TEST_CSV)
LABELS_CSV = Path(LABELS_CSV)

# In Azure ML command jobs, write artifacts to ./outputs (gets uploaded automatically).
OUTPUT_DIR = Path(os.getenv("OUTPUT_DIR", "outputs"))
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("TRAIN_CSV:", TRAIN_CSV.resolve())
print("TEST_CSV :", TEST_CSV.resolve())
print("LABELS_CSV:", LABELS_CSV.resolve())
print("OUTPUT_DIR:", OUTPUT_DIR.resolve())


In [None]:
# Imports
import os, json, time
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import precision_recall_curve, average_precision_score, auc

np.random.seed(SEED)

try:
    import lightgbm as lgb
    HAS_LGBM = True
except Exception:
    HAS_LGBM = False

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModel,
    TrainingArguments,
    Trainer,
)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE, "| LightGBM:", HAS_LGBM)


## 1) Step 00 — Filter excluded labels & resplit 70/30

In [None]:
# Load inputs
train_raw = pd.read_csv(TRAIN_CSV)
test_raw  = pd.read_csv(TEST_CSV)
labels_df = pd.read_csv(LABELS_CSV)

# detect relevance column
rel_col = None
for c in REL_COL_CANDIDATES:
    if c in train_raw.columns:
        rel_col = c
        break
if rel_col is None:
    raise ValueError(f"Could not find relevance column. Tried: {REL_COL_CANDIDATES}. Found: {list(train_raw.columns)}")

# Normalize dtypes
for df_ in (train_raw, test_raw):
    df_[DEMAND_COL] = df_[DEMAND_COL].astype(str)
    if GROUP_COL in df_.columns:
        df_[GROUP_COL] = df_[GROUP_COL].astype(str)

labels_df[DEMAND_COL] = labels_df[DEMAND_COL].astype(str)

# Merge
merged = pd.concat([train_raw, test_raw], ignore_index=True)
print("Merged rows:", len(merged))

# Filter excluded demand_ids
excluded = set(labels_df.loc[labels_df[EXCLUDE_COL] == 1, DEMAND_COL].astype(str))
before = len(merged)
merged = merged[~merged[DEMAND_COL].isin(excluded)].copy()
print("Excluded labels:", len(excluded), "| Rows removed:", before - len(merged))
print("Rows after filter:", len(merged), "| Labels:", merged[DEMAND_COL].nunique())

# Guard: labels with only 1 row must stay in train
counts = merged[DEMAND_COL].value_counts()
single_labels = set(counts[counts == 1].index)

df_single = merged[merged[DEMAND_COL].isin(single_labels)].copy()
df_multi  = merged[~merged[DEMAND_COL].isin(single_labels)].copy()

# If only one label overall, keep all in train
if df_multi[DEMAND_COL].nunique() <= 1:
    train_clean = merged.copy()
    test_clean = merged.iloc[0:0].copy()
    print("Only one label overall → keeping all rows in train, empty test.")
else:
    train_multi, test_multi = train_test_split(
        df_multi,
        test_size=TEST_SIZE_RESPLIT,
        random_state=SEED,
        stratify=df_multi[DEMAND_COL]
    )
    train_clean = pd.concat([train_multi, df_single], ignore_index=True)
    test_clean  = test_multi.copy()

print("Train_clean:", len(train_clean), "rows | labels:", train_clean[DEMAND_COL].nunique())
print("Test_clean :", len(test_clean),  "rows | labels:", test_clean[DEMAND_COL].nunique())

# Save
train_clean_path = ARTIFACTS_DIR / "train_clean.csv"
test_clean_path  = ARTIFACTS_DIR / "test_clean.csv"
train_clean.to_csv(train_clean_path, index=False)
test_clean.to_csv(test_clean_path, index=False)
print("Saved:", train_clean_path, test_clean_path)


## 2) Relevance model — compare TF‑IDF + (LogReg, LinearSVC, LightGBM)

In [None]:
# Prepare relevance dataset (from train_clean only)
df_rel = train_clean.copy()
df_rel[rel_col] = df_rel[rel_col].fillna(0).astype(int)

X = df_rel[TEXT_COL].astype(str)
y = df_rel[rel_col].astype(int)

X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=TEST_SIZE_RELEVANCE, random_state=SEED, stratify=y if y.nunique() > 1 else None
)

def recall_at_precision(y_true, scores, precision_target=0.90):
    p, r, thr = precision_recall_curve(y_true, scores)
    # p,r length = len(thr)+1, threshold aligns with p[1:], r[1:]
    best = 0.0
    best_thr = None
    for i in range(1, len(p)):
        if p[i] >= precision_target:
            if r[i] > best:
                best = r[i]
                best_thr = thr[i-1]
    return best, best_thr

def fit_eval_model(name, estimator, use_proba=True):
    pipe = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1,3), min_df=2, max_df=0.95)),
        ("clf", estimator),
    ])
    pipe.fit(X_tr, y_tr)

    if hasattr(estimator, "predict_proba") and use_proba:
        scores = pipe.predict_proba(X_va)[:, 1]
    elif hasattr(estimator, "decision_function"):
        scores = pipe.decision_function(X_va)
    else:
        # fallback
        scores = pipe.predict(X_va).astype(float)

    pr_auc = average_precision_score(y_va, scores)
    rec, thr = recall_at_precision(y_va, scores, precision_target=PRECISION_TARGET)
    return pipe, {"model": name, "pr_auc": pr_auc, f"recall@p>={PRECISION_TARGET}": rec, "threshold": thr}

models = []
artifacts = {}

# LogReg
logreg = LogisticRegression(max_iter=5000, class_weight="balanced", n_jobs=None)
pipe_lr, m_lr = fit_eval_model("tfidf_logreg", logreg, use_proba=True)
models.append(m_lr); artifacts[m_lr["model"]] = pipe_lr

# LinearSVC (calibrated => proba)
svc = LinearSVC(class_weight="balanced", max_iter=8000)
pipe_svc = Pipeline([("tfidf", TfidfVectorizer(ngram_range=(1,3), min_df=2, max_df=0.95)), ("clf", svc)])
pipe_svc.fit(X_tr, y_tr)
cal_svc = CalibratedClassifierCV(pipe_svc, method="sigmoid", cv=3)
cal_svc.fit(X_tr, y_tr)
scores = cal_svc.predict_proba(X_va)[:,1]
m_svc = {"model":"tfidf_linearsvc_cal", "pr_auc": average_precision_score(y_va, scores)}
rec, thr = recall_at_precision(y_va, scores, precision_target=PRECISION_TARGET)
m_svc[f"recall@p>={PRECISION_TARGET}"] = rec
m_svc["threshold"] = thr
models.append(m_svc); artifacts[m_svc["model"]] = cal_svc

# LightGBM (optional)
if HAS_LGBM:
    lgbm = lgb.LGBMClassifier(
        n_estimators=600,
        learning_rate=0.05,
        num_leaves=64,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        class_weight=None,  # we'll use scale_pos_weight
        random_state=SEED,
        n_jobs=-1,
    )
    # compute scale_pos_weight
    pos = int((y_tr == 1).sum()); neg = int((y_tr == 0).sum())
    lgbm.set_params(scale_pos_weight=(neg/pos) if pos > 0 else 1.0)
    pipe_lgbm, m_lgbm = fit_eval_model("tfidf_lgbm", lgbm, use_proba=True)
    models.append(m_lgbm); artifacts[m_lgbm["model"]] = pipe_lgbm
else:
    print("LightGBM not installed. Skipping tfidf_lgbm. (Notebook still fully works.)")

results_rel = pd.DataFrame(models).sort_values(["pr_auc", f"recall@p>={PRECISION_TARGET}"], ascending=False)
results_rel


In [None]:
# Plot PR curves for compared relevance models
plt.figure(figsize=(8,5))

def pr_curve_from_model(model_obj):
    if hasattr(model_obj, "predict_proba"):
        s = model_obj.predict_proba(X_va)[:,1]
    else:
        s = model_obj.decision_function(X_va)
    p, r, _ = precision_recall_curve(y_va, s)
    ap = average_precision_score(y_va, s)
    return r, p, ap

for name, mdl in artifacts.items():
    r, p, ap = pr_curve_from_model(mdl)
    plt.plot(r, p, label=f"{name} (AP={ap:.3f})")

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Relevance: Precision-Recall")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()


In [None]:
# Select best relevance model and save
best_rel = results_rel.iloc[0]["model"]
best_rel_model = artifacts[best_rel]
best_rel_threshold = float(results_rel.iloc[0]["threshold"]) if results_rel.iloc[0]["threshold"] is not None else 0.0

joblib.dump(
    {"model": best_rel_model, "threshold": best_rel_threshold, "rel_col": rel_col},
    ARTIFACTS_DIR / "relevance_model.joblib"
)
print("Best relevance model:", best_rel, "| threshold:", best_rel_threshold)
print("Saved:", ARTIFACTS_DIR / "relevance_model.joblib")


## 3) Label classification (relevant==1) — train group model + two label models

In [None]:
# Use relevant-only rows for label classification
df_labels = train_clean.copy()
df_labels[rel_col] = df_labels[rel_col].fillna(0).astype(int)
df_labels = df_labels[df_labels[rel_col] == 1].copy()

# Merge label descriptions (needed for cross-encoder)
labels_map = labels_df[[DEMAND_COL]].copy()
if "description" in labels_df.columns:
    labels_map["description"] = labels_df["description"]
else:
    # If labels.csv does not include descriptions, you must add them
    raise ValueError("labels.csv must contain 'description' column for cross-encoder solution.")
labels_map[DEMAND_COL] = labels_map[DEMAND_COL].astype(str)

df_labels[DEMAND_COL] = df_labels[DEMAND_COL].astype(str)
df_labels[GROUP_COL]  = df_labels[GROUP_COL].astype(str)

df_labels = df_labels.merge(labels_map[[DEMAND_COL, "description"]], on=DEMAND_COL, how="left")
df_labels = df_labels.dropna(subset=[TEXT_COL, DEMAND_COL, GROUP_COL, "description"]).copy()

print("Labeling rows (relevant==1):", len(df_labels))
print("Unique demand_id:", df_labels[DEMAND_COL].nunique(), "| groups:", df_labels[GROUP_COL].nunique())

train_lab, val_lab = train_test_split(
    df_labels, test_size=TEST_SIZE_LABELS, random_state=SEED,
    stratify=df_labels[GROUP_COL] if df_labels[GROUP_COL].nunique() > 1 else None
)
print("Train_lab:", train_lab.shape, "Val_lab:", val_lab.shape)


In [None]:
# Stage A for labeling: Group model (TF-IDF + LinearSVC calibrated)
group_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,3), min_df=2, max_df=0.95)),
    ("clf", LinearSVC(class_weight="balanced", max_iter=8000)),
])
group_pipe.fit(train_lab[TEXT_COL].astype(str), train_lab[GROUP_COL].astype(str))

group_model = CalibratedClassifierCV(group_pipe, method="sigmoid", cv=3)
group_model.fit(train_lab[TEXT_COL].astype(str), train_lab[GROUP_COL].astype(str))

joblib.dump(group_model, ARTIFACTS_DIR / "group_model.joblib")
print("Saved:", ARTIFACTS_DIR / "group_model.joblib")


In [None]:
# Build group -> label candidates: list of (demand_id, description)
labels_rel = labels_df.copy()
labels_rel[DEMAND_COL] = labels_rel[DEMAND_COL].astype(str)
if GROUP_COL not in labels_rel.columns:
    # derive groups from df_labels if labels.csv doesn't have group_id
    labels_rel = labels_rel.merge(df_labels[[DEMAND_COL, GROUP_COL]].drop_duplicates(), on=DEMAND_COL, how="left")
labels_rel = labels_rel.dropna(subset=[GROUP_COL, "description"]).copy()
labels_rel[GROUP_COL] = labels_rel[GROUP_COL].astype(str)
labels_rel = labels_rel.drop_duplicates(subset=[DEMAND_COL], keep="first")

group_to_labels = {
    g: list(zip(sub[DEMAND_COL].tolist(), sub["description"].astype(str).tolist()))
    for g, sub in labels_rel.groupby(GROUP_COL)
}
print("Groups with candidates:", len(group_to_labels))
print("Example:", list(group_to_labels.items())[:1])


### 3A) Model A — Description-aware Cross‑Encoder + Top‑N groups + Hard Negatives

In [None]:
RNG = np.random.default_rng(SEED)

def make_random_pairs(df_part: pd.DataFrame, neg_per_pos: int) -> pd.DataFrame:
    rows = []
    for _, r in df_part.iterrows():
        text = str(r[TEXT_COL])
        demand = str(r[DEMAND_COL])
        group = str(r[GROUP_COL])
        desc_pos = str(r["description"])

        rows.append({"text": text, "description": desc_pos, "labels": 1, "group": group, "true_demand": demand})

        cand = group_to_labels.get(group, [])
        neg_pool = [(d, desc) for d, desc in cand if d != demand]
        if not neg_pool:
            continue
        take = min(neg_per_pos, len(neg_pool))
        idx = RNG.choice(len(neg_pool), size=take, replace=False)
        for i in np.atleast_1d(idx):
            _, desc_neg = neg_pool[int(i)]
            rows.append({"text": text, "description": str(desc_neg), "labels": 0, "group": group, "true_demand": demand})
    return pd.DataFrame(rows)

ce_train_pairs = make_random_pairs(train_lab, CE_NEG_RANDOM)
ce_val_pairs   = make_random_pairs(val_lab, CE_NEG_RANDOM)

print("CE train pairs:", ce_train_pairs.shape, "pos rate:", ce_train_pairs["labels"].mean())
print("CE val pairs  :", ce_val_pairs.shape, "pos rate:", ce_val_pairs["labels"].mean())


In [None]:
# Train Cross-Encoder
ce_tokenizer = AutoTokenizer.from_pretrained(CE_MODEL)

def ce_tok(batch):
    return ce_tokenizer(
        batch["text"], batch["description"],
        truncation=True, padding="max_length", max_length=CE_MAX_LEN
    )

def to_hfds(pairs_df: pd.DataFrame) -> Dataset:
    ds = Dataset.from_pandas(pairs_df[["text","description","labels"]], preserve_index=False)
    ds = ds.map(ce_tok, batched=True)
    ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
    return ds

ce_train_ds = to_hfds(ce_train_pairs)
ce_val_ds   = to_hfds(ce_val_pairs)

ce_model = AutoModelForSequenceClassification.from_pretrained(CE_MODEL, num_labels=2).to(DEVICE)

ce_args = TrainingArguments(
    output_dir=str(ARTIFACTS_DIR / "ce_out"),
    learning_rate=CE_LR,
    per_device_train_batch_size=CE_BATCH_SIZE,
    per_device_eval_batch_size=CE_BATCH_SIZE,
    num_train_epochs=CE_EPOCHS_BASE,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none",
    seed=SEED,
)

ce_trainer = Trainer(model=ce_model, args=ce_args, train_dataset=ce_train_ds, eval_dataset=ce_val_ds)
t0 = time.time()
ce_trainer.train()
print("CE base training seconds:", round(time.time() - t0, 1))


In [None]:
@torch.no_grad()
def ce_score_probs(text: str, descriptions: list[str]) -> np.ndarray:
    batch = ce_tokenizer([text]*len(descriptions), descriptions, padding=True, truncation=True, max_length=CE_MAX_LEN, return_tensors="pt")
    batch = {k: v.to(DEVICE) for k, v in batch.items()}
    logits = ce_model(**batch).logits
    return F.softmax(logits, dim=-1)[:, 1].detach().cpu().numpy()

def topn_groups(texts: pd.Series, n: int) -> np.ndarray:
    probs = group_model.predict_proba(texts.astype(str))
    classes = group_model.classes_
    topi = np.argsort(-probs, axis=1)[:, :n]
    return classes[topi], probs[np.arange(len(probs))[:,None], topi]

def ce_predict_topk_union(text: str, groups: list[str], k: int) -> list[str]:
    cand = []
    for g in groups:
        cand.extend(group_to_labels.get(str(g), []))
    if not cand:
        return []
    # dedupe demand_id
    seen=set(); dids=[]; descs=[]
    for did, desc in cand:
        if did in seen: 
            continue
        seen.add(did); dids.append(did); descs.append(desc)
    scores = ce_score_probs(text, descs)
    order = np.argsort(-scores)[:k]
    return [dids[i] for i in order]

def eval_ce(df_part: pd.DataFrame, top_n_groups: int, top_k: int):
    groups_topn, _ = topn_groups(df_part[TEXT_COL], top_n_groups)
    y_true = df_part[DEMAND_COL].astype(str).tolist()
    texts  = df_part[TEXT_COL].astype(str).tolist()

    top1=0; topk=0; valid=0
    for text, y, gs in zip(texts, y_true, groups_topn):
        preds = ce_predict_topk_union(text, list(gs), top_k)
        if not preds:
            continue
        valid += 1
        top1 += int(preds[0] == y)
        topk += int(y in preds)
    return {"eval_rows": valid, "top1": top1/max(1,valid), f"top{top_k}": topk/max(1,valid)}

ce_metrics_base_top1 = eval_ce(val_lab, top_n_groups=1, top_k=TOP_K_LABELS)
ce_metrics_base_topN = eval_ce(val_lab, top_n_groups=TOP_N_GROUPS, top_k=TOP_K_LABELS)
print("CE base top-1 group:", ce_metrics_base_top1)
print(f"CE base top-{TOP_N_GROUPS} groups:", ce_metrics_base_topN)


In [None]:
# Hard-negative mining + short fine-tune
def mine_hard_pairs(df_part: pd.DataFrame, hard_neg_per_pos: int) -> pd.DataFrame:
    out = []
    for _, r in df_part.iterrows():
        text = str(r[TEXT_COL])
        true_demand = str(r[DEMAND_COL])
        group = str(r[GROUP_COL])

        cand = group_to_labels.get(group, [])
        if not cand:
            continue

        # optional cap for speed
        if MAX_CANDIDATES_PER_GROUP_MINING is not None and len(cand) > MAX_CANDIDATES_PER_GROUP_MINING:
            idx = RNG.choice(len(cand), size=MAX_CANDIDATES_PER_GROUP_MINING, replace=False)
            cand = [cand[int(i)] for i in idx]

        dids = [d for d, _ in cand]
        descs = [desc for _, desc in cand]
        scores = ce_score_probs(text, descs)
        order = np.argsort(-scores)

        # positive
        out.append({"text": text, "description": str(r["description"]), "labels": 1})

        # hard negatives: highest scoring incorrect labels
        taken = 0
        for i in order:
            if dids[int(i)] == true_demand:
                continue
            out.append({"text": text, "description": str(descs[int(i)]), "labels": 0})
            taken += 1
            if taken >= hard_neg_per_pos:
                break
    return pd.DataFrame(out)

hard_pairs = mine_hard_pairs(val_lab, CE_NEG_HARD)
print("Hard pairs:", hard_pairs.shape, "pos rate:", hard_pairs["labels"].mean())

ft_pairs = pd.concat([ce_train_pairs[["text","description","labels"]], hard_pairs], ignore_index=True)
ft_ds = to_hfds(ft_pairs)

ce_args_hard = TrainingArguments(
    output_dir=str(ARTIFACTS_DIR / "ce_out_hard"),
    learning_rate=CE_LR,
    per_device_train_batch_size=CE_BATCH_SIZE,
    num_train_epochs=CE_EPOCHS_HARD,
    evaluation_strategy="no",
    save_strategy="no",
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none",
    seed=SEED,
)
ce_trainer_hard = Trainer(model=ce_model, args=ce_args_hard, train_dataset=ft_ds)
t0 = time.time()
ce_trainer_hard.train()
print("CE hard fine-tune seconds:", round(time.time() - t0, 1))

ce_metrics_hard_top1 = eval_ce(val_lab, top_n_groups=1, top_k=TOP_K_LABELS)
ce_metrics_hard_topN = eval_ce(val_lab, top_n_groups=TOP_N_GROUPS, top_k=TOP_K_LABELS)
print("CE hard top-1 group:", ce_metrics_hard_top1)
print(f"CE hard top-{TOP_N_GROUPS} groups:", ce_metrics_hard_topN)


### 3B) Model B — Hierarchical Transformer (shared encoder + per-group head)

In [None]:
# Build per-group label list FROM TRAIN ONLY (avoid leakage)
train_groups = train_lab[GROUP_COL].astype(str)

group_label_list = {}
for g, sub in train_lab.groupby(GROUP_COL):
    group_label_list[str(g)] = sorted(sub[DEMAND_COL].astype(str).unique().tolist())

group_label_to_idx = {g: {lab:i for i, lab in enumerate(labs)} for g, labs in group_label_list.items()}
group_num_labels = {g: len(labs) for g, labs in group_label_list.items()}

# Keep only validation rows whose label exists in train within the same group
def label_seen(row):
    g = str(row[GROUP_COL]); lab = str(row[DEMAND_COL])
    return g in group_label_to_idx and lab in group_label_to_idx[g]

val_seen = val_lab[val_lab.apply(label_seen, axis=1)].copy()
print("Val rows:", len(val_lab), "| val rows with label seen in train:", len(val_seen))


In [None]:
ht_tokenizer = AutoTokenizer.from_pretrained(HT_MODEL)

def ht_tok(batch):
    return ht_tokenizer(batch[TEXT_COL], truncation=True, padding="max_length", max_length=HT_MAX_LEN)

def to_examples(df_part: pd.DataFrame) -> pd.DataFrame:
    ex = df_part[[TEXT_COL, GROUP_COL, DEMAND_COL]].copy()
    ex[GROUP_COL] = ex[GROUP_COL].astype(str)
    ex[DEMAND_COL] = ex[DEMAND_COL].astype(str)
    ex["group"] = ex[GROUP_COL]
    ex["label_local"] = ex.apply(lambda r: group_label_to_idx[str(r[GROUP_COL])][str(r[DEMAND_COL])], axis=1)
    return ex[[TEXT_COL, "group", "label_local"]]

ht_train_ex = to_examples(train_lab)
ht_val_ex   = to_examples(val_seen)

train_ds = Dataset.from_pandas(ht_train_ex, preserve_index=False).map(ht_tok, batched=True)
val_ds   = Dataset.from_pandas(ht_val_ex, preserve_index=False).map(ht_tok, batched=True)

train_ds.set_format(type="torch", columns=["input_ids","attention_mask","label_local"])
val_ds.set_format(type="torch", columns=["input_ids","attention_mask","label_local"])

train_groups_list = ht_train_ex["group"].tolist()
val_groups_list   = ht_val_ex["group"].tolist()

def build_torch_dataset(ds: Dataset, groups: list[str]):
    return list(zip(ds["input_ids"], ds["attention_mask"], ds["label_local"], groups))

train_torch = build_torch_dataset(train_ds, train_groups_list)
val_torch   = build_torch_dataset(val_ds, val_groups_list)

def collate(batch):
    return {
        "input_ids": torch.stack([b[0] for b in batch]),
        "attention_mask": torch.stack([b[1] for b in batch]),
        "labels": torch.stack([b[2] for b in batch]),
        "groups": [b[3] for b in batch],
    }

train_loader = DataLoader(train_torch, batch_size=HT_BATCH_SIZE, shuffle=True, collate_fn=collate)
val_loader   = DataLoader(val_torch, batch_size=HT_BATCH_SIZE, shuffle=False, collate_fn=collate)


In [None]:
class HierTransformer(nn.Module):
    def __init__(self, base_model_name: str, group_num_labels: dict[str,int], class_weights: dict[str, torch.Tensor] | None = None):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_name)
        hidden = self.encoder.config.hidden_size
        self.heads = nn.ModuleDict({g: nn.Linear(hidden, n) for g, n in group_num_labels.items()})
        self.class_weights = class_weights or {}

    def forward(self, input_ids, attention_mask, groups, labels=None):
        enc = self.encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
        losses = []
        logits_per_example = []

        for i in range(enc.size(0)):
            g = groups[i]
            logits = self.heads[g](enc[i])  # [n_labels_in_group]
            logits_per_example.append(logits)

            if labels is not None:
                w = self.class_weights.get(g, None)
                loss_fn = nn.CrossEntropyLoss(weight=w.to(logits.device) if w is not None else None)
                losses.append(loss_fn(logits.unsqueeze(0), labels[i].unsqueeze(0)))

        loss = torch.stack(losses).mean() if losses else None
        return loss, logits_per_example

# Optional per-group class weights (inverse frequency)
class_w = {}
if HT_USE_CLASS_WEIGHTS:
    for g, sub in ht_train_ex.groupby("group"):
        counts = sub["label_local"].value_counts().sort_index()
        w = (counts.sum() / (counts + 1e-9)).values.astype(np.float32)
        w = w / w.mean()
        class_w[str(g)] = torch.tensor(w, dtype=torch.float32)

ht_model = HierTransformer(HT_MODEL, group_num_labels, class_weights=class_w if HT_USE_CLASS_WEIGHTS else None).to(DEVICE)
optim = torch.optim.AdamW(ht_model.parameters(), lr=HT_LR, weight_decay=HT_WEIGHT_DECAY)

def train_epoch():
    ht_model.train()
    total=0.0; n=0
    for batch in train_loader:
        optim.zero_grad()
        loss, _ = ht_model(
            batch["input_ids"].to(DEVICE),
            batch["attention_mask"].to(DEVICE),
            batch["groups"],
            batch["labels"].to(DEVICE),
        )
        loss.backward()
        optim.step()
        total += float(loss.detach().cpu())
        n += 1
    return total/max(1,n)

@torch.no_grad()
def eval_epoch():
    ht_model.eval()
    total=0.0; n=0
    for batch in val_loader:
        loss, _ = ht_model(
            batch["input_ids"].to(DEVICE),
            batch["attention_mask"].to(DEVICE),
            batch["groups"],
            batch["labels"].to(DEVICE),
        )
        total += float(loss.detach().cpu())
        n += 1
    return total/max(1,n)

best = 1e18
best_state = None
t0 = time.time()
for e in range(1, HT_EPOCHS+1):
    tr = train_epoch()
    va = eval_epoch()
    print(f"HT epoch {e}: train_loss={tr:.4f} val_loss={va:.4f}")
    if va < best:
        best = va
        best_state = {k: v.detach().cpu().clone() for k, v in ht_model.state_dict().items()}

if best_state is not None:
    ht_model.load_state_dict(best_state)

print("HT training seconds:", round(time.time() - t0, 1))


In [None]:
@torch.no_grad()
def ht_predict_topk_union(text: str, groups: list[str], group_probs: list[float], k: int) -> list[str]:
    # Combine group prob with within-group label prob (simple but effective): score = P(group)*P(label|group)
    token = ht_tokenizer(text, truncation=True, padding="max_length", max_length=HT_MAX_LEN, return_tensors="pt")
    token = {k: v.to(DEVICE) for k, v in token.items()}
    enc = ht_model.encoder(**token).last_hidden_state[:, 0][0]

    cand = []
    for g, pg in zip(groups, group_probs):
        g = str(g)
        if g not in group_label_list:
            continue
        logits = ht_model.heads[g](enc)
        probs = F.softmax(logits, dim=-1).detach().cpu().numpy()
        labs = group_label_list[g]
        # take top 10 from each group then merge
        top_local = np.argsort(-probs)[: min(10, len(labs))]
        for i in top_local:
            cand.append((labs[int(i)], float(pg) * float(probs[int(i)])))

    if not cand:
        return []
    # sort global and take top-k unique
    cand.sort(key=lambda x: -x[1])
    seen=set(); out=[]
    for lab, _ in cand:
        if lab in seen:
            continue
        seen.add(lab); out.append(lab)
        if len(out) >= k:
            break
    return out

def eval_ht(df_part: pd.DataFrame, top_n_groups: int, top_k: int) -> dict:
    groups_topn, probs_topn = topn_groups(df_part[TEXT_COL], top_n_groups)
    y_true = df_part[DEMAND_COL].astype(str).tolist()
    texts  = df_part[TEXT_COL].astype(str).tolist()

    top1=0; topk=0; valid=0
    for text, y, gs, ps in zip(texts, y_true, groups_topn, probs_topn):
        preds = ht_predict_topk_union(text, list(gs), list(ps), top_k)
        if not preds:
            continue
        valid += 1
        top1 += int(preds[0] == y)
        topk += int(y in preds)
    return {"eval_rows": valid, "top1": top1/max(1,valid), f"top{top_k}": topk/max(1,valid)}

ht_metrics_top1 = eval_ht(val_seen, top_n_groups=1, top_k=TOP_K_LABELS)
ht_metrics_topN = eval_ht(val_seen, top_n_groups=TOP_N_GROUPS, top_k=TOP_K_LABELS)

print("HT top-1 group:", ht_metrics_top1)
print(f"HT top-{TOP_N_GROUPS} groups:", ht_metrics_topN)


## 4) Compare label models + choose best + save artifacts

In [None]:
# Collect metrics (use CE hard + top-N as the strongest CE variant)
rows = []

rows.append({"model": "CE_base_top1group", **ce_metrics_base_top1})
rows.append({"model": f"CE_base_top{TOP_N_GROUPS}groups", **ce_metrics_base_topN})
rows.append({"model": "CE_hard_top1group", **ce_metrics_hard_top1})
rows.append({"model": f"CE_hard_top{TOP_N_GROUPS}groups", **ce_metrics_hard_topN})

rows.append({"model": "HT_top1group", **ht_metrics_top1})
rows.append({"model": f"HT_top{TOP_N_GROUPS}groups", **ht_metrics_topN})

metrics = pd.DataFrame(rows)
metrics_path = ARTIFACTS_DIR / "comparison_metrics.csv"
metrics.to_csv(metrics_path, index=False)
metrics


In [None]:
# Visualization: Top-1 and Top-K bars
plt.figure(figsize=(10,4))
plt.bar(metrics["model"], metrics["top1"])
plt.xticks(rotation=45, ha="right")
plt.ylabel("Top-1 accuracy")
plt.title("Label model comparison — Top-1")
plt.grid(True, axis="y", alpha=0.3)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,4))
plt.bar(metrics["model"], metrics[f"top{TOP_K_LABELS}"])
plt.xticks(rotation=45, ha="right")
plt.ylabel(f"Top-{TOP_K_LABELS} accuracy")
plt.title(f"Label model comparison — Top-{TOP_K_LABELS}")
plt.grid(True, axis="y", alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# Select best labeling model by Top-K (tie-break Top-1)
metrics_sorted = metrics.sort_values([f"top{TOP_K_LABELS}", "top1"], ascending=False)
best_label_variant = metrics_sorted.iloc[0]["model"]
print("Best label variant:", best_label_variant)

# Save CE artifacts (always save, since it's commonly best)
ce_save_dir = ARTIFACTS_DIR / "cross_encoder"
ce_save_dir.mkdir(parents=True, exist_ok=True)
ce_model.save_pretrained(ce_save_dir)
ce_tokenizer.save_pretrained(ce_save_dir)

# Save mapping
(ARTIFACTS_DIR / "group_to_labels.json").write_text(
    json.dumps({g: [{"demand_id": d, "description": desc} for d, desc in pairs] for g, pairs in group_to_labels.items()}, indent=2),
    encoding="utf-8"
)

# Save HT artifacts
torch.save(ht_model.state_dict(), ARTIFACTS_DIR / "hierarchical_transformer.pt")
meta = {
    "ht_model": HT_MODEL,
    "group_label_list": group_label_list,
    "ht_max_len": HT_MAX_LEN,
}
(ARTIFACTS_DIR / "hierarchical_transformer_meta.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")

print("Saved label artifacts to:", ARTIFACTS_DIR.resolve())


## 5) What you deploy

- **Relevance**: `artifacts/relevance_model.joblib` (model + threshold)
- **Group model**: `artifacts/group_model.joblib`
- **Label model**: usually `artifacts/cross_encoder/` + `group_to_labels.json` + `TOP_N_GROUPS`
  - For speed: use `hierarchical_transformer.pt` + meta + group model.


## (Optional) Run this notebook on Azure ML (v2 SDK) — compute + environment + command job

In [None]:
# This cell is meant to be executed *locally* (e.g., on your laptop/Jupyter).
# It submits an Azure ML command job that runs THIS notebook on a remote compute cluster via Papermill.
# When the job runs remotely, AZUREML_RUN_ID is set, so we skip submission logic to avoid recursion.

import os

if os.getenv("AZUREML_RUN_ID"):
    print("Running inside Azure ML job; skipping job-submission cell.")
else:
    # 1) Install deps (run once per local environment)
    # %pip install -U azure-ai-ml azure-identity papermill

    from azure.identity import DefaultAzureCredential
    from azure.ai.ml import MLClient, command, Input
    from azure.ai.ml.entities import AmlCompute, Environment
    from azure.ai.ml.constants import AssetTypes

    # 2) Auth + workspace
    # Preferred: config.json in the same folder (created by `az ml folder attach -w <ws> -g <rg>`),
    # or set env vars: AZURE_SUBSCRIPTION_ID, AZURE_RESOURCE_GROUP, AZURE_WORKSPACE_NAME
    try:
        ml_client = MLClient.from_config(credential=DefaultAzureCredential(exclude_shared_token_cache_credential=True))
        print("Loaded workspace from config.json")
    except Exception:
        sub_id = os.environ["AZURE_SUBSCRIPTION_ID"]
        rg     = os.environ["AZURE_RESOURCE_GROUP"]
        ws     = os.environ["AZURE_WORKSPACE_NAME"]
        ml_client = MLClient(DefaultAzureCredential(exclude_shared_token_cache_credential=True), sub_id, rg, ws)
        print("Loaded workspace from env vars")

    # 3) Compute
    compute_name = os.getenv("AML_COMPUTE_NAME", "cpu-cluster")
    try:
        ml_client.compute.get(compute_name)
        print(f"Compute '{compute_name}' exists.")
    except Exception:
        cpu_cluster = AmlCompute(
            name=compute_name,
            type="amlcompute",
            size=os.getenv("AML_VM_SIZE", "STANDARD_D4_V3"),
            min_instances=int(os.getenv("AML_MIN_INSTANCES", "0")),
            max_instances=int(os.getenv("AML_MAX_INSTANCES", "2")),
            idle_time_before_scale_down=120,
        )
        ml_client.compute.begin_create_or_update(cpu_cluster).result()
        print(f"Created compute '{compute_name}'.")

    # 4) Environment (conda)
    env_name = os.getenv("AML_ENV_NAME", "nlp-pipeline-env")
    env_version = os.getenv("AML_ENV_VERSION", "1")
    conda_yaml = """name: nlp-pipeline
channels:
  - conda-forge
dependencies:
  - python=3.10
  - pip
  - pip:
      - pandas==2.2.2
      - numpy==1.26.4
      - scikit-learn==1.5.1
      - matplotlib==3.9.0
      - joblib==1.4.2
      - torch==2.3.1
      - transformers==4.44.2
      - datasets==2.20.0
      - accelerate==0.33.0
      - sentence-transformers==3.0.1
      - lightgbm==4.5.0
      - papermill==2.6.0
"""

    env = Environment(
        name=env_name,
        version=env_version,
        description="Env for full_pipeline_end_to_end.ipynb (Azure ML v2)",
        conda_file=conda_yaml,
        image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
    )
    env = ml_client.environments.create_or_update(env)
    print("Registered env:", env.name, env.version)

    # 5) Submit a command job that executes the notebook with Papermill
    # Inputs are auto-uploaded from local paths.
    job = command(
        code=".",  # folder that contains this notebook and the CSVs
        command=(
            "papermill full_pipeline_end_to_end.ipynb outputs/executed.ipynb "
            "-p TRAIN_CSV ${{inputs.train_csv}} "
            "-p TEST_CSV ${{inputs.test_csv}} "
            "-p LABELS_CSV ${{inputs.labels_csv}} "
            "-p OUTPUT_DIR outputs"
        ),
        inputs={
            "train_csv": Input(type=AssetTypes.URI_FILE, path=str(TRAIN_CSV)),
            "test_csv":  Input(type=AssetTypes.URI_FILE, path=str(TEST_CSV)),
            "labels_csv":Input(type=AssetTypes.URI_FILE, path=str(LABELS_CSV)),
        },
        environment=f"{env.name}:{env.version}",
        compute=compute_name,
        display_name="nlp-full-pipeline-notebook",
        experiment_name=os.getenv("AML_EXPERIMENT_NAME", "nlp_full_pipeline"),
    )

    returned_job = ml_client.jobs.create_or_update(job)
    print("Submitted job:", returned_job.name)
    print("Studio URL:", returned_job.studio_url)
