## 1. Load pipeline & metadata
We’ll load:
- `models/baseline_pipeline.joblib` (TF-IDF + One-vs-Rest Logistic Regression)
- `models/baseline_meta.json` (has `label_cols`)

If the `models/` folder is elsewhere, adjust the path below.


In [3]:
from pathlib import Path
import json, joblib

# Primary location (as previously saved)
MODELS_DIR = Path("models")

# Fallbacks if you moved files (uncomment/adjust if needed)
if not (MODELS_DIR / "baseline_pipeline.joblib").exists():
    # Example fallback locations you can try:
    # MODELS_DIR = Path.cwd().parent / "ui" / "models"
    # MODELS_DIR = Path.cwd().parent / "models"
    pass

PIPE_PATH = MODELS_DIR / "baseline_pipeline.joblib"
META_PATH = MODELS_DIR / "baseline_meta.json"

assert PIPE_PATH.exists(), f"Pipeline not found at {PIPE_PATH}"
assert META_PATH.exists(), f"Metadata not found at {META_PATH}"

pipe = joblib.load(PIPE_PATH)

with open(META_PATH, "r", encoding="utf-8") as f:
    meta = json.load(f)

# Labels key per your earlier save format
LABELS = meta.get("label_cols")
assert isinstance(LABELS, list) and len(LABELS) > 0, "meta['label_cols'] missing or empty."

print("Loaded pipeline steps:", [name for name, _ in getattr(pipe, "steps", [])])
print("Labels:", LABELS)


Loaded pipeline steps: ['tfidf', 'clf']
Labels: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


## 2. Extract vectorizer and per-label models
From the pipeline:
- `tfidf = pipe.named_steps["tfidf"]`
- `ovr = pipe.named_steps["clf"]` → `ovr.estimators_` is a list of classifiers, one per label.

We’ll map `LABELS[i] → ovr.estimators_[i]` and build a `models_dict`.


In [4]:
# Get the TF-IDF vectorizer
assert "tfidf" in pipe.named_steps, "Pipeline has no 'tfidf' step."
vectorizer = pipe.named_steps["tfidf"]

# Get the One-vs-Rest classifier and its per-label estimators
assert "clf" in pipe.named_steps, "Pipeline has no 'clf' step (expected OneVsRestClassifier)."
ovr = pipe.named_steps["clf"]

# Ensure fitted
assert hasattr(ovr, "estimators_"), "ovr.estimators_ not found — was the pipeline fitted?"

estimators = ovr.estimators_
assert len(estimators) == len(LABELS), f"Estimator count {len(estimators)} != labels {len(LABELS)}."

# Build label → model dict
models = {lbl: est for lbl, est in zip(LABELS, estimators)}

# Quick sanity print
print("Vectorizer:", type(vectorizer).__name__)
print("Per-label models:", {k: type(v).__name__ for k, v in list(models.items())[:3]}, "... total:", len(models))


Vectorizer: TfidfVectorizer
Per-label models: {'toxic': 'LogisticRegression', 'severe_toxic': 'LogisticRegression', 'obscene': 'LogisticRegression'} ... total: 6


## 3. Single-label linear explainer
For a chosen `label`, we compute:
- `X = vectorizer.transform([text])`
- `z = w·X + b` (logit) from the per-label estimator
- `p = 1 / (1 + exp(-z))`
- token-level contributions: `contrib[token] = w[token] × X[token]`

We’ll aggregate duplicate tokens (e.g., bi-grams overlapping with uni-grams) and return:
- `prob`, `logit`, `bias`, and a dict `token_contributions`.


In [5]:
import numpy as np
import re

def _logit_from_estimator(X, est):
    """Return scalar logit z for binary estimator on a single-row X."""
    if hasattr(est, "decision_function"):
        z = est.decision_function(X)
        # shape could be (1,) or (1,1)
        z = np.asarray(z).ravel()[0]
        return float(z)
    if hasattr(est, "predict_proba"):
        p = est.predict_proba(X)[:, 1][0]
        p = float(np.clip(p, 1e-9, 1 - 1e-9))
        return float(np.log(p / (1 - p)))
    raise ValueError("Estimator lacks decision_function/predict_proba")

def _weights_intercept(est):
    """Get (w, b) on linear models; unwrap common wrappers if needed."""
    # Direct
    if hasattr(est, "coef_"):
        w = est.coef_.ravel()
        b = float(getattr(est, "intercept_", np.array([0.0]))[0])
        return w, b
    # Try common wrappers
    for attr in ("base_estimator", "estimator", "classifier"):
        inner = getattr(est, attr, None)
        if inner is not None and hasattr(inner, "coef_"):
            w = inner.coef_.ravel()
            b = float(getattr(inner, "intercept_", np.array([0.0]))[0])
            return w, b
    raise ValueError("Cannot access linear weights (coef_). Is the base model linear?")

def linear_token_contribs(text: str, label: str, *, vectorizer, models):
    """
    Compute token contributions for a single label.
    Returns: dict with prob, logit, bias, token_contributions (dict token->float).
    """
    # Vectorize using the trained vectorizer (same preprocessing/tokenization as training)
    X = vectorizer.transform([text])

    est = models[label]
    z = _logit_from_estimator(X, est)
    p = 1.0 / (1.0 + np.exp(-z))
    w, b = _weights_intercept(est)

    # Active features and their tf-idf values
    X_csr = X.tocsr()
    idxs = X_csr.indices
    vals = X_csr.data

    # contributions in logit space
    contribs = w[idxs] * vals

    # Map feature indices back to tokens
    inv_vocab = {j: t for t, j in vectorizer.vocabulary_.items()}
    tokens = [inv_vocab[j] for j in idxs]

    # Aggregate duplicate tokens
    agg = {}
    for t, c in zip(tokens, contribs):
        agg[t] = agg.get(t, 0.0) + float(c)

    return {
        "prob": float(p),
        "logit": float(z),
        "bias": float(b),
        "token_contributions": agg,
    }


## 4. Validate on a sample
We’ll run the explainer on one short comment for a chosen label and inspect the top contributors by absolute value.


In [6]:
sample_text = "You are an idiot. This is the worst comment ever."
focus_label = LABELS[0]  # e.g., "toxic" — change if you want

exp = linear_token_contribs(sample_text, focus_label, vectorizer=vectorizer, models=models)
top = sorted(exp["token_contributions"].items(), key=lambda kv: abs(kv[1]), reverse=True)

print({"label": focus_label, "prob": round(exp["prob"], 4), "logit": round(exp["logit"], 4)})
print("Top contributors (logit space):")
for t, c in top[:12]:
    print(f"{t:20s}  {c:+.4f}")


{'label': 'toxic', 'prob': 0.9999, 'logit': 8.8362}
Top contributors (logit space):
idiot                 +4.2091
an idiot              +1.2153
worst                 +0.9780
you                   +0.8191
the worst             +0.7471
are an                +0.6704
ever                  +0.6434
you are               +0.6361
are                   +0.3395
this is               -0.1512
is                    +0.1357
the                   -0.1315


## 5. What-if analysis
We’ll remove or replace a token (case-insensitive, whole-word) and see how the probability for the focused label changes.


In [7]:
import re
import numpy as np

def what_if_remove(text: str, token: str, label: str, *, vectorizer, models):
    pat = rf"\b{re.escape(token)}\b"
    edited = re.sub(pat, " ", text, flags=re.IGNORECASE)
    before = linear_token_contribs(text,   label, vectorizer=vectorizer, models=models)["prob"]
    after  = linear_token_contribs(edited, label, vectorizer=vectorizer, models=models)["prob"]
    return {
        "action": "remove",
        "token": token,
        "before": float(before),
        "after":  float(after),
        "delta":  float(after - before),
        "edited_text": edited,
    }

def what_if_replace(text: str, token: str, replacement: str, label: str, *, vectorizer, models):
    pat = rf"\b{re.escape(token)}\b"
    edited = re.sub(pat, replacement, text, flags=re.IGNORECASE)
    before = linear_token_contribs(text,   label, vectorizer=vectorizer, models=models)["prob"]
    after  = linear_token_contribs(edited, label, vectorizer=vectorizer, models=models)["prob"]
    return {
        "action": "replace",
        "token": token,
        "replacement": replacement,
        "before": float(before),
        "after":  float(after),
        "delta":  float(after - before),
        "edited_text": edited,
    }


In [8]:
what_if_remove(sample_text, "idiot", focus_label, vectorizer=vectorizer, models=models)


{'action': 'remove',
 'token': 'idiot',
 'before': 0.9998546402762276,
 'after': 0.981243896211523,
 'delta': -0.018610744064704643,
 'edited_text': 'You are an  . This is the worst comment ever.'}

## 6. Explain all labels + compact JSON
We’ll compute:
- Per-label probabilities for the text.
- One focused label’s token contributions.
- A compact payload suitable for the UI to download or display.


In [9]:
def explain_all_labels(text: str, *, vectorizer, models, labels):
    probs = {}
    for lbl in labels:
        p = linear_token_contribs(text, lbl, vectorizer=vectorizer, models=models)["prob"]
        probs[lbl] = float(p)
    return probs

def build_explanation_payload(text: str, label: str, *, vectorizer, models, labels, threshold: float = 0.5):
    exp = linear_token_contribs(text, label, vectorizer=vectorizer, models=models)
    payload = {
        "text": text,
        "label": label,
        "threshold": float(threshold),
        "probabilities": explain_all_labels(text, vectorizer=vectorizer, models=models, labels=labels),
        "explanation": {
            "bias": float(exp["bias"]),
            "logit": float(exp["logit"]),
            "token_contributions": exp["token_contributions"],  # dict token -> logit contribution
        },
        "predicted": bool(exp["prob"] >= threshold),
    }
    return payload


In [10]:
payload = build_explanation_payload(sample_text, focus_label, vectorizer=vectorizer, models=models, labels=LABELS, threshold=0.5)
# Peek (truncate print)
import json
print(json.dumps(payload, indent=2)[:800], "...")


{
  "text": "You are an idiot. This is the worst comment ever.",
  "label": "toxic",
  "threshold": 0.5,
  "probabilities": {
    "toxic": 0.9998546402762276,
    "severe_toxic": 0.11633880980412105,
    "obscene": 0.899354640984587,
    "threat": 0.013199508069555322,
    "insult": 0.9997739805767635,
    "identity_hate": 0.24543510433230759
  },
  "explanation": {
    "bias": -1.4071821295020228,
    "logit": 8.836153663888966,
    "token_contributions": {
      "an": 0.04634766088388386,
      "an idiot": 1.2153161457573238,
      "are": 0.33954640046836143,
      "are an": 0.6704142675447535,
      "comment": 0.027014049151402125,
      "ever": 0.6434071197386951,
      "idiot": 4.209143713050065,
      "idiot this": 0.026189527321537114,
      "is": 0.1356997950192003,
      "is the": ...
