In [1]:
# Cell 1 ‚Äî Setup for explainability
%run ./00_config.ipynb

import os, torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1) Load best model + tokenizer
model_dir = cfg.paths.model_dir
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()

print("‚úÖ Model loaded for explainability")
print("Device:", device)
print("Labels:", list(cfg.labels))

# 2) Ensure SHAP is available
try:
    import shap
    print("SHAP version:", shap.__version__)
except Exception as e:
    print("SHAP not found. Installing‚Ä¶")
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "shap"])
    import shap
    print("SHAP version (after install):", shap.__version__)


Python version: 3.13.7 (tags/v3.13.7:bcee1c3, Aug 14 2025, 14:15:11) [MSC v.1944 64 bit (AMD64)]
CUDA available: False
Running on CPU
../../data/train_data.csv
microsoft/mdeberta-v3-base
‚úÖ Config loaded and random seed set to: 42
üìÇ Model directory: ../models/best
üìÇ Reports directory: ../reports
‚úÖ Folder setup complete.
‚úÖ Found: ..\..\data\train_data.csv
‚úÖ Found: ..\..\data\test_data.csv

All required data files are present and accessible.
‚úÖ Configuration snapshot saved at:
../reports\config_snapshot.json


  from .autonotebook import tqdm as notebook_tqdm


‚úÖ Model loaded for explainability
Device: cpu
Labels: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
SHAP version: 0.49.1


In [3]:
# Cell 2 ‚Äî SHAP text explainer for one label

import shap
import numpy as np
import os

# 1) choose which label to explain (you can change this)
label_to_explain = "toxic"   # e.g., "insult", "obscene", etc.
label_idx = list(cfg.labels).index(label_to_explain)
print("Explaining label:", label_to_explain, "‚Üí index", label_idx)

# 2) small background texts to speed up SHAP on CPU
background_texts = [
    "Have a nice day.",
    "I disagree with your point.",
    "This is helpful, thank you.",
    "Please keep the discussion civil.",
    "I don't think that's correct."
]

# 3) prediction function returning probabilities for the chosen label
def f_label(texts):
    if isinstance(texts, str):
        texts = [texts]
    with torch.no_grad():
        enc = tokenizer(
            list(texts),
            truncation=True,
            padding=True,
            max_length=min(128, cfg.train.max_len),
            return_tensors="pt"
        ).to(model.device)
        probs = torch.sigmoid(model(**enc).logits).detach().cpu().numpy()
    # return the column for the selected label as a 1D vector
    return probs[:, label_idx]

# 4) create a text masker and explainer
masker = shap.maskers.Text(tokenizer)  # SHAP will tokenize and mask words/subwords
explainer = shap.Explainer(f_label, masker)

print("‚úÖ SHAP explainer ready for label:", label_to_explain)


Explaining label: toxic ‚Üí index 0

‚úÖ SHAP explainer ready for label: toxic


In [9]:
# Custom inline SHAP text visualization (red=negative ‚Üì, blue=positive ‚Üë)
from IPython.display import HTML, display
import numpy as np, html, os, torch

def render_shap_text_inline(exp, *, label_name="", title="SHAP Text Explanation", save_path=None, prob=None):
    tokens = list(exp.data)
    values = np.array(exp.values, dtype=float)
    max_abs = float(np.max(np.abs(values))) if np.any(values) else 1.0

    def color_for(v):
        # blue for positive (push up), red for negative (pull down)
        alpha = min(abs(v) / (max_abs + 1e-8), 1.0)
        if v >= 0:
            return f"background-color: rgba(0,120,255,{0.15 + 0.6*alpha});"
        else:
            return f"background-color: rgba(255,0,0,{0.15 + 0.6*alpha});"

    spans = [
        f'<span style="padding:2px 4px; margin:1px; border-radius:4px; {color_for(val)}">'
        f'{html.escape(str(tok))}</span>'
        for tok, val in zip(tokens, values)
    ]

    legend = """
    <div style="font:14px/1.4 system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial;">
      <div style="margin-bottom:8px;"><b>Legend</b> ‚Äî blue: pushes <i>up</i> (positive), red: pulls <i>down</i> (negative); intensity ‚àù |SHAP|</div>
      <div style="display:flex; gap:12px; align-items:center; margin-bottom:12px;">
        <span style="background:rgba(0,120,255,0.35); padding:4px 8px; border-radius:4px;">positive ‚Üë</span>
        <span style="background:rgba(255,0,0,0.35); padding:4px 8px; border-radius:4px;">negative ‚Üì</span>
      </div>
    </div>
    """

    prob_line = f'<div style="color:#555; margin-bottom:4px;">Predicted P({html.escape(label_name)}): <b>{prob:.3f}</b></div>' if prob is not None else ""

    html_doc = f"""<!doctype html>
<html>
<head><meta charset="utf-8" /><title>{html.escape(title)}</title></head>
<body style="font:16px/1.6 system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial; padding:24px;">
  <h2 style="margin:0 0 8px;">{html.escape(title)}</h2>
  <div style="color:#555; margin-bottom:4px;">Label: <b>{html.escape(label_name)}</b></div>
  {prob_line}
  <div style="color:#777; margin-bottom:16px;">Max |SHAP|: {max_abs:.4f}</div>
  {legend}
  <div style="border:1px solid #eee; padding:12px; border-radius:8px;">
    {' '.join(spans)}
  </div>
</body>
</html>"""

    # inline display
    display(HTML(html_doc))

    # optional save
    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        with open(save_path, "w", encoding="utf-8") as f:
            f.write(html_doc)
        print("‚úÖ Saved inline HTML to:", save_path)

# ---- Use it for the current sample (uses variables from previous cells) ----
# Compute probability for the same text/label
with torch.no_grad():
    enc = tokenizer(text_to_explain, truncation=True, padding="max_length",
                    max_length=min(128, cfg.train.max_len), return_tensors="pt").to(model.device)
    p = torch.sigmoid(model(**enc).logits)[0, label_idx].item()

out_html = os.path.join(cfg.paths.reports_dir, "figs", f"shap_inline_{label_to_explain}_sample{idx}.html")
render_shap_text_inline(exp, label_name=label_to_explain, save_path=out_html, prob=p)


‚úÖ Saved inline HTML to: ../reports\figs\shap_inline_toxic_sample1.html


In [10]:
# Cell 4 ‚Äî Batch SHAP gallery
# Requires: tokenizer, model, explainer, label_to_explain, label_idx, render_shap_text_inline

import os, torch, numpy as np, pandas as pd
from IPython.display import HTML, display

gallery_dir = os.path.join(cfg.paths.reports_dir, "figs", f"shap_gallery_{label_to_explain}")
os.makedirs(gallery_dir, exist_ok=True)

# ---------- choose inputs ----------
USE_TOP_FROM_VAL = 'val_df' in globals()  # if val_df is loaded from 01_data, use it
TOP_K = 8                                  # how many examples to export
MAX_SCAN = 300                             # scan first N val rows for speed (CPU friendly)

if USE_TOP_FROM_VAL:
    scan_df = val_df.head(MAX_SCAN).copy()
    # get probabilities for chosen label
    model.eval()
    with torch.no_grad():
        enc = tokenizer(
            scan_df["comment"].tolist(),
            truncation=True, padding=True, max_length=min(128, cfg.train.max_len), return_tensors="pt"
        ).to(model.device)
        probs_all = torch.sigmoid(model(**enc).logits).cpu().numpy()[:, label_idx]
    scan_df["__prob__"] = probs_all
    chosen = scan_df.sort_values("__prob__", ascending=False).head(TOP_K)[["comment","__prob__"]].reset_index(drop=True)
    texts = chosen["comment"].tolist()
    probs_for_texts = chosen["__prob__"].to_numpy()
    source_note = f"Top {TOP_K} from val_df (out of first {MAX_SCAN} rows) by P({label_to_explain})."
else:
    texts = [
        "I completely disagree with you but let's keep it respectful.",
        "You are an absolute idiot and a disgrace.",
        "Have a nice day!",
        "This is the dumbest thing I've read.",
        "Thanks for the clarification, appreciate it."
    ][:TOP_K]
    # compute probabilities for display
    with torch.no_grad():
        enc = tokenizer(texts, truncation=True, padding=True, max_length=min(128, cfg.train.max_len), return_tensors="pt").to(model.device)
        probs_for_texts = torch.sigmoid(model(**enc).logits).cpu().numpy()[:, label_idx]
    source_note = "Custom sample texts."

# ---------- render & save each ----------
index_rows = []
for i, (txt, p) in enumerate(zip(texts, probs_for_texts), start=1):
    exps = explainer([txt])
    exp = exps[0]
    out_path = os.path.join(gallery_dir, f"shap_{label_to_explain}_{i:02d}.html")
    render_shap_text_inline(exp, label_name=label_to_explain, save_path=out_path, prob=float(p))
    index_rows.append((i, out_path, float(p), txt[:90] + ("‚Ä¶" if len(txt) > 90 else "")))

# ---------- write simple gallery index ----------
index_html = os.path.join(gallery_dir, "index.html")
html_rows = "\n".join(
    f'<tr><td style="padding:6px 10px;">{i:02d}</td>'
    f'<td style="padding:6px 10px;"><a href="{os.path.basename(path)}" target="_blank">view</a></td>'
    f'<td style="padding:6px 10px;">{p:.3f}</td>'
    f'<td style="padding:6px 10px;">{txt}</td></tr>'
    for (i, path, p, txt) in index_rows
)
with open(index_html, "w", encoding="utf-8") as f:
    f.write(f"""<!doctype html>
<html><head><meta charset="utf-8"><title>SHAP Gallery ‚Äî {label_to_explain}</title></head>
<body style="font:15px/1.6 system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial; padding:20px;">
<h2 style="margin-top:0;">SHAP Gallery ‚Äî Label: {label_to_explain}</h2>
<div style="color:#666; margin-bottom:12px;">{source_note}</div>
<table style="border-collapse:collapse; border:1px solid #eee;">
<thead><tr style="background:#fafafa;">
<th style="padding:6px 10px; text-align:left;">#</th>
<th style="padding:6px 10px; text-align:left;">Explanation</th>
<th style="padding:6px 10px; text-align:left;">P({label_to_explain})</th>
<th style="padding:6px 10px; text-align:left;">Text (truncated)</th>
</tr></thead>
<tbody>
{html_rows}
</tbody></table>
</body></html>""")

print("‚úÖ Wrote gallery to:", gallery_dir)
print("Open:", index_html)
display(HTML(f'<a href="{index_html}" target="_blank">Open SHAP gallery</a>'))


PartitionExplainer explainer: 2it [00:13, 13.38s/it]               


‚úÖ Saved inline HTML to: ../reports\figs\shap_gallery_toxic\shap_toxic_01.html


                                       

‚úÖ Saved inline HTML to: ../reports\figs\shap_gallery_toxic\shap_toxic_02.html


‚úÖ Saved inline HTML to: ../reports\figs\shap_gallery_toxic\shap_toxic_03.html


                                       

‚úÖ Saved inline HTML to: ../reports\figs\shap_gallery_toxic\shap_toxic_04.html


‚úÖ Saved inline HTML to: ../reports\figs\shap_gallery_toxic\shap_toxic_05.html
‚úÖ Wrote gallery to: ../reports\figs\shap_gallery_toxic
Open: ../reports\figs\shap_gallery_toxic\index.html
