In [None]:
!pip uninstall -y pandas
!pip install pandas --upgrade

Found existing installation: pandas 2.3.3
Uninstalling pandas-2.3.3:
  Successfully uninstalled pandas-2.3.3
Collecting pandas
  Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
Installing collected packages: pandas
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.
cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.3 which is incompatible.
dask-cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.3 which is incompatible.[0m[31m
[0mSuccessfully installed pandas-2.3.3


In [None]:
import os, sys
os.kill(os.getpid(), 9)


In [None]:
# ==========================================
# 🧩 STEP 0: 安装依赖（仅第一次运行）
# ==========================================
!pip install shap transformers datasets torch pandas tqdm --quiet


In [None]:
import pandas as pd
import shap
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification

print("✅ pandas version:", pd.__version__)

✅ pandas version: 2.3.3


In [None]:
import torch
import shap
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import glob
import os

# ✅ 1️⃣ 加载模型（binary toxic classifier）
model_name = "SkolkovoInstitute/roberta_toxicity_classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

# ✅ 2️⃣ 定义预测函数
def predict_toxic(texts):
    if isinstance(texts, str):
        texts = [texts]
    elif isinstance(texts, (tuple, np.ndarray)):
        texts = list(texts)
    elif not isinstance(texts, list):
        try:
            texts = list(texts)
        except TypeError:
            raise ValueError("Unsupported input type for predict_toxic")

    texts = [str(text) for text in texts]

    enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=256)
    with torch.no_grad():
        logits = model(**enc).logits
    # Return class-1 logit (more linear, larger-magnitude signal for SHAP)
    return logits[:, 1].unsqueeze(1).cpu().numpy()

# ✅ 3️⃣ 定义 SHAP masker & explainer
# By default the HuggingFace tokenizer produces subword tokens (BPE). If you want
# to run SHAP at word-level (one word == one token, split by whitespace) we need
# to provide a small wrapper that mimics the minimal tokenizer interface SHAP
# expects (callable returning a dict with 'input_ids' and a convert_tokens_to_string method).

# Set to True to use word-level tokens (split on whitespace). False to use model tokenizer (subword).
USE_WORD_LEVEL = True
# Set to True to run a deterministic leave-one-out (L1O) per-token impact instead of SHAP
# L1O guarantees every token occurrence is measured but is more expensive (one model call per token)
USE_LEAVE_ONE_OUT = True

class WordTokenizer:
    """A tiny tokenizer-like wrapper that splits text on whitespace and
    provides a minimal HF-like interface required by shap.maskers.Text.
    This lets SHAP treat each whitespace-delimited word as a single token.
    """
    def __call__(self, text):
        if not isinstance(text, str):
            text = str(text)
        tokens = text.split()
        # SHAP inspects tokenizer("")['input_ids'] to get prefix/suffix info;
        # return tokens as 'input_ids' (strings) — this is enough for the Text masker.
        return {"input_ids": tokens}

    def convert_tokens_to_string(self, tokens):
        return " ".join(tokens)

    def decode(self, ids):
        return " ".join(ids)


if USE_WORD_LEVEL:
    masker = shap.maskers.Text(WordTokenizer())
else:
    masker = shap.maskers.Text(tokenizer)

explainer = shap.Explainer(predict_toxic, masker)

# ✅ 4️⃣ 获取所有 subreddit CSV 文件路径
csv_files = [
    "final_version.csv"
]

# ✅ 5️⃣ 循环处理每个文件
for file_path in csv_files:
    subreddit_name = os.path.splitext(os.path.basename(file_path))[0]
    print(f"\n🚀 Processing {subreddit_name} ...")

    df = pd.read_csv(file_path)
    df["body"] = df["body_nopunct_apostrophe"].fillna("")
    # prepare texts and original ids (use 'id' or 'Sentence_ID' if present, else index)
    max_rows = 300  #### 调整sentence数量 target == 300
    texts_to_explain = df["body_nopunct_apostrophe"].tolist()[:max_rows]
    if 'id' in df.columns:
        orig_ids = df['id'].tolist()[:max_rows]
    elif 'Sentence_ID' in df.columns:
        orig_ids = df['Sentence_ID'].tolist()[:max_rows]
    else:
        orig_ids = (df.index + 1).tolist()[:max_rows]

    print(f"🧩 Explaining {len(texts_to_explain)} comments from {subreddit_name}...")

    rows = []

    if USE_LEAVE_ONE_OUT:
        # Deterministic leave-one-out: for each sentence and each word occurrence, remove the word
        # and measure change in model logit (orig_logit - masked_logit)
        print("⚙️ Running leave-one-out per-token (this may be slow)...")
        for i, text in enumerate(texts_to_explain):
            sent_id = orig_ids[i]
            # split on whitespace to get words (preserves apostrophes if present)
            tokens = [t for t in str(text).split() if t.strip()]
            if not tokens:
                continue
            # original logit
            orig_logit = float(predict_toxic([text])[0])
            for pos in range(len(tokens)):
                masked_tokens = tokens[:pos] + tokens[pos+1:]
                masked_text = " ".join(masked_tokens)
                masked_logit = float(predict_toxic([masked_text])[0])
                impact = orig_logit - masked_logit
                # clip to [-1, 1] for consistency with earlier output
                impact_clipped = float(np.clip(impact, -1.0, 1.0))
                rows.append({
                    "Sentence_ID": sent_id,
                    "Sentence_Order": i + 1,
                    "Text": text,
                    "Token": tokens[pos],
                    "Token_Position": pos + 1,
                    "SHAP_Impact_on_Toxic": impact_clipped,
                    "Interpretation": "↑ increase toxic" if impact_clipped > 0 else "↓ decrease toxic"
                })
    else:
        shap_values = explainer(texts_to_explain)
        # For every sentence, record an entry for each token occurrence (with position)
        for i, text in enumerate(texts_to_explain):
            toks = np.array(shap_values.data[i])
            vals = np.array(shap_values.values[i]).flatten()
            sent_id = orig_ids[i]
            for pos, (t, v) in enumerate(zip(toks, vals), start=1):
                if isinstance(t, str) and t.strip() not in ["[CLS]", "[SEP]", "[PAD]", ""]:
                    # Clip SHAP impact to [-1, 1]
                    v_clipped = float(np.clip(v, -1.0, 1.0))
                    rows.append({
                        "Sentence_ID": sent_id,
                        "Sentence_Order": i + 1,
                        "Text": text,
                        "Token": t,
                        "Token_Position": pos,
                        "SHAP_Impact_on_Toxic": v_clipped,
                        "Interpretation": "↑ increase toxic" if v_clipped > 0 else "↓ decrease toxic"
                    })

    df_out = pd.DataFrame(rows)
    df_out = df_out.sort_values(["Sentence_ID", "SHAP_Impact_on_Toxic"], ascending=[True, False])

    out_name = f"toxic_shap_table_{subreddit_name}.csv"
    df_out.to_csv(out_name, index=False, encoding="utf-8-sig")
    print(f"💾 Saved as {out_name}")

print("\n✅ All subreddit files processed successfully!")

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  orig_logit = float(predict_toxic([text])[0])
  masked_logit = float(predict_toxic([masked_text])[0])



🚀 Processing final_version ...
🧩 Explaining 300 comments from final_version...
⚙️ Running leave-one-out per-token (this may be slow)...
💾 Saved as toxic_shap_table_final_version.csv

✅ All subreddit files processed successfully!
