-
Notifications
You must be signed in to change notification settings - Fork 771
FEAT: Add DecodingTrust Toxicity dataset loader #1821
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,268 @@ | ||
| # Copyright (c) Microsoft Corporation. | ||
| # Licensed under the MIT license. | ||
|
|
||
| import logging | ||
| from typing import Any, Literal, cast | ||
|
|
||
| from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import ( | ||
| _RemoteDatasetLoader, | ||
| ) | ||
| from pyrit.models import SeedDataset, SeedPrompt | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| # Pinned commit SHA of AI-secure/DecodingTrust `main` (2024-09-16). | ||
| # Pinning prevents silent upstream changes from altering the prompt set. | ||
| _DECODING_TRUST_COMMIT = "161ae8321ced62f45fcd9ceb412e05b47c603cd4" | ||
| _BASE_URL = ( | ||
| f"https://raw.githubusercontent.com/AI-secure/DecodingTrust/{_DECODING_TRUST_COMMIT}/data/toxicity/user_prompts/" | ||
| ) | ||
| _TOXIC_URL = f"{_BASE_URL}toxic.jsonl" | ||
| _NONTOXIC_URL = f"{_BASE_URL}nontoxic.jsonl" | ||
|
|
||
| # Mapping from Perspective API score keys (as published in the DecodingTrust JSONL) | ||
| # to harm_categories strings exposed on each SeedPrompt. | ||
| _PERSPECTIVE_SCORE_KEYS: tuple[str, ...] = ( | ||
| "toxicity", | ||
| "severe_toxicity", | ||
| "profanity", | ||
| "sexually_explicit", | ||
| "identity_attack", | ||
| "threat", | ||
| "insult", | ||
| "flirtation", | ||
| ) | ||
|
|
||
|
|
||
| class _DecodingTrustToxicityDataset(_RemoteDatasetLoader): | ||
| """ | ||
| Loader for the Toxicity perspective of the DecodingTrust benchmark. | ||
|
|
||
| DecodingTrust [1] evaluates LLM trustworthiness across eight perspectives; | ||
| the Toxicity perspective publishes ~2.4k prompts sampled from | ||
| RealToxicityPrompts [2] and re-scored with the Perspective API. Each record | ||
| contains a plain-text prompt fragment, eight Perspective-API scores | ||
| (toxicity, severe_toxicity, profanity, sexually_explicit, identity_attack, | ||
| threat, insult, flirtation) and a ``challenging`` flag for the | ||
| high-difficulty subset. | ||
|
|
||
| The published data lives in two files at | ||
| ``AI-secure/DecodingTrust/data/toxicity/user_prompts/`` — ``toxic.jsonl`` | ||
| (1196 records, high toxicity) and ``nontoxic.jsonl`` (1200 records, low | ||
| toxicity). The loader fetches the source files at runtime from | ||
| ``raw.githubusercontent.com`` (no redistribution) at a pinned commit SHA. | ||
|
|
||
| Per-prompt ``harm_categories`` are derived from the Perspective scores at | ||
| construction time using ``toxicity_threshold``: any score | ||
| ``>= toxicity_threshold`` adds the corresponding category. This avoids | ||
| guessing where the source provides no label. | ||
|
|
||
| References: | ||
| - Wang et al., 2023. *DecodingTrust: A Comprehensive Assessment of | ||
| Trustworthiness in GPT Models.* https://arxiv.org/abs/2306.11698 | ||
| - Gehman et al., 2020. *RealToxicityPrompts: Evaluating Neural Toxic | ||
| Degeneration in Language Models.* https://arxiv.org/abs/2009.11462 | ||
| - https://github.com/AI-secure/DecodingTrust | ||
|
|
||
| License: | ||
| DecodingTrust is distributed under CC BY-SA 4.0. PyRIT fetches the | ||
| prompts at runtime and does not redistribute them. Full attribution to | ||
| the DecodingTrust and RealToxicityPrompts authors is recorded on every | ||
| ``SeedPrompt`` produced. | ||
|
|
||
| Warning: | ||
| The toxic subset contains profanity, sexual content, identity attacks | ||
| and other content that may be offensive. Use only for authorised | ||
| red-teaming and safety research. | ||
| """ | ||
|
|
||
| # Class-level metadata picked up by _RemoteDatasetLoader._parse_metadata. | ||
| # See pyrit/datasets/seed_datasets/seed_metadata.py for the schema. | ||
| modalities: list[str] = ["text"] | ||
| size: str = "large" # 1196 (default toxic) – 2396 (subset="all") | ||
| tags: set[str] = {"default", "safety", "toxicity"} | ||
|
|
||
| _AUTHORS: tuple[str, ...] = ( | ||
| # DecodingTrust authors | ||
| "Boxin Wang", | ||
| "Weixin Chen", | ||
| "Hengzhi Pei", | ||
| "Chulin Xie", | ||
| "Mintong Kang", | ||
| "Chenhui Zhang", | ||
| "Chejian Xu", | ||
| "Zidi Xiong", | ||
| "Ritik Dutta", | ||
| "Rylan Schaeffer", | ||
| "Sang T. Truong", | ||
| "Simran Arora", | ||
| "Mantas Mazeika", | ||
| "Dan Hendrycks", | ||
| "Zinan Lin", | ||
| "Yu Cheng", | ||
| "Sanmi Koyejo", | ||
| "Dawn Song", | ||
| "Bo Li", | ||
| # RealToxicityPrompts authors (DecodingTrust subsamples from RTP) | ||
| "Samuel Gehman", | ||
| "Suchin Gururangan", | ||
| "Maarten Sap", | ||
| "Yejin Choi", | ||
| "Noah A. Smith", | ||
| ) | ||
|
|
||
| _GROUPS: tuple[str, ...] = ( | ||
| "University of Illinois Urbana-Champaign", | ||
| "Stanford University", | ||
| "University of California, Berkeley", | ||
| "Center for AI Safety", | ||
| "Microsoft Research", | ||
| "Allen Institute for Artificial Intelligence", | ||
| "University of Washington", | ||
| "Paul G. Allen School of Computer Science & Engineering", | ||
| ) | ||
|
|
||
| _DESCRIPTION = ( | ||
| "Toxicity perspective of the DecodingTrust benchmark (Wang et al., 2023). " | ||
| "Prompts are sampled from RealToxicityPrompts (Gehman et al., 2020) and " | ||
| "re-scored with the Perspective API. Each record carries eight Perspective " | ||
| "scores (toxicity, severe_toxicity, profanity, sexually_explicit, " | ||
| "identity_attack, threat, insult, flirtation) and a `challenging` flag. " | ||
| "Per-prompt harm_categories are derived from these scores at load time." | ||
| ) | ||
|
|
||
| def __init__( | ||
| self, | ||
| *, | ||
| subset: Literal["toxic", "nontoxic", "all"] = "toxic", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| challenging_only: bool = False, | ||
| toxicity_threshold: float = 0.5, | ||
| ) -> None: | ||
| """ | ||
| Initialize the DecodingTrust Toxicity dataset loader. | ||
|
|
||
| Args: | ||
| subset: Which subset of the toxicity prompts to load. ``"toxic"`` | ||
| (default) loads the ~1.2k high-toxicity prompts, ``"nontoxic"`` | ||
| loads the ~1.2k low-toxicity prompts, ``"all"`` loads both. | ||
| The default matches the red-teaming use case | ||
| (cf. microsoft/PyRIT#1798). | ||
| challenging_only: If True, retain only records flagged | ||
| ``challenging: true`` in the source — the adversarial subset | ||
| emphasised by the DecodingTrust authors. | ||
| toxicity_threshold: Perspective score at or above which a category | ||
| is added to a SeedPrompt's ``harm_categories``. Applied | ||
| independently to each of the eight Perspective scores. | ||
| """ | ||
| self.subset: Literal["toxic", "nontoxic", "all"] = subset | ||
| self.challenging_only = challenging_only | ||
| self.toxicity_threshold = toxicity_threshold | ||
|
|
||
| @property | ||
| def dataset_name(self) -> str: | ||
| """Return the dataset name.""" | ||
| return "decoding_trust_toxicity" | ||
|
|
||
| async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset: | ||
| """ | ||
| Fetch the DecodingTrust Toxicity prompts and return them as a SeedDataset. | ||
|
|
||
| Args: | ||
| cache: Whether to cache the fetched JSONL files locally. Defaults to True. | ||
|
|
||
| Returns: | ||
| SeedDataset: A SeedDataset whose seeds are the selected toxicity prompts. | ||
|
|
||
| Raises: | ||
| ValueError: If the source JSONL contains items that are not dicts. | ||
| """ | ||
| logger.info(f"Loading DecodingTrust Toxicity subset={self.subset!r} from {_BASE_URL}") | ||
|
|
||
| urls: list[str] = [] | ||
| if self.subset in ("toxic", "all"): | ||
| urls.append(_TOXIC_URL) | ||
| if self.subset in ("nontoxic", "all"): | ||
| urls.append(_NONTOXIC_URL) | ||
|
|
||
| records: list[tuple[str, dict[str, Any]]] = [] | ||
| for url in urls: | ||
| raw = self._fetch_from_url(source=url, source_type="public_url", cache=cache) | ||
| records.extend((url, item) for item in cast("list[Any]", raw)) | ||
|
|
||
| seed_prompts = self._records_to_seed_prompts(records=records) | ||
| logger.info(f"Loaded {len(seed_prompts)} prompts from DecodingTrust Toxicity") | ||
| return SeedDataset(seeds=seed_prompts, dataset_name=self.dataset_name) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
|
||
| def _records_to_seed_prompts(self, *, records: list[tuple[str, dict[str, Any]]]) -> list[SeedPrompt]: | ||
| """ | ||
| Convert raw JSONL records into SeedPrompt instances. | ||
|
|
||
| Args: | ||
| records: List of ``(source_url, record_dict)`` pairs straight from the | ||
| JSONL files. The ``source_url`` becomes the per-prompt ``source``. | ||
|
|
||
| Returns: | ||
| List of SeedPrompt objects, one per record that passes filters and | ||
| yields a non-empty ``prompt.text``. | ||
|
|
||
| Raises: | ||
| ValueError: If any record is not a dict (i.e. the source is malformed). | ||
| """ | ||
| seed_prompts: list[SeedPrompt] = [] | ||
| for source_url, item in records: | ||
| if not isinstance(item, dict): | ||
| raise ValueError( | ||
| f"Expected DecodingTrust toxicity records to be JSON objects, got {type(item).__name__}: {item!r}" | ||
| ) | ||
|
|
||
| if self.challenging_only and not item.get("challenging"): | ||
| continue | ||
|
|
||
| prompt_obj = item.get("prompt") or {} | ||
| if not isinstance(prompt_obj, dict): | ||
| logger.warning(f"Skipping record with non-dict 'prompt' field (type={type(prompt_obj).__name__})") | ||
| continue | ||
|
|
||
| text = prompt_obj.get("text") | ||
| if not isinstance(text, str) or not text: | ||
| logger.warning("Skipping record with missing or empty 'prompt.text'") | ||
| continue | ||
|
|
||
| seed_prompts.append( | ||
| SeedPrompt( | ||
| value=text, | ||
| data_type="text", | ||
| dataset_name=self.dataset_name, | ||
| harm_categories=self._derive_harm_categories(prompt_scores=prompt_obj), | ||
| description=self._DESCRIPTION, | ||
| source=source_url, | ||
| authors=list(self._AUTHORS), | ||
| groups=list(self._GROUPS), | ||
| ) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The The schema is metadata={
"challenging": bool(item.get("challenging", False)),
**{
key: str(prompt_obj[key])
for key in _PERSPECTIVE_SCORE_KEYS
if isinstance(prompt_obj.get(key), (int, float))
},
}A paired test asserting one score and the flag round-trip would round it out. |
||
| ) | ||
| return seed_prompts | ||
|
|
||
| def _derive_harm_categories(self, *, prompt_scores: dict[str, Any]) -> list[str]: | ||
| """ | ||
| Map Perspective API scores on a single prompt to a list of harm categories. | ||
|
|
||
| Each Perspective score whose value is ``>= self.toxicity_threshold`` | ||
| contributes its key as a harm category. Non-numeric or missing scores | ||
| are silently skipped — the source occasionally omits dimensions. | ||
|
|
||
| Args: | ||
| prompt_scores: The ``prompt`` sub-dict from a DecodingTrust toxicity | ||
| record, containing the eight Perspective scores plus ``text``. | ||
|
|
||
| Returns: | ||
| Sorted list of harm category strings. Sorted for deterministic | ||
| output (tests, snapshots). | ||
| """ | ||
| categories: list[str] = [] | ||
| for key in _PERSPECTIVE_SCORE_KEYS: | ||
| score = prompt_scores.get(key) | ||
| if isinstance(score, (int, float)) and score >= self.toxicity_threshold: | ||
| categories.append(key) | ||
| categories.sort() | ||
| return categories | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Two doc gaps from
datasets.instructions.md:Docstring cite-key.
@wang2023decodingtrustalready exists indoc/references.bibanddoc/bibliography.md, so this block should use the project formReference: [@wang2023decodingtrust]instead of raw arxiv URLs.1_loading_datasets. New loaders need to be added to the prose paragraph at the top ofdoc/code/datasets/1_loading_datasets.py(alphabetically, betweenDarkBenchandDo Anything Now) and mirrored in1_loading_datasets.ipynb. Inline edits to both files are fine for this trivial change.