Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pyrit/datasets/seed_datasets/remote/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
from pyrit.datasets.seed_datasets.remote.darkbench_dataset import (
_DarkBenchDataset,
) # noqa: F401
from pyrit.datasets.seed_datasets.remote.decoding_trust_toxicity_dataset import (
_DecodingTrustToxicityDataset,
) # noqa: F401
from pyrit.datasets.seed_datasets.remote.equitymedqa_dataset import (
_EquityMedQADataset,
) # noqa: F401
Expand Down Expand Up @@ -161,6 +164,7 @@
"ComicJailbreakTemplateConfig",
"_DangerousQADataset",
"_DarkBenchDataset",
"_DecodingTrustToxicityDataset",
"_EquityMedQADataset",
"_ForbiddenQuestionsDataset",
"_HarmBenchDataset",
Expand Down
268 changes: 268 additions & 0 deletions pyrit/datasets/seed_datasets/remote/decoding_trust_toxicity_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
from typing import Any, Literal, cast

from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
_RemoteDatasetLoader,
)
from pyrit.models import SeedDataset, SeedPrompt

logger = logging.getLogger(__name__)


# Pinned commit SHA of AI-secure/DecodingTrust `main` (2024-09-16).
# Pinning prevents silent upstream changes from altering the prompt set.
_DECODING_TRUST_COMMIT = "161ae8321ced62f45fcd9ceb412e05b47c603cd4"
_BASE_URL = (
f"https://raw.githubusercontent.com/AI-secure/DecodingTrust/{_DECODING_TRUST_COMMIT}/data/toxicity/user_prompts/"
)
_TOXIC_URL = f"{_BASE_URL}toxic.jsonl"
_NONTOXIC_URL = f"{_BASE_URL}nontoxic.jsonl"

# Mapping from Perspective API score keys (as published in the DecodingTrust JSONL)
# to harm_categories strings exposed on each SeedPrompt.
_PERSPECTIVE_SCORE_KEYS: tuple[str, ...] = (
"toxicity",
"severe_toxicity",
"profanity",
"sexually_explicit",
"identity_attack",
"threat",
"insult",
"flirtation",
)


class _DecodingTrustToxicityDataset(_RemoteDatasetLoader):
"""
Loader for the Toxicity perspective of the DecodingTrust benchmark.

DecodingTrust [1] evaluates LLM trustworthiness across eight perspectives;
the Toxicity perspective publishes ~2.4k prompts sampled from
RealToxicityPrompts [2] and re-scored with the Perspective API. Each record
contains a plain-text prompt fragment, eight Perspective-API scores
(toxicity, severe_toxicity, profanity, sexually_explicit, identity_attack,
threat, insult, flirtation) and a ``challenging`` flag for the
high-difficulty subset.

The published data lives in two files at
``AI-secure/DecodingTrust/data/toxicity/user_prompts/`` — ``toxic.jsonl``
(1196 records, high toxicity) and ``nontoxic.jsonl`` (1200 records, low
toxicity). The loader fetches the source files at runtime from
``raw.githubusercontent.com`` (no redistribution) at a pinned commit SHA.

Per-prompt ``harm_categories`` are derived from the Perspective scores at
construction time using ``toxicity_threshold``: any score
``>= toxicity_threshold`` adds the corresponding category. This avoids
guessing where the source provides no label.

References:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two doc gaps from datasets.instructions.md:

Docstring cite-key. @wang2023decodingtrust already exists in doc/references.bib and doc/bibliography.md, so this block should use the project form Reference: [@wang2023decodingtrust] instead of raw arxiv URLs.

1_loading_datasets. New loaders need to be added to the prose paragraph at the top of doc/code/datasets/1_loading_datasets.py (alphabetically, between DarkBench and Do Anything Now) and mirrored in 1_loading_datasets.ipynb. Inline edits to both files are fine for this trivial change.

- Wang et al., 2023. *DecodingTrust: A Comprehensive Assessment of
Trustworthiness in GPT Models.* https://arxiv.org/abs/2306.11698
- Gehman et al., 2020. *RealToxicityPrompts: Evaluating Neural Toxic
Degeneration in Language Models.* https://arxiv.org/abs/2009.11462
- https://github.com/AI-secure/DecodingTrust

License:
DecodingTrust is distributed under CC BY-SA 4.0. PyRIT fetches the
prompts at runtime and does not redistribute them. Full attribution to
the DecodingTrust and RealToxicityPrompts authors is recorded on every
``SeedPrompt`` produced.

Warning:
The toxic subset contains profanity, sexual content, identity attacks
and other content that may be offensive. Use only for authorised
red-teaming and safety research.
"""

# Class-level metadata picked up by _RemoteDatasetLoader._parse_metadata.
# See pyrit/datasets/seed_datasets/seed_metadata.py for the schema.
modalities: list[str] = ["text"]
size: str = "large" # 1196 (default toxic) – 2396 (subset="all")
tags: set[str] = {"default", "safety", "toxicity"}

_AUTHORS: tuple[str, ...] = (
# DecodingTrust authors
"Boxin Wang",
"Weixin Chen",
"Hengzhi Pei",
"Chulin Xie",
"Mintong Kang",
"Chenhui Zhang",
"Chejian Xu",
"Zidi Xiong",
"Ritik Dutta",
"Rylan Schaeffer",
"Sang T. Truong",
"Simran Arora",
"Mantas Mazeika",
"Dan Hendrycks",
"Zinan Lin",
"Yu Cheng",
"Sanmi Koyejo",
"Dawn Song",
"Bo Li",
# RealToxicityPrompts authors (DecodingTrust subsamples from RTP)
"Samuel Gehman",
"Suchin Gururangan",
"Maarten Sap",
"Yejin Choi",
"Noah A. Smith",
)

_GROUPS: tuple[str, ...] = (
"University of Illinois Urbana-Champaign",
"Stanford University",
"University of California, Berkeley",
"Center for AI Safety",
"Microsoft Research",
"Allen Institute for Artificial Intelligence",
"University of Washington",
"Paul G. Allen School of Computer Science & Engineering",
)

_DESCRIPTION = (
"Toxicity perspective of the DecodingTrust benchmark (Wang et al., 2023). "
"Prompts are sampled from RealToxicityPrompts (Gehman et al., 2020) and "
"re-scored with the Perspective API. Each record carries eight Perspective "
"scores (toxicity, severe_toxicity, profanity, sexually_explicit, "
"identity_attack, threat, insult, flirtation) and a `challenging` flag. "
"Per-prompt harm_categories are derived from these scores at load time."
)

def __init__(
self,
*,
subset: Literal["toxic", "nontoxic", "all"] = "toxic",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

datasets.instructions.md requires filter axes to be module-level Enum rather than Literal[...]. See VLGuardSubset or PromptIntelSeverity for the pattern. Could you define a DecodingTrustToxicitySubset(Enum), accept it here, validate it with self._validate_enum(...), and re-export from remote/__init__.py?

challenging_only: bool = False,
toxicity_threshold: float = 0.5,
) -> None:
"""
Initialize the DecodingTrust Toxicity dataset loader.

Args:
subset: Which subset of the toxicity prompts to load. ``"toxic"``
(default) loads the ~1.2k high-toxicity prompts, ``"nontoxic"``
loads the ~1.2k low-toxicity prompts, ``"all"`` loads both.
The default matches the red-teaming use case
(cf. microsoft/PyRIT#1798).
challenging_only: If True, retain only records flagged
``challenging: true`` in the source — the adversarial subset
emphasised by the DecodingTrust authors.
toxicity_threshold: Perspective score at or above which a category
is added to a SeedPrompt's ``harm_categories``. Applied
independently to each of the eight Perspective scores.
"""
self.subset: Literal["toxic", "nontoxic", "all"] = subset
self.challenging_only = challenging_only
self.toxicity_threshold = toxicity_threshold

@property
def dataset_name(self) -> str:
"""Return the dataset name."""
return "decoding_trust_toxicity"

async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
"""
Fetch the DecodingTrust Toxicity prompts and return them as a SeedDataset.

Args:
cache: Whether to cache the fetched JSONL files locally. Defaults to True.

Returns:
SeedDataset: A SeedDataset whose seeds are the selected toxicity prompts.

Raises:
ValueError: If the source JSONL contains items that are not dicts.
"""
logger.info(f"Loading DecodingTrust Toxicity subset={self.subset!r} from {_BASE_URL}")

urls: list[str] = []
if self.subset in ("toxic", "all"):
urls.append(_TOXIC_URL)
if self.subset in ("nontoxic", "all"):
urls.append(_NONTOXIC_URL)

records: list[tuple[str, dict[str, Any]]] = []
for url in urls:
raw = self._fetch_from_url(source=url, source_type="public_url", cache=cache)
records.extend((url, item) for item in cast("list[Any]", raw))

seed_prompts = self._records_to_seed_prompts(records=records)
logger.info(f"Loaded {len(seed_prompts)} prompts from DecodingTrust Toxicity")
return SeedDataset(seeds=seed_prompts, dataset_name=self.dataset_name)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

datasets.instructions.md requires loaders to raise when filters leave zero seeds, with the standard message ValueError("SeedDataset cannot be empty. Check your filter criteria."). Today challenging_only=True against a subset that has no challenging records returns an empty SeedDataset silently, which is hard to debug downstream. Could you add the check after _records_to_seed_prompts and a paired test covering the case?


def _records_to_seed_prompts(self, *, records: list[tuple[str, dict[str, Any]]]) -> list[SeedPrompt]:
"""
Convert raw JSONL records into SeedPrompt instances.

Args:
records: List of ``(source_url, record_dict)`` pairs straight from the
JSONL files. The ``source_url`` becomes the per-prompt ``source``.

Returns:
List of SeedPrompt objects, one per record that passes filters and
yields a non-empty ``prompt.text``.

Raises:
ValueError: If any record is not a dict (i.e. the source is malformed).
"""
seed_prompts: list[SeedPrompt] = []
for source_url, item in records:
if not isinstance(item, dict):
raise ValueError(
f"Expected DecodingTrust toxicity records to be JSON objects, got {type(item).__name__}: {item!r}"
)

if self.challenging_only and not item.get("challenging"):
continue

prompt_obj = item.get("prompt") or {}
if not isinstance(prompt_obj, dict):
logger.warning(f"Skipping record with non-dict 'prompt' field (type={type(prompt_obj).__name__})")
continue

text = prompt_obj.get("text")
if not isinstance(text, str) or not text:
logger.warning("Skipping record with missing or empty 'prompt.text'")
continue

seed_prompts.append(
SeedPrompt(
value=text,
data_type="text",
dataset_name=self.dataset_name,
harm_categories=self._derive_harm_categories(prompt_scores=prompt_obj),
description=self._DESCRIPTION,
source=source_url,
authors=list(self._AUTHORS),
groups=list(self._GROUPS),
)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The challenging flag and the 8 Perspective scores get read here but not stored on the SeedPrompt. datasets.instructions.md asks for per-row source fields to land in metadata={...} so they're persisted to memory, queryable via _get_seed_metadata_conditions, and flow into MessagePiece.prompt_metadata when the seed reaches a target. For DT specifically those annotations are what distinguishes this dataset from raw RealToxicityPrompts, so it's worth carrying them through.

The schema is dict[str, Union[str, int]], so floats need stringifying (see _ToxicChatDataset for the precedent). One shape that fits:

metadata={
    "challenging": bool(item.get("challenging", False)),
    **{
        key: str(prompt_obj[key])
        for key in _PERSPECTIVE_SCORE_KEYS
        if isinstance(prompt_obj.get(key), (int, float))
    },
}

A paired test asserting one score and the flag round-trip would round it out.

)
return seed_prompts

def _derive_harm_categories(self, *, prompt_scores: dict[str, Any]) -> list[str]:
"""
Map Perspective API scores on a single prompt to a list of harm categories.

Each Perspective score whose value is ``>= self.toxicity_threshold``
contributes its key as a harm category. Non-numeric or missing scores
are silently skipped — the source occasionally omits dimensions.

Args:
prompt_scores: The ``prompt`` sub-dict from a DecodingTrust toxicity
record, containing the eight Perspective scores plus ``text``.

Returns:
Sorted list of harm category strings. Sorted for deterministic
output (tests, snapshots).
"""
categories: list[str] = []
for key in _PERSPECTIVE_SCORE_KEYS:
score = prompt_scores.get(key)
if isinstance(score, (int, float)) and score >= self.toxicity_threshold:
categories.append(key)
categories.sort()
return categories
Loading