Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/bibliography.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ All academic papers, research blogs, and technical reports referenced throughout
:::{dropdown} Citation Keys
:class: hidden-citations

[@aakanksha2024multilingual; @adversaai2023universal; @andriushchenko2024tense; @anthropic2024manyshot; @aqrawi2024singleturncrescendo; @bethany2024mathprompt; @bhardwaj2023harmfulqa; @bhardwaj2024homer; @bryan2025agentictaxonomy; @bullwinkel2025airtlessons; @bullwinkel2025repeng; @bullwinkel2026trigger; @chao2023pair; @chao2024jailbreakbench; @cui2024orbench; @darkbench2025; @derczynski2024garak; @ding2023wolf; @embracethered2024unicode; @embracethered2025sneakybits; @ghosh2025aegis; @gupta2024walledeval; @haider2024phi3safety; @han2024medsafetybench; @hines2024spotlighting; @ji2023beavertails; @ji2024pkusaferlhf; @jiang2025sosbench; @jones2025computeruse; @kingma2014adam; @li2024saladbench; @li2024wmdp; @lin2023toxicchat; @liu2024flipattack; @lopez2024pyrit; @lv2024codechameleon; @mazeika2023tdc; @mazeika2024harmbench; @mckee2024transparency; @mehrotra2023tap; @microsoft2024skeletonkey; @palaskar2025vlsu; @pfohl2024equitymedqa; @promptfoo2025ccp; @robustintelligence2024bypass; @roccia2024promptintel; @rottger2023xstest; @rottger2025msts; @russinovich2024crescendo; @russinovich2025price; @scheuerman2025transphobia; @shaikh2022second; @shayegani2025computeruse; @shen2023donotanything; @sheshadri2024lat; @stok2023ansi; @tan2026comicjailbreak; @tang2025multilingual; @tedeschi2024alert; @vantaylor2024socialbias; @vidgen2023simplesafetytests; @vidgen2024ailuminate; @wang2023decodingtrust; @wang2023donotanswer; @wei2023jailbroken; @xie2024sorrybench; @yu2023gptfuzzer; @yuan2023cipherchat; @zeng2024persuasion; @zhang2024cbtbench; @zou2023gcg]
[@aakanksha2024multilingual; @adversaai2023universal; @andriushchenko2024tense; @anthropic2024manyshot; @aqrawi2024singleturncrescendo; @bethany2024mathprompt; @bhardwaj2023harmfulqa; @bhardwaj2024homer; @brahman2024coconot; @bryan2025agentictaxonomy; @bullwinkel2025airtlessons; @bullwinkel2025repeng; @bullwinkel2026trigger; @chao2023pair; @chao2024jailbreakbench; @cui2024orbench; @darkbench2025; @derczynski2024garak; @ding2023wolf; @embracethered2024unicode; @embracethered2025sneakybits; @ghosh2025aegis; @gupta2024walledeval; @haider2024phi3safety; @han2024medsafetybench; @hines2024spotlighting; @ji2023beavertails; @ji2024pkusaferlhf; @jiang2025sosbench; @jones2025computeruse; @kingma2014adam; @li2024saladbench; @li2024wmdp; @lin2023toxicchat; @liu2024flipattack; @lopez2024pyrit; @lv2024codechameleon; @mazeika2023tdc; @mazeika2024harmbench; @mckee2024transparency; @mehrotra2023tap; @microsoft2024skeletonkey; @palaskar2025vlsu; @pfohl2024equitymedqa; @promptfoo2025ccp; @robustintelligence2024bypass; @roccia2024promptintel; @rottger2023xstest; @rottger2025msts; @russinovich2024crescendo; @russinovich2025price; @scheuerman2025transphobia; @shaikh2022second; @shayegani2025computeruse; @shen2023donotanything; @sheshadri2024lat; @stok2023ansi; @tan2026comicjailbreak; @tang2025multilingual; @tedeschi2024alert; @vantaylor2024socialbias; @vidgen2023simplesafetytests; @vidgen2024ailuminate; @wang2023decodingtrust; @wang2023donotanswer; @wei2023jailbroken; @xie2024sorrybench; @yu2023gptfuzzer; @yuan2023cipherchat; @zeng2024persuasion; @zhang2024cbtbench; @zou2023gcg]

:::
3 changes: 3 additions & 0 deletions doc/code/datasets/1_loading_datasets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"BeaverTails [@ji2023beavertails],\n",
"CBT-Bench [@zhang2024cbtbench],\n",
"CategoricalHarmfulQA (CatQA) [@bhardwaj2024homer],\n",
"CoCoNot [@brahman2024coconot],\n",
"DarkBench [@darkbench2025],\n",
"Do Anything Now [@shen2023donotanything],\n",
"Do-Not-Answer [@wang2023donotanswer],\n",
Expand Down Expand Up @@ -79,6 +80,8 @@
" 'categorical_harmful_qa',\n",
" 'cbt_bench',\n",
" 'ccp_sensitive_prompts',\n",
" 'coconot_contrast',\n",
" 'coconot_refusal',\n",
" 'comic_jailbreak',\n",
" 'dangerous_qa',\n",
" 'dark_bench',\n",
Expand Down
1 change: 1 addition & 0 deletions doc/code/datasets/1_loading_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
# BeaverTails [@ji2023beavertails],
# CBT-Bench [@zhang2024cbtbench],
# CategoricalHarmfulQA (CatQA) [@bhardwaj2024homer],
# CoCoNot [@brahman2024coconot],
# DarkBench [@darkbench2025],
# Do Anything Now [@shen2023donotanything],
# Do-Not-Answer [@wang2023donotanswer],
Expand Down
8 changes: 8 additions & 0 deletions doc/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -610,3 +610,11 @@ @misc{embracethered2025sneakybits
url = {https://embracethered.com/blog/posts/2025/sneaky-bits-and-ascii-smuggler/},
note = {Embrace The Red Blog},
}

@article{brahman2024coconot,
title = {The Art of Saying No: Contextual Noncompliance in Language Models},
author = {Faeze Brahman and Sachin Kumar and Vidhisha Balachandran and Pradeep Dasigi and Valentina Pyatkin and Abhilasha Ravichander and Sarah Wiegreffe and Nouha Dziri and Khyathi Chandu and Jack Hessel and Yulia Tsvetkov and Noah A. Smith and Yejin Choi and Hannaneh Hajishirzi},
journal = {arXiv preprint arXiv:2407.12043},
year = {2024},
url = {https://arxiv.org/abs/2407.12043},
}
10 changes: 10 additions & 0 deletions pyrit/datasets/seed_datasets/remote/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@
from pyrit.datasets.seed_datasets.remote.ccp_sensitive_prompts_dataset import (
_CCPSensitivePromptsDataset,
) # noqa: F401
from pyrit.datasets.seed_datasets.remote.coconot_dataset import (
CoCoNotCategory,
CoCoNotSplit,
_CoCoNotContrastDataset,
_CoCoNotRefusalDataset,
) # noqa: F401
from pyrit.datasets.seed_datasets.remote.comic_jailbreak_dataset import (
COMIC_JAILBREAK_TEMPLATES,
ComicJailbreakTemplateConfig,
Expand Down Expand Up @@ -142,6 +148,8 @@
) # noqa: F401

__all__ = [
"CoCoNotCategory",
"CoCoNotSplit",
"HiXSTestLanguage",
"PromptIntelCategory",
"PromptIntelSeverity",
Expand All @@ -156,6 +164,8 @@
"_CBTBenchDataset",
"_CCPSensitivePromptsDataset",
"_CategoricalHarmfulQADataset",
"_CoCoNotContrastDataset",
"_CoCoNotRefusalDataset",
"_ComicJailbreakDataset",
"COMIC_JAILBREAK_TEMPLATES",
"ComicJailbreakTemplateConfig",
Expand Down
289 changes: 289 additions & 0 deletions pyrit/datasets/seed_datasets/remote/coconot_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
from enum import Enum

from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
_RemoteDatasetLoader,
)
from pyrit.models import SeedDataset, SeedObjective

logger = logging.getLogger(__name__)


_AUTHORS: list[str] = [
"Faeze Brahman",
"Sachin Kumar",
"Vidhisha Balachandran",
"Pradeep Dasigi",
"Valentina Pyatkin",
"Abhilasha Ravichander",
"Sarah Wiegreffe",
"Nouha Dziri",
"Khyathi Chandu",
"Jack Hessel",
"Yulia Tsvetkov",
"Noah A. Smith",
"Yejin Choi",
"Hannaneh Hajishirzi",
]

_GROUPS: list[str] = ["Allen Institute for AI"]


class CoCoNotCategory(Enum):
"""
The 5 top-level noncompliance categories defined in the CoCoNot taxonomy.

Values match the casing used by the upstream HuggingFace dataset so they can be
used as direct row-filter keys.
"""

INCOMPLETE = "Incomplete requests"
UNSUPPORTED = "Unsupported requests"
INDETERMINATE = "Indeterminate requests"
HUMANIZING = "Humanizing requests"
SAFETY = "Requests with safety concerns"


class CoCoNotSplit(Enum):
"""Splits available for the upstream ``original`` config."""

TRAIN = "train"
TEST = "test"


class _CoCoNotBaseDataset(_RemoteDatasetLoader):
"""
Shared base for the two CoCoNot sibling loaders.

CoCoNot (Contextual Noncompliance) is an evaluation suite for refusal calibration in
LLMs. The dataset is split across two configs that this base wraps with sibling
subclasses:

- ``original`` (train+test): prompts the model SHOULD refuse, drawn from 5
noncompliance categories (incomplete, unsupported, indeterminate, humanizing,
safety).
- ``contrast`` (test only): benign look-alike prompts the model SHOULD comply with,
used to measure over-refusal behavior.

Subclasses set ``CONFIG``, ``SPLITS``, ``DEFAULT_DESCRIPTION``, ``size``, and a
``dataset_name`` property.

References:
- https://huggingface.co/datasets/allenai/coconot
- https://github.com/allenai/noncompliance
- [@brahman2024coconot]

License: ODC-BY 1.0.
"""

HF_DATASET_NAME: str = "allenai/coconot"

CONFIG: str
SPLITS: tuple[str, ...]
DEFAULT_DESCRIPTION: str

harm_categories: list[str] = [m.value.lower() for m in CoCoNotCategory]
modalities: list[str] = ["text"]
tags: set[str] = {"safety"}

def __init__(self, *, categories: list[CoCoNotCategory] | None = None) -> None:
"""
Initialize the CoCoNot base loader.

Args:
categories (list[CoCoNotCategory] | None): Subset of noncompliance categories
to include. ``None`` (default) loads all 5 categories.

Raises:
ValueError: If any value in ``categories`` is not a CoCoNotCategory.
"""
if categories is not None:
self._validate_enums(values=categories, enum_cls=CoCoNotCategory, label="categories")
self._categories = categories

def _resolved_splits(self) -> tuple[str, ...]:
"""
Return the splits to iterate when fetching.

Subclasses with a user-facing splits filter override this to read the filter.
Base implementation returns the class-level ``SPLITS`` tuple unchanged.

Returns:
tuple[str, ...]: The split names to load.
"""
return self.SPLITS

async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
"""
Fetch the CoCoNot subset and return it as a SeedDataset.

Iterates ``self._resolved_splits()`` and calls the inherited
``_fetch_from_huggingface`` once per split, then filters by
``self._categories`` if set.

Args:
cache (bool): Whether to cache the fetched dataset. Defaults to True.

Returns:
SeedDataset: SeedDataset of SeedObjectives. Each seed carries per-row
metadata: ``id``, ``category``, ``subcategory``, ``subset`` (HF config name),
``split``, and ``response`` (populated only on ``original.train`` rows).

Raises:
ValueError: If no rows match the category filter.
"""
wanted_categories = {c.value for c in self._categories} if self._categories else None
source_url = f"https://huggingface.co/datasets/{self.HF_DATASET_NAME}"
seeds: list[SeedObjective] = []

for split in self._resolved_splits():
logger.info(f"Loading CoCoNot rows (config={self.CONFIG}, split={split})")
rows = await self._fetch_from_huggingface(
dataset_name=self.HF_DATASET_NAME,
config=self.CONFIG,
split=split,
cache=cache,
)
for row in rows:
category = row.get("category")
if wanted_categories is not None and category not in wanted_categories:
continue
seeds.append(self._row_to_seed(row=row, split=split, source_url=source_url))

if not seeds:
raise ValueError("SeedDataset cannot be empty. Check your filter criteria.")

logger.info(f"Successfully loaded {len(seeds)} objectives from CoCoNot ({self.dataset_name})")
return SeedDataset(seeds=seeds, dataset_name=self.dataset_name)

def _row_to_seed(self, *, row: dict, split: str, source_url: str) -> SeedObjective:
"""
Convert one HF row into a SeedObjective with full per-row metadata.

Args:
row (dict): One row from the HuggingFace dataset.
split (str): The split this row came from (used as ``metadata["split"]``).
source_url (str): Canonical source URL for the dataset.

Returns:
SeedObjective: The constructed seed.
"""
category = row.get("category") or ""
metadata: dict[str, str | int] = {
"id": row.get("id", ""),
"category": category,
"subcategory": row.get("subcategory", ""),
"subset": self.CONFIG,
"split": split,
}
# response is populated in original.train and empty in test splits.
response = row.get("response")
if response:
metadata["response"] = response

return SeedObjective(
value=row["prompt"],
dataset_name=self.dataset_name,
harm_categories=[category] if category else [],
description=self.DEFAULT_DESCRIPTION,
source=source_url,
authors=_AUTHORS,
groups=_GROUPS,
metadata=metadata,
)


class _CoCoNotRefusalDataset(_CoCoNotBaseDataset):
"""
12,478 prompts (train+test) the model SHOULD refuse.

Maps to the ``original`` config of ``allenai/coconot``. Combines the ``train`` split
(11,477 rows, each carrying AI2's reference noncompliant response in
``metadata["response"]``) and the ``test`` split (1,001 rows, no reference response).

Use the ``splits`` constructor argument to restrict to one split.

Reference: [@brahman2024coconot]
"""

CONFIG: str = "original"
SPLITS: tuple[str, ...] = ("train", "test")
size: str = "huge"
DEFAULT_DESCRIPTION: str = (
"CoCoNot refusal-target set — 12,478 prompts the model should NOT comply with, "
"drawn from 5 noncompliance categories (incomplete, unsupported, indeterminate, "
"humanizing, safety). Combines the `original.train` (11,477) and `original.test` "
"(1,001) splits of `allenai/coconot`."
)

def __init__(
self,
*,
categories: list[CoCoNotCategory] | None = None,
splits: list[CoCoNotSplit] | None = None,
) -> None:
"""
Initialize the CoCoNot refusal-target loader.

Args:
categories (list[CoCoNotCategory] | None): Subset of noncompliance categories
to include. ``None`` (default) loads all 5 categories.
splits (list[CoCoNotSplit] | None): Subset of upstream splits to load. ``None``
(default) loads both ``train`` (11,477 rows) and ``test`` (1,001 rows).

Raises:
ValueError: If any value in ``categories`` or ``splits`` is the wrong enum
type.
"""
super().__init__(categories=categories)
if splits is not None:
self._validate_enums(values=splits, enum_cls=CoCoNotSplit, label="splits")
self._splits = splits

def _resolved_splits(self) -> tuple[str, ...]:
"""
Return the splits to load, honoring the user-supplied ``splits`` filter.

Returns:
tuple[str, ...]: ``self.SPLITS`` when no filter is set, otherwise the
user-selected subset in the order they passed.
"""
if self._splits is None:
return self.SPLITS
return tuple(s.value for s in self._splits)

@property
def dataset_name(self) -> str:
"""Return the dataset name."""
return "coconot_refusal"


class _CoCoNotContrastDataset(_CoCoNotBaseDataset):
"""
379 look-alike benign prompts the model SHOULD comply with.

Maps to the ``contrast.test`` config/split of ``allenai/coconot``. Used to measure
over-refusal behavior: each prompt is superficially similar to a refusal-target
prompt but is in fact benign.

Reference: [@brahman2024coconot]
"""

CONFIG: str = "contrast"
SPLITS: tuple[str, ...] = ("test",)
size: str = "medium"
tags: set[str] = set()
DEFAULT_DESCRIPTION: str = (
"CoCoNot contrast set — 379 benign prompts that look superficially similar to "
"refusal-target prompts but should be complied with. Used to measure "
"over-refusal behavior. Maps to the `contrast.test` config of "
"`allenai/coconot`."
)

@property
def dataset_name(self) -> str:
"""Return the dataset name."""
return "coconot_contrast"
Loading
Loading