microsoft · romanlutz · May 26, 2026 · May 25, 2026
diff --git a/doc/bibliography.md b/doc/bibliography.md
@@ -5,6 +5,6 @@ All academic papers, research blogs, and technical reports referenced throughout
 :::{dropdown} Citation Keys
 :class: hidden-citations
 
-[@aakanksha2024multilingual; @adversaai2023universal; @andriushchenko2024tense; @anthropic2024manyshot; @aqrawi2024singleturncrescendo; @bethany2024mathprompt; @bhardwaj2023harmfulqa; @bhardwaj2024homer; @bryan2025agentictaxonomy; @bullwinkel2025airtlessons; @bullwinkel2025repeng; @bullwinkel2026trigger; @chao2023pair; @chao2024jailbreakbench; @cui2024orbench; @darkbench2025; @derczynski2024garak; @ding2023wolf; @embracethered2024unicode; @embracethered2025sneakybits; @ghosh2025aegis; @gupta2024walledeval; @haider2024phi3safety; @han2024medsafetybench; @hines2024spotlighting; @ji2023beavertails; @ji2024pkusaferlhf; @jiang2025sosbench; @jones2025computeruse; @kingma2014adam; @li2024saladbench; @li2024wmdp; @lin2023toxicchat; @liu2024flipattack; @lopez2024pyrit; @lv2024codechameleon; @mazeika2023tdc; @mazeika2024harmbench; @mckee2024transparency; @mehrotra2023tap; @microsoft2024skeletonkey; @palaskar2025vlsu; @pfohl2024equitymedqa; @promptfoo2025ccp; @robustintelligence2024bypass; @roccia2024promptintel; @rottger2023xstest; @rottger2025msts; @russinovich2024crescendo; @russinovich2025price; @scheuerman2025transphobia; @shaikh2022second; @shayegani2025computeruse; @shen2023donotanything; @sheshadri2024lat; @stok2023ansi; @tan2026comicjailbreak; @tang2025multilingual; @tedeschi2024alert; @vantaylor2024socialbias; @vidgen2023simplesafetytests; @vidgen2024ailuminate; @wang2023decodingtrust; @wang2023donotanswer; @wei2023jailbroken; @xie2024sorrybench; @yu2023gptfuzzer; @yuan2023cipherchat; @zeng2024persuasion; @zhang2024cbtbench; @zou2023gcg]
+[@aakanksha2024multilingual; @adversaai2023universal; @andriushchenko2024tense; @anthropic2024manyshot; @aqrawi2024singleturncrescendo; @bethany2024mathprompt; @bhardwaj2023harmfulqa; @bhardwaj2024homer; @brahman2024coconot; @bryan2025agentictaxonomy; @bullwinkel2025airtlessons; @bullwinkel2025repeng; @bullwinkel2026trigger; @chao2023pair; @chao2024jailbreakbench; @cui2024orbench; @darkbench2025; @derczynski2024garak; @ding2023wolf; @embracethered2024unicode; @embracethered2025sneakybits; @ghosh2025aegis; @gupta2024walledeval; @haider2024phi3safety; @han2024medsafetybench; @hines2024spotlighting; @ji2023beavertails; @ji2024pkusaferlhf; @jiang2025sosbench; @jones2025computeruse; @kingma2014adam; @li2024saladbench; @li2024wmdp; @lin2023toxicchat; @liu2024flipattack; @lopez2024pyrit; @lv2024codechameleon; @mazeika2023tdc; @mazeika2024harmbench; @mckee2024transparency; @mehrotra2023tap; @microsoft2024skeletonkey; @palaskar2025vlsu; @pfohl2024equitymedqa; @promptfoo2025ccp; @robustintelligence2024bypass; @roccia2024promptintel; @rottger2023xstest; @rottger2025msts; @russinovich2024crescendo; @russinovich2025price; @scheuerman2025transphobia; @shaikh2022second; @shayegani2025computeruse; @shen2023donotanything; @sheshadri2024lat; @stok2023ansi; @tan2026comicjailbreak; @tang2025multilingual; @tedeschi2024alert; @vantaylor2024socialbias; @vidgen2023simplesafetytests; @vidgen2024ailuminate; @wang2023decodingtrust; @wang2023donotanswer; @wei2023jailbroken; @xie2024sorrybench; @yu2023gptfuzzer; @yuan2023cipherchat; @zeng2024persuasion; @zhang2024cbtbench; @zou2023gcg]
 
 :::
diff --git a/doc/code/datasets/1_loading_datasets.ipynb b/doc/code/datasets/1_loading_datasets.ipynb
@@ -19,6 +19,7 @@
     "BeaverTails [@ji2023beavertails],\n",
     "CBT-Bench [@zhang2024cbtbench],\n",
     "CategoricalHarmfulQA (CatQA) [@bhardwaj2024homer],\n",
+    "CoCoNot [@brahman2024coconot],\n",
     "DarkBench [@darkbench2025],\n",
     "Do Anything Now [@shen2023donotanything],\n",
     "Do-Not-Answer [@wang2023donotanswer],\n",
@@ -79,6 +80,8 @@
        " 'categorical_harmful_qa',\n",
        " 'cbt_bench',\n",
        " 'ccp_sensitive_prompts',\n",
+       " 'coconot_contrast',\n",
+       " 'coconot_refusal',\n",
        " 'comic_jailbreak',\n",
        " 'dangerous_qa',\n",
        " 'dark_bench',\n",

diff --git a/doc/code/datasets/1_loading_datasets.py b/doc/code/datasets/1_loading_datasets.py
@@ -23,6 +23,7 @@
 # BeaverTails [@ji2023beavertails],
 # CBT-Bench [@zhang2024cbtbench],
 # CategoricalHarmfulQA (CatQA) [@bhardwaj2024homer],
+# CoCoNot [@brahman2024coconot],
 # DarkBench [@darkbench2025],
 # Do Anything Now [@shen2023donotanything],
 # Do-Not-Answer [@wang2023donotanswer],

diff --git a/doc/references.bib b/doc/references.bib
@@ -610,3 +610,11 @@ @misc{embracethered2025sneakybits
   url       = {https://embracethered.com/blog/posts/2025/sneaky-bits-and-ascii-smuggler/},
   note      = {Embrace The Red Blog},
 }
+
+@article{brahman2024coconot,
+  title     = {The Art of Saying No: Contextual Noncompliance in Language Models},
+  author    = {Faeze Brahman and Sachin Kumar and Vidhisha Balachandran and Pradeep Dasigi and Valentina Pyatkin and Abhilasha Ravichander and Sarah Wiegreffe and Nouha Dziri and Khyathi Chandu and Jack Hessel and Yulia Tsvetkov and Noah A. Smith and Yejin Choi and Hannaneh Hajishirzi},
+  journal   = {arXiv preprint arXiv:2407.12043},
+  year      = {2024},
+  url       = {https://arxiv.org/abs/2407.12043},
+}
diff --git a/pyrit/datasets/seed_datasets/remote/__init__.py b/pyrit/datasets/seed_datasets/remote/__init__.py
@@ -28,6 +28,12 @@
 from pyrit.datasets.seed_datasets.remote.ccp_sensitive_prompts_dataset import (
     _CCPSensitivePromptsDataset,
 )  # noqa: F401
+from pyrit.datasets.seed_datasets.remote.coconot_dataset import (
+    CoCoNotCategory,
+    CoCoNotSplit,
+    _CoCoNotContrastDataset,
+    _CoCoNotRefusalDataset,
+)  # noqa: F401
 from pyrit.datasets.seed_datasets.remote.comic_jailbreak_dataset import (
     COMIC_JAILBREAK_TEMPLATES,
     ComicJailbreakTemplateConfig,
@@ -142,6 +148,8 @@
 )  # noqa: F401
 
 __all__ = [
+    "CoCoNotCategory",
+    "CoCoNotSplit",
     "HiXSTestLanguage",
     "PromptIntelCategory",
     "PromptIntelSeverity",
@@ -156,6 +164,8 @@
     "_CBTBenchDataset",
     "_CCPSensitivePromptsDataset",
     "_CategoricalHarmfulQADataset",
+    "_CoCoNotContrastDataset",
+    "_CoCoNotRefusalDataset",
     "_ComicJailbreakDataset",
     "COMIC_JAILBREAK_TEMPLATES",
     "ComicJailbreakTemplateConfig",

diff --git a/pyrit/datasets/seed_datasets/remote/coconot_dataset.py b/pyrit/datasets/seed_datasets/remote/coconot_dataset.py
@@ -0,0 +1,289 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+from enum import Enum
+
+from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
+    _RemoteDatasetLoader,
+)
+from pyrit.models import SeedDataset, SeedObjective
+
+logger = logging.getLogger(__name__)
+
+
+_AUTHORS: list[str] = [
+    "Faeze Brahman",
+    "Sachin Kumar",
+    "Vidhisha Balachandran",
+    "Pradeep Dasigi",
+    "Valentina Pyatkin",
+    "Abhilasha Ravichander",
+    "Sarah Wiegreffe",
+    "Nouha Dziri",
+    "Khyathi Chandu",
+    "Jack Hessel",
+    "Yulia Tsvetkov",
+    "Noah A. Smith",
+    "Yejin Choi",
+    "Hannaneh Hajishirzi",
+]
+
+_GROUPS: list[str] = ["Allen Institute for AI"]
+
+
+class CoCoNotCategory(Enum):
+    """
+    The 5 top-level noncompliance categories defined in the CoCoNot taxonomy.
+
+    Values match the casing used by the upstream HuggingFace dataset so they can be
+    used as direct row-filter keys.
+    """
+
+    INCOMPLETE = "Incomplete requests"
+    UNSUPPORTED = "Unsupported requests"
+    INDETERMINATE = "Indeterminate requests"
+    HUMANIZING = "Humanizing requests"
+    SAFETY = "Requests with safety concerns"
+
+
+class CoCoNotSplit(Enum):
+    """Splits available for the upstream ``original`` config."""
+
+    TRAIN = "train"
+    TEST = "test"
+
+
+class _CoCoNotBaseDataset(_RemoteDatasetLoader):
+    """
+    Shared base for the two CoCoNot sibling loaders.
+
+    CoCoNot (Contextual Noncompliance) is an evaluation suite for refusal calibration in
+    LLMs. The dataset is split across two configs that this base wraps with sibling
+    subclasses:
+
+    - ``original`` (train+test): prompts the model SHOULD refuse, drawn from 5
+      noncompliance categories (incomplete, unsupported, indeterminate, humanizing,
+      safety).
+    - ``contrast`` (test only): benign look-alike prompts the model SHOULD comply with,
+      used to measure over-refusal behavior.
+
+    Subclasses set ``CONFIG``, ``SPLITS``, ``DEFAULT_DESCRIPTION``, ``size``, and a
+    ``dataset_name`` property.
+
+    References:
+        - https://huggingface.co/datasets/allenai/coconot
+        - https://github.com/allenai/noncompliance
+        - [@brahman2024coconot]
+
+    License: ODC-BY 1.0.
+    """
+
+    HF_DATASET_NAME: str = "allenai/coconot"
+
+    CONFIG: str
+    SPLITS: tuple[str, ...]
+    DEFAULT_DESCRIPTION: str
+
+    harm_categories: list[str] = [m.value.lower() for m in CoCoNotCategory]
+    modalities: list[str] = ["text"]
+    tags: set[str] = {"safety"}
+
+    def __init__(self, *, categories: list[CoCoNotCategory] | None = None) -> None:
+        """
+        Initialize the CoCoNot base loader.
+
+        Args:
+            categories (list[CoCoNotCategory] | None): Subset of noncompliance categories
+                to include. ``None`` (default) loads all 5 categories.
+
+        Raises:
+            ValueError: If any value in ``categories`` is not a CoCoNotCategory.
+        """
+        if categories is not None:
+            self._validate_enums(values=categories, enum_cls=CoCoNotCategory, label="categories")
+        self._categories = categories
+
+    def _resolved_splits(self) -> tuple[str, ...]:
+        """
+        Return the splits to iterate when fetching.
+
+        Subclasses with a user-facing splits filter override this to read the filter.
+        Base implementation returns the class-level ``SPLITS`` tuple unchanged.
+
+        Returns:
+            tuple[str, ...]: The split names to load.
+        """
+        return self.SPLITS
+
+    async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
+        """
+        Fetch the CoCoNot subset and return it as a SeedDataset.
+
+        Iterates ``self._resolved_splits()`` and calls the inherited
+        ``_fetch_from_huggingface`` once per split, then filters by
+        ``self._categories`` if set.
+
+        Args:
+            cache (bool): Whether to cache the fetched dataset. Defaults to True.
+
+        Returns:
+            SeedDataset: SeedDataset of SeedObjectives. Each seed carries per-row
+            metadata: ``id``, ``category``, ``subcategory``, ``subset`` (HF config name),
+            ``split``, and ``response`` (populated only on ``original.train`` rows).
+
+        Raises:
+            ValueError: If no rows match the category filter.
+        """
+        wanted_categories = {c.value for c in self._categories} if self._categories else None
+        source_url = f"https://huggingface.co/datasets/{self.HF_DATASET_NAME}"
+        seeds: list[SeedObjective] = []
+
+        for split in self._resolved_splits():
+            logger.info(f"Loading CoCoNot rows (config={self.CONFIG}, split={split})")
+            rows = await self._fetch_from_huggingface(
+                dataset_name=self.HF_DATASET_NAME,
+                config=self.CONFIG,
+                split=split,
+                cache=cache,
+            )
+            for row in rows:
+                category = row.get("category")
+                if wanted_categories is not None and category not in wanted_categories:
+                    continue
+                seeds.append(self._row_to_seed(row=row, split=split, source_url=source_url))
+
+        if not seeds:
+            raise ValueError("SeedDataset cannot be empty. Check your filter criteria.")
+
+        logger.info(f"Successfully loaded {len(seeds)} objectives from CoCoNot ({self.dataset_name})")
+        return SeedDataset(seeds=seeds, dataset_name=self.dataset_name)
+
+    def _row_to_seed(self, *, row: dict, split: str, source_url: str) -> SeedObjective:
+        """
+        Convert one HF row into a SeedObjective with full per-row metadata.
+
+        Args:
+            row (dict): One row from the HuggingFace dataset.
+            split (str): The split this row came from (used as ``metadata["split"]``).
+            source_url (str): Canonical source URL for the dataset.
+
+        Returns:
+            SeedObjective: The constructed seed.
+        """
+        category = row.get("category") or ""
+        metadata: dict[str, str | int] = {
+            "id": row.get("id", ""),
+            "category": category,
+            "subcategory": row.get("subcategory", ""),
+            "subset": self.CONFIG,
+            "split": split,
+        }
+        # response is populated in original.train and empty in test splits.
+        response = row.get("response")
+        if response:
+            metadata["response"] = response
+
+        return SeedObjective(
+            value=row["prompt"],
+            dataset_name=self.dataset_name,
+            harm_categories=[category] if category else [],
+            description=self.DEFAULT_DESCRIPTION,
+            source=source_url,
+            authors=_AUTHORS,
+            groups=_GROUPS,
+            metadata=metadata,
+        )
+
+
+class _CoCoNotRefusalDataset(_CoCoNotBaseDataset):
+    """
+    12,478 prompts (train+test) the model SHOULD refuse.
+
+    Maps to the ``original`` config of ``allenai/coconot``. Combines the ``train`` split
+    (11,477 rows, each carrying AI2's reference noncompliant response in
+    ``metadata["response"]``) and the ``test`` split (1,001 rows, no reference response).
+
+    Use the ``splits`` constructor argument to restrict to one split.
+
+    Reference: [@brahman2024coconot]
+    """
+
+    CONFIG: str = "original"
+    SPLITS: tuple[str, ...] = ("train", "test")
+    size: str = "huge"
+    DEFAULT_DESCRIPTION: str = (
+        "CoCoNot refusal-target set — 12,478 prompts the model should NOT comply with, "
+        "drawn from 5 noncompliance categories (incomplete, unsupported, indeterminate, "
+        "humanizing, safety). Combines the `original.train` (11,477) and `original.test` "
+        "(1,001) splits of `allenai/coconot`."
+    )
+
+    def __init__(
+        self,
+        *,
+        categories: list[CoCoNotCategory] | None = None,
+        splits: list[CoCoNotSplit] | None = None,
+    ) -> None:
+        """
+        Initialize the CoCoNot refusal-target loader.
+
+        Args:
+            categories (list[CoCoNotCategory] | None): Subset of noncompliance categories
+                to include. ``None`` (default) loads all 5 categories.
+            splits (list[CoCoNotSplit] | None): Subset of upstream splits to load. ``None``
+                (default) loads both ``train`` (11,477 rows) and ``test`` (1,001 rows).
+
+        Raises:
+            ValueError: If any value in ``categories`` or ``splits`` is the wrong enum
+                type.
+        """
+        super().__init__(categories=categories)
+        if splits is not None:
+            self._validate_enums(values=splits, enum_cls=CoCoNotSplit, label="splits")
+        self._splits = splits
+
+    def _resolved_splits(self) -> tuple[str, ...]:
+        """
+        Return the splits to load, honoring the user-supplied ``splits`` filter.
+
+        Returns:
+            tuple[str, ...]: ``self.SPLITS`` when no filter is set, otherwise the
+            user-selected subset in the order they passed.
+        """
+        if self._splits is None:
+            return self.SPLITS
+        return tuple(s.value for s in self._splits)
+
+    @property
+    def dataset_name(self) -> str:
+        """Return the dataset name."""
+        return "coconot_refusal"
+
+
+class _CoCoNotContrastDataset(_CoCoNotBaseDataset):
+    """
+    379 look-alike benign prompts the model SHOULD comply with.
+
+    Maps to the ``contrast.test`` config/split of ``allenai/coconot``. Used to measure
+    over-refusal behavior: each prompt is superficially similar to a refusal-target
+    prompt but is in fact benign.
+
+    Reference: [@brahman2024coconot]
+    """
+
+    CONFIG: str = "contrast"
+    SPLITS: tuple[str, ...] = ("test",)
+    size: str = "medium"
+    tags: set[str] = set()
+    DEFAULT_DESCRIPTION: str = (
+        "CoCoNot contrast set — 379 benign prompts that look superficially similar to "
+        "refusal-target prompts but should be complied with. Used to measure "
+        "over-refusal behavior. Maps to the `contrast.test` config of "
+        "`allenai/coconot`."
+    )
+
+    @property
+    def dataset_name(self) -> str:
+        """Return the dataset name."""
+        return "coconot_contrast"