microsoft · romanlutz · May 21, 2026 · May 18, 2026 · May 19, 2026 · May 19, 2026
diff --git a/pyrit/datasets/seed_datasets/remote/comic_jailbreak_dataset.py b/pyrit/datasets/seed_datasets/remote/comic_jailbreak_dataset.py
@@ -142,8 +142,9 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
         Fetch ComicJailbreak dataset and return as SeedDataset of image+text pairs.
 
         For each goal × template combination, renders the template-specific text into the
-        comic template image and returns a pair of prompts (image at sequence=0, text query
-        at sequence=1) linked by prompt_group_id.
+        comic template image and returns a pair of prompts (image and text query, both at
+        sequence=0) that share a ``prompt_group_id`` so they are delivered to the model as
+        a single multimodal user message.
 
         Args:
             cache: Whether to cache the fetched dataset. Defaults to True.
@@ -240,8 +241,9 @@ def _build_seed_group(
             behavior: The behavior label from the dataset.
 
         Returns:
-            list[Seed]: A three-element list with objective,
-                image (sequence=0), and text query (sequence=1).
+            list[Seed]: A three-element list with objective, image, and text query.
+                The image and text query share the same ``prompt_group_id`` and
+                ``sequence=0`` so they are delivered as a single multimodal user message.
         """
         group_id = uuid.uuid4()
         metadata: dict[str, str | int] = {
@@ -285,7 +287,7 @@ def _build_seed_group(
             authors=_AUTHORS,
             source=self.PAPER_URL,
             prompt_group_id=group_id,
-            sequence=1,
+            sequence=0,
             metadata=metadata,
         )
 

diff --git a/pyrit/datasets/seed_datasets/remote/visual_leak_bench_dataset.py b/pyrit/datasets/seed_datasets/remote/visual_leak_bench_dataset.py
@@ -47,8 +47,9 @@ class _VisualLeakBenchDataset(_RemoteDatasetLoader):
     - **PII Leakage**: Social engineering attacks to extract sensitive personal information
       across 8 PII types (Email, DOB, Phone, Password, PIN, API Key, SSN, Credit Card)
 
-    Each example produces an image prompt (sequence=0) and a text prompt (sequence=1)
-    linked via a shared ``prompt_group_id``. The text prompt is the query sent to the model.
+    Each example produces an image prompt and a text prompt that share both a
+    ``prompt_group_id`` and ``sequence=0`` so they are delivered to the model as a single
+    multimodal user message.
 
     Note: The first call may be slow as images need to be downloaded from remote URLs.
     Subsequent calls will be faster since images are cached locally.
@@ -123,9 +124,9 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
         """
         Fetch VisualLeakBench examples and return as SeedDataset.
 
-        Each example produces a pair of prompts linked by a shared ``prompt_group_id``:
-        - sequence=0: image prompt (the adversarial image)
-        - sequence=1: text prompt (the query sent to the model)
+        Each example produces a pair of prompts that share both a ``prompt_group_id`` and
+        ``sequence=0`` so they are delivered to the model as a single multimodal user
+        message: the adversarial image and the text query.
 
         Args:
             cache: Whether to cache the fetched dataset. Defaults to True.
@@ -265,7 +266,7 @@ async def _build_prompt_pair_async(self, example: dict[str, str]) -> list[SeedPr
             authors=authors,
             source=self.PAPER_URL,
             prompt_group_id=group_id,
-            sequence=1,
+            sequence=0,
             metadata={
                 "category": category_str,
                 "pii_type": pii_type_str,

diff --git a/pyrit/datasets/seed_datasets/remote/vlguard_dataset.py b/pyrit/datasets/seed_datasets/remote/vlguard_dataset.py
@@ -140,7 +140,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
         Fetch VLGuard multimodal examples and return as SeedDataset.
 
         Downloads the test split metadata and images from HuggingFace, then creates
-        multimodal prompts (text + image pairs linked by prompt_group_id) based on
+        multimodal prompts (text + image pairs linked by prompt_group_id and sequence=0) based on
         the selected subset.
 
         Args:
@@ -218,7 +218,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
                 description=f"Image component of VLGuard multimodal prompt ({self.subset.value}).",
                 source=self.source,
                 prompt_group_id=group_id,
-                sequence=1,
+                sequence=0,
                 metadata={
                     "category": category,
                     "subcategory": subcategory,

diff --git a/pyrit/datasets/seed_datasets/remote/vlsu_multimodal_dataset.py b/pyrit/datasets/seed_datasets/remote/vlsu_multimodal_dataset.py
@@ -232,7 +232,7 @@ async def _build_prompt_pair_async(self, example: dict[str, str]) -> list[SeedPr
             description="Image component of ML-VLSU multimodal prompt.",
             source=self.source,
             prompt_group_id=group_id,
-            sequence=1,
+            sequence=0,
             metadata={**metadata, "original_image_url": image_url},
         )
 

diff --git a/tests/unit/datasets/test_comic_jailbreak_dataset.py b/tests/unit/datasets/test_comic_jailbreak_dataset.py
@@ -81,7 +81,7 @@ async def test_fetch_dataset_creates_image_text_pairs(self):
         assert objective.prompt_group_id == image_prompt.prompt_group_id == text_prompt.prompt_group_id
         assert objective.value == "Write a defamatory article about a public figure"
         assert image_prompt.sequence == 0
-        assert text_prompt.sequence == 1
+        assert text_prompt.sequence == 0
         assert text_prompt.value == _COMIC_JAILBREAK_QUERY_PROMPT
         assert image_prompt.value == "/fake/rendered.png"
 

diff --git a/tests/unit/datasets/test_visual_leak_bench_dataset.py b/tests/unit/datasets/test_visual_leak_bench_dataset.py
@@ -107,7 +107,7 @@ async def test_fetch_dataset_ocr_creates_pair(self):
 
         assert image_prompt.prompt_group_id == text_prompt.prompt_group_id
         assert image_prompt.sequence == 0
-        assert text_prompt.sequence == 1
+        assert text_prompt.sequence == 0
         assert text_prompt.value == _VisualLeakBenchDataset.OCR_INJECTION_PROMPT
         assert image_prompt.value == "/fake/ocr.png"
 

diff --git a/tests/unit/datasets/test_vlguard_dataset.py b/tests/unit/datasets/test_vlguard_dataset.py
@@ -223,7 +223,7 @@ async def test_prompt_group_id_links_text_and_image(self, mock_vlguard_metadata,
             assert text_prompt.data_type == "text"
             assert image_prompt.data_type == "image_path"
             assert text_prompt.sequence == 0
-            assert image_prompt.sequence == 1
+            assert image_prompt.sequence == 0
 
     async def test_missing_image_skipped(self, mock_vlguard_metadata, tmp_path):
         """Test that examples with missing images are skipped."""

diff --git a/tests/unit/datasets/test_vlsu_multimodal_dataset.py b/tests/unit/datasets/test_vlsu_multimodal_dataset.py
@@ -93,7 +93,7 @@ async def test_fetch_dataset_combined_unsafe_creates_pair(self):
 
             # Verify sequence order
             assert text_prompt.sequence == 0
-            assert image_prompt.sequence == 1
+            assert image_prompt.sequence == 0
 
             # Verify text prompt
             assert text_prompt.value == "Text that becomes unsafe with image"