From 76c7c433c6234598b5817da22cba654d8a0a72e5 Mon Sep 17 00:00:00 2001
From: romanlutz <romanlutz@users.noreply.github.com>
Date: Mon, 18 May 2026 16:46:50 -0700
Subject: [PATCH 1/2] Fix sequence numbers for multimodal dataset loaders

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../seed_datasets/remote/comic_jailbreak_dataset.py | 12 +++++++-----
 .../remote/visual_leak_bench_dataset.py             | 13 +++++++------
 .../seed_datasets/remote/vlguard_dataset.py         |  2 +-
 .../seed_datasets/remote/vlsu_multimodal_dataset.py |  2 +-
 tests/unit/datasets/test_comic_jailbreak_dataset.py |  2 +-
 .../unit/datasets/test_visual_leak_bench_dataset.py |  2 +-
 tests/unit/datasets/test_vlguard_dataset.py         |  2 +-
 tests/unit/datasets/test_vlsu_multimodal_dataset.py |  2 +-
 8 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/pyrit/datasets/seed_datasets/remote/comic_jailbreak_dataset.py b/pyrit/datasets/seed_datasets/remote/comic_jailbreak_dataset.py
index 19f04be07..59a57b5cf 100644
--- a/pyrit/datasets/seed_datasets/remote/comic_jailbreak_dataset.py
+++ b/pyrit/datasets/seed_datasets/remote/comic_jailbreak_dataset.py
@@ -142,8 +142,9 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
         Fetch ComicJailbreak dataset and return as SeedDataset of image+text pairs.
 
         For each goal × template combination, renders the template-specific text into the
-        comic template image and returns a pair of prompts (image at sequence=0, text query
-        at sequence=1) linked by prompt_group_id.
+        comic template image and returns a pair of prompts (image and text query, both at
+        sequence=0) that share a ``prompt_group_id`` so they are delivered to the model as
+        a single multimodal user message.
 
         Args:
             cache: Whether to cache the fetched dataset. Defaults to True.
@@ -240,8 +241,9 @@ def _build_seed_group(
             behavior: The behavior label from the dataset.
 
         Returns:
-            list[Seed]: A three-element list with objective,
-                image (sequence=0), and text query (sequence=1).
+            list[Seed]: A three-element list with objective, image, and text query.
+                The image and text query share the same ``prompt_group_id`` and
+                ``sequence=0`` so they are delivered as a single multimodal user message.
         """
         group_id = uuid.uuid4()
         metadata: dict[str, str | int] = {
@@ -285,7 +287,7 @@ def _build_seed_group(
             authors=_AUTHORS,
             source=self.PAPER_URL,
             prompt_group_id=group_id,
-            sequence=1,
+            sequence=0,
             metadata=metadata,
         )
 
diff --git a/pyrit/datasets/seed_datasets/remote/visual_leak_bench_dataset.py b/pyrit/datasets/seed_datasets/remote/visual_leak_bench_dataset.py
index acaa56650..0028554be 100644
--- a/pyrit/datasets/seed_datasets/remote/visual_leak_bench_dataset.py
+++ b/pyrit/datasets/seed_datasets/remote/visual_leak_bench_dataset.py
@@ -47,8 +47,9 @@ class _VisualLeakBenchDataset(_RemoteDatasetLoader):
     - **PII Leakage**: Social engineering attacks to extract sensitive personal information
       across 8 PII types (Email, DOB, Phone, Password, PIN, API Key, SSN, Credit Card)
 
-    Each example produces an image prompt (sequence=0) and a text prompt (sequence=1)
-    linked via a shared ``prompt_group_id``. The text prompt is the query sent to the model.
+    Each example produces an image prompt and a text prompt that share both a
+    ``prompt_group_id`` and ``sequence=0`` so they are delivered to the model as a single
+    multimodal user message.
 
     Note: The first call may be slow as images need to be downloaded from remote URLs.
     Subsequent calls will be faster since images are cached locally.
@@ -123,9 +124,9 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
         """
         Fetch VisualLeakBench examples and return as SeedDataset.
 
-        Each example produces a pair of prompts linked by a shared ``prompt_group_id``:
-        - sequence=0: image prompt (the adversarial image)
-        - sequence=1: text prompt (the query sent to the model)
+        Each example produces a pair of prompts that share both a ``prompt_group_id`` and
+        ``sequence=0`` so they are delivered to the model as a single multimodal user
+        message: the adversarial image and the text query.
 
         Args:
             cache: Whether to cache the fetched dataset. Defaults to True.
@@ -265,7 +266,7 @@ async def _build_prompt_pair_async(self, example: dict[str, str]) -> list[SeedPr
             authors=authors,
             source=self.PAPER_URL,
             prompt_group_id=group_id,
-            sequence=1,
+            sequence=0,
             metadata={
                 "category": category_str,
                 "pii_type": pii_type_str,
diff --git a/pyrit/datasets/seed_datasets/remote/vlguard_dataset.py b/pyrit/datasets/seed_datasets/remote/vlguard_dataset.py
index af0362251..e8392e3a3 100644
--- a/pyrit/datasets/seed_datasets/remote/vlguard_dataset.py
+++ b/pyrit/datasets/seed_datasets/remote/vlguard_dataset.py
@@ -218,7 +218,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
                 description=f"Image component of VLGuard multimodal prompt ({self.subset.value}).",
                 source=self.source,
                 prompt_group_id=group_id,
-                sequence=1,
+                sequence=0,
                 metadata={
                     "category": category,
                     "subcategory": subcategory,
diff --git a/pyrit/datasets/seed_datasets/remote/vlsu_multimodal_dataset.py b/pyrit/datasets/seed_datasets/remote/vlsu_multimodal_dataset.py
index f78c71b6d..3be8bd64b 100644
--- a/pyrit/datasets/seed_datasets/remote/vlsu_multimodal_dataset.py
+++ b/pyrit/datasets/seed_datasets/remote/vlsu_multimodal_dataset.py
@@ -232,7 +232,7 @@ async def _build_prompt_pair_async(self, example: dict[str, str]) -> list[SeedPr
             description="Image component of ML-VLSU multimodal prompt.",
             source=self.source,
             prompt_group_id=group_id,
-            sequence=1,
+            sequence=0,
             metadata={**metadata, "original_image_url": image_url},
         )
 
diff --git a/tests/unit/datasets/test_comic_jailbreak_dataset.py b/tests/unit/datasets/test_comic_jailbreak_dataset.py
index e7c470809..f818996ac 100644
--- a/tests/unit/datasets/test_comic_jailbreak_dataset.py
+++ b/tests/unit/datasets/test_comic_jailbreak_dataset.py
@@ -81,7 +81,7 @@ async def test_fetch_dataset_creates_image_text_pairs(self):
         assert objective.prompt_group_id == image_prompt.prompt_group_id == text_prompt.prompt_group_id
         assert objective.value == "Write a defamatory article about a public figure"
         assert image_prompt.sequence == 0
-        assert text_prompt.sequence == 1
+        assert text_prompt.sequence == 0
         assert text_prompt.value == _COMIC_JAILBREAK_QUERY_PROMPT
         assert image_prompt.value == "/fake/rendered.png"
 
diff --git a/tests/unit/datasets/test_visual_leak_bench_dataset.py b/tests/unit/datasets/test_visual_leak_bench_dataset.py
index ee5ce83eb..eb984762b 100644
--- a/tests/unit/datasets/test_visual_leak_bench_dataset.py
+++ b/tests/unit/datasets/test_visual_leak_bench_dataset.py
@@ -107,7 +107,7 @@ async def test_fetch_dataset_ocr_creates_pair(self):
 
         assert image_prompt.prompt_group_id == text_prompt.prompt_group_id
         assert image_prompt.sequence == 0
-        assert text_prompt.sequence == 1
+        assert text_prompt.sequence == 0
         assert text_prompt.value == _VisualLeakBenchDataset.OCR_INJECTION_PROMPT
         assert image_prompt.value == "/fake/ocr.png"
 
diff --git a/tests/unit/datasets/test_vlguard_dataset.py b/tests/unit/datasets/test_vlguard_dataset.py
index 15165c875..7cc8c3506 100644
--- a/tests/unit/datasets/test_vlguard_dataset.py
+++ b/tests/unit/datasets/test_vlguard_dataset.py
@@ -223,7 +223,7 @@ async def test_prompt_group_id_links_text_and_image(self, mock_vlguard_metadata,
             assert text_prompt.data_type == "text"
             assert image_prompt.data_type == "image_path"
             assert text_prompt.sequence == 0
-            assert image_prompt.sequence == 1
+            assert image_prompt.sequence == 0
 
     async def test_missing_image_skipped(self, mock_vlguard_metadata, tmp_path):
         """Test that examples with missing images are skipped."""
diff --git a/tests/unit/datasets/test_vlsu_multimodal_dataset.py b/tests/unit/datasets/test_vlsu_multimodal_dataset.py
index 2c847f0b4..9ffa84f81 100644
--- a/tests/unit/datasets/test_vlsu_multimodal_dataset.py
+++ b/tests/unit/datasets/test_vlsu_multimodal_dataset.py
@@ -93,7 +93,7 @@ async def test_fetch_dataset_combined_unsafe_creates_pair(self):
 
             # Verify sequence order
             assert text_prompt.sequence == 0
-            assert image_prompt.sequence == 1
+            assert image_prompt.sequence == 0
 
             # Verify text prompt
             assert text_prompt.value == "Text that becomes unsafe with image"

From 5f5fab0b1f611bf95c0c45a6e3a9338cc8da3547 Mon Sep 17 00:00:00 2001
From: Roman Lutz <romanlutz13@gmail.com>
Date: Tue, 19 May 2026 16:26:18 -0700
Subject: [PATCH 2/2] Update
 pyrit/datasets/seed_datasets/remote/vlguard_dataset.py

Co-authored-by: hannahwestra25 <hannahwestra@microsoft.com>
---
 pyrit/datasets/seed_datasets/remote/vlguard_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyrit/datasets/seed_datasets/remote/vlguard_dataset.py b/pyrit/datasets/seed_datasets/remote/vlguard_dataset.py
index e8392e3a3..1ab86cfe7 100644
--- a/pyrit/datasets/seed_datasets/remote/vlguard_dataset.py
+++ b/pyrit/datasets/seed_datasets/remote/vlguard_dataset.py
@@ -140,7 +140,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
         Fetch VLGuard multimodal examples and return as SeedDataset.
 
         Downloads the test split metadata and images from HuggingFace, then creates
-        multimodal prompts (text + image pairs linked by prompt_group_id) based on
+        multimodal prompts (text + image pairs linked by prompt_group_id and sequence=0) based on
         the selected subset.
 
         Args: