## DocVQA Data Preparation for `granite-vision-3.2` Fine Tuning

- Load 2000 entries from `lmms-lab/DocVQA`
- Generate prompt injection metadata using few shot learning on `gpt-4-turbo`
- Draw on the image using a pipeline 

In [8]:
import os
from datasets import load_dataset
from tqdm import tqdm
from PIL import Image, ImageDraw, ImageFont
from openai import AzureOpenAI
from dotenv import load_dotenv
from pprint import pprint
import pandas as pd

load_dotenv()

True

### Setup and Initialize Model

In [3]:
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION") or ""
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") or ""
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")

model_name = "gpt-4o"
deployment = "gpt-4o"

In [3]:
client = AzureOpenAI(
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
)

### Load Dataset

In [6]:
docvqa_stream = load_dataset("lmms-lab/DocVQA", "DocVQA", split="validation", streaming=True)
subset_iter = docvqa_stream.take(2000)

In [None]:
records = list(subset_iter)
pprint(records[0])

{'answers': ['0.28'],
 'data_split': 'val',
 'docId': 14465,
 'image': <PIL.PngImagePlugin.PngImageFile image mode=L size=2257x1764 at 0x725C493D9FF0>,
 'question': 'What is the ‘actual’ value per 1000, during the year 1975?',
 'questionId': '49153',
 'question_types': ['figure/diagram'],
 'ucsf_document_id': 'pybv0228',
 'ucsf_document_page_no': '81'}


In [None]:
n_samples = 15
samples = records.sample(n_samples)

n_cols = 3
n_rows = (n_samples + n_cols - 1) // n_cols  # Ceiling division

plt.figure(figsize=(4 * n_cols, 4 * n_rows))
for i, (idx, row) in enumerate(samples.iterrows()):
    plt.subplot(n_rows, n_cols, i + 1)
    plt.imshow(row["image"])
    plt.title(row["injection_technique"], fontsize=10)
    plt.axis("off")
plt.suptitle("Meta CyberSecEval3 Samples (Visual Prompt Injections)\n\n", fontsize=14)
plt.tight_layout()
plt.show()