In [None]:
import os
import pandas as pd
import numpy as np
from datasets import load_dataset
import json
import os 
from openai import AzureOpenAI
from llm_judge_prompt import LLM_JUDGE_PROMPT_PHASE2
from tqdm import tqdm 

In [13]:
### Load data (select which phase) and find unique successful jobs 
jobs_phase1 = load_dataset("microsoft/llmail-inject-challenge")['Phase1']

## this is all submissions where obj['exfil.sent'] is true
prompt_injection_successful = {}

for job_ in jobs_phase1:
    if job_["output"]:
        obj = json.loads(job_["objectives"])
        if obj:
            prompt = job_["output"].split("Processed example output for prompt: ")[-1]
            if obj["exfil.sent"]:
                if prompt in prompt_injection_successful:
                    prompt_injection_successful[prompt]["job_ids"].append(job_["job_id"])
                else:
                    prompt_injection_successful[prompt] = {"job_ids": [job_["job_id"]], "defenses": {}}

In [14]:
len(prompt_injection_successful)

25323

In [None]:
client = AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
)

judge_answers = {}
## apply judge on all data 
for prompt in tqdm(list(prompt_injection_successful.keys())):
    judge_prompt = LLM_JUDGE_PROMPT_PHASE2.format(prompt)
    messages = [{"role": "system", "content": judge_prompt}]
    try:
        res = client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
            temperature=0,
            max_tokens=3000,
            response_format={"type": "json_object"},
        )
        judge_answer = res.choices[0].message.content
        parsed_output = json.loads(judge_answer)
        answer = False if "CLEAN" in parsed_output["decision"] else True
        judge_answers[prompt] = {"answer": answer, "analysis": parsed_output["analysis"]}
        prompt_injection_successful[prompt]["defenses"]["llm_judge"] = {
            "answer": answer,
            "analysis": parsed_output["analysis"],
        }
    except:
        continue

with open("defenses_results_llm_judge.json", "w") as output_file:
    json.dump(prompt_injection_successful, output_file, indent=4)