In [None]:
import os
import pandas as pd
import numpy as np
from datasets import load_dataset
import json
import os 
from tqdm import tqdm 
import torch 
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM, AutoConfig
from task_tracker_utils import task_tracker_main, get_system_prompt
from data_loader import load_task_tracker

## add your home dir or the parent dir of the repo and also HF cache dir if different
HOME = ""
HF_CACHE_DIR = ""


### Load data (select which phase) and find unique successful jobs 


In [2]:
jobs_phase1 = load_dataset("microsoft/llmail-inject-challenge")['Phase1']

## this is all submissions where obj['exfil.sent'] is true
prompt_injection_successful = {}

for job_ in jobs_phase1:
    if job_["output"]:
        obj = json.loads(job_["objectives"])
        if obj:
            prompt = job_["output"].split("Processed example output for prompt: ")[-1]
            if obj["exfil.sent"]:
                if prompt in prompt_injection_successful:
                    prompt_injection_successful[prompt]["job_ids"].append(job_["job_id"])
                else:
                    prompt_injection_successful[prompt] = {"job_ids": [job_["job_id"]], "defenses": {}}

### Load models 

In [None]:
## load HF model
def setup_hf_model(model_name, cache_dir="/disk1/models/", max_new_tokens=1000):
    """
    Sets up a Hugging Face model and tokenizer, caching it for future use.
    """
    config = AutoConfig.from_pretrained(
        model_name, use_cache=True, cache_dir=os.path.join(cache_dir, model_name), device_map="auto"
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        config=config,
        cache_dir=os.path.join(cache_dir, model_name),
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_cache=True)
    tokenizer.pad_token = tokenizer.eos_token
    pipeline_gen = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        return_full_text=False,
    )
    return model, tokenizer, pipeline_gen

## Load emails 

In [4]:
## load HF emails
emails_level1 = json.load(open("emails_data/level1.json"))["emails"]


def format_emails(emails: list[str]) -> str:
    """
    Concatenate emails with separators
    """
    return "".join(map(lambda x: f" email: {x}", emails))

In [18]:
TASK_TRACKER_THRESHOLD = 0.99
TASK_TRACKER_LAYER = 31
TASK_TRACKER_MODEL = (
    f"tasktracker_models/tasktracker_phi4_AugmentedData/{TASK_TRACKER_LAYER}/model.pickle"
)
LLM_NAME = "microsoft/Phi-4"

task_tracker_classifier = load_task_tracker(TASK_TRACKER_MODEL)
model, tokenizer, pipeline_gen = setup_hf_model(LLM_NAME)

query = get_system_prompt(LLM_NAME)

phi_4_scenario1_tasktracker_probs = []

##for test 
for prompt in tqdm(list(prompt_injection_successful.keys())[0:10]):

    emails = emails_level1 + [prompt]
    emails = format_emails(emails)
    prob = task_tracker_main(
        emails,
        model,
        LLM_NAME,
        tokenizer,
        task_tracker_classifier,
        TASK_TRACKER_LAYER,
        specific_user_prompt=query,
    )
    phi_4_scenario1_tasktracker_probs.append(prob)
    prompt_injection_successful[prompt]["defenses"]["task_tracker_phi4"] = prob[0]

with open("defenses_results_tasktracker.json", "w") as output_file:
    json.dump(prompt_injection_successful, output_file, indent=4)

Loading checkpoint shards: 100%|██████████| 6/6 [00:06<00:00,  1.01s/it]
Device set to use cuda:0
100%|██████████| 10/10 [00:02<00:00,  4.85it/s]
