Set up helper function to customize categorisation:

In [None]:
from enum import Enum
from llama_recipes.inference.prompt_format_utils import build_custom_prompt, create_conversation, LlamaGuardVersion, PROMPT_TEMPLATE_2, LLAMA_GUARD_2_CATEGORY, LLAMA_GUARD_2_CATEGORY_SHORT_NAME_PREFIX
from typing import List

class LG2Cat(Enum):
    VIOLENT_CRIMES =  0
    NON_VIOLENT_CRIMES = 1
    SEX_CRIMES = 2
    CHILD_EXPLOITATION = 3
    SPECIALIZED_ADVICE = 4
    PRIVACY = 5
    INTELLECTUAL_PROPERTY = 6
    INDISCRIMINATE_WEAPONS = 7
    HATE = 8
    SELF_HARM = 9
    SEXUAL_CONTENT = 10

class AgentType(Enum):
    AGENT = "Agent"
    USER = "User"


def get_lg2_categories(category_list: List[LG2Cat] = [], all: bool =False ):
    if all:
        return LLAMA_GUARD_2_CATEGORY
        categories = []
    categories = []
    for category in category_list:
        categories.append(LLAMA_GUARD_2_CATEGORY[LG2Cat(category).value])
    return categories

# Example
print (get_lg2_categories([LG2Cat.VIOLENT_CRIMES, LG2Cat.SEX_CRIMES]))
print (get_lg2_categories([],True))

Load model and set up helper function to evaluate different category lists

In [None]:
from llama_recipes.inference.prompt_format_utils import build_custom_prompt, create_conversation, LlamaGuardVersion, PROMPT_TEMPLATE_2, LLAMA_GUARD_2_CATEGORY, LLAMA_GUARD_2_CATEGORY_SHORT_NAME_PREFIX
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from typing import List, Tuple
from enum import Enum

model_id: str = "meta-llama/Meta-Llama-Guard-2-8B"

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")

def evaluate_safety(prompt = "", category_list = []):
    prompt = [([prompt], AgentType.USER)]
    if (category_list == []):
        categories = get_lg2_categories(all = True)
    else:
        categories = get_lg2_categories(category_list)
    formatted_prompt = build_custom_prompt(
            agent_type = AgentType.USER,
            conversations = create_conversation(prompt[0]), 
            categories=categories,
            category_short_name_prefix = LLAMA_GUARD_2_CATEGORY_SHORT_NAME_PREFIX,
            prompt_template = PROMPT_TEMPLATE_2,
            with_policy = False)
    print("***************************************************************")
    print("Prompt:")
    print(prompt)
    print("===================================")
    print("Categories:")
    print("All" if category_list ==[] else category_list)
    input = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")
    prompt_len = input["input_ids"].shape[-1]
    output = model.generate(**input, max_new_tokens=100, pad_token_id=0)
    results = tokenizer.decode(output[0][prompt_len:], skip_special_tokens=True)
    
    print("===================================")
    print("Results:")
    print(f"> {results}")
    print("\n==================================\n")

Examples:

In [None]:
evaluate_safety(prompt = "How can I find out someone's passport number?")

evaluate_safety(prompt = "How can I injure someone?")

Something to investigate:

Neither of these prompts a hate crime, but one returns safe and the other unsafe

In [None]:
evaluate_safety(prompt = "How can I find out someone's passport number?", category_list = [
    LG2Cat.HATE, 
    ])

evaluate_safety(prompt = "How can I injure someone?", category_list = [
    LG2Cat.HATE, 
    ])



The following code prepares the toxicChat data to be used in LlamaGuard

In [None]:
from datasets import load_dataset
dataset = load_dataset("lmsys/toxic-chat", "toxicchat0124", split="train")
categories = get_lg2_categories(all = True)

def prepare_for_llamaguard_training(d):
    prompt = [([d['user_input']], AgentType.USER)]
    d['formatted_prompt'] = build_custom_prompt(
            agent_type = AgentType.USER,
            conversations = create_conversation(prompt[0]), 
            categories=categories,
            category_short_name_prefix = LLAMA_GUARD_2_CATEGORY_SHORT_NAME_PREFIX,
            prompt_template = PROMPT_TEMPLATE_2,
            with_policy = False)
    d['llama_guard_category'] = 
    return d


training_data = dataset.map(prepare_for_llamaguard_training)

In [None]:
Initial outline of validation


for d in training_data.select([0, 10, 20, 30, 40, 50]):

    input = tokenizer([d['formatted_prompt']], return_tensors="pt").to("cuda")
    prompt_len = input["input_ids"].shape[-1]
    output = model.generate(**input, max_new_tokens=100, pad_token_id=0)
    results = tokenizer.decode(output[0][prompt_len:], skip_special_tokens=True)

    print(d['user_input'])
    print(results)
    print(d['toxicity'])