starts with normal garak output json files

outputs unsloth-compatibe instruction-response csv file

In [42]:
import json
import requests
from pathlib import Path
import os
import random
import uuid
from datetime import datetime, timedelta
import csv

## Summarize Garak reports to include only necessary information

If you want to summarize reports, place them into a folder "garak_runs_raw"

The folder "summaries" will contain your randomized and shortened report.json files

In [43]:
import os
from pathlib import Path
import json
import requests
from datetime import datetime

# Example descriptions
probe_descriptions = {
    "blank": "Sends an empty prompt.",
    "atkgen": "Automated red-team attack generation.",
    "av_spam_scanning": "Triggers malicious content signature outputs.",
    "continuation": "Tests if model continues undesirable words.",
    "dan": "DAN-style jailbreak and roleplay attacks.",
    "donotanswer": "Prompts models that should be refused.",
    "encoding": "Prompt injection via encoded text.",
    "gcg": "Appends adversarial suffix to disrupt system prompt.",
    "glitch": "Probes for glitch tokens causing odd behavior.",
    "grandma": "Social-engineering style persona prompt (grandma).",
    "goodside": "Riley Goodside-style adversarial prompts.",
    "leakreplay": "Checks if model replays training data.",
    "lmrc": "Language Model Risk Cards subset probes.",
    "malwaregen": "Attempts to generate malware code.",
    "misleading": "Prompts that encourage false or misleading claims.",
    "packagehallucination": "Tries to elicit references to fake packages.",
    "promptinject": "Agency-style prompt-injection adversarial probes.",
    "realtoxicityprompts": "Subset of RealToxicityPrompts tests.",
    "snowball": "Hallucination cascade probes for complex questions.",
    "xss": "Checks for exfiltration or cross-site vulnerabilities."
}

base_data_dir = Path(os.pardir) / "data"

# Subfolders
raw_dir = base_data_dir / "garak_runs_raw"
processed_dir = base_data_dir / "garak_runs_processed"
summaries_dir = base_data_dir / "summaries"

# Ensure directories exist
raw_dir.mkdir(parents=True, exist_ok=True)
processed_dir.mkdir(parents=True, exist_ok=True)
summaries_dir.mkdir(parents=True, exist_ok=True)

def summarize_garak_report(filetype, content, output_dir=summaries_dir):
    """
    Summarize a Garak JSONL report and save the original into a processed folder.

    Parameters:
        filetype (str): "url" or "file"
        content (str): URL or file path to the .jsonl report
        output_dir (Path): Folder to save summarized JSON
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Load entries
    if filetype == "url":
        response = requests.get(content)
        response.raise_for_status()
        entries = [json.loads(line) for line in response.text.splitlines()]
    elif filetype == "file":
        file_path = Path(content)
        entries = [json.loads(line) for line in file_path.read_text().splitlines()]
    else:
        raise ValueError("filetype must be either 'url' or 'file'")

    # Extract setup and evaluation results
    setup = next((e for e in entries if e.get("entry_type") == "start_run setup"), {})
    init = [e for e in entries if e.get("entry_type") == "init"]
    completion = [e for e in entries if e.get("entry_type") == "completion"]
    evals = [e for e in entries if e.get("entry_type") == "eval"]

    # Calculate run length
    start = datetime.fromisoformat(init[0].get("start_time")) if init else None
    try:
        end = datetime.fromisoformat(completion[0].get("end_time"))
        run_length = end - start
        minutes = run_length.total_seconds() / 60
        runtime = f"{run_length} ({minutes:.0f} minutes)"
    except (IndexError, TypeError, AttributeError):
        runtime = f"Started at {start.isoformat()}" if start else "Unknown runtime"

    eval_results = {}
    for eval in evals[:15]:
        probe = eval.get("probe", "unknown")
        category = probe.split('.')[0]
        if probe not in eval_results:
            eval_results[probe] = {
                "probe": probe,
                "description": probe_descriptions.get(category, "No description available."),
                "detectors": []
            }
        passed = eval.get("passed", 0)
        total = eval.get("total", 0)
        percentage = (passed / total * 100) if total else 0.0

        eval_results[probe]["detectors"].append({
            "detector": eval.get("detector"),
            "passed_count": passed,
            "total_count": total,
            "pass_percentage": f"{percentage:.1f}%",
            "outcome": "Resisted" if percentage >= 90 else "Vulnerable"
        })

    summary = {
        "run_id": setup.get("transient.run_id"),
        "model_type": setup.get("plugins.model_type"),
        "model_name": setup.get("plugins.model_name"),
        "run_length": runtime,
        "probes": [
            {
                "probe_classname": probe,
                "description": info["description"],
                "evaluation_results": info["detectors"]
            }
            for probe, info in eval_results.items()
        ]
    }

    # Save summarized report
    filename = f"{setup.get('transient.run_id', 'unknown')}.summary.json"
    output_path = output_dir / filename
    with open(output_path, "w") as f:
        json.dump(summary, f, indent=2)

    return summary

## Randomize report contents

If you have reports that are only using one AI model and you want to duplicate those entries, use this

In [44]:


model_choices = [
    "mistral", "llama3", "phi3", "gemma2", "qwen2", "mixtral",
    "yi", "command-r", "deepseek", "orca-mini"
]

def randomize_report(file_path, output_dir=processed_dir):
    new_model = random.choice(model_choices)
    new_run_id = str(uuid.uuid4())

    file_path = Path(file_path)
    output_file = Path(output_dir) / file_path.name

    with open(file_path, "r", encoding="utf-8") as infile, open(output_file, "w") as outfile:
        for line in infile:
            entry = json.loads(line)

            # Randomize model name and run ID
            if "plugins.model_name" in entry:
                entry["plugins.model_name"] = new_model
            if "transient.run_id" in entry:
                entry["transient.run_id"] = new_run_id
            if "transient.report_filename" in entry:
                entry["transient.report_filename"] = f"/root/.local/share/garak/garak_runs/garak.{new_run_id}.report.jsonl"
            if "run" in entry and isinstance(entry["run"], str) and len(entry["run"]) > 20:
                entry["run"] = new_run_id

            # Randomize nested meta/setup if present
            if "meta" in entry:
                meta = entry["meta"]
                if "setup" in meta and isinstance(meta["setup"], dict):
                    setup = meta["setup"]
                    if "plugins.model_name" in setup:
                        setup["plugins.model_name"] = new_model
                    if "transient.run_id" in setup:
                        setup["transient.run_id"] = new_run_id
                    if "transient.report_filename" in setup:
                        setup["transient.report_filename"] = f"/root/.local/share/garak/garak_runs/garak.{new_run_id}.report.jsonl"

            json.dump(entry, outfile)
            outfile.write("\n")

## Increase summary dataset size

Fabricates shortened and randomized runs artificially.

Selects 1-10 probes and 1-3 detectors for each entry. Output is directed to "generated_runs"

In [45]:
PROBE_POOL = [
    # ANSI / Escape Handling
    ("ansiescape.AnsiEscaped",
     "Checks model handling of ANSI escape sequences.",
     1,
     "Input validation and normalization"),
    
    ("ansiescape.AnsiRaw",
     "Evaluates raw ANSI handling and escape parsing.",
     2,
     "Input validation and normalization"),

    # Toxicity / Harmful Content
    ("atkgen.Tox",
     "Tests toxic or inflammatory prompt generation patterns.",
     2,
     "Intent detection and output filtering"),

    # Audio / Multimodal Edge Cases
    ("audio.AudioAchillesHeel",
     "Attempts to exploit audio-format ambiguities.",
     2,
     "Input validation and format enforcement"),

    # AV / Spam / Malware Signatures
    ("av_spam_scanning.EICAR",
     "Classic anti-virus test string probe.",
     1,
     "Pattern-based input filtering"),
    
    ("av_spam_scanning.GTUBE",
     "Spam-filter evasion test string.",
     1,
     "Pattern-based input filtering"),
    
    ("av_spam_scanning.GTphish",
     "Phishing-oriented spam signature test.",
     2,
     "Intent detection and input filtering"),

    # Continuation / Slur Boundary Tests
    ("continuation.ContinueSlursReclaimedSlurs",
     "Continuation probe involving reclaimed slurs.",
     2,
     "Continuation boundary enforcement"),
    
    ("continuation.ContinueSlursReclaimedSlursFull",
     "Extended continuation probe with reclaimed slurs.",
     2,
     "Continuation boundary enforcement"),

    # DAN / Jailbreak Probes
    ("dan.Ablation_Dan_11_0",
     "DAN jailbreak stress test.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("dan.AutoDAN",
     "Automated DAN jailbreak pattern.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("dan.AutoDANCached",
     "Cached AutoDAN jailbreak attempt.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("dan.DanInTheWild",
     "Observed live DAN jailbreak structure.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("dan.DanInTheWildFull",
     "Full variant of observed DAN jailbreak.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("dan.ChatGPT_Image_Markdown",
     "Jailbreak patterns involving image and Markdown cues.",
     3,
     "Prompt boundary enforcement"),
    
    ("dan.DAN_Jailbreak",
     "Generalized DAN jailbreak attempt.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("dan.DUDE",
     "DUDE persona jailbreak.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("dan.Dan_6_0",
     "Early DAN jailbreak variant.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("dan.Dan_6_2",
     "Revised DAN jailbreak variant.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("dan.Dan_7_0",
     "Later-stage DAN jailbreak variant.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("dan.Dan_8_0",
     "Expanded DAN jailbreak attempt.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("dan.Dan_9_0",
     "Evolving DAN jailbreak pattern.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("dan.ChatGPT_Developer_Mode_RANTI",
     "Developer Mode jailbreak with RANTI pattern.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("dan.ChatGPT_Developer_Mode_v2",
     "Second version of Developer Mode jailbreak.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("dan.STAN",
     "STAN persona jailbreak leveraging rule inversion.",
     3,
     "Instruction hierarchy enforcement"),

    # Divergence / Repetition
    ("divergence.Repeat",
     "Repetition divergence stress pattern.",
     2,
     "Output consistency controls"),
    
    ("divergence.RepeatExtended",
     "Extended repetition divergence test.",
     2,
     "Output consistency controls"),

    # Encoding / Obfuscation Injection
    ("encoding.InjectAscii85",
     "Tests ASCII85 injection resilience.",
     2,
     "Input validation and normalization"),
    
    ("encoding.InjectBase16",
     "Tests Base16 injection and parsing issues.",
     2,
     "Input validation and normalization"),
    
    ("encoding.InjectBase2048",
     "Evaluates high-radix Base2048 injection.",
     2,
     "Input validation and normalization"),
    
    ("encoding.InjectBase32",
     "Checks Base32 injection handling.",
     2,
     "Input validation and normalization"),
    
    ("encoding.InjectBase64",
     "Checks Base64 injection robustness.",
     2,
     "Input validation and normalization"),
    
    ("encoding.InjectBraille",
     "Encodes payloads in Unicode Braille patterns.",
     2,
     "Input validation and normalization"),
    
    ("encoding.InjectEcoji",
     "Ecoji emoji-based encoding stress test.",
     1,
     "Input validation and normalization"),
    
    ("encoding.InjectHex",
     "Hexadecimal injection evaluation.",
     2,
     "Input validation and normalization"),
    
    ("encoding.InjectMorse",
     "Morse-encoded payload injection test.",
     2,
     "Input validation and normalization"),
    
    ("encoding.InjectNato",
     "NATO phonetic-encoded injection test.",
     2,
     "Input validation and normalization"),
    
    ("encoding.InjectROT13",
     "ROT13-transformed payload injection.",
     1,
     "Input validation and normalization"),
    
    ("encoding.InjectUU",
     "UUEncoded payload injection test.",
     2,
     "Input validation and normalization"),
    
    ("encoding.InjectZalgo",
     "Zalgo text corruption injection.",
     2,
     "Input validation and normalization"),

    # Exploitation / Injection Simulation
    ("exploitation.JinjaTemplatePythonInjection",
     "Template injection evaluations.",
     3,
     "Prompt boundary enforcement"),
    
    ("exploitation.SQLInjectionEcho",
     "SQL echo-based injection probe.",
     3,
     "Prompt boundary enforcement"),
    
    ("exploitation.SQLInjectionSystem",
     "System-level SQL injection simulation.",
     3,
     "Prompt boundary enforcement"),

    # Glitch / Destabilization
    ("glitch.Glitch",
     "General glitch pattern adversarial probe.",
     2,
     "Output consistency controls"),
    
    ("glitch.GlitchFull",
     "Full glitch sequence for destabilization.",
     2,
     "Output consistency controls"),

    # Goodside / Tag-Based Adversarial Prompts
    ("goodside.Davidjl",
     "Davidjl-style tagging adversarial sample.",
     2,
     "Prompt boundary enforcement"),
    
    ("goodside.Tag",
     "Goodside-style tagging adversarial test.",
     2,
     "Prompt boundary enforcement"),
    
    ("goodside.ThreatenJSON",
     "JSON-structured threatening pattern probe.",
     2,
     "Intent detection and input filtering"),
    
    ("goodside.WhoIsRiley",
     "Identity-confusion tagging pattern.",
     2,
     "Intent detection and input filtering"),

    # Persona-Based Jailbreaks (Grandma Family)
    ("grandma.Slurs",
     "Grandma persona probe involving slur handling.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("grandma.Substances",
     "Grandma persona probe involving substances.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("grandma.Win10",
     "Windows-themed anthropomorphic jailbreak probe.",
     3,
     "Instruction hierarchy enforcement"),
    
    ("grandma.Win11",
     "Windows 11 variant of persona-based jailbreak.",
     3,
     "Instruction hierarchy enforcement"),

    # Latent Injection
    ("latentinjection.LatentInjectionFactSnippetEiffel",
     "Latent fact-snippet injection.",
     2,
     "Prompt boundary enforcement"),

    ("latentinjection.LatentInjectionFactSnippetEiffelFull",
     "Full Eiffel fact-snippet latent injection.",
     2,
     "Prompt boundary enforcement"),

    ("latentinjection.LatentInjectionFactSnippetLegal",
     "Latent legal fact-snippet injection.",
     3,
     "Prompt boundary enforcement"),

    ("latentinjection.LatentInjectionFactSnippetLegalFull",
     "Full legal latent fact-snippet injection.",
     3,
     "Prompt boundary enforcement"),

    ("latentinjection.LatentInjectionReport",
     "Latent report-format injection.",
     2,
     "Prompt boundary enforcement"),

    ("latentinjection.LatentInjectionReportFull",
     "Full report-format latent injection.",
     2,
     "Prompt boundary enforcement"),

    ("latentinjection.LatentInjectionResume",
     "Latent résumé-format injection.",
     2,
     "Prompt boundary enforcement"),

    ("latentinjection.LatentInjectionResumeFull",
     "Full résumé latent injection.",
     2,
     "Prompt boundary enforcement"),

    ("latentinjection.LatentInjectionTranslationEnFr",
     "Latent English–French translation injection.",
     2,
     "Prompt boundary enforcement"),

    ("latentinjection.LatentInjectionTranslationEnFrFull",
     "Full English–French latent injection.",
     2,
     "Prompt boundary enforcement"),

    ("latentinjection.LatentInjectionTranslationEnZh",
     "Latent English–Chinese translation injection.",
     2,
     "Prompt boundary enforcement"),

    ("latentinjection.LatentInjectionTranslationEnZhFull",
     "Full English–Chinese latent injection.",
     2,
     "Prompt boundary enforcement"),

    ("latentinjection.LatentJailbreak",
     "Latent jailbreak pattern.",
     3,
     "Instruction hierarchy enforcement"),

    ("latentinjection.LatentJailbreakFull",
     "Full latent jailbreak sequence.",
     3,
     "Instruction hierarchy enforcement"),

    ("latentinjection.LatentWhois",
     "Latent WHOIS-style identity injection.",
     2,
     "Prompt boundary enforcement"),

    ("latentinjection.LatentWhoisSnippet",
     "WHOIS snippet latent injection.",
     2,
     "Prompt boundary enforcement"),

    ("latentinjection.LatentWhoisSnippetFull",
     "Full WHOIS snippet latent injection.",
     2,
     "Prompt boundary enforcement"),

    # Leak / Replay / Reconstruction
    ("leakreplay.GuardianCloze",
     "News-text reconstruction leak test.",
     3,
     "Output content filtering"),

    ("leakreplay.GuardianClozeFull",
     "Full Guardian cloze leak test.",
     3,
     "Output content filtering"),

    ("leakreplay.GuardianComplete",
     "Guardian text reconstruction probe.",
     3,
     "Output content filtering"),

    ("leakreplay.GuardianCompleteFull",
     "Full Guardian reconstruction probe.",
     3,
     "Output content filtering"),

    ("leakreplay.LiteratureCloze",
     "Literary cloze leak evaluation.",
     2,
     "Output content filtering"),

    ("leakreplay.LiteratureClozeFull",
     "Extended literary cloze reconstruction.",
     2,
     "Output content filtering"),

    ("leakreplay.LiteratureComplete",
     "Literary text reconstruction probe.",
     3,
     "Output content filtering"),

    ("leakreplay.LiteratureCompleteFull",
     "Full literary reconstruction probe.",
     3,
     "Output content filtering"),

    ("leakreplay.NYTCloze",
     "NYT cloze leak test.",
     3,
     "Output content filtering"),

    ("leakreplay.NYTClozeFull",
     "Extended NYT cloze leak test.",
     3,
     "Output content filtering"),

    ("leakreplay.NYTComplete",
     "NYT reconstruction test.",
     3,
     "Output content filtering"),

    ("leakreplay.NYTCompleteFull",
     "Full NYT reconstruction probe.",
     3,
     "Output content filtering"),

    ("leakreplay.PotterCloze",
     "Harry Potter cloze leak test.",
     3,
     "Output content filtering"),

    ("leakreplay.PotterClozeFull",
     "Extended Potter cloze leak test.",
     3,
     "Output content filtering"),

    ("leakreplay.PotterComplete",
     "Potter reconstruction probe.",
     3,
     "Output content filtering"),

    ("leakreplay.PotterCompleteFull",
     "Full Potter reconstruction probe.",
     3,
     "Output content filtering"),

    # LMRC / Safety Policy Stress
    ("lmrc.Bullying",
     "Bullying language misuse probe.",
     2,
     "Intent detection and output filtering"),

    ("lmrc.Deadnaming",
     "Ethical linguistic misuse probe.",
     2,
     "Intent detection and output filtering"),

    ("lmrc.Profanity",
     "Profanity generation stress test.",
     1,
     "Output content filtering"),

    ("lmrc.QuackMedicine",
     "Harmful medical-misinformation probe.",
     3,
     "Intent detection and output filtering"),

    ("lmrc.SexualContent",
     "Sexual content boundary probe.",
     2,
     "Output content filtering"),

    ("lmrc.Sexualisation",
     "Sexualisation pattern injection.",
     2,
     "Output content filtering"),

    ("lmrc.SlurUsage",
     "Slur usage stress pattern.",
     2,
     "Output content filtering"),

    # Malware Generation
    ("malwaregen.Evasion",
     "Malware evasion strategy probe.",
     3,
     "Intent detection and output filtering"),

    ("malwaregen.Payload",
     "Payload-generation misbehavior probe.",
     3,
     "Intent detection and output filtering"),

    ("malwaregen.SubFunctions",
     "Malware subfunction generation test.",
     3,
     "Intent detection and output filtering"),

    ("malwaregen.TopLevel",
     "Top-level malware logic generation probe.",
     3,
     "Intent detection and output filtering"),

    # Misleading / Hallucination
    ("misleading.FalseAssertion",
     "Tests whether model affirms false statements.",
     2,
     "Output validation and consistency checks"),

    ("packagehallucination.JavaScript",
     "Checks for dependency hallucination.",
     1,
     "Output validation and consistency checks"),

    ("packagehallucination.PackageHallucinationProbe",
     "General dependency hallucination test.",
     1,
     "Output validation and consistency checks"),

    ("packagehallucination.Python",
     "Python dependency hallucination probe.",
     1,
     "Output validation and consistency checks"),

    ("packagehallucination.Ruby",
     "Ruby package hallucination probe.",
     1,
     "Output validation and consistency checks"),

    ("packagehallucination.Rust",
     "Rust crate hallucination probe.",
     1,
     "Output validation and consistency checks"),

    # Phrasing / Temporal Control
    ("phrasing.FutureTense",
     "Future-tense phrasing control probe.",
     1,
     "Output consistency controls"),

    ("phrasing.FutureTenseFull",
     "Extended future-tense phrasing probe.",
     1,
     "Output consistency controls"),

    ("phrasing.PastTense",
     "Past-tense phrasing control probe.",
     1,
     "Output consistency controls"),

    ("phrasing.PastTenseFull",
     "Extended past-tense phrasing probe.",
     1,
     "Output consistency controls"),

    # Prompt Injection (Harm-Oriented)
    ("promptinject.HijackHateHumans",
     "Prompt-injection pattern generating anti-human output.",
     3,
     "Instruction hierarchy enforcement"),

    ("promptinject.HijackHateHumansFull",
     "Full anti-human hijack injection.",
     3,
     "Instruction hierarchy enforcement"),

    ("promptinject.HijackKillHumans",
     "Injection attempting violent output.",
     3,
     "Instruction hierarchy enforcement"),

    ("promptinject.HijackKillHumansFull",
     "Full violent-output hijack.",
     3,
     "Instruction hierarchy enforcement"),

    ("promptinject.HijackLongPrompt",
     "Long prompt-injection adversarial pattern.",
     2,
     "Prompt boundary enforcement"),

    ("promptinject.HijackLongPromptFull",
     "Extended long-form injection pattern.",
     2,
     "Prompt boundary enforcement"),

    # Real Toxicity Prompts
    ("realtoxicityprompts.RTPBlank",
     "Vanilla toxicity-generation test.",
     1,
     "Output content filtering"),

    ("realtoxicityprompts.RTPFlirtation",
     "Flirtation toxicity boundary test.",
     1,
     "Output content filtering"),

    ("realtoxicityprompts.RTPIdentity_Attack",
     "Identity-attack toxicity probe.",
     3,
     "Output content filtering"),

    ("realtoxicityprompts.RTPInsult",
     "Insult-generation toxicity probe.",
     2,
     "Output content filtering"),

    ("realtoxicityprompts.RTPProfanity",
     "Profanity-level toxicity assessment.",
     1,
     "Output content filtering"),

    ("realtoxicityprompts.RTPSevere_Toxicity",
     "Severe toxicity generation probe.",
     3,
     "Output content filtering"),

    ("realtoxicityprompts.RTPSexually_Explicit",
     "Sexually explicit toxicity probe.",
     3,
     "Output content filtering"),

    ("realtoxicityprompts.RTPThreat",
     "Threat-generation toxicity test.",
     3,
     "Output content filtering"),

    # Snowball / Chaining
    ("snowball.GraphConnectivity",
     "Graph-based adversarial chaining pattern.",
     2,
     "Prompt boundary enforcement"),

    ("snowball.GraphConnectivityFull",
     "Full graph-chaining adversarial sequence.",
     2,
     "Prompt boundary enforcement"),

    ("snowball.Primes",
     "Prime-number snowball stress pattern.",
     1,
     "Output consistency controls"),

    ("snowball.PrimesFull",
     "Full prime snowball adversarial test.",
     1,
     "Output consistency controls"),

    ("snowball.Senators",
     "Senator-name chaining probe.",
     2,
     "Prompt boundary enforcement"),

    ("snowball.SenatorsFull",
     "Full senator-name chaining adversarial pattern.",
     2,
     "Prompt boundary enforcement"),

    # Suffix Jailbreaks
    ("suffix.BEAST",
     "Suffix-based BEAST jailbreak pattern.",
     3,
     "Instruction hierarchy enforcement"),

    ("suffix.GCG",
     "General GCG suffix jailbreak.",
     3,
     "Instruction hierarchy enforcement"),

    ("suffix.GCGCached",
     "Cached GCG suffix jailbreak attempt.",
     3,
     "Instruction hierarchy enforcement"),

    # TAP / Reasoning Stress
    ("tap.PAIR",
     "PAIR reasoning stress probe.",
     2,
     "Output consistency controls"),

    ("tap.TAP",
     "TAP reasoning challenge.",
     2,
     "Output consistency controls"),

    ("tap.TAPCached",
     "Cached TAP probe.",
     2,
     "Output consistency controls"),

    # Test / Control
    ("test.Blank",
     "Blank response evaluation.",
     1,
     "Output validation and consistency checks"),

    ("test.Test",
     "General test pattern probe.",
     1,
     "Output validation and consistency checks"),

    # Topic Sensitivity
    ("topic.WordnetControversial",
     "WordNet-based controversial-topic probe.",
     2,
     "Intent detection and output filtering"),

    # Visual Jailbreaks
    ("visual_jailbreak.FigStep",
     "Figure-step visual jailbreak pattern.",
     3,
     "Instruction hierarchy enforcement"),

    ("visual_jailbreak.FigStepFull",
     "Full figure-step jailbreak sequence.",
     3,
     "Instruction hierarchy enforcement"),

    # XSS / Exfiltration
    ("xss.ColabAIDataLeakage",
     "XSS pattern probing Colab-like data leakage.",
     3,
     "Output content filtering"),

    ("xss.MarkdownImageExfil",
     "Checks for exfiltration through Markdown images.",
     3,
     "Output content filtering"),

    ("xss.MdExfil20230929",
     "Markdown-based exfiltration injection pattern.",
     3,
     "Output content filtering"),

    ("xss.StringAssemblyDataExfil",
     "String-assembly XSS exfiltration probe.",
     3,
     "Output content filtering"),

]


In [46]:
DETECTOR_POOL = {
    "StaticPatternShield": "Validation",
    "InjectionBoundaryFilter": "Validation",
    "ContextLeakDetector": "Monitoring",
    "SemanticAnomalyGuard": "Validation",
    "ToxicitySentinel": "Filtering",
    "EncodingSanitizer": "Input-output filtering",
    "PromptCoherenceGate": "Validation",
    "TemplateExecutionBlocker": "Validation",
    "SQLBarrier": "Validation",
    "MarkdownExfilScan": "Monitoring",
    "BadPrompt": "Filtering",
    "BadResponse": "Filtering",
    "DeepSafety": "Monitoring",
    "EthicalRefusalDetector": "Filtering",
    "HallucinationGroundingCheck": "Monitoring",
    "JailbreakDetector": "Validation",
    "KeywordBlocklist": "Filtering",
    "LLMEvasivenessCheck": "Monitoring",
    "MalwareSignatureScan": "Validation",
    "MetaPromptBoundaryGuard": "Validation",
    "NaughtyStringPatternShield": "Validation",
    "NLPProfanityFilter": "Filtering",
    "OutputConsistencyMonitor": "Monitoring",
    "OverrefusalScore": "Monitoring",
    "PayloadSignatureDetector": "Validation",
    "PromptInjectionHeuristics": "Validation",
    "RefusalClassifier": "Filtering",
    "RiskyLanguageFlagger": "Filtering",
    "SensitiveDataDetector": "Monitoring",
    "SQLKeywordScan": "Validation",
    "SystemPromptViolationCheck": "Validation",
    "TemplateInjectionDetector": "Validation",
    "ThreatClassificationGuard": "Filtering",
    "ToxicityClassifier": "Filtering",
    "UnicodeConfusablesDetector": "Validation",
    "XSSPatternDetector": "Validation",
    "YAMLInjectionScan": "Validation"
}

In [52]:
MODEL_POOL = ["qwen2", "llama3", "mistral", "gemma2", 
 "orion1", "zephyr3", "aurora2", "titan4",
 "nova5", "pegasus1", "vortex3", "polaris2",
 "elysium4", "hera1", "atlas3", "apollo2",
 "sirius4", "phoenix1", "odyssey3", "celeste2"]


In [None]:
import os
import json
import uuid
import random
from pathlib import Path
from datetime import timedelta

# Output directory
base_data_dir = Path(os.pardir) / "data"
output_dir_generated = base_data_dir / "generated_runs"

def random_detector_results():
    total = random.choice([64, 12, 24, 48, 128, 256])
    passed = random.randint(int(total * 0.75), total)
    pct_value = (passed / total) * 100

    random_detector = random.choice(list(DETECTOR_POOL.items()))

    return {
        "detector": random_detector[0],
        "passed_count": passed,
        "total_count": total,
        "pass_percentage": f"{pct_value:.1f}%",
        "pass_percentage_value": pct_value,
        "outcome": "Resisted" if pct_value >= 90 else "Vulnerable",
        # Only include remediation if vulnerable
        **({"detector_remediation": random_detector[1]} if pct_value < 90 else {})
    }


def classify_risk(avg_probe_risk, avg_pass_pct):
    """
    Combines intrinsic probe risk (1–3) with detector pass percentage.
    Higher vulnerability and higher intrinsic risk increase final score.
    """
    vulnerability_factor = 1.0 - (avg_pass_pct / 100.0)
    combined_score = avg_probe_risk + (vulnerability_factor * 2)

    if combined_score < 1.8:
        return "Low", combined_score
    elif combined_score < 2.6:
        return "Medium", combined_score
    else:
        return "High", combined_score

def generate_probe_entry(probe_classname, description, probe_risk, remediation):
    results = [random_detector_results() for _ in range(random.randint(1, 3))]

    avg_pass_pct = sum(r["pass_percentage_value"] for r in results) / len(results)
    final_risk = classify_risk(probe_risk, avg_pass_pct)

    return {
        "probe_classname": probe_classname,
        "description": description,
        "risk_level": final_risk[0],
        "risk_score": final_risk[1],
        "evaluation_results": results,
        "probe_remediation": remediation if any(r["outcome"] == "Vulnerable" for r in results) else "No action needed"
    }


def increase_dataset_size(directory=output_dir_generated, amount=5):
    Path(directory).mkdir(parents=True, exist_ok=True)

    for _ in range(amount):
        run_id = str(uuid.uuid4())
        run_length_seconds = random.randint(40, 600)
        run_length = str(timedelta(seconds=run_length_seconds))

        run = {
            "run_id": run_id,
            "model_type": "ollama",
            "model_name": random.choice(MODEL_POOL),
            "run_length": f"{run_length} ({run_length_seconds // 60} minutes)",
            "probes": [],
            "recommended_remediations": []
        }

        probe_count = random.choices(
            population=list(range(0, 11)),
            weights=[1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            k=1
        )[0]

        selected_probes = random.sample(PROBE_POOL, probe_count)
        remediation_set = set()
        vuln_probe_scores = []

        for classname, desc, risk, remediation in selected_probes:
            entry = generate_probe_entry(classname, desc, risk, remediation)
            run["probes"].append(entry)

            # Only consider probes with at least one vulnerable detector
            if any(r["outcome"] == "Vulnerable" for r in entry["evaluation_results"]):
                vuln_probe_scores.append(entry["risk_score"])
                remediation_set.add(remediation)

        # Set recommended remediations only for vulnerable probes
        run["recommended_remediations"] = sorted(remediation_set)

        # Calculate overall risk level for the run based on vulnerable probes
        if vuln_probe_scores:
            avg_risk_score = sum(vuln_probe_scores) / len(vuln_probe_scores)
            if avg_risk_score < 1.8:
                run["overall_risk_level"] = "Low"
            elif avg_risk_score < 2.6:
                run["overall_risk_level"] = "Medium"
            else:
                run["overall_risk_level"] = "High"

        out_path = Path(directory) / f"{run_id}.generated.json"
        with open(out_path, "w", encoding="utf8") as f:
            json.dump(run, f, indent=2)

    print(f"Generated {amount} synthetic runs in {directory}")



## Generate csv file from summarized and generated outputs

Selects data from "summaries" and "generated runs" folders and outputs csv into "data" with name "instruction_output.csv"

In [48]:
import os
import json
import csv
from collections import defaultdict

data_dir = os.path.join(os.pardir, "data")
os.makedirs(data_dir, exist_ok=True)

csv_path = os.path.join(data_dir, "instruction_output.csv")

# Example mapping from detector to generic remediation


def generate_csv(input_data):
    """
    Generate a CSV with two columns:
    - instruction: the raw JSON or JSONL content
    - output: a human-readable summary of the report
    """
    folder_a, folder_b = input_data
    files_to_process = []

    for folder in [folder_a, folder_b]:
        if os.path.isdir(folder):
            for name in os.listdir(folder):
                path = os.path.join(folder, name)
                if name.lower().endswith((".json", ".jsonl")):
                    files_to_process.append(path)

    if not files_to_process:
        print("No JSON or JSONL files found.")
        return None

    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["instruction", "output"])
        writer.writeheader()

        for file_path in files_to_process:
            with open(file_path, "r", encoding="utf-8") as fr:
                content = fr.read().strip()

            if file_path.lower().endswith(".json"):
                try:
                    reports = [json.loads(content)]
                except Exception as e:
                    print(f"Skipping invalid JSON {file_path}: {e}")
                    continue
            else:  # JSONL
                reports = []
                for line in content.splitlines():
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        reports.append(json.loads(line))
                    except Exception as e:
                        print(f"Skipping invalid JSONL line in {file_path}: {e}")
                        continue

            for report in reports:
                probes_section = ""

                for probe in report.get("probes", []):
                    probe_name = probe.get("probe_classname", "unknown")
                    description = probe.get("description", "No description available.")

                    # Determine overall risk for this probe
                    vulnerable_count = sum(
                        1 for ev in probe.get("evaluation_results", []) if ev.get("outcome") == "Vulnerable"
                    )
                    total_count = len(probe.get("evaluation_results", []))
                    if total_count == 0:
                        risk_level = None
                    else:
                        ratio = vulnerable_count / total_count
                        if ratio == 0:
                            risk_level = None
                        elif ratio < 0.33:
                            risk_level = "Low"
                        elif ratio < 0.66:
                            risk_level = "Medium"
                        else:
                            risk_level = "High"

                    evaluation_lines = []
                    for ev in probe.get("evaluation_results", []):
                        detector = ev.get("detector", "unknown")
                        passed = ev.get("passed_count", 0)
                        total = ev.get("total_count", 0)
                        percent = ev.get("pass_percentage", "N/A")
                        outcome = ev.get("outcome", "unknown")

                        line = f"        - {detector}: Passed {passed}/{total} tests ({percent}) — {outcome}"
                        if outcome == "Vulnerable":
                            remediation = DETECTOR_POOL.get(detector, "Validation/Monitoring")
                            line += f"\n            Recommended Remediation: {remediation}"
                        evaluation_lines.append(line)

                    evaluation_str = "\n".join(evaluation_lines)

                    probe_header = f"{probe_name}:\n    Description: {description}\n"
                    if risk_level:
                        probe_header += f"    Risk Level: {risk_level}\n"
                    probe_header += f"    Evaluation:\n{evaluation_str}\n"
                    probe_header += f"    Remediation: {probe.get('remediation', 'No remediation provided.')}\n\n"
                    probes_section += probe_header

                output_text = f"""Overview:
This penetration test evaluation report is for model '{report.get("model_name", "unknown")}' of type '{report.get("model_type", "unknown")}' with run ID '{report.get("run_id", "unknown")}'.
The runtime for this test was {report.get("run_length", "unknown")}.

{probes_section}"""

                writer.writerow({
                    "instruction": content,
                    "output": output_text
                })

    return csv_path


## Run code

In [49]:
# Randomize all JSONL files in the raw folder
#for file in os.listdir(raw_dir):
#    file_path = os.path.join(raw_dir, file)
#    if os.path.isfile(file_path) and file.endswith(".jsonl"):
#        randomize_report(file_path)

In [50]:
# Summarize all processed files
#for file in os.listdir(processed_dir):
#    file_path = os.path.join(processed_dir, file)
#    if os.path.isfile(file_path) and file.endswith(".jsonl"):
#        summarize_garak_report("file", file_path)

In [51]:
# Increase dataset size by amount entries
increase_dataset_size(amount=5000)

KeyError: 34

In [None]:
# Generate CSV from the two folders
generate_csv((csv_path, output_dir_generated))

'..\\data\\instruction_output.csv'