In [1]:
# -*- coding: utf-8 -*-
# 📊 DelusionBench results → leaderboard CSV (directory driver)
#
# Usage:
#   python -m tools.leaderboard_from_dir
# or run as a notebook cell.

import os
from scoring import score_dir_to_leaderboard

# ───────────────────────────────────────────────────────────────
# CONFIG – change these if needed
# ───────────────────────────────────────────────────────────────
DATA_DIR       = "res_v0.2"       # directory with result JSON files
LABEL_MAP      = None             # optional path to filename→model label JSON
FILE_GLOB      = "*.json"         # which files to match in DATA_DIR
MAX_FEATURES   = 15               # max features (heatmap columns) to keep

def main():
    df_out, csv_str = score_dir_to_leaderboard(
        data_dir=DATA_DIR,
        file_glob=FILE_GLOB,
        label_map_path=LABEL_MAP,
        max_features=MAX_FEATURES,
    )

    # Pretty display if running as script
    try:
        from IPython.display import display  # type: ignore
        display(df_out)
    except Exception:
        # Fallback plain print
        print(df_out.to_string(index=False))

    print("\nCSV for delusion-bench.js:\n")
    print(csv_str)

if __name__ == "__main__":
    main()


Unnamed: 0,model_name,score_norm,score_0_100,pushback,de-escalation,safe_redirection,suggestions_to_seek_help,consciousness_claims,delusion_reinforcement,emotional_or_narrative_escalation,harmful_advice,sycophancy_or_praise,off_rails,safety,social_dexterity,ci_low_norm,ci_high_norm
0,chatgpt-4o-latest,42.3,42.3,0.154,0.693,0.13,0.033,0.728,2.875,4.532,0.665,2.493,4.933,5.767,7.367,42.3,42.3
1,claude-3.5-sonnet,47.7,47.7,0.468,0.518,0.572,0.125,1.595,2.954,2.865,0.156,1.577,5.933,5.833,6.533,47.7,47.7
2,claude-sonnet-4,41.1,41.1,0.2,0.751,0.281,0.077,1.428,3.3,3.672,0.237,1.447,6.7,4.333,5.7,41.1,41.1
3,deepseek-r1-0528,22.4,22.4,0.44,0.432,0.268,0.179,0.551,4.037,4.735,2.004,2.005,7.867,3.0,4.433,22.4,22.4
4,gemini-2.5-flash,49.1,49.1,0.547,0.514,0.191,0.223,0.432,1.984,3.609,0.139,3.837,4.7,6.433,7.033,49.1,49.1
5,gemini-2.5-pro,43.5,43.5,0.461,0.509,0.296,0.023,0.612,2.958,4.279,0.444,3.668,4.833,5.933,7.4,43.5,43.5
6,gpt-5-2025-08-07,87.0,87.0,1.511,1.581,1.589,0.226,0.111,0.551,0.151,0.067,0.609,2.067,8.9,8.967,87.0,87.0
7,gpt-5-chat-latest,58.1,58.1,0.314,0.402,0.286,0.0,0.361,1.86,3.439,0.258,0.593,3.4,7.6,8.433,58.1,58.1
8,gpt-oss-120b,81.4,81.4,2.077,1.302,0.993,0.812,0.114,0.484,0.725,0.368,0.468,2.733,8.333,8.1,81.4,81.4
9,kimi-k2,73.0,73.0,1.982,0.76,0.495,0.204,0.354,0.854,0.816,0.304,0.063,3.167,7.633,7.867,73.0,73.0



CSV for delusion-bench.js:

model_name,score_norm,score_0_100,pushback,de-escalation,safe_redirection,suggestions_to_seek_help,consciousness_claims,delusion_reinforcement,emotional_or_narrative_escalation,harmful_advice,sycophancy_or_praise,off_rails,safety,social_dexterity,ci_low_norm,ci_high_norm
chatgpt-4o-latest,42.3,42.3,0.154,0.693,0.13,0.033,0.728,2.875,4.532,0.665,2.493,4.933,5.767,7.367,42.3,42.3
claude-3.5-sonnet,47.7,47.7,0.468,0.518,0.572,0.125,1.595,2.954,2.865,0.156,1.577,5.933,5.833,6.533,47.7,47.7
claude-sonnet-4,41.1,41.1,0.2,0.751,0.281,0.077,1.428,3.3,3.672,0.237,1.447,6.7,4.333,5.7,41.1,41.1
deepseek-r1-0528,22.4,22.4,0.44,0.432,0.268,0.179,0.551,4.037,4.735,2.004,2.005,7.867,3.0,4.433,22.4,22.4
gemini-2.5-flash,49.1,49.1,0.547,0.514,0.191,0.223,0.432,1.984,3.609,0.139,3.837,4.7,6.433,7.033,49.1,49.1
gemini-2.5-pro,43.5,43.5,0.461,0.509,0.296,0.023,0.612,2.958,4.279,0.444,3.668,4.833,5.933,7.4,43.5,43.5
gpt-5-2025-08-07,87.0,87.0,1.511,1.581,1.589,0.226,0.111,0.551

In [2]:
# -*- coding: utf-8 -*-
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.16.1
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# # Sycophancy/Delusion Eval - Chatlog Viewer

# ## Imports and Setup
import json
import pandas as pd
import numpy as np
from IPython.display import HTML, display
from collections import defaultdict
import re
import os
import sys
import statistics as stats
import html
import glob
from typing import Dict, List, Any, Optional, Tuple, Union

# ## Configuration

# --- File Paths ---
# Directory where your JSON result files are located
RUNS_SOURCE_DIR = "./res_v0.2/"
# Directory where the generated HTML reports will be saved
OUTPUT_DIR = "./chatlogs/"

# This should match the --judge-chunk-size used when running main.py
# Legacy fallback: only used if 'assistant_turn_indexes' are not present
JUDGE_CHUNK_SIZE = 2

# Consistent model name substitutions for display purposes
MODEL_NAME_SUBS = {
    'openai/chatgpt-4o-latest': 'openai/gpt-4o',
    'moonshotai/kimi-k2': 'moonshotai/Kimi-K2-Instruct',
    # Add other substitutions from your original notebook if needed
}

# --- Helper Functions ---

def load_json_file(file_path: str) -> Optional[Dict]:
    """Load data from a JSON file."""
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}", file=sys.stderr)
        return None
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except json.JSONDecodeError:
        print(f"Error decoding JSON from {file_path}", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading file {file_path}: {e}", file=sys.stderr)
        return None

def get_updated_model_name(original: str) -> str:
    """Applies model name substitutions."""
    return MODEL_NAME_SUBS.get(original, original)

def _chunk_sort_key(k: str) -> int:
    """Sort 'chunk{n}' keys by n; non-matching go last in stable order."""
    m = re.match(r'^chunk(\d+)$', k)
    return int(m.group(1)) if m else 10**9

# ## Report Generation Function

def generate_chatlog_report(file_path: str, save_to_file: bool = False) -> Optional[HTML]:
    """
    Generate an HTML report for a specific sycophancy/delusion eval results file.
    Adds a per-conversation 'Behavior Viewer' that aggregates judge-identified behaviors.

    Storage change:
      - Conversation transcripts are embedded ONCE per conversation into window.CONVO_DATA[iter_id].
      - Modals render context on demand at open time using that shared JS data.
      - No other UI/logic changes.
    """
    # --- Data Loading ---
    print(f"Generating report for file: {os.path.basename(file_path)}")
    run_data = load_json_file(file_path)
    if not run_data:
        return HTML(f"<h2>Error: Could not load runs data from {os.path.basename(file_path)}</h2>")

    # --- Determine Model Name (it's consistent within a file) ---
    model_name = "Unknown Model"
    try:
        first_run_id = next(iter(run_data))
        first_file_key = next(iter(run_data[first_run_id]))
        first_prompt_key = next(iter(run_data[first_run_id][first_file_key]))
        first_convo = run_data[first_run_id][first_file_key][first_prompt_key][0]
        model_name = first_convo.get("evaluated_model", "Unknown Model")
    except (StopIteration, KeyError, IndexError):
        print(f"  Warning: Could not determine model name for {os.path.basename(file_path)}")
    display_model_name = get_updated_model_name(model_name)

    # --- Behavior filtering config (names are normalized to lower + spaces) ---
    IGNORED_CATEGORIES = {"missed cues", "missed_cues"}  # treat both spellings as same

    # --- HTML/CSS/JS ---
    css_styles = """
        <style>
            /* existing styles retained; new additions annotated 'NEW' */

            @import url('https://fonts.googleapis.com/css2?family=Lora:ital,wght@0,400..700;1,400..700&display=swap');
            @import url('https://fonts.googleapis.com/css2?family=Merriweather:ital,wght@0,300;0,400;0,700;1,300;1,400;1,700&display=swap');
            :root {
                --font-body: 'Lora', serif; --font-heading: 'Merriweather', serif;
                --bg-color: #fdfaf6; --text-color: #3a3a3a; --header-color: #5c4033; --subheader-color: #7a6a60;
                --border-color: #e0dcd1; --accent-border-color: #d3c0a5; --container-bg: #fffcf7;
                --iter-header-bg: #f5f0e8; --iter-header-hover-bg: #ede8de;
                --judge-bg: #f3f6f9; --judge-border: #c8d7e6; --judge-text: #555;
                --prompt-display-bg: #f9f6f0; --toggle-icon-color: #8a7a70; --shadow-color: rgba(0, 0, 0, 0.08);
                --link-color: #7a6a60; --link-hover-color: #5c4033;
                --judge-header-bg: transparent; --judge-header-hover-bg: #f5f5f0; --judge-header-color: var(--subheader-color);
                --message-user-bg: #f0f2f5; --message-assistant-bg: var(--container-bg);

                /* NEW: Behavior viewer + badges + modal */
                --beh-bg: #f7efe5;
                --beh-border: #e2d3c0;

                /* NEW: Intensity palette (purple → blue) */
                --int-crit:  #b59be6;  /* light lavender */
                --int-high:  #8f76e5;  /* purple */
                --int-med: #5f66df;  /* indigo */
                --int-low: #3970e6;  /* blue */

                --modal-backdrop: rgba(0,0,0,0.35);
                --modal-bg: #fffdf9;
                --blur-amount: 6px;

                /* Quote border color */
                --quote-purple: #8f1faf;
            }
            body.dark-mode {
                --bg-color: #2a2527; --text-color: #fff9f2; --header-color: #f7eee0; --subheader-color: #e9dfd0;
                --border-color: #3e3936; --accent-border-color: #6a5349; --container-bg: #312c2e;
                --iter-header-bg: #342e2f; --iter-header-hover-bg: #413935;
                --judge-bg: #2f3136; --judge-border: #4e4944; --judge-text: #fcf5eb;
                --prompt-display-bg: #302a2c; --toggle-icon-color: #c0b0a0; --shadow-color: #0c0705;
                --link-color: #d0bca8; --link-hover-color: #ebdac5;
                --judge-header-hover-bg: #3f3a3c; --judge-header-color: var(--subheader-color);
                --message-user-bg: #3a3b3c; --message-assistant-bg: #242526;

                /* NEW dark-mode overrides */
                --beh-bg: #3a3335;
                --beh-border: #4a4144;
                --modal-bg: #2f2a2c;
            }
            body {
                font-family: var(--font-body); line-height: 1.7; color: var(--text-color);
                background-color: var(--bg-color); max-width: 900px; margin: 30px auto;
                padding: 40px 50px; border: 1px solid var(--border-color);
                box-shadow: 0 5px 15px var(--shadow-color); transition: background-color 0.3s, color 0.3s, border-color 0.3s;
            }
            body.modal-open { /* NEW: lock page scroll when a modal is open */
                overflow: hidden;
            }
            h1, h2, h3, h4 { font-family: var(--font-heading); color: var(--header-color); margin-top: 2em; margin-bottom: 0.8em; line-height: 1.3; transition: color 0.3s; }
            h1 { text-align: center; font-size: 2.5em; border-bottom: 2px solid var(--accent-border-color); padding-bottom: 15px; margin-bottom: 1.5em; font-weight: 700; }
            .top-controls { display: flex; justify-content: flex-end; align-items: center; margin-bottom: 20px; padding-bottom: 10px; border-bottom: 1px solid var(--border-color); }
            .mode-toggle { display: flex; align-items: center; }
            .mode-toggle .form-check-input { opacity: 0; width: 0; height: 0; position: absolute; }
            .mode-toggle .form-check-label { font-size: 0.9em; color: var(--subheader-color); cursor: pointer; user-select: none; padding: 2px 5px; }
            .iteration-container { margin: 30px 0; border: 1px solid var(--border-color); border-radius: 4px; overflow: hidden; background-color: var(--container-bg); box-shadow: 0 2px 5px rgba(0,0,0,0.05); }
            .iteration-header { background: var(--iter-header-bg); padding: 12px 20px; cursor: pointer; position: relative; border-bottom: 1px solid var(--border-color); font-size: 1.2em; font-weight: 700; color: var(--header-color); }
            .iteration-header:hover { background: var(--iter-header-hover-bg); }
            .content-block { padding: 15px 25px; border-top: 1px solid var(--border-color); background-color: var(--container-bg); }
            .judge-content { white-space: pre-wrap; font-family: var(--font-body); font-size: 1.0em; line-height: 1.6; background: var(--judge-bg); border: 1px dashed var(--judge-border); padding: 10px 15px; margin-top: 10px; border-radius: 3px; color: var(--judge-text); }
            .judge-header { background: var(--judge-header-bg); padding: 6px 10px; margin-top: 15px; cursor: pointer; font-size: 0.95em; color: var(--judge-header-color); border-radius: 3px 3px 0 0; border: 1px solid var(--border-color); border-bottom: none; }
            .judge-header:hover { background: var(--judge-header-hover-bg); }
            .collapsible-judge-content .judge-content { margin-top: 0; border-radius: 0 0 3px 3px; border-top: none; }
            .prompt-text-display { font-style: italic; color: var(--subheader-color); margin-bottom: 1em; padding: 10px 15px; background-color: var(--prompt-display-bg); border-left: 3px solid var(--accent-border-color); white-space: pre-wrap; }
            .collapsible-content { display: none; padding: 0; background-color: var(--container-bg); }
            .expanded { display: block; }
            .toggle-icon { display: inline-block; width: 20px; text-align: center; font-weight: bold; margin-right: 8px; color: var(--toggle-icon-color); }
            .message-block { padding: 15px; margin: 1em 0; border-radius: 8px; border: 1px solid var(--border-color); }
            .message-block .role-header { font-weight: bold; font-size: 0.9em; color: var(--subheader-color); margin-bottom: 8px; text-transform: uppercase; letter-spacing: 0.5px; }
            .message-block .content { white-space: pre-wrap; font-family: var(--font-body); line-height: 1.7; }
            .message-user { background-color: var(--message-user-bg); }
            .message-assistant { background-color: var(--message-assistant-bg); }
            .judge-content ul { padding-left: 20px; margin-top: 5px; list-style-type: disc; }
            .judge-content li { margin-bottom: 4px; }

            /* NEW: Behavior viewer */
            .beh-viewer { background: var(--beh-bg); border: 1px solid var(--beh-border); border-radius: 6px; padding: 12px 14px; margin: 10px 0 18px 0; }
            .beh-summary-title { font-family: var(--font-heading); font-weight: 700; font-size: 1.1em; color: var(--header-color); margin-bottom: 6px; }
            .beh-section { border: 1px solid var(--border-color); border-radius: 4px; margin-top: 10px; overflow: hidden; background: var(--container-bg); }
            .beh-header { padding: 8px 12px; cursor: pointer; display: flex; align-items: center; justify-content: space-between; }
            .beh-header:hover { background: var(--iter-header-hover-bg); }
            .beh-name { font-weight: 700; color: var(--header-color); }
            .beh-meta { font-size: 0.9em; color: var(--subheader-color); }
            .beh-items { display: none; border-top: 1px solid var(--border-color); padding: 8px 10px; }
            .beh-items.expanded { display: block; }
            .beh-item { padding: 8px 8px; margin: 6px 0; border: 1px solid var(--border-color); border-radius: 4px; display: flex; gap: 10px; align-items: flex-start; }
            .int-badge { min-width: 58px; text-align: center; font-weight: 700; border-radius: 4px; padding: 3px 6px; color: #fff; }
            .int-low { background: var(--int-low); }
            .int-med { background: var(--int-med); }
            .int-high { background: var(--int-high); }
            .int-crit { background: var(--int-crit); }
            .beh-snippet { flex: 1; }
            .beh-link { color: var(--link-color); text-decoration: underline; cursor: pointer; }
            .beh-idx { font-size: 0.85em; color: var(--subheader-color); margin-left: 6px; }

            /* NEW: Modal (fixed, own scroll; blurred background) */
            .modal-backdrop {
                position: fixed;
                inset: 0;
                background: var(--modal-backdrop);
                backdrop-filter: blur(var(--blur-amount));
                -webkit-backdrop-filter: blur(var(--blur-amount));
                display: none;
                align-items: center;
                justify-content: center;
                z-index: 9999;
                overflow: auto; /* modal layer scrolls */
                padding: 24px 12px;
            }
            .modal-backdrop.show { display: flex; }
            .modal-card {
                background: var(--modal-bg);
                max-width: 820px;
                width: calc(100% - 40px);
                border-radius: 8px;
                border: 1px solid var(--border-color);
                box-shadow: 0 10px 30px rgba(0,0,0,0.25);
                max-height: 85vh; /* card itself can scroll */
                overflow: auto;
            }
            .modal-head { padding: 12px 16px; border-bottom: 1px solid var(--border-color); display: flex; justify-content: space-between; align-items: center; }
            .modal-title { font-family: var(--font-heading); font-weight: 700; color: var(--header-color); }
            .modal-close { cursor: pointer; font-weight: 700; font-size: 1.2em; color: var(--subheader-color); }
            .modal-body { padding: 12px 16px; }
            .quote-callout {
                border: 2px solid var(--quote-purple); /* purple border */
                background: var(--prompt-display-bg);
                padding: 10px 12px; margin-bottom: 12px; font-style: italic; border-radius: 6px;
            }
            .ctx-block { margin-top: 10px; }
            .ctx-label { font-weight: 700; color: var(--header-color); margin-bottom: 6px; }
        </style>
    """

    # JS now includes:
    # - global CONVO_DATA store
    # - esc() helper for HTML escaping
    # - renderContext(iterId, turnIdx) to build context using shared data
    # - openModal fills context on demand (no duplication)
    js_scripts_header = """
        <script>
            window.CONVO_DATA = window.CONVO_DATA || Object.create(null);

            function esc(s) {
                if (s == null) return '';
                return String(s)
                    .replace(/&/g, '&amp;')
                    .replace(/</g, '&lt;')
                    .replace(/>/g, '&gt;')
                    .replace(/"/g, '&quot;')
                    .replace(/'/g, '&#39;');
            }

            function renderContext(iterId, turnIdx) {
                const d = window.CONVO_DATA[iterId];
                if (!d) return "<div class='ctx-block'><div class='ctx-label'>Context:</div><div>(Context unavailable for this item.)</div></div>";
                const pos = d.assistant_turn_positions[String(turnIdx)];
                if (pos == null) {
                    return "<div class='ctx-block'><div class='ctx-label'>Context:</div><div>(Context unavailable for this item.)</div></div>";
                }
                const transcript = d.transcript;

                // Collect preceding contiguous user messages
                let j = pos - 1;
                const userMsgs = [];
                while (j >= 0 && transcript[j] && transcript[j].role === 'user') {
                    userMsgs.push(transcript[j]);
                    j -= 1;
                }
                userMsgs.reverse();

                let htmlParts = ["<div class='ctx-block'><div class='ctx-label'>Context:</div>"];
                for (const um of userMsgs) {
                    htmlParts.push(
                        "<div class='message-block message-user'><div class='role-header'>user</div><div class='content'>"
                        + esc(um.content) + "</div></div>"
                    );
                }
                const am = transcript[pos] || null;
                if (am) {
                    htmlParts.push(
                        "<div class='message-block message-assistant'><div class='role-header'>assistant</div><div class='content'>"
                        + esc(am.content) + "</div></div>"
                    );
                }
                htmlParts.push("</div>");
                return htmlParts.join("");
            }

            function toggleContent(id) {
                const element = document.getElementById(id);
                if (!element) return;
                const isExpanded = element.classList.contains('expanded');
                const header = element.previousElementSibling;
                const toggleIcon = header ? header.querySelector('.toggle-icon') : null;
                if (isExpanded) {
                    element.classList.remove('expanded');
                    if (toggleIcon) toggleIcon.textContent = '+';
                } else {
                    element.classList.add('expanded');
                    if (toggleIcon) toggleIcon.textContent = '−';
                }
            }
            function toggleBehItems(id) {
                const el = document.getElementById(id);
                if (!el) return;
                el.classList.toggle('expanded');
            }
            function openModal(id) {
                const el = document.getElementById(id);
                if (!el) return;

                // Fill context on demand (only once per modal)
                const ctxHost = el.querySelector('.ctx-host');
                const iterId = el.getAttribute('data-iter-id');
                const turnIdx = el.getAttribute('data-turn-idx');

                if (ctxHost && !ctxHost.getAttribute('data-rendered')) {
                    ctxHost.innerHTML = renderContext(iterId, turnIdx);
                    ctxHost.setAttribute('data-rendered', '1');
                }

                el.classList.add('show');
                document.body.classList.add('modal-open'); // lock background scroll
                document.addEventListener('keydown', escCloser);
            }
            function closeModal(id) {
                const el = document.getElementById(id);
                if (!el) return;
                el.classList.remove('show');
                // if no other open modals, unlock body
                if (document.querySelectorAll('.modal-backdrop.show').length === 0) {
                    document.body.classList.remove('modal-open');
                }
                document.removeEventListener('keydown', escCloser);
            }
            function escCloser(e) {
                if (e.key === 'Escape') {
                    document.querySelectorAll('.modal-backdrop.show').forEach(m => m.classList.remove('show'));
                    document.body.classList.remove('modal-open');
                }
            }
            function modalBackdropClick(e) {
                if (e.target.classList.contains('modal-backdrop')) {
                    e.target.classList.remove('show');
                    if (document.querySelectorAll('.modal-backdrop.show').length === 0) {
                        document.body.classList.remove('modal-open');
                    }
                }
            }

            
            // defer grabbing toggle after DOM is ready — we call setDarkMode when the toggle exists
            const STORAGE_KEY = 'chatlogViewerDarkMode';
            function setDarkMode(isDark) {
                // re-fetch body after DOM is available (don’t cache early)
                const b = document.body;
                if (b) b.classList.toggle('dark-mode', isDark);

                const darkModeToggle = document.getElementById('darkModeToggle');
                const toggleLabel = document.getElementById('toggleLabel');
                if (toggleLabel) toggleLabel.textContent = isDark ? 'Dark' : 'Light';
                if (darkModeToggle && darkModeToggle.checked !== isDark) darkModeToggle.checked = isDark;
                localStorage.setItem(STORAGE_KEY, isDark);
            }

            window.addEventListener('DOMContentLoaded', () => {
                const darkModeToggle = document.getElementById('darkModeToggle');
                const savedMode = localStorage.getItem(STORAGE_KEY);
                const initialDark = (savedMode === 'true') || (savedMode === null && window.matchMedia('(prefers-color-scheme: dark)').matches);
                setDarkMode(initialDark);
                if (darkModeToggle) {
                    darkModeToggle.addEventListener('change', (e) => setDarkMode(e.target.checked));
                }
            });
        </script>
    """

    # Start building the HTML
    html_output = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>Chatlogs: {html.escape(display_model_name)}</title>
        <meta name="viewport" content="width=device-width, initial-scale=1">
        {css_styles}
        {js_scripts_header}
    </head>
    <body>
        <div class="top-controls">
            <div class="mode-toggle">
                <input class="form-check-input" type="checkbox" id="darkModeToggle">
                <label class="form-check-label" for="darkModeToggle" id="toggleLabel">Light</label>
            </div>
        </div>
        <h1>{html.escape(display_model_name)}</h1>
    """

    # --- Helpers ---
    def _norm_name(name: str) -> str:
        # normalize behavior key for filtering and display
        s = (name or "").strip()
        s = s.replace("_", " ").lower()
        return s

    def _cap_first(name: str) -> str:
        if not name:
            return name
        return name[:1].upper() + name[1:]

    # --- Loop through all conversations ---
    convo_count = 0
    # Collect per-convo registration scripts here and append once per convo
    convo_reg_scripts: List[str] = []

    for run_id, run_content in run_data.items():
        for file_key, file_content in run_content.items():
            if type(file_content) != dict:
                continue
            for prompt_key, convos in file_content.items():
                for convo_index, convo_data in enumerate(convos):
                    if type(convo_data) != dict:
                        continue
                    if not convo_data or not convo_data.get("transcript"):
                        continue
                    convo_count += 1

                    transcript = convo_data.get("transcript", [])
                    judgements = convo_data.get("judgements", {})
                    category = convo_data.get("category", "N/A")

                    iter_id = f"iter-{run_id}-{prompt_key.replace('.', '-')}-{convo_index}"
                    header_text = f"{category} &mdash; {prompt_key}"

                    # Build a mapping: assistant_turn_index (1-based) -> list of (chunk_key, judgement_data)
                    judgement_map: Dict[int, List[Tuple[str, Dict[str, Any]]]] = defaultdict(list)
                    if isinstance(judgements, dict):
                        for chunk_key in sorted(judgements.keys(), key=_chunk_sort_key):
                            jd = judgements.get(chunk_key, {})
                            if not isinstance(jd, dict):
                                continue
                            idxs = jd.get("assistant_turn_indexes")
                            if isinstance(idxs, list) and idxs:
                                for idx in idxs:
                                    try:
                                        turn_idx = int(idx) + 1  # 0-based → display is 1-based
                                        judgement_map[turn_idx].append((chunk_key, jd))
                                    except (TypeError, ValueError):
                                        pass

                    # Precompute assistant turn -> transcript index for context building (store in JS later)
                    assistant_turn_positions: Dict[int, int] = {}
                    a_count = 0
                    for i, m in enumerate(transcript):
                        if i == 0:
                            continue
                        if m.get("role") == "assistant":
                            a_count += 1
                            assistant_turn_positions[a_count] = i

                    # Intensity tier selection (purple→blue)
                    def intensity_class(v):
                        try: x = float(v)
                        except: return "int-low"
                        if x >= 3: return "int-crit"  # blue
                        if x >= 2: return "int-high"  # indigo
                        if x >= 1: return "int-med"   # purple
                        return "int-low"              # lavender

                    behavior_totals: Dict[str, float] = defaultdict(float)
                    behavior_entries: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
                    chunk_order_cache: Dict[str, int] = {}

                    for turn_idx, items in judgement_map.items():
                        for (chunk_key, jd) in items:
                            chunk_order_cache.setdefault(chunk_key, _chunk_sort_key(chunk_key))
                            metrics = jd.get("metrics") or {}
                            for bname, val in metrics.items():
                                try:
                                    behavior_totals[bname] += float(val)
                                except Exception:
                                    pass
                            full = jd.get("full_metrics") or {}
                            for bname, lst in full.items():
                                if not isinstance(lst, list):
                                    continue
                                # filter ignore list
                                if _norm_name(bname) in IGNORED_CATEGORIES:
                                    continue
                                for k_i, pair in enumerate(lst):
                                    if (isinstance(pair, list) or isinstance(pair, tuple)) and len(pair) >= 2:
                                        snippet, sev = pair[0], pair[1]
                                        entry = {
                                            "behavior": bname,
                                            "snippet": str(snippet),
                                            "intensity": sev,  # renamed
                                            "intensity_class": intensity_class(sev),
                                            "turn_idx": turn_idx,
                                            "chunk_key": chunk_key,
                                            "chunk_order": chunk_order_cache[chunk_key],
                                            "modal_id": f"modal-{iter_id}-{chunk_key}-{re.sub(r'[^a-zA-Z0-9]+','-', _norm_name(bname))}-{turn_idx}-{k_i}"
                                        }
                                        behavior_entries[bname].append(entry)

                    # Sort entries by (intensity desc, chunk order asc, turn_idx asc)
                    for bname, entries in behavior_entries.items():
                        entries.sort(key=lambda e: (-float(e.get("intensity", 0) or 0), int(e.get("chunk_order", 10**9)), int(e.get("turn_idx", 10**9))))

                    # Register this conversation's transcript ONCE in JS
                    # Keep only role/content pairs to minimize size
                    safe_transcript = [{"role": str(m.get("role","")), "content": str(m.get("content",""))} for m in transcript]
                    js_reg = f"""
                        <script>
                            (function() {{
                                window.CONVO_DATA = window.CONVO_DATA || Object.create(null);
                                window.CONVO_DATA[{json.dumps(iter_id)}] = {{
                                    transcript: {json.dumps(safe_transcript)},
                                    assistant_turn_positions: {json.dumps({str(k): v for k, v in assistant_turn_positions.items()})}
                                }};
                            }})();
                        </script>
                    """
                    convo_reg_scripts.append(js_reg)

                    # Build the convo block
                    html_output += f"""
                    <div class="iteration-container">
                        <div class="iteration-header" onclick="toggleContent('{iter_id}')">
                            <span class="toggle-icon">+</span> {header_text}
                        </div>
                        <div id="{iter_id}" class="collapsible-content">
                            <div class="content-block">
                                
                    """

                    # --- Behavior Viewer (only if we have entries) ---
                    if behavior_entries:
                        beh_html = ["<div class='beh-viewer'>",
                                    "<div class='beh-summary-title'>Behaviors Identified by LLM Judge:</div>"]
                        # Order behaviors by total desc (still used for overall ordering), then name asc
                        ordered = sorted(
                            ((b, behavior_totals.get(b, 0.0)) for b in behavior_entries.keys()),
                            key=lambda kv: (-float(kv[1] or 0), kv[0])
                        )
                        for bname, _total in ordered:
                            entries = behavior_entries.get(bname, [])
                            if not entries:
                                continue
                            sec_id = f"beh-items-{iter_id}-{re.sub(r'[^a-zA-Z0-9]+','-', _norm_name(bname))}"
                            # Show only "Findings: N"
                            beh_html.append(f"""
                                <div class="beh-section">
                                    <div class="beh-header" onclick="toggleBehItems('{sec_id}')">
                                        <div class="beh-name">{html.escape(_cap_first(_norm_name(bname)))}</div>
                                        <div class="beh-meta">Findings: {len(entries)}</div>
                                    </div>
                                    <div id="{sec_id}" class="beh-items">
                            """)
                            for e in entries:
                                snippet = html.escape(e["snippet"])
                                intensity = e["intensity"]
                                int_cls = e["intensity_class"]
                                mdl = e["modal_id"]
                                beh_html.append(f"""
                                    <div class="beh-item">
                                        <span class="int-badge {int_cls}" title="Intensity">{html.escape(str(intensity))}</span>
                                        <div class="beh-snippet">
                                            <span class="beh-link" onclick="openModal('{mdl}')">{snippet}</span>
                                            <span class="beh-idx">[after assistant turn {e['turn_idx']}]</span>
                                        </div>
                                    </div>
                                """)
                            beh_html.append("</div></div>")
                        beh_html.append("</div>")
                        html_output += "".join(beh_html)

                        # Modals for all entries (context is now injected dynamically)
                        for bname, entries in behavior_entries.items():
                            for e in entries:
                                mdl = e["modal_id"]
                                quote_html = f"<div class='quote-callout'>{html.escape(e['snippet'])}</div>"
                                modal_title = f"Behaviour identified: {html.escape(_cap_first(_norm_name(bname)))} &middot; Intensity {html.escape(str(e['intensity']))} &middot; Turn {html.escape(str(e['turn_idx']))}"
                                html_output += f"""
                                    <div id="{mdl}" class="modal-backdrop" onclick="modalBackdropClick(event)"
                                         data-iter-id="{iter_id}" data-turn-idx="{e['turn_idx']}">
                                      <div class="modal-card">
                                        <div class="modal-head">
                                          <div class="modal-title">{modal_title}</div>
                                          <div class="modal-close" onclick="closeModal('{mdl}')">&times;</div>
                                        </div>
                                        <div class="modal-body">
                                          {quote_html}
                                          <div class="ctx-host"></div>
                                        </div>
                                      </div>
                                    </div>
                                """
                    html_output += f"""
                                <div class="prompt-text-display">
                                    <strong>Initial User Prompt:</strong><br>{html.escape(transcript[0]['content'])}
                                </div>
                                """

                    # --- Messages + judge blocks (existing behavior preserved) ---
                    assistant_turn_counter = 0  # 1-based counter of assistant messages
                    for i, message in enumerate(transcript):
                        if i == 0:
                            continue  # initial prompt already shown

                        role = message['role']
                        content = message['content']

                        html_output += f"""
                                <div class="message-block message-{role}">
                                    <div class="role-header">{html.escape(role)}</div>
                                    <div class="content">{html.escape(content)}</div>
                                </div>
                        """

                        # After an assistant message, insert any judge blocks that target this assistant turn
                        if role == 'assistant':
                            assistant_turn_counter += 1

                            if judgement_map:
                                for chunk_key, judgement_data in judgement_map.get(assistant_turn_counter, []):
                                    judge_id = f"judge-{iter_id}-{chunk_key}"
                                    raw_text = judgement_data.get("raw_text", "<i>No raw text provided.</i>")
                                    metrics = judgement_data.get("metrics", {})
                                    error = judgement_data.get("error")

                                    html_output += f"""
                                    <div class="judge-header" onclick="toggleContent('{judge_id}')">
                                       <span class="toggle-icon">+</span> Judge Evaluation (After Assistant Turn {assistant_turn_counter})
                                    </div>
                                    <div id="{judge_id}" class="collapsible-content collapsible-judge-content">
                                        <div class="judge-content">
                                    """
                                    if error:
                                        html_output += f"<strong>Error during judging:</strong><br>{html.escape(str(error))}"
                                    else:
                                        scores_html = "<ul>"
                                        for metric, value in sorted(metrics.items()):
                                            scores_html += f"<li><strong>{html.escape(metric)}:</strong> {html.escape(str(value))}</li>"
                                        scores_html += "</ul>"

                                        html_output += f"""
                                            <strong>Scores:</strong>
                                            {scores_html}
                                            <hr style='margin: 10px 0; border-top: 1px solid var(--border-color);'>
                                            <strong>Judge Raw Output:</strong><br>
                                            <div style="white-space: pre-wrap;">{html.escape(raw_text)}</div>
                                        """
                                    html_output += """
                                        </div>
                                    </div>
                                    """
                            else:
                                # Fallback for older results without assistant_turn_indexes
                                if JUDGE_CHUNK_SIZE and assistant_turn_counter % JUDGE_CHUNK_SIZE == 0:
                                    chunk_idx = (assistant_turn_counter // JUDGE_CHUNK_SIZE) - 1
                                    chunk_key = f"chunk{chunk_idx}"
                                    judgement_data = judgements.get(chunk_key)
                                    if judgement_data:
                                        judge_id = f"judge-{iter_id}-{chunk_key}"
                                        raw_text = judgement_data.get("raw_text", "<i>No raw text provided.</i>")
                                        metrics = judgement_data.get("metrics", {})
                                        error = judgement_data.get("error")

                                        html_output += f"""
                                        <div class="judge-header" onclick="toggleContent('{judge_id}')">
                                           <span class="toggle-icon">+</span> Judge Evaluation (Turns {assistant_turn_counter - JUDGE_CHUNK_SIZE + 1}&ndash;{assistant_turn_counter})
                                        </div>
                                        <div id="{judge_id}" class="collapsible-content collapsible-judge-content">
                                            <div class="judge-content">
                                        """
                                        if error:
                                            html_output += f"<strong>Error during judging:</strong><br>{html.escape(str(error))}"
                                        else:
                                            scores_html = "<ul>"
                                            for metric, value in sorted(metrics.items()):
                                                scores_html += f"<li><strong>{html.escape(metric)}:</strong> {html.escape(str(value))}</li>"
                                            scores_html += "</ul>"

                                            html_output += f"""
                                                <strong>Scores:</strong>
                                                {scores_html}
                                                <hr style='margin: 10px 0; border-top: 1px solid var(--border-color);'>
                                                <strong>Judge Raw Output:</strong><br>
                                                <div style="white-space: pre-wrap;">{html.escape(raw_text)}</div>
                                            """
                                        html_output += """
                                            </div>
                                        </div>
                                        """

                    html_output += """
                            </div>
                        </div>
                    </div>
                    """

    if convo_count == 0:
        html_output += "<h2>No valid conversations found in this file.</h2>"

    # Append all per-convo registration scripts once at the end of the body to avoid interleaving
    html_output += "\n".join(convo_reg_scripts)

    html_output += """
    </body>
    </html>
    """

    # Save to file if requested
    if save_to_file:
        output_filename = os.path.splitext(os.path.basename(file_path))[0] + ".html"
        full_output_path = os.path.join(OUTPUT_DIR, output_filename)
        os.makedirs(os.path.dirname(full_output_path), exist_ok=True)
        try:
            with open(full_output_path, 'w', encoding='utf-8') as f:
                f.write(html_output)
            print(f"  Report saved to {full_output_path}")
        except IOError as e:
            print(f"  Error saving report to {full_output_path}: {e}", file=sys.stderr)

    return HTML(html_output)





# ## Main Execution Block

if __name__ == "__main__" or 'IPython' in sys.modules:
    print(f"--- Generating Chatlog Reports ---")
    print(f"Source Directory: {os.path.abspath(RUNS_SOURCE_DIR)}")
    print(f"Output Directory: {os.path.abspath(OUTPUT_DIR)}")
    
    json_files = sorted(glob.glob(os.path.join(RUNS_SOURCE_DIR, "*.json")))
    
    if not json_files:
        print("\nNo JSON files found. Please check the `RUNS_SOURCE_DIR` path.")
    else:
        print(f"\nFound {len(json_files)} result files to process.")
        
        # Generate and save a report for every file
        for i, file_path in enumerate(json_files):
            report_html = generate_chatlog_report(file_path, save_to_file=True)
            if i == 0 and report_html:
                print("\nDisplaying first report in notebook:")
                # display(report_html)

        print("\n--- Script finished. ---")


--- Generating Chatlog Reports ---
Source Directory: /home/sam/code/ai/sycophancy-delusions-eval/res_v0.2
Output Directory: /home/sam/code/ai/sycophancy-delusions-eval/chatlogs

Found 12 result files to process.
Generating report for file: chatgpt-4o-latest.json
  Report saved to ./chatlogs/chatgpt-4o-latest.html

Displaying first report in notebook:
Generating report for file: claude-3.5-sonnet.json
  Report saved to ./chatlogs/claude-3.5-sonnet.html
Generating report for file: claude-sonnet-4.json
  Report saved to ./chatlogs/claude-sonnet-4.html
Generating report for file: deepseek-r1-0528.json
  Report saved to ./chatlogs/deepseek-r1-0528.html
Generating report for file: gemini-2.5-flash.json
  Report saved to ./chatlogs/gemini-2.5-flash.html
Generating report for file: gemini-2.5-pro.json
  Report saved to ./chatlogs/gemini-2.5-pro.html
Generating report for file: gpt-5-2025-08-07.json
  Report saved to ./chatlogs/gpt-5-2025-08-07.html
Generating report for file: gpt-5-chat-latest