# inspect-viz Examples for Capitalization Experiments

This notebook demonstrates how to use `inspect-viz` pre-built views to visualize
evaluation results from cruijff_kit experiments.

## Pre-built Views Demonstrated

| View | Use Case | Example |
|------|----------|----------|
| `scores_by_task` | Multiple tasks/conditions | Word length comparison |
| `scores_heatmap` | Model × task matrix | Model performance grid |
| `scores_radar_by_task` | Multiple metrics | Match vs. includes accuracy |
| `scores_by_factor` | Binary factors | With/without prompt |
| `scores_by_model` | Cross-model comparison | Llama vs Gemma |

## Setup

In [None]:
import os
import sys

# Add cruijff_kit to path for imports
CRUIJFF_KIT_ROOT = "/home/st0898/MSALGANIK/st0898/cruijff_kit"
sys.path.insert(0, CRUIJFF_KIT_ROOT)

from inspect_viz import Data
from inspect_viz.plot import write_html
from inspect_viz.view.beta import (
    scores_by_task,
    scores_heatmap,
    scores_radar_by_task,
    scores_radar_by_task_df,
    scores_by_model,
    scores_by_factor,
)

from tools.inspect.viz_helpers import load_experiment_logs

# Output directory for HTML files
HTML_OUTPUT_DIR = os.path.join(CRUIJFF_KIT_ROOT, "viz_examples", "html")
os.makedirs(HTML_OUTPUT_DIR, exist_ok=True)

---
## Experiment 1: Word Length (scores_by_task, scores_heatmap, scores_radar)

Compare model performance across different word lengths (tasks).

**Variables:**
- Model: Llama-3.2-1B vs Llama-3.2-3B
- Task: 5-letter words, etc.

In [None]:
# Load experiment data
logs_df_wordlen = load_experiment_logs(
    experiment_path="/home/st0898/MSALGANIK/st0898/ck-experiments/cap_wordlen_2026-01-12",
    subdirs=["Llama-3.2-1B-Instruct_5L", "Llama-3.2-3B-Instruct_5L"],
    log_viewer_url="http://localhost:8000/cap_wordlen_logs_viewer/",
    metadata_extractors={
        # Extract model size and epoch for cleaner display
        "model": lambda df: (
            df['model'].str.extract(r'Llama-3\.2-(?P<size>\d+B)', expand=False)
            + '_epoch'
            + df['model'].str.extract(r'epoch_(?P<epoch>\d+)', expand=False)
        ),
        # Extract word length as task name
        "task_name": lambda df: df['task_arg_data_path'].str.extract(
            r'words_(?P<task_name>\d+L)', expand=False
        )
    }
)

# Wrap in Data object for inspect-viz
evals_wordlen = Data.from_dataframe(logs_df_wordlen)

### scores_by_task: Compare scores across tasks

In [None]:
# Match accuracy by task
scores_by_task_match = scores_by_task(
    evals_wordlen,
    task_name='task_name',
    score_value="score_match_accuracy",
    score_stderr="score_match_stderr",
    score_label="Match Accuracy",
    ci=0.95
)
write_html(os.path.join(HTML_OUTPUT_DIR, "scores_by_task_wordlen_match.html"), scores_by_task_match)

# Includes accuracy by task
scores_by_task_includes = scores_by_task(
    evals_wordlen,
    task_name='task_name',
    score_value="score_includes_accuracy",
    score_stderr="score_includes_stderr",
    score_label="Includes Accuracy",
    ci=0.95
)
write_html(os.path.join(HTML_OUTPUT_DIR, "scores_by_task_wordlen_includes.html"), scores_by_task_includes)

scores_by_task_match

### scores_heatmap: Model × Task matrix

In [None]:
# Match accuracy heatmap
heatmap_match = scores_heatmap(
    evals_wordlen,
    task_name='task_name',
    model_name="model_display_name",
    model_label="Model",
    score_value="score_match_accuracy",
    tip=True,
    title=""
)
write_html(os.path.join(HTML_OUTPUT_DIR, "scores_heatmap_wordlen_match.html"), heatmap_match)

# Includes accuracy heatmap (horizontal orientation)
heatmap_includes = scores_heatmap(
    evals_wordlen,
    task_name='task_name',
    model_name="model_display_name",
    model_label="Model",
    score_value="score_includes_accuracy",
    tip=True,
    title="",
    orientation="horizontal"
)
write_html(os.path.join(HTML_OUTPUT_DIR, "scores_heatmap_wordlen_includes.html"), heatmap_includes)

heatmap_match

### scores_radar_by_task: Multiple metrics comparison

In [None]:
# Prepare radar data with min-max normalization
radar_df = scores_radar_by_task_df(
    logs_df_wordlen,
    invert=["score_match_accuracy", "score_includes_accuracy"],
    normalization="min_max",
    domain=(0, 1)
)

# Create radar plot showing both scorers
radar_plot = scores_radar_by_task(Data.from_dataframe(radar_df))
write_html(os.path.join(HTML_OUTPUT_DIR, "scores_radar_wordlen.html"), radar_plot)

radar_plot

---
## Experiment 2: Model × System Prompt (scores_by_factor)

Compare model performance with and without a system prompt.

**Variables:**
- Model: Llama-3.2-1B vs Llama-3.2-3B
- Factor: with_prompt vs no_prompt (binary)

In [None]:
# Load experiment data
logs_df_prompt = load_experiment_logs(
    experiment_path="/home/st0898/MSALGANIK/st0898/ck-experiments/cap_model_prompt_2025-11-21",
    subdirs=[
        "Llama-3.2-1B-Instruct_no_prompt",
        "Llama-3.2-3B-Instruct_no_prompt",
        "Llama-3.2-1B-Instruct_with_prompt",
        "Llama-3.2-3B-Instruct_with_prompt"
    ],
    log_viewer_url="http://localhost:8000/cap_prompts_logs_viewer/",
    metadata_extractors={
        "model": lambda df: df['model_path'].str.extract(
            r'(Llama-3\.2-\d+B)', expand=False
        ),
        "prompt_type": lambda df: df['task_arg_config_path'].str.extract(
            r'Instruct_(?P<prompt_type>with_prompt|no_prompt)', expand=False
        )
    }
)

# Convert prompt_type to boolean for scores_by_factor
logs_df_prompt['prompt_type'] = logs_df_prompt['prompt_type'] == 'with_prompt'

evals_prompt = Data.from_dataframe(logs_df_prompt)

### scores_by_factor: Binary factor comparison

In [None]:
# Match accuracy by factor
factor_match = scores_by_factor(
    evals_prompt,
    factor="prompt_type",
    factor_labels=("No Prompt", "Prompt"),
    score_value="score_match_accuracy",
    score_stderr="score_match_stderr",
    score_label="Match Accuracy",
    model="model",
    model_label="Model",
    ci=0.95
)
write_html(os.path.join(HTML_OUTPUT_DIR, "scores_by_factor_prompt_match.html"), factor_match)

# Includes accuracy by factor
factor_includes = scores_by_factor(
    evals_prompt,
    factor="prompt_type",
    factor_labels=("No Prompt", "Prompt"),
    score_value="score_includes_accuracy",
    score_stderr="score_includes_stderr",
    score_label="Includes Accuracy",
    model="model",
    model_label="Model",
    ci=0.95
)
write_html(os.path.join(HTML_OUTPUT_DIR, "scores_by_factor_prompt_includes.html"), factor_includes)

factor_match

---
## Experiment 3: Cross-Organization Model Comparison (scores_by_model)

Compare models from different organizations.

**Variables:**
- Model: Google Gemma-2B vs Meta Llama-3.2-1B

In [None]:
# Load experiment data
logs_df_crossorg = load_experiment_logs(
    experiment_path="/home/st0898/MSALGANIK/st0898/ck-experiments/cap_cross_org_models_2025-11-21",
    subdirs=["Google-Gemma-2B", "Meta-Llama-3.2-1B-Instruct"],
    log_viewer_url="http://localhost:8000/cap_crossorg_logs_viewer/",
    metadata_extractors={
        "model": lambda df: df['model_path'].str.extract(
            r'(Google-Gemma-2B|Meta-Llama-3\.2-1B-Instruct)', expand=False
        )
    }
)

evals_crossorg = Data.from_dataframe(logs_df_crossorg)

### scores_by_model: Model comparison

In [None]:
# Match accuracy by model
model_match = scores_by_model(
    evals_crossorg,
    score_value="score_match_accuracy",
    score_stderr="score_match_stderr",
    score_label="Match Accuracy",
    ci=0.95,
    height=200
)
write_html(os.path.join(HTML_OUTPUT_DIR, "scores_by_model_crossorg_match.html"), model_match)

# Includes accuracy by model
model_includes = scores_by_model(
    evals_crossorg,
    score_value="score_includes_accuracy",
    score_stderr="score_includes_stderr",
    score_label="Includes Accuracy",
    ci=0.95,
    height=200
)
write_html(os.path.join(HTML_OUTPUT_DIR, "scores_by_model_crossorg_includes.html"), model_includes)

model_match