In [11]:
from pathlib import Path
import json
import numpy as np
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Output directory for visualizations
visualization_dir = Path("visualization/")
visualization_dir.mkdir(parents=True, exist_ok=True)

# Data directory
data_path = Path("output/")

print(f"✓ Loaded libraries")
print(f"  Visualization dir: {visualization_dir}")
print(f"  Data path: {data_path}")

✓ Loaded libraries
  Visualization dir: visualization
  Data path: output


"""
Visualization for pytorch_pure Angular Steering

NOTE: This is a simplified visualization for pytorch_pure development.
For comprehensive visualization with more models and detailed analysis,
use ../visualization.ipynb in the parent directory.
"""

from pathlib import Path
import json
import numpy as np
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Output directory for visualizations
visualization_dir = Path("visualization/")
visualization_dir.mkdir(parents=True, exist_ok=True)

# Data directory
data_path = Path("output/")

print(f"✓ Loaded libraries")
print(f"  Visualization dir: {visualization_dir}")
print(f"  Data path: {data_path}")

In [12]:
# Configuration
model_ids = [
    "Qwen2.5-7B-Instruct",
    # Add more models here as you run the pipeline
    "Qwen2.5-3B-Instruct",
]

language = "en"
data_type = "harmful"
data_path = "./output/"

# Whether to use adaptive mode (mode_1) files
# If False, uses non-adaptive (mode_0) files
adaptive = True

# Direction strategy to visualize ("max_sim" or "max_norm")
# This filters which direction to plot when multiple are available
direction_strategy = "max_sim"

# Create figure
num_row = (len(model_ids) + 1) // 2
num_col = min(2, len(model_ids))

fig = make_subplots(
    rows=num_row,
    cols=num_col,
    specs=[[{"type": "polar"}] * num_col] * num_row,
    subplot_titles=model_ids,
    vertical_spacing=0.1,
    horizontal_spacing=0.1,
)

# Color mapping for metrics (matching parent)
colour_map = {
    "substring_matching": plotly.colors.qualitative.Plotly[1],
    "llamaguard3": plotly.colors.qualitative.Plotly[2],
    "harmbench": plotly.colors.qualitative.Plotly[0],
}

# Angle categories (0 to 360, close the loop)
categories = list(str(i) for i in range(0, 360, 10))
categories.append(categories[0])

for idx, model_id in enumerate(model_ids):
    output_path = Path(data_path) / model_id

    # Glob pattern based on adaptive mode
    if adaptive:
        glob_pattern = f"eval-mode_1*.json"
    else:
        glob_pattern = "eval-[!(mode)(perp)]*.json"

    for file in sorted(list(output_path.glob(glob_pattern))):
        # Extract metric name
        if adaptive:
            metric = file.stem.split("-")[2]
        else:
            metric = file.stem.split("-")[1]

        # Skip llmjudge (if desired)
        if metric == "llmjudge":
            continue

        # Load evaluation data
        with open(file, "r") as f:
            eval_data = json.load(f)

        print(f"Processing {model_id}: {file.name}")

        # Extract baseline
        baseline = eval_data.get("baseline")
        if baseline is None:
            print(f"  ⚠ No baseline found")
            continue

        if isinstance(baseline, list):
            baseline = np.mean(baseline)

        # Add metric to color map if not present
        if metric not in colour_map:
            colour_map[metric] = plotly.colors.qualitative.Plotly[len(colour_map)]

        # Plot baseline as dashed circle
        fig.add_trace(
            go.Scatterpolar(
                r=[
                    (
                        1 - baseline
                        if metric in ["substring_matching", "llmjudge"]
                        else baseline
                    )
                    for _ in range(len(categories))
                ],
                theta=categories,
                name="baseline",
                line=dict(width=2, color=colour_map[metric], dash="dot"),
                mode="lines",
                opacity=0.5,
                showlegend=False,
            ),
            row=idx // num_col + 1,
            col=idx % num_col + 1,
        )

        # Find direction keys (e.g., "max_sim_21_post-pca_0")
        # Filter by strategy and exclude random directions
        direction_keys = [
            k
            for k in eval_data.keys()
            if k != "baseline" and "dir_random" not in k and direction_strategy in k
        ]

        if not direction_keys:
            print(f"  ⚠ No direction keys matching '{direction_strategy}'")
            print(
                f"     Available keys: {[k for k in eval_data.keys() if k != 'baseline']}"
            )
            continue

        # Use first matching direction
        chosen_direction = direction_keys[0]
        print(f"  ✓ Using direction: {chosen_direction}")

        # Extract values for each angle
        direction_data = eval_data[chosen_direction]
        values = []

        for cat in categories[:-1]:  # Exclude last (duplicate of first)
            if cat in direction_data:
                val = direction_data[cat]
                # Convert list to mean if needed
                if isinstance(val, list):
                    val = np.mean(val)
                values.append(val)
            else:
                # Missing angle - use baseline
                values.append(baseline)

        values.append(values[0])  # Close the loop

        # Plot steered results
        fig.add_trace(
            go.Scatterpolar(
                r=(
                    [1 - v for v in values]
                    if metric in ["substring_matching", "llmjudge"]
                    else values
                ),
                theta=categories,
                name=metric,
                line=dict(width=2, color=colour_map[metric]),
                mode="lines",
                showlegend=idx == 0,
            ),
            row=idx // num_col + 1,
            col=idx % num_col + 1,
        )

    # Add feature direction marker (0° = harmful direction)
    fig.add_trace(
        go.Scatterpolar(
            r=[1.02],
            theta=[0],
            name="feature direction",
            marker=dict(size=20, symbol="arrow-right", color="black"),
            mode="markers",
            showlegend=idx == 0,
        ),
        row=idx // num_col + 1,
        col=idx % num_col + 1,
    )

# Update layout for each subplot
for i in range(len(model_ids) + 1):
    polar_key = f'polar{i if i > 0 else ""}'
    fig.update_layout(
        {
            polar_key: dict(
                radialaxis=dict(
                    visible=True,
                    dtick=0.2,
                    tickfont=dict(size=20),
                ),
                angularaxis=dict(
                    tickvals=list(range(0, 360, 10)),
                    ticktext=[f"{i}°" for i in range(0, 360, 10)],
                    tickfont=dict(size=18),
                    dtick=10,
                ),
            )
        }
    )

# Global layout settings
fig.update_layout(
    height=800 * num_row,
    width=1000,
    title_text=f"Angular Steering: {'Adaptive' if adaptive else 'Always-On'} Mode ({direction_strategy})",
    showlegend=True,
    legend=dict(
        orientation="h",
        y=-0.05,
        xanchor="center",
        x=0.5,
        font=dict(size=14),
    ),
)

fig.show()

Processing Qwen2.5-7B-Instruct: eval-mode_1-substring_matching-harmful-en.json
  ✓ Using direction: max_sim_21_post-pca_0
Processing Qwen2.5-3B-Instruct: eval-mode_1-substring_matching-harmful-en.json
  ✓ Using direction: max_sim_27_mid-pca_0


## Quick Data Summary

Print available evaluation files for debugging.

In [13]:
print("Available evaluation files:\n")

for model_dir in data_path.iterdir():
    if not model_dir.is_dir():
        continue

    print(f"\n{model_dir.name}/")

    eval_files = sorted(model_dir.glob("eval-*.json"))

    if not eval_files:
        print("  (no eval files)")
        continue

    for f in eval_files:
        print(f"  {f.name}")

Available evaluation files:



AttributeError: 'str' object has no attribute 'iterdir'

## Usage Instructions

### Running the Pipeline

```bash
# Quick test (30-degree steps, 128 samples)
./run_pipeline.sh

# Full evaluation matching parent folder (10-degree steps, 512 samples)
./run_pipeline.sh --angle-step 10 --n-samples 512 --eval-methods substring_matching,harmbench

# Multiple models
./run_pipeline.sh --model Qwen/Qwen2.5-3B-Instruct --angle-step 10
./run_pipeline.sh --model Qwen/Qwen2.5-7B-Instruct --angle-step 10
```

### Updating the Visualization

After running the pipeline for different models, update the `model_configs` list in cell 3:

```python
model_configs = [
    ("Qwen2.5-3B-Instruct", "max_sim"),
    ("Qwen2.5-7B-Instruct", "max_sim"),
    ("Llama-3.1-8B-Instruct", "max_sim"),
]
```

Then re-run cell 3 to generate the updated visualization.