In [None]:
# Layer curves comparing shallow vs deep models (32B is the only deep model)
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

for j, concept in enumerate(CONCEPTS):
    # Top row: raw layer curves
    ax = axes[0, j]
    for model in MODELS:
        model_data = summaries_df[(summaries_df["concept"] == concept) & (summaries_df["model"] == model)]
        style = '--' if LAYER_COUNTS[model] == 64 else '-'
        linewidth = 2 if LAYER_COUNTS[model] == 64 else 1.5
        ax.plot(model_data["layer"], model_data["delta"], style, 
                label=f"{model} ({LAYER_COUNTS[model]}L)", alpha=0.8, linewidth=linewidth)
    ax.axhline(0, color='black', linestyle=':', alpha=0.3)
    ax.set_title(f"{concept.replace('_', ' ').title()}")
    ax.set_xlabel("Layer")
    ax.set_ylabel("Delta")
    ax.legend(fontsize=9)
    
    # Bottom row: normalized layer position
    ax = axes[1, j]
    for model in MODELS:
        model_data = summaries_df[(summaries_df["concept"] == concept) & (summaries_df["model"] == model)]
        normalized = model_data["layer"] / LAYER_COUNTS[model]
        style = '--' if LAYER_COUNTS[model] == 64 else '-'
        linewidth = 2 if LAYER_COUNTS[model] == 64 else 1.5
        ax.plot(normalized, model_data["delta"], style, 
                label=f"{model} ({LAYER_COUNTS[model]}L)", alpha=0.8, linewidth=linewidth)
    ax.axhline(0, color='black', linestyle=':', alpha=0.3)
    ax.set_xlabel("Normalized Layer Position")
    ax.set_ylabel("Delta")
    ax.legend(fontsize=9)

plt.suptitle("Solid = Shallow (36-40L), Dashed = Deep (64L, 32B only)", fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig("../results/logit_diff_sweep/layer_curves_depth_comparison.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# Best layer position as fraction of total layers
print("Best Layer Position (% of total layers):")
print("="*70)

position_data = []
for model in MODELS:
    row = {"model": model, "layers": LAYER_COUNTS[model]}
    for concept in CONCEPTS:
        model_best = best_layers[(best_layers["model"] == model) & (best_layers["concept"] == concept)]
        if len(model_best) > 0:
            layer = model_best["layer"].values[0]
            frac = layer / LAYER_COUNTS[model]
            row[concept] = frac
    position_data.append(row)

position_df = pd.DataFrame(position_data)
position_df = position_df.set_index("model")

# Display as percentages
display_df = position_df.copy()
for col in CONCEPTS:
    display_df[col] = display_df[col].apply(lambda x: f"{x:.0%}" if pd.notna(x) else "")
print(display_df.to_string())

print("\n" + "="*70)
print("Pattern:")
print("  4B, 8B, 14B: best layers in middle (50-65%)")
print("  32B:         best layers LATE (69-80%) <-- OUTLIER")
print("="*70)

## Best Layer Position: 32B is an Outlier

32B is unique - its best steering layers are in the final third (69-80%) for ALL concepts,
while all other models (4B, 8B, 14B) have best layers in the middle (50-65%).

In [None]:
# Correct architecture details from Qwen3 model configs
LAYER_COUNTS = {"4B": 36, "8B": 36, "14B": 40, "32B": 64}

print("Model Architectures (from Qwen3 configs):")
print("="*50)
for model in MODELS:
    depth = "DEEP" if LAYER_COUNTS[model] == 64 else "shallow"
    print(f"  {model}: {LAYER_COUNTS[model]} layers ({depth})")

# Compare shallow vs deep (only 32B is deep)
print("\n" + "="*50)
print("Average Best Delta: Shallow (36-40L) vs Deep (64L)")
print("="*50)

for concept in CONCEPTS:
    shallow_deltas = []
    deep_deltas = []
    
    for model in MODELS:
        model_best = best_layers[(best_layers["model"] == model) & (best_layers["concept"] == concept)]
        if len(model_best) > 0:
            delta = model_best["delta"].values[0]
            if LAYER_COUNTS[model] <= 40:
                shallow_deltas.append(delta)
            else:
                deep_deltas.append(delta)
    
    shallow_avg = np.mean(shallow_deltas) if shallow_deltas else 0
    deep_avg = np.mean(deep_deltas) if deep_deltas else 0
    ratio = shallow_avg / deep_avg if deep_avg > 0 else 0
    
    print(f"\n{concept}:")
    print(f"  Shallow (4B, 8B, 14B): {shallow_avg:.2f}")
    print(f"  Deep (32B only):       {deep_avg:.2f}")
    print(f"  Ratio:                 {ratio:.1f}x")

# Logit Diff Layer Sweep Analysis

Analyzing steering effects across 4B, 8B, 14B, 32B models for corrigible, self_awareness, and sycophancy concepts.

**Methodology**: For each layer, we compute P("(A" | prompt) vs P("(B" | prompt) at the first generation position,
measuring how steering shifts the model's latent disposition toward one answer.

In [None]:
import json
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

## Load All Data

In [None]:
BASE_DIR = Path("../results/logit_diff_sweep")

MODELS = ["4B", "8B", "14B", "32B"]
CONCEPTS = ["corrigible", "self_awareness", "sycophancy"]

# Load all per-sample data
all_samples = []
all_summaries = []

for concept in CONCEPTS:
    for model in MODELS:
        model_dir = BASE_DIR / concept / f"Qwen_Qwen3_{model}"
        
        # Per-sample data
        sample_path = model_dir / "per_sample_all_layers.csv"
        if sample_path.exists():
            df = pd.read_csv(sample_path)
            all_samples.append(df)
        
        # Layer summary
        summary_path = model_dir / "layer_summary.csv"
        if summary_path.exists():
            df = pd.read_csv(summary_path)
            df["model"] = model
            df["concept"] = concept
            all_summaries.append(df)

samples_df = pd.concat(all_samples, ignore_index=True)
summaries_df = pd.concat(all_summaries, ignore_index=True)

print(f"Loaded {len(samples_df):,} per-sample rows")
print(f"Loaded {len(summaries_df)} layer summaries")
print(f"\nModels: {samples_df['model'].unique()}")
print(f"Concepts: {samples_df['concept'].unique()}")

## Summary Statistics

In [None]:
# Best layer per model/concept
best_layers = summaries_df.loc[summaries_df.groupby(["model", "concept"])["delta"].idxmax()]
best_layers = best_layers[["model", "concept", "layer", "delta", "baseline_mean", "positive_mean", "negative_mean"]]
best_layers = best_layers.sort_values(["concept", "model"])

print("Best Layer per Model/Concept (by max delta):")
print("="*80)
display(best_layers.round(3))

In [None]:
# Pivot table of best deltas
pivot = best_layers.pivot(index="model", columns="concept", values="delta")
pivot = pivot.reindex(["4B", "8B", "14B", "32B"])

print("\nBest Delta by Model/Concept:")
print("="*50)
display(pivot.round(2))

## Layer-wise Delta Curves

How does steering effect (delta = positive - negative) vary across layers?

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)

for ax, concept in zip(axes, CONCEPTS):
    concept_data = summaries_df[summaries_df["concept"] == concept]
    
    for model in MODELS:
        model_data = concept_data[concept_data["model"] == model]
        ax.plot(model_data["layer"], model_data["delta"], 
                marker="o", markersize=3, label=model, alpha=0.8)
    
    ax.axhline(y=0, color="black", linestyle="--", alpha=0.3)
    ax.set_xlabel("Layer")
    ax.set_title(concept.replace("_", " ").title())
    ax.legend(title="Model")

axes[0].set_ylabel("Delta (logit diff: +1.0 steering - (-1.0 steering))")
plt.suptitle("Steering Effect (Delta) by Layer", fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig("../results/logit_diff_sweep/layer_delta_curves.png", dpi=150, bbox_inches="tight")
plt.show()

## Normalized Layer Curves

Normalize layer index to [0, 1] to compare across model sizes.

In [None]:
# Add normalized layer position (correct layer counts from Qwen3 configs)
layer_counts = {"4B": 36, "8B": 36, "14B": 40, "32B": 64}

summaries_df["layer_frac"] = summaries_df.apply(
    lambda row: row["layer"] / layer_counts[row["model"]], axis=1
)

fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)

for ax, concept in zip(axes, CONCEPTS):
    concept_data = summaries_df[summaries_df["concept"] == concept]
    
    for model in MODELS:
        model_data = concept_data[concept_data["model"] == model]
        ax.plot(model_data["layer_frac"], model_data["delta"], 
                marker="o", markersize=3, label=model, alpha=0.8)
    
    ax.axhline(y=0, color="black", linestyle="--", alpha=0.3)
    ax.set_xlabel("Normalized Layer Position")
    ax.set_title(concept.replace("_", " ").title())
    ax.legend(title="Model")

axes[0].set_ylabel("Delta")
plt.suptitle("Steering Effect by Normalized Layer Position", fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig("../results/logit_diff_sweep/layer_delta_curves_normalized.png", dpi=150, bbox_inches="tight")
plt.show()

## Distribution of Logit Diffs at Best Layers

Violin plots showing per-sample logit diff distributions at the best layer for each model/concept.

In [None]:
# Get best layer for each model/concept
best_layer_map = best_layers.set_index(["model", "concept"])["layer"].to_dict()

# Filter samples to best layers only
samples_df["is_best_layer"] = samples_df.apply(
    lambda row: row["layer"] == best_layer_map.get((row["model"], row["concept"]), -1),
    axis=1
)
best_samples = samples_df[samples_df["is_best_layer"]].copy()

print(f"Samples at best layers: {len(best_samples):,}")

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for ax, concept in zip(axes, CONCEPTS):
    data = best_samples[best_samples["concept"] == concept]
    
    # Order models
    data["model"] = pd.Categorical(data["model"], categories=MODELS, ordered=True)
    
    sns.violinplot(
        data=data, x="model", y="logit_diff_matching", hue="strength",
        ax=ax, palette="coolwarm", inner="quartile", cut=0
    )
    
    ax.axhline(y=0, color="black", linestyle="--", alpha=0.3)
    ax.set_xlabel("Model")
    ax.set_title(f"{concept.replace('_', ' ').title()}\n(best layer per model)")
    ax.legend(title="Strength", loc="upper right")

axes[0].set_ylabel("Logit Diff (matching direction)")
plt.suptitle("Logit Diff Distributions at Best Layers", fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig("../results/logit_diff_sweep/best_layer_violins.png", dpi=150, bbox_inches="tight")
plt.show()

## Heatmaps: Mean Logit Diff by Strength and Layer

In [None]:
def plot_heatmap(model, concept):
    data = samples_df[(samples_df["model"] == model) & (samples_df["concept"] == concept)]
    
    pivot = data.pivot_table(
        index="layer", columns="strength", values="logit_diff_matching", aggfunc="mean"
    )
    
    fig, ax = plt.subplots(figsize=(6, 8))
    sns.heatmap(pivot, cmap="RdBu_r", center=0, ax=ax, cbar_kws={"label": "Mean Logit Diff"})
    ax.set_title(f"{model} - {concept.replace('_', ' ').title()}")
    ax.set_ylabel("Layer")
    ax.set_xlabel("Steering Strength")
    plt.tight_layout()
    return fig

# Show one example
fig = plot_heatmap("14B", "corrigible")
plt.show()

In [None]:
# All heatmaps in a grid
fig, axes = plt.subplots(len(MODELS), len(CONCEPTS), figsize=(14, 18))

for i, model in enumerate(MODELS):
    for j, concept in enumerate(CONCEPTS):
        ax = axes[i, j]
        data = samples_df[(samples_df["model"] == model) & (samples_df["concept"] == concept)]
        
        pivot = data.pivot_table(
            index="layer", columns="strength", values="logit_diff_matching", aggfunc="mean"
        )
        
        sns.heatmap(pivot, cmap="RdBu_r", center=0, ax=ax, 
                    cbar=j == len(CONCEPTS) - 1,
                    cbar_kws={"label": "Mean Logit Diff"} if j == len(CONCEPTS) - 1 else {})
        
        if i == 0:
            ax.set_title(concept.replace("_", " ").title())
        if j == 0:
            ax.set_ylabel(f"{model}\nLayer")
        else:
            ax.set_ylabel("")
        if i == len(MODELS) - 1:
            ax.set_xlabel("Strength")
        else:
            ax.set_xlabel("")

plt.suptitle("Mean Logit Diff by Layer and Steering Strength", fontsize=14, y=1.01)
plt.tight_layout()
plt.savefig("../results/logit_diff_sweep/heatmap_grid.png", dpi=150, bbox_inches="tight")
plt.show()

## Compare Baseline (strength=0) Distributions

What's the model's natural disposition without steering?

In [None]:
baseline_samples = best_samples[best_samples["strength"] == 0.0]

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for ax, concept in zip(axes, CONCEPTS):
    data = baseline_samples[baseline_samples["concept"] == concept]
    data["model"] = pd.Categorical(data["model"], categories=MODELS, ordered=True)
    
    sns.boxplot(data=data, x="model", y="logit_diff_matching", ax=ax, palette="Set2")
    
    ax.axhline(y=0, color="red", linestyle="--", alpha=0.5, label="No preference")
    ax.set_xlabel("Model")
    ax.set_title(concept.replace("_", " ").title())

axes[0].set_ylabel("Baseline Logit Diff (strength=0)")
plt.suptitle("Model's Natural Disposition (No Steering)", fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig("../results/logit_diff_sweep/baseline_distributions.png", dpi=150, bbox_inches="tight")
plt.show()

## Effect Size: Positive vs Negative Steering

In [None]:
# Compare steered distributions
fig, axes = plt.subplots(len(MODELS), len(CONCEPTS), figsize=(14, 12))

for i, model in enumerate(MODELS):
    for j, concept in enumerate(CONCEPTS):
        ax = axes[i, j]
        data = best_samples[(best_samples["model"] == model) & (best_samples["concept"] == concept)]
        
        for strength, color in [(-1.0, "blue"), (0.0, "gray"), (1.0, "red")]:
            subset = data[data["strength"] == strength]["logit_diff_matching"]
            ax.hist(subset, bins=30, alpha=0.5, label=f"{strength:+.1f}", color=color, density=True)
        
        ax.axvline(x=0, color="black", linestyle="--", alpha=0.3)
        
        if i == 0:
            ax.set_title(concept.replace("_", " ").title())
        if j == 0:
            ax.set_ylabel(f"{model}")
        if i == len(MODELS) - 1:
            ax.set_xlabel("Logit Diff")
        if i == 0 and j == len(CONCEPTS) - 1:
            ax.legend(title="Strength")

plt.suptitle("Logit Diff Distributions by Steering Strength (Best Layers)", fontsize=14, y=1.01)
plt.tight_layout()
plt.savefig("../results/logit_diff_sweep/strength_histograms.png", dpi=150, bbox_inches="tight")
plt.show()

## Summary Statistics Table

In [None]:
# Compute summary stats for best layers
summary_stats = best_samples.groupby(["model", "concept", "strength"]).agg({
    "logit_diff_matching": ["mean", "std", "median"]
}).round(3)

summary_stats.columns = ["mean", "std", "median"]
summary_stats = summary_stats.reset_index()

print("Summary Statistics at Best Layers:")
display(summary_stats)

In [None]:
# Effect size (Cohen's d approximation): (mean_pos - mean_neg) / pooled_std
effect_sizes = []

for model in MODELS:
    for concept in CONCEPTS:
        data = best_samples[(best_samples["model"] == model) & (best_samples["concept"] == concept)]
        
        pos = data[data["strength"] == 1.0]["logit_diff_matching"]
        neg = data[data["strength"] == -1.0]["logit_diff_matching"]
        
        mean_diff = pos.mean() - neg.mean()
        pooled_std = np.sqrt((pos.std()**2 + neg.std()**2) / 2)
        cohens_d = mean_diff / pooled_std if pooled_std > 0 else 0
        
        effect_sizes.append({
            "model": model,
            "concept": concept,
            "mean_positive": pos.mean(),
            "mean_negative": neg.mean(),
            "delta": mean_diff,
            "cohens_d": cohens_d,
        })

effect_df = pd.DataFrame(effect_sizes)
print("\nEffect Sizes (Cohen's d):")
display(effect_df.round(3))

In [None]:
# Effect size heatmap
pivot = effect_df.pivot(index="model", columns="concept", values="cohens_d")
pivot = pivot.reindex(["4B", "8B", "14B", "32B"])

fig, ax = plt.subplots(figsize=(8, 5))
sns.heatmap(pivot, annot=True, fmt=".2f", cmap="RdYlGn", center=0, ax=ax,
            cbar_kws={"label": "Cohen's d"})
ax.set_title("Steering Effect Size (Cohen's d) at Best Layers")
ax.set_ylabel("Model")
ax.set_xlabel("Concept")
plt.tight_layout()
plt.savefig("../results/logit_diff_sweep/effect_size_heatmap.png", dpi=150, bbox_inches="tight")
plt.show()

## Key Findings

In [None]:
print("="*70)
print("KEY FINDINGS")
print("="*70)

print("\n1. Best Layers by Model/Concept:")
for _, row in best_layers.iterrows():
    print(f"   {row['model']:4s} {row['concept']:15s}: layer {int(row['layer']):2d}, delta={row['delta']:+.2f}")

print("\n2. Effect Sizes (Cohen's d):")
for _, row in effect_df.iterrows():
    strength = "strong" if abs(row['cohens_d']) > 0.8 else "medium" if abs(row['cohens_d']) > 0.5 else "weak"
    print(f"   {row['model']:4s} {row['concept']:15s}: d={row['cohens_d']:+.2f} ({strength})")

print("\n3. Strongest Steering Concepts:")
concept_avg = effect_df.groupby("concept")["cohens_d"].mean().sort_values(ascending=False)
for concept, d in concept_avg.items():
    print(f"   {concept}: avg d = {d:+.2f}")

print("\n4. Model Size Scaling:")
model_avg = effect_df.groupby("model")["cohens_d"].mean()
model_avg = model_avg.reindex(["4B", "8B", "14B", "32B"])
for model, d in model_avg.items():
    print(f"   {model}: avg d = {d:+.2f}")

In [None]:
# Save summary to JSON
summary = {
    "best_layers": best_layers.to_dict(orient="records"),
    "effect_sizes": effect_df.to_dict(orient="records"),
    "concept_avg_effect": concept_avg.to_dict(),
    "model_avg_effect": model_avg.to_dict(),
}

with open("../results/logit_diff_sweep/analysis_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("\nSaved analysis_summary.json")

## Architecture Depth Analysis

**Key Finding**: 32B is the only deep model and the hardest to steer.

| Model | Layers | Hidden Dim |
|-------|--------|------------|
| 4B | 36 | 2560 |
| 8B | 36 | 4096 |
| 14B | 40 | 5120 |
| 32B | 64 | 5120 |

32B (64 layers) is significantly harder to steer than the shallower models (36-40 layers).