# Variance comparison

In [7]:
import os
import sys
import torch
import json
import numpy as np
import pandas as pd

sys.path.append('.')
sys.path.append('..')

from utils.pca_utils import *
from plots import *

## Configuration

In [10]:
# Configuration - Change these parameters for different models/datasets
base_dir = "/workspace/llama-3.3-70b"
type = "roles_240"
dir = f"{base_dir}/{type}"
model_name = "Llama-3.3-70B"
layer = 40

In [11]:
pca_results = torch.load(f"{dir}/pca/layer{layer}_pos23.pt", weights_only=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## Variance across and within roles

### raw activations

In [12]:
vectors = torch.stack(pca_results['vectors']['pos_3'])[:, layer, :].float()
print(vectors.shape)

# compute variance across roles (rows) along hidden_dims
raw_across_var = torch.var(vectors, dim=0)
print(raw_across_var.shape)

torch.Size([275, 8192])
torch.Size([8192])


In [13]:
# load in scores
scores = {}
for file in os.listdir(f"{dir}/extract_scores"):
    if file.endswith('.json'):
        scores[file.replace('.json', '')] = json.load(open(f"{dir}/extract_scores/{file}"))

print(f"Loaded {len(scores)} scores")


Loaded 275 scores


In [14]:
# load in raw activations
activations = {}
for file in os.listdir(f"{dir}/response_activations"):
    if file.endswith('.pt') and 'default' not in file:
        # dict we should iterate over (1200 each)
        role_activations = []
        obj = torch.load(f"{dir}/response_activations/{file}")
        for key in obj:
            if scores[file.replace('.pt', '')][key] == 3:
                role_activations.append(obj[key])
        activations[file.replace('.pt', '')] = torch.stack(role_activations)
        



In [15]:
# compute variance within roles
raw_within_var = []
for file in activations:
    raw_within_var.append(torch.var(activations[file][:, layer, :], dim=0))

print(f"for {len(raw_within_var)} roles, shape is {raw_within_var[0].shape}")

for 275 roles, shape is torch.Size([8192])


In [16]:
avg_raw_within_var = torch.stack(raw_within_var).mean(dim=0)
print(avg_raw_within_var.shape)



torch.Size([8192])


In [17]:
# total variance ratio
raw_ratio = raw_across_var.mean() / avg_raw_within_var.mean()
print(f"ratio of raw_across_var / avg_raw_within_var is {raw_ratio}")

ratio of raw_across_var / avg_raw_within_var is 0.2864890992641449


In [18]:
raw_across_var_normalized = torch.var(F.normalize(vectors, p=2, dim=1), dim=0)
print(raw_across_var_normalized.shape)



torch.Size([8192])


In [19]:
raw_within_var_normalized = []
for file in activations:
    raw_within_var_normalized.append(torch.var(F.normalize(activations[file][:, layer, :], p=2, dim=1), dim=0))

print(f"for {len(raw_within_var_normalized)} roles, shape is {raw_within_var_normalized[0].shape}")
avg_raw_within_var_normalized = torch.stack(raw_within_var_normalized).mean(dim=0)
print(avg_raw_within_var_normalized.shape)


for 275 roles, shape is torch.Size([8192])
torch.Size([8192])


In [20]:
raw_ratio_normalized = raw_across_var_normalized.mean() / avg_raw_within_var_normalized.mean()
print(f"ratio of raw_across_var_normalized / avg_raw_within_var_normalized is {raw_ratio_normalized}")


ratio of raw_across_var_normalized / avg_raw_within_var_normalized is 0.4062836766242981


### in PC space

In [21]:
# get transformed role vectors
pca_across_var = np.var(pca_results['pca_transformed'][:275], axis=0)
print(pca_across_var.shape)


(377,)


In [22]:
print(activations['absurdist'].shape)

torch.Size([1158, 80, 8192])


In [23]:
pca_within_var = []
pc1_within_var = []
for role in activations:
    role_scaled = pca_results['scaler'].transform(activations[role][:, layer, :].float().numpy())
    role_pca = pca_results['pca'].transform(role_scaled)
    pca_within_var.append(np.var(role_pca, axis=0))
    pc1_within_var.append(np.var(role_pca[:, 0]))

print(f"for {len(pca_within_var)} roles, shape is {pca_within_var[0].shape}")

for 275 roles, shape is (377,)


In [24]:
mean_pca_within_var = np.array(pca_within_var).mean(axis=0)
print(mean_pca_within_var.shape)


(377,)


In [25]:
pca_ratio = pca_across_var.mean() / mean_pca_within_var.mean()
print(f"ratio of pca_across_var / mean_pca_within_var is {pca_ratio}")

ratio of pca_across_var / mean_pca_within_var is 0.34383073803712927


### pc1 variance only

In [26]:
pc1_across_var = np.var(pca_results['pca_transformed'][:275, 0])
print(pc1_across_var)


1098.9190452905918


In [27]:
mean_pc1_within_var = np.array(pc1_within_var).mean()
print(mean_pc1_within_var)

pc1_ratio = pc1_across_var / mean_pc1_within_var
print(f"ratio of pc1_across_var / mean_pc1_within_var is {pc1_ratio}")


544.487320753733
ratio of pc1_across_var / mean_pc1_within_var is 2.018263793120765


## Conditional variance of role vectors based on distance from Assistant

In [28]:
role_vectors = torch.stack(pca_results['vectors']['pos_2'] + pca_results['vectors']['pos_3'])[:, layer, :]
print(role_vectors.shape)

pc1 = pca_results['pca_transformed'][:, 0]

torch.Size([377, 8192])


### Conditional variance in raw activation space

In [31]:
from scipy.stats import pearsonr

# Two-group comparison: Assistant-like vs Roleplay
# Using PC1 threshold of -25 (same as in 9_cone.ipynb)
threshold = -25

assistant_mask = pc1 < threshold
roleplay_mask = pc1 >= threshold

# Compute variance of raw activations for each group
# role_vectors shape: [448, 4608]
var_assistant_raw = torch.var(role_vectors[assistant_mask], dim=0).mean().item()
var_roleplay_raw = torch.var(role_vectors[roleplay_mask], dim=0).mean().item()

var_ratio_raw = var_assistant_raw / var_roleplay_raw

print("=" * 60)
print("RAW ACTIVATION SPACE: Two-Group Comparison")
print("=" * 60)
print(f"PC1 threshold: {threshold}")
print(f"Assistant-like roles (PC1 < {threshold}): {assistant_mask.sum()} samples")
print(f"Roleplay roles (PC1 >= {threshold}): {roleplay_mask.sum()} samples")
print(f"\nMean variance (Assistant-like): {var_assistant_raw:.6f}")
print(f"Mean variance (Roleplay): {var_roleplay_raw:.6f}")
print(f"Variance ratio (Assistant/Roleplay): {var_ratio_raw:.4f} ({var_ratio_raw*100:.2f}%)")
print("=" * 60)

RAW ACTIVATION SPACE: Two-Group Comparison
PC1 threshold: -25
Assistant-like roles (PC1 < -25): 121 samples
Roleplay roles (PC1 >= -25): 256 samples

Mean variance (Assistant-like): 0.000248
Mean variance (Roleplay): 0.000900
Variance ratio (Assistant/Roleplay): 0.2754 (27.54%)


In [32]:
# Project out PC1 from raw activations
# Get PC1 direction from PCA
pc1_direction = torch.from_numpy(pca_results['pca'].components_[0]).float()

# Project role_vectors onto PC1 and subtract
# Formula: projection = (v · u) * u, where u is the unit vector (PC1 direction)
pc1_loadings = (role_vectors.float() @ pc1_direction).unsqueeze(1)  # Shape: [448, 1]
pc1_projections = pc1_loadings * pc1_direction.unsqueeze(0)  # Shape: [448, 4608]
role_vectors_pc1_removed = role_vectors - pc1_projections

# Compute variance with PC1 projected out
var_assistant_raw_no_pc1 = torch.var(role_vectors_pc1_removed[assistant_mask], dim=0).mean().item()
var_roleplay_raw_no_pc1 = torch.var(role_vectors_pc1_removed[roleplay_mask], dim=0).mean().item()

var_ratio_raw_no_pc1 = var_assistant_raw_no_pc1 / var_roleplay_raw_no_pc1

print("\n" + "=" * 60)
print("RAW ACTIVATION SPACE (PC1 projected out): Two-Group Comparison")
print("=" * 60)
print(f"PC1 threshold: {threshold}")
print(f"Assistant-like roles (PC1 < {threshold}): {assistant_mask.sum()} samples")
print(f"Roleplay roles (PC1 >= {threshold}): {roleplay_mask.sum()} samples")
print(f"\nMean variance (Assistant-like, PC1 removed): {var_assistant_raw_no_pc1:.6f}")
print(f"Mean variance (Roleplay, PC1 removed): {var_roleplay_raw_no_pc1:.6f}")
print(f"Variance ratio (Assistant/Roleplay): {var_ratio_raw_no_pc1:.4f} ({var_ratio_raw_no_pc1*100:.2f}%)")
print(f"\nThis is analogous to the PC2-10 analysis in PC space.")
print("=" * 60)


RAW ACTIVATION SPACE (PC1 projected out): Two-Group Comparison
PC1 threshold: -25
Assistant-like roles (PC1 < -25): 121 samples
Roleplay roles (PC1 >= -25): 256 samples

Mean variance (Assistant-like, PC1 removed): 0.000246
Mean variance (Roleplay, PC1 removed): 0.000806
Variance ratio (Assistant/Roleplay): 0.3049 (30.49%)

This is analogous to the PC2-10 analysis in PC space.


In [34]:
# Quintile analysis
n_quintiles = 5
quintile_edges = np.quantile(pc1, np.linspace(0, 1, n_quintiles + 1))
quintile_variances = []
quintile_variances_no_pc1 = []
quintile_sizes = []

print("\n" + "=" * 60)
print("RAW ACTIVATION SPACE: Quintile Analysis")
print("=" * 60)

for i in range(n_quintiles):
    if i == 0:
        mask = (pc1 >= quintile_edges[i]) & (pc1 <= quintile_edges[i + 1])
    else:
        mask = (pc1 > quintile_edges[i]) & (pc1 <= quintile_edges[i + 1])
    
    quintile_var = torch.var(role_vectors[mask], dim=0).mean().item()
    quintile_var_no_pc1 = torch.var(role_vectors_pc1_removed[mask], dim=0).mean().item()
    quintile_variances.append(quintile_var)
    quintile_variances_no_pc1.append(quintile_var_no_pc1)
    quintile_sizes.append(mask.sum())
    
    print(f"\nQuintile {i+1}: PC1 ∈ [{quintile_edges[i]:.2f}, {quintile_edges[i+1]:.2f}]")
    print(f"  Sample size: {mask.sum()}")
    print(f"  Mean variance (full): {quintile_var:.6f}")
    print(f"  Mean variance (PC1 removed): {quintile_var_no_pc1:.6f}")

# Calculate ratios between first and last quintile
quintile_ratio = quintile_variances[-1] / quintile_variances[0]
quintile_ratio_no_pc1 = quintile_variances_no_pc1[-1] / quintile_variances_no_pc1[0]

print("\n" + "-" * 60)
print(f"Variance ratio (Last/First quintile, full): {quintile_ratio:.2f}x")
print(f"Variance ratio (Last/First quintile, PC1 removed): {quintile_ratio_no_pc1:.2f}x")
print("=" * 60)


RAW ACTIVATION SPACE: Quintile Analysis

Quintile 1: PC1 ∈ [-44.38, -31.56]
  Sample size: 76
  Mean variance (full): 0.000200
  Mean variance (PC1 removed): 0.000200

Quintile 2: PC1 ∈ [-31.56, -18.14]
  Sample size: 75
  Mean variance (full): 0.000309
  Mean variance (PC1 removed): 0.000308

Quintile 3: PC1 ∈ [-18.14, 2.47]
  Sample size: 75
  Mean variance (full): 0.000549
  Mean variance (PC1 removed): 0.000546

Quintile 4: PC1 ∈ [2.47, 31.74]
  Sample size: 75
  Mean variance (full): 0.000984
  Mean variance (PC1 removed): 0.000981

Quintile 5: PC1 ∈ [31.74, 101.90]
  Sample size: 76
  Mean variance (full): 0.001060
  Mean variance (PC1 removed): 0.001030

------------------------------------------------------------
Variance ratio (Last/First quintile, full): 5.30x
Variance ratio (Last/First quintile, PC1 removed): 5.16x


In [35]:
# Distance from center correlation
# Compute mean of raw activations
role_vectors_mean = role_vectors.mean(dim=0)
role_vectors_pc1_removed_mean = role_vectors_pc1_removed.mean(dim=0)

# Compute L2 distance from mean for each role
distances_raw = torch.norm(role_vectors.float() - role_vectors_mean, p=2, dim=1).numpy()
distances_raw_no_pc1 = torch.norm(role_vectors_pc1_removed - role_vectors_pc1_removed_mean, p=2, dim=1).numpy()

# Calculate correlation with PC1
correlation_raw, p_value_raw = pearsonr(pc1, distances_raw)
correlation_raw_no_pc1, p_value_raw_no_pc1 = pearsonr(pc1, distances_raw_no_pc1)

print("\n" + "=" * 60)
print("RAW ACTIVATION SPACE: Distance from Center Correlation")
print("=" * 60)
print(f"Correlation between PC1 and L2 distance from mean (full):")
print(f"  r = {correlation_raw:.4f}")
print(f"  p-value = {p_value_raw:.3e}")
if p_value_raw < 0.001:
    print(f"  Highly significant (p < 0.001)")
elif p_value_raw < 0.05:
    print(f"  Significant (p < 0.05)")

print(f"\nCorrelation between PC1 and L2 distance from mean (PC1 removed):")
print(f"  r = {correlation_raw_no_pc1:.4f}")
print(f"  p-value = {p_value_raw_no_pc1:.3e}")
if p_value_raw_no_pc1 < 0.001:
    print(f"  Highly significant (p < 0.001)")
elif p_value_raw_no_pc1 < 0.05:
    print(f"  Significant (p < 0.05)")
print("=" * 60)


RAW ACTIVATION SPACE: Distance from Center Correlation
Correlation between PC1 and L2 distance from mean (full):
  r = 0.7185
  p-value = 4.150e-61
  Highly significant (p < 0.001)

Correlation between PC1 and L2 distance from mean (PC1 removed):
  r = 0.6382
  p-value = 1.639e-44
  Highly significant (p < 0.001)


### Per-PC analysis: Correlation between each PC and distance in remaining PC space

In [36]:
# For each of the top 10 PCs, calculate:
# 1. The correlation between that PC and distance from center in all OTHER PCs
# 2. This tells us if the pattern we see with PC1 generalizes to other PCs

from scipy.stats import pearsonr

n_pcs_to_analyze = 10
pca_transformed = pca_results['pca_transformed']

print("=" * 70)
print("Correlation between each PC and distance in remaining PC space")
print("=" * 70)

correlations = []
p_values = []

for pc_idx in range(n_pcs_to_analyze):
    # Get the PC values
    pc_values = pca_transformed[:, pc_idx]
    
    # Get all other PCs (excluding current PC)
    other_pcs = np.delete(pca_transformed, pc_idx, axis=1)
    
    # Calculate distance from center in the remaining PC space
    other_pcs_mean = other_pcs.mean(axis=0)
    distances = np.linalg.norm(other_pcs - other_pcs_mean, axis=1)
    
    # Calculate correlation
    corr, p_val = pearsonr(pc_values, distances)
    correlations.append(corr)
    p_values.append(p_val)
    
    # Print results
    sig_marker = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else ""
    print(f"PC{pc_idx+1:2d}: r = {corr:7.4f}, p = {p_val:.3e} {sig_marker}")

print("=" * 70)
print(f"\nPC1 correlation: {correlations[0]:.4f}")
print(f"Mean correlation (PC2-10): {np.mean(correlations[1:]):.4f}")
print("=" * 70)

Correlation between each PC and distance in remaining PC space
PC 1: r =  0.6549, p = 1.521e-47 ***
PC 2: r =  0.4304, p = 1.981e-18 ***
PC 3: r =  0.2705, p = 9.594e-08 ***
PC 4: r =  0.1602, p = 1.802e-03 **
PC 5: r = -0.0537, p = 2.987e-01 
PC 6: r =  0.1110, p = 3.114e-02 *
PC 7: r =  0.1095, p = 3.348e-02 *
PC 8: r =  0.0153, p = 7.668e-01 
PC 9: r =  0.0751, p = 1.456e-01 
PC10: r =  0.1433, p = 5.316e-03 **

PC1 correlation: 0.6549
Mean correlation (PC2-10): 0.1402


### Conditional variance in PC2-10 based on position along each PC

This analysis shows whether the pattern of "extreme positions → high variance in other PCs" is unique to PC1 or generalizes to other PCs.

In [37]:
# For each PC, split roles into two groups (high/low) and compute variance in PC2-10 (excluding that PC)
# This tests if extreme positions on PC_i lead to high variance in other PCs

n_pcs_to_test = 10
pca_transformed = pca_results['pca_transformed']

print("=" * 80)
print("Conditional Variance in PC2-10 based on position along each PC")
print("=" * 80)
print("For each PC, we split roles by median and compute variance in PC2-10 (excluding that PC)")
print("-" * 80)

variance_ratios = []

for pc_idx in range(n_pcs_to_test):
    # Split by median on this PC
    pc_values = pca_transformed[:, pc_idx]
    median_val = np.median(pc_values)
    high_mask = pc_values > median_val
    low_mask = pc_values <= median_val
    
    # Get PC2-10, excluding current PC if it's in that range
    if pc_idx == 0:
        # For PC1, we want variance in PC2-10
        other_pcs = pca_transformed[:, 1:10]
    elif 1 <= pc_idx < 10:
        # For PC2-9, exclude that PC from PC2-10
        pc_indices = [i for i in range(1, 10) if i != pc_idx]
        other_pcs = pca_transformed[:, pc_indices]
    else:
        # For PC10, use PC2-9
        other_pcs = pca_transformed[:, 1:10]
    
    # Compute variance for each group
    var_high = np.var(other_pcs[high_mask], axis=0).mean()
    var_low = np.var(other_pcs[low_mask], axis=0).mean()
    
    ratio = max(var_high, var_low) / min(var_high, var_low)
    variance_ratios.append(ratio)
    
    print(f"PC{pc_idx+1:2d}: High={high_mask.sum():3d} samples, Low={low_mask.sum():3d} samples")
    print(f"      Var(high) = {var_high:8.3f}, Var(low) = {var_low:8.3f}, Ratio = {ratio:.3f}")

print("=" * 80)
print(f"\nSummary:")
print(f"  PC1 variance ratio: {variance_ratios[0]:.3f}")
print(f"  Mean variance ratio for PC2-10: {np.mean(variance_ratios[1:]):.3f}")
print(f"  Max variance ratio (excluding PC1): {np.max(variance_ratios[1:]):.3f} (PC{np.argmax(variance_ratios[1:])+2})")
print("\n  → Shows whether PC1 is unique in having high-variance 'other dimensions' for extreme positions")
print("=" * 80)

Conditional Variance in PC2-10 based on position along each PC
For each PC, we split roles by median and compute variance in PC2-10 (excluding that PC)
--------------------------------------------------------------------------------
PC 1: High=188 samples, Low=189 samples
      Var(high) =  562.709, Var(low) =  129.801, Ratio = 4.335
PC 2: High=188 samples, Low=189 samples
      Var(high) =  412.558, Var(low) =  173.262, Ratio = 2.381
PC 3: High=188 samples, Low=189 samples
      Var(high) =  350.552, Var(low) =  307.318, Ratio = 1.141
PC 4: High=188 samples, Low=189 samples
      Var(high) =  395.870, Var(low) =  277.716, Ratio = 1.425
PC 5: High=188 samples, Low=189 samples
      Var(high) =  312.331, Var(low) =  388.396, Ratio = 1.244
PC 6: High=188 samples, Low=189 samples
      Var(high) =  391.404, Var(low) =  326.078, Ratio = 1.200
PC 7: High=188 samples, Low=189 samples
      Var(high) =  381.977, Var(low) =  343.582, Ratio = 1.112
PC 8: High=188 samples, Low=189 samples
      

In [38]:
# Create role labels from pca_results
def get_role_labels_from_pca(pca_results):
    labels = []
    if 'pos_2' in pca_results['roles'].keys():
        pos_2_roles = [role.replace('_', ' ').title() for role in pca_results['roles']['pos_2']]
        labels.extend(pos_2_roles)
    if 'pos_3' in pca_results['roles'].keys():
        pos_3_roles = [role.replace('_', ' ').title() for role in pca_results['roles']['pos_3']]
        labels.extend(pos_3_roles)
    return labels

role_labels = get_role_labels_from_pca(pca_results)
print(f"Total roles: {len(role_labels)}")
print(f"pca_transformed shape: {pca_transformed.shape}")

Total roles: 377
pca_transformed shape: (377, 377)


In [39]:
# Show top/bottom roles for each PC
n_pcs_to_show = 10  # Show first 5 PCs
n_roles_to_show = 5  # Show top/bottom 5 roles

print("=" * 80)
print("Top and Bottom Roles for Each PC")
print("=" * 80)

for pc_idx in range(n_pcs_to_show):
    pc_values = pca_transformed[:, pc_idx]
    
    # Get indices of top and bottom roles
    top_indices = np.argsort(pc_values)[-n_roles_to_show:][::-1]
    bottom_indices = np.argsort(pc_values)[:n_roles_to_show]
    
    print(f"\nPC{pc_idx+1}:")
    print(f"  Top {n_roles_to_show} (highest loadings):")
    for i, idx in enumerate(top_indices):
        print(f"    {i+1}. {role_labels[idx]:30s} (PC{pc_idx+1} = {pc_values[idx]:7.2f})")
    
    print(f"  Bottom {n_roles_to_show} (lowest loadings):")
    for i, idx in enumerate(bottom_indices):
        print(f"    {i+1}. {role_labels[idx]:30s} (PC{pc_idx+1} = {pc_values[idx]:7.2f})")

print("=" * 80)

Top and Bottom Roles for Each PC

PC1:
  Top 5 (highest loadings):
    1. Leviathan                      (PC1 =  101.90)
    2. Eldritch                       (PC1 =  100.75)
    3. Void                           (PC1 =   96.26)
    4. Wraith                         (PC1 =   85.32)
    5. Tree                           (PC1 =   82.25)
  Bottom 5 (lowest loadings):
    1. Evaluator                      (PC1 =  -44.38)
    2. Coach                          (PC1 =  -41.81)
    3. Mentor                         (PC1 =  -41.67)
    4. Planner                        (PC1 =  -41.62)
    5. Moderator                      (PC1 =  -41.14)

PC2:
  Top 5 (highest loadings):
    1. Toddler                        (PC2 =  106.64)
    2. Procrastinator                 (PC2 =  104.63)
    3. Adolescent                     (PC2 =  100.52)
    4. Teenager                       (PC2 =  100.21)
    5. Fool                           (PC2 =   95.96)
  Bottom 5 (lowest loadings):
    1. Eldritch              

## Correlations between role loadings onto PCs across the 3 models

In [40]:
models = ['gemma-2-27b', 'qwen-3-32b', 'llama-3.3-70b']
layers = [22, 32, 40]

trait_results = {}
labels = {}
for model, layer in zip(models, layers):
    model_dir = f"/workspace/{model}/traits_240"
    trait_results[model] = torch.load(f"{model_dir}/pca/layer{layer}_pos-neg50.pt", weights_only=False)
    print(trait_results[model]['pca_transformed'].shape)
    labels[model] = trait_results[model]['traits']['pos_neg_50']
    print(labels[model][:20])

# need to get intersection of traits across models (gemma missing vindictive)
pca_transformed = []
for model in models:
    if model != 'gemma-2-27b':
        # splice out index 5 but keep the ones before and after
        pca_transformed.append(np.concatenate((trait_results[model]['pca_transformed'][:5], trait_results[model]['pca_transformed'][6:])))
    else:
        pca_transformed.append(trait_results[model]['pca_transformed'])

for m in pca_transformed:
    print(m.shape)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


(239, 239)
['zealous', 'wry', 'witty', 'whimsical', 'visceral', 'verbose', 'utilitarian', 'urgent', 'universalist', 'understated', 'transparent', 'traditional', 'theoretical', 'theatrical', 'temperamental', 'technical', 'tactful', 'systems_thinker', 'sycophantic', 'supportive']
(240, 240)
['zealous', 'wry', 'witty', 'whimsical', 'visceral', 'vindictive', 'verbose', 'utilitarian', 'urgent', 'universalist', 'understated', 'transparent', 'traditional', 'theoretical', 'theatrical', 'temperamental', 'technical', 'tactful', 'systems_thinker', 'sycophantic']
(240, 240)
['zealous', 'wry', 'witty', 'whimsical', 'visceral', 'vindictive', 'verbose', 'utilitarian', 'urgent', 'universalist', 'understated', 'transparent', 'traditional', 'theoretical', 'theatrical', 'temperamental', 'technical', 'tactful', 'systems_thinker', 'sycophantic']
(239, 239)
(239, 240)
(239, 240)


In [41]:
# Transpose each matrix so rows are PCs and columns are traits
pca_transposed = [m.T for m in pca_transformed]

# Extract top 10 PCs from each model
n_pcs = 6
top_pcs = [m[:n_pcs] for m in pca_transposed]

print(f"Transposed shapes (n_pcs, n_traits):")
for model, pc_matrix in zip(models, top_pcs):
    print(f"{model}: {pc_matrix.shape}")

# Compute pairwise correlations for each PC
from scipy.stats import pearsonr

pc_correlations = []
for pc_idx in range(n_pcs):
    # Extract the trait loading vector for this PC from each model
    gemma_pc = top_pcs[0][pc_idx]
    qwen_pc = top_pcs[1][pc_idx]
    llama_pc = top_pcs[2][pc_idx]
    
    # Compute pairwise correlations
    corr_gemma_qwen, _ = pearsonr(gemma_pc, qwen_pc)
    corr_gemma_llama, _ = pearsonr(gemma_pc, llama_pc)
    corr_qwen_llama, _ = pearsonr(qwen_pc, llama_pc)
    
    # Create 3x3 correlation matrix
    corr_matrix = np.array([
        [1.0, corr_gemma_qwen, corr_gemma_llama],
        [corr_gemma_qwen, 1.0, corr_qwen_llama],
        [corr_gemma_llama, corr_qwen_llama, 1.0]
    ])
    
    pc_correlations.append(corr_matrix)

    print(f"\nPC{pc_idx + 1}:")
    print(f"  Gemma ↔ Qwen:  {corr_gemma_qwen:7.4f}")
    print(f"  Gemma ↔ Llama: {corr_gemma_llama:7.4f}")
    print(f"  Qwen  ↔ Llama: {corr_qwen_llama:7.4f}")

Transposed shapes (n_pcs, n_traits):
gemma-2-27b: (6, 239)
qwen-3-32b: (6, 239)
llama-3.3-70b: (6, 239)

PC1:
  Gemma ↔ Qwen:  -0.8940
  Gemma ↔ Llama:  0.9690
  Qwen  ↔ Llama: -0.8359

PC2:
  Gemma ↔ Qwen:   0.8356
  Gemma ↔ Llama: -0.9079
  Qwen  ↔ Llama: -0.8064

PC3:
  Gemma ↔ Qwen:   0.7499
  Gemma ↔ Llama: -0.9005
  Qwen  ↔ Llama: -0.8542

PC4:
  Gemma ↔ Qwen:   0.6075
  Gemma ↔ Llama:  0.6253
  Qwen  ↔ Llama:  0.4756

PC5:
  Gemma ↔ Qwen:   0.3510
  Gemma ↔ Llama:  0.5943
  Qwen  ↔ Llama:  0.5252

PC6:
  Gemma ↔ Qwen:  -0.2080
  Gemma ↔ Llama: -0.7845
  Qwen  ↔ Llama: -0.3113


In [42]:
# try for top 10 role PCs
models = ['gemma-2-27b', 'qwen-3-32b', 'llama-3.3-70b']
layers = [22, 32, 40]

def get_role_labels(pca_results):
    labels = []
    if 'pos_2' in pca_results['roles'].keys():
        pos_2_roles = [role.replace('_', ' ').title() for role in pca_results['roles']['pos_2']]
        pos_2_roles = [f"{role} (Somewhat RP)" for role in pos_2_roles]
        labels.extend(pos_2_roles)
    if 'pos_3' in pca_results['roles'].keys():
        pos_3_roles = [role.replace('_', ' ').title() for role in pca_results['roles']['pos_3']]
        pos_3_roles = [f"{role} (Fully RP)" for role in pos_3_roles]
        labels.extend(pos_3_roles)
    return labels


role_results = {}
labels = {}
for model, layer in zip(models, layers):
    model_dir = f"/workspace/{model}/roles_240"
    role_results[model] = torch.load(f"{model_dir}/pca/layer{layer}_pos23.pt", weights_only=False)
    print(role_results[model]['pca_transformed'].shape)
    labels[model] = get_role_labels(role_results[model])



(448, 448)
(463, 463)
(377, 377)


In [43]:
# Find intersection of roles across all 3 models
set_gemma = set(labels['gemma-2-27b'])
set_qwen = set(labels['qwen-3-32b'])
set_llama = set(labels['llama-3.3-70b'])

common_roles = set_gemma & set_qwen & set_llama
print(f"Common roles across all models: {len(common_roles)}")

# Get indices of common roles for each model (preserving order from labels)
indices = {}
for model in models:
    model_indices = []
    for i, role in enumerate(labels[model]):
        if role in common_roles:
            model_indices.append(i)
    indices[model] = model_indices
    print(f"{model}: {len(model_indices)} common roles")

# Extract aligned PCA transformed matrices (only common roles, in consistent order)
# Need to ensure the same role ordering across models
common_roles_list = sorted(list(common_roles))  # Consistent ordering

pca_transformed_roles = []
for model in models:
    # Map from common_roles_list order to model's indices
    model_indices_ordered = []
    for role in common_roles_list:
        idx = labels[model].index(role)
        model_indices_ordered.append(idx)
    
    # Extract rows for common roles in the standardized order
    pca_transformed_roles.append(role_results[model]['pca_transformed'][model_indices_ordered])
    print(f"{model} aligned shape: {pca_transformed_roles[-1].shape}")

# Transpose each matrix so rows are PCs and columns are roles
pca_transposed_roles = [m.T for m in pca_transformed_roles]

# Extract top 10 PCs from each model
n_pcs = 6
top_pcs_roles = [m[:n_pcs] for m in pca_transposed_roles]

print(f"\nTransposed shapes (n_pcs, n_common_roles):")
for model, pc_matrix in zip(models, top_pcs_roles):
    print(f"{model}: {pc_matrix.shape}")

# Compute pairwise correlations for each PC
pc_correlations_roles = []
for pc_idx in range(n_pcs):
    # Extract the role loading vector for this PC from each model
    gemma_pc = top_pcs_roles[0][pc_idx]
    qwen_pc = top_pcs_roles[1][pc_idx]
    llama_pc = top_pcs_roles[2][pc_idx]
    
    # Compute pairwise correlations
    corr_gemma_qwen, _ = pearsonr(gemma_pc, qwen_pc)
    corr_gemma_llama, _ = pearsonr(gemma_pc, llama_pc)
    corr_qwen_llama, _ = pearsonr(qwen_pc, llama_pc)
    
    # Create 3x3 correlation matrix
    corr_matrix = np.array([
        [1.0, corr_gemma_qwen, corr_gemma_llama],
        [corr_gemma_qwen, 1.0, corr_qwen_llama],
        [corr_gemma_llama, corr_qwen_llama, 1.0]
    ])
    
    pc_correlations_roles.append(corr_matrix)

    print(f"\nPC{pc_idx + 1}:")
    print(f"  Gemma ↔ Qwen:  {corr_gemma_qwen:7.4f}")
    print(f"  Gemma ↔ Llama: {corr_gemma_llama:7.4f}")
    print(f"  Qwen  ↔ Llama: {corr_qwen_llama:7.4f}")

Common roles across all models: 361
gemma-2-27b: 361 common roles
qwen-3-32b: 361 common roles
llama-3.3-70b: 361 common roles
gemma-2-27b aligned shape: (361, 448)
qwen-3-32b aligned shape: (361, 463)
llama-3.3-70b aligned shape: (361, 377)

Transposed shapes (n_pcs, n_common_roles):
gemma-2-27b: (6, 361)
qwen-3-32b: (6, 361)
llama-3.3-70b: (6, 361)

PC1:
  Gemma ↔ Qwen:   0.9633
  Gemma ↔ Llama:  0.9530
  Qwen  ↔ Llama:  0.9723

PC2:
  Gemma ↔ Qwen:   0.9284
  Gemma ↔ Llama:  0.9121
  Qwen  ↔ Llama:  0.9176

PC3:
  Gemma ↔ Qwen:   0.3108
  Gemma ↔ Llama:  0.7327
  Qwen  ↔ Llama:  0.6831

PC4:
  Gemma ↔ Qwen:   0.6476
  Gemma ↔ Llama: -0.8664
  Qwen  ↔ Llama: -0.6006

PC5:
  Gemma ↔ Qwen:   0.6709
  Gemma ↔ Llama:  0.8028
  Qwen  ↔ Llama:  0.7283

PC6:
  Gemma ↔ Qwen:   0.2149
  Gemma ↔ Llama: -0.5411
  Qwen  ↔ Llama: -0.1034


## Save variance analysis results to JSON

### Instructions for saving variance analysis results

This section saves the variance analysis results to JSON files for easy sharing and analysis.

**Per-model files** (one per model):
- `{model_name}_layer{layer}.json` - Contains all variance metrics for a single model:
  - `across_within_role_var`: 4 variance ratios (raw, normalized, PCA space, PC1 only)
  - `conditional_var_roles`: Threshold analysis, quintile analysis, distance correlations
  - `high_var_pc_correlation`: PC distance correlations, conditional variance by PC, top/bottom roles

**Cross-model file** (one file for all models):
- `cross_model_loadings.json` - Contains PC correlation analysis across 3 models:
  - `trait_analysis`: PC correlations for traits across Gemma, Qwen, Llama
  - `role_analysis`: PC correlations for roles across Gemma, Qwen, Llama

**To generate all files:**
1. Run the notebook with one model configuration (e.g., Gemma)
2. Change the configuration cell to another model (e.g., Qwen)
3. Re-run from the configuration cell onwards
4. Repeat for the third model (e.g., Llama)
5. The cross-model file is generated after all 3 model sections are run

In [44]:
from datetime import datetime

# Configuration for saving
outdir = "./results"
os.makedirs(outdir, exist_ok=True)

# Get current timestamp
timestamp = datetime.now().isoformat()

print(f"Saving variance analysis results to {outdir}/")
print(f"Timestamp: {timestamp}")

Saving variance analysis results to ./results/
Timestamp: 2025-10-21T23:40:50.577644


In [47]:
# Build the per-model variance analysis JSON structure

# Build quintile data
quintiles_data = []
for i in range(len(quintile_edges) - 1):
    quintiles_data.append({
        "quintile": i + 1,
        "pc1_range": [float(quintile_edges[i]), float(quintile_edges[i + 1])],
        "n_samples": int(quintile_sizes[i]),
        "variance_full": float(quintile_variances[i]),
        "variance_pc1_removed": float(quintile_variances_no_pc1[i])
    })

# Build PC distance correlations
pc_distance_corrs = []
for i in range(len(correlations)):
    pc_distance_corrs.append({
        "pc": i + 1,
        "r": float(correlations[i]),
        "p_value": float(p_values[i]),
        "significant": bool(p_values[i] < 0.05)
    })

# Build conditional variance by PC
cond_var_by_pc = []
for i in range(len(variance_ratios)):
    cond_var_by_pc.append({
        "pc": i + 1,
        "ratio": float(variance_ratios[i])
    })

# Build the complete JSON structure
model_variance_data = {
    "model_name": model_name,
    "layer": layer,
    "hidden_dim": vectors.shape[1],
    "n_roles": len(activations),
    "n_role_samples": role_vectors.shape[0],
    "timestamp": timestamp,
    "analysis_version": "1.0",
    
    "across_within_role_var": {
        "raw_activations": {
            "across_var_mean": float(raw_across_var.mean().item()),
            "within_var_mean": float(avg_raw_within_var.mean().item()),
            "ratio": float(raw_ratio)
        },
        "raw_activations_normalized": {
            "across_var_mean": float(raw_across_var_normalized.mean().item()),
            "within_var_mean": float(avg_raw_within_var_normalized.mean().item()),
            "ratio": float(raw_ratio_normalized)
        },
        "pca_space_all_components": {
            "across_var_mean": float(pca_across_var.mean()),
            "within_var_mean": float(mean_pca_within_var.mean()),
            "ratio": float(pca_ratio),
            "n_components": int(len(pca_across_var))
        },
        "pc1_only": {
            "across_var": float(pc1_across_var),
            "within_var_mean": float(mean_pc1_within_var),
            "ratio": float(pc1_ratio)
        }
    },
    
    "conditional_var_roles": {
        "threshold_analysis": {
            "pc1_threshold": threshold,
            "assistant_like": {
                "mask": f"pc1 < {threshold}",
                "n_samples": int(assistant_mask.sum()),
                "variance_raw": float(var_assistant_raw),
                "variance_raw_pc1_removed": float(var_assistant_raw_no_pc1)
            },
            "roleplay": {
                "mask": f"pc1 >= {threshold}",
                "n_samples": int(roleplay_mask.sum()),
                "variance_raw": float(var_roleplay_raw),
                "variance_raw_pc1_removed": float(var_roleplay_raw_no_pc1)
            },
            "variance_ratio_raw": float(var_ratio_raw),
            "variance_ratio_raw_pc1_removed": float(var_ratio_raw_no_pc1)
        },
        
        "quintile_analysis": {
            "n_quintiles": 5,
            "quintiles": quintiles_data,
            "variance_ratio_first_to_last_full": float(quintile_ratio),
            "variance_ratio_first_to_last_pc1_removed": float(quintile_ratio_no_pc1)
        },
        
        "distance_correlation": {
            "full_space": {
                "correlation": float(correlation_raw),
                "p_value": float(p_value_raw),
                "significant": bool(p_value_raw < 0.05)
            },
            "pc1_removed": {
                "correlation": float(correlation_raw_no_pc1),
                "p_value": float(p_value_raw_no_pc1),
                "significant": bool(p_value_raw_no_pc1 < 0.05)
            }
        }
    },
    
    "high_var_pc_correlation": {
        "pc_distance_correlations": {
            "description": "Correlation between each PC and distance in remaining PC space",
            "n_pcs_analyzed": 10,
            "correlations": pc_distance_corrs,
            "pc1_correlation": float(correlations[0]),
            "mean_correlation_pc2_to_10": float(np.mean(correlations[1:]))
        },
        
        "conditional_variance_by_pc": {
            "description": "Variance in PC2-10 conditioned on high/low position along each PC",
            "n_pcs_analyzed": 10,
            "variance_ratios": cond_var_by_pc,
            "pc1_variance_ratio": float(variance_ratios[0]),
            "mean_variance_ratio_pc2_to_10": float(np.mean(variance_ratios[1:])),
            "max_variance_ratio_excluding_pc1": float(np.max(variance_ratios[1:])),
            "max_variance_ratio_pc": int(np.argmax(variance_ratios[1:]) + 2)
        }
    }
}

print("Built per-model variance analysis data structure")

Built per-model variance analysis data structure


In [49]:
# Save per-model variance analysis to JSON file
filename = f"{outdir}/{model_name.lower()}/variance_layer{layer}.json"
with open(filename, 'w') as f:
    json.dump(model_variance_data, f, indent=2)

print(f"Saved: {filename}")
print(f"✓ Saved variance analysis for {model_name}")

Saved: ./results/llama-3.3-70b/variance_layer40.json
✓ Saved variance analysis for Llama-3.3-70B


In [50]:
# Build cross-model PC loadings analysis JSON structure

n_pcs = 6

# Build trait analysis
trait_data = {
    "dataset_info": {
        "n_common_traits": pca_transformed[0].shape[0],
        "excluded_traits": ["vindictive"],
        "note": "Gemma missing vindictive trait, spliced out from other models for alignment"
    },
    "model_configs": {},
    "pc_correlations": []
}

# Add model configs for traits
for model, layer_num in zip(models, layers):
    pca_shape = list(trait_results[model]['pca_transformed'].shape)
    trait_data["model_configs"][model] = {
        "layer": int(layer_num),
        "n_total_traits": int(pca_shape[0]),
        "pca_shape": pca_shape
    }

# Add PC correlations for traits
pca_transposed_traits = [m.T for m in pca_transformed]
top_pcs_traits = [m[:n_pcs] for m in pca_transposed_traits]

for pc_idx in range(n_pcs):
    gemma_pc = top_pcs_traits[0][pc_idx]
    qwen_pc = top_pcs_traits[1][pc_idx]
    llama_pc = top_pcs_traits[2][pc_idx]
    
    from scipy.stats import pearsonr
    corr_gemma_qwen, _ = pearsonr(gemma_pc, qwen_pc)
    corr_gemma_llama, _ = pearsonr(gemma_pc, llama_pc)
    corr_qwen_llama, _ = pearsonr(qwen_pc, llama_pc)
    
    trait_data["pc_correlations"].append({
        "pc": pc_idx + 1,
        "gemma_qwen": float(corr_gemma_qwen),
        "gemma_llama": float(corr_gemma_llama),
        "qwen_llama": float(corr_qwen_llama)
    })

# Build role analysis
role_data = {
    "dataset_info": {
        "n_common_roles": int(len(common_roles)),
        "note": "Roles include pos_2 (Somewhat RP) and pos_3 (Fully RP) labels",
        "alignment_method": "sorted common roles list for consistent ordering"
    },
    "model_configs": {},
    "pc_correlations": []
}

# Add model configs for roles
for model, layer_num in zip(models, layers):
    pca_shape = list(role_results[model]['pca_transformed'].shape)
    role_data["model_configs"][model] = {
        "layer": int(layer_num),
        "n_total_roles": int(pca_shape[0]),
        "n_common_roles": int(len(common_roles)),
        "pca_shape": pca_shape
    }

# Add PC correlations for roles
pca_transposed_roles_func = [m.T for m in pca_transformed_roles]
top_pcs_roles_func = [m[:n_pcs] for m in pca_transposed_roles_func]

for pc_idx in range(n_pcs):
    gemma_pc = top_pcs_roles_func[0][pc_idx]
    qwen_pc = top_pcs_roles_func[1][pc_idx]
    llama_pc = top_pcs_roles_func[2][pc_idx]
    
    corr_gemma_qwen, _ = pearsonr(gemma_pc, qwen_pc)
    corr_gemma_llama, _ = pearsonr(gemma_pc, llama_pc)
    corr_qwen_llama, _ = pearsonr(qwen_pc, llama_pc)
    
    role_data["pc_correlations"].append({
        "pc": pc_idx + 1,
        "gemma_qwen": float(corr_gemma_qwen),
        "gemma_llama": float(corr_gemma_llama),
        "qwen_llama": float(corr_qwen_llama)
    })

# Build complete structure
cross_model_data = {
    "analysis_version": "1.0",
    "timestamp": timestamp,
    "models": models,
    "n_pcs_analyzed": n_pcs,
    "trait_analysis": trait_data,
    "role_analysis": role_data
}

print("Built cross-model PC loadings data structure")

Built cross-model PC loadings data structure


In [51]:
# Save cross-model PC loadings to JSON file
filename = f"{outdir}/cross_model_loadings.json"
with open(filename, 'w') as f:
    json.dump(cross_model_data, f, indent=2)

print(f"Saved: {filename}")
print(f"✓ Saved cross-model PC loadings analysis")

Saved: ./results/cross_model_loadings.json
✓ Saved cross-model PC loadings analysis


In [52]:
# Summary of saved files
print("\n" + "=" * 60)
print("SUMMARY: JSON Files Saved")
print("=" * 60)
print(f"\nOutput directory: {outdir}")
print(f"\nFiles created:")
print(f"  1. Per-model variance analysis:")
print(f"     - {model_name.lower().replace('.', '-').replace(' ', '-')}_layer{layer}.json")
print(f"\n  2. Cross-model PC loadings:")
print(f"     - cross_model_loadings.json")
print(f"\nNote: To save variance analysis for other models (Qwen, Llama),")
print(f"      update the configuration cell and re-run the notebook.")
print("=" * 60)


SUMMARY: JSON Files Saved

Output directory: ./results

Files created:
  1. Per-model variance analysis:
     - llama-3-3-70b_layer40.json

  2. Cross-model PC loadings:
     - cross_model_loadings.json

Note: To save variance analysis for other models (Qwen, Llama),
      update the configuration cell and re-run the notebook.


### Validation: Check saved JSON files

In [None]:
# Validate the saved JSON files by loading and checking structure
import os
import json

print("=" * 60)
print("Validating saved JSON files...")
print("=" * 60)

# Check per-model file
model_file = f"{outdir}/{model_name.lower().replace('.', '-').replace(' ', '-')}_layer{layer}.json"
if os.path.exists(model_file):
    with open(model_file, 'r') as f:
        model_data = json.load(f)
    
    print(f"\n✓ Per-model file loaded successfully: {model_file}")
    print(f"  - Model: {model_data['model_name']}")
    print(f"  - Layer: {model_data['layer']}")
    print(f"  - Sections: {list(model_data.keys())}")
    print(f"  - across_within_role_var keys: {list(model_data['across_within_role_var'].keys())}")
    print(f"  - conditional_var_roles keys: {list(model_data['conditional_var_roles'].keys())}")
    print(f"  - high_var_pc_correlation keys: {list(model_data['high_var_pc_correlation'].keys())}")
else:
    print(f"\n✗ Per-model file not found: {model_file}")

# Check cross-model file
cross_model_file = f"{outdir}/cross_model_loadings.json"
if os.path.exists(cross_model_file):
    with open(cross_model_file, 'r') as f:
        cross_data = json.load(f)
    
    print(f"\n✓ Cross-model file loaded successfully: {cross_model_file}")
    print(f"  - Models: {cross_data['models']}")
    print(f"  - Number of PCs: {cross_data['n_pcs_analyzed']}")
    print(f"  - Sections: {list(cross_data.keys())}")
    print(f"  - Trait analysis sections: {list(cross_data['trait_analysis'].keys())}")
    print(f"  - Role analysis sections: {list(cross_data['role_analysis'].keys())}")
    print(f"  - Number of trait PC correlations: {len(cross_data['trait_analysis']['pc_correlations'])}")
    print(f"  - Number of role PC correlations: {len(cross_data['role_analysis']['pc_correlations'])}")
else:
    print(f"\n✗ Cross-model file not found: {cross_model_file}")

print("\n" + "=" * 60)
print("Validation complete!")
print("=" * 60)