# Variance comparison

In [11]:
import os
import sys
import torch
import json
import numpy as np
import pandas as pd

sys.path.append('.')
sys.path.append('..')

from utils.pca_utils import *
from plots import *

## Configuration

In [15]:
# Configuration - Change these parameters for different models/datasets
base_dir = "/workspace/qwen-3-32b"
type = "roles_240"
dir = f"{base_dir}/{type}"
model_name = "Qwen-3-32B"
layer = 32

## Variance across and within roles

### raw activations

In [16]:
pca_results = torch.load(f"{dir}/pca/layer{layer}_pos23.pt", weights_only=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [17]:
vectors = torch.stack(pca_results['vectors']['pos_3'])[:, layer, :].float()
print(vectors.shape)

# compute variance across roles (rows) along hidden_dims
raw_across_var = torch.var(vectors, dim=0)
print(raw_across_var.shape)

torch.Size([275, 5120])
torch.Size([5120])


In [18]:
# load in scores
scores = {}
for file in os.listdir(f"{dir}/extract_scores"):
    if file.endswith('.json'):
        scores[file.replace('.json', '')] = json.load(open(f"{dir}/extract_scores/{file}"))

print(f"Loaded {len(scores)} scores")


Loaded 275 scores


In [21]:
# load in raw activations
activations = {}
for file in os.listdir(f"{dir}/response_activations"):
    if file.endswith('.pt') and 'default' not in file:
        # dict we should iterate over (1200 each)
        role_activations = []
        obj = torch.load(f"{dir}/response_activations/{file}")
        for key in obj:
            if scores[file.replace('.pt', '')][key] == 3:
                role_activations.append(obj[key])
        activations[file.replace('.pt', '')] = torch.stack(role_activations)
        



In [23]:
# compute variance within roles
raw_within_var = []
for file in activations:
    raw_within_var.append(torch.var(activations[file][:, layer, :], dim=0))

print(f"for {len(raw_within_var)} roles, shape is {raw_within_var[0].shape}")

for 275 roles, shape is torch.Size([5120])


In [24]:
avg_raw_within_var = torch.stack(raw_within_var).mean(dim=0)
print(avg_raw_within_var.shape)



torch.Size([5120])


In [25]:
# total variance ratio
raw_ratio = raw_across_var.sum() / avg_raw_within_var.sum()
print(f"ratio of raw_across_var / avg_raw_within_var is {raw_ratio}")

ratio of raw_across_var / avg_raw_within_var is 0.36073175072669983


In [None]:
raw_across_var_normalized = torch.var(F.normalize(vectors, p=2, dim=1), dim=0)
print(raw_across_var_normalized.shape)



torch.Size([5120])


In [28]:
raw_within_var_normalized = []
for file in activations:
    raw_within_var_normalized.append(torch.var(F.normalize(activations[file][:, layer, :], p=2, dim=1), dim=0))

print(f"for {len(raw_within_var_normalized)} roles, shape is {raw_within_var_normalized[0].shape}")
avg_raw_within_var_normalized = torch.stack(raw_within_var_normalized).mean(dim=0)
print(avg_raw_within_var_normalized.shape)


for 275 roles, shape is torch.Size([5120])
torch.Size([5120])


In [29]:
raw_ratio_normalized = raw_across_var_normalized.sum() / avg_raw_within_var_normalized.sum()
print(f"ratio of raw_across_var_normalized / avg_raw_within_var_normalized is {raw_ratio_normalized}")


ratio of raw_across_var_normalized / avg_raw_within_var_normalized is 0.38904350996017456


### in PC space

In [33]:
# get transformed role vectors
pca_across_var = np.var(pca_results['pca_transformed'][:275], axis=0)
print(pca_across_var.shape)


(463,)


In [34]:
print(activations['absurdist'].shape)

torch.Size([1193, 64, 5120])


In [45]:
pca_within_var = []
pc1_within_var = []
for role in activations:
    role_scaled = pca_results['scaler'].transform(activations[role][:, layer, :].float().numpy())
    role_pca = pca_results['pca'].transform(role_scaled)
    pca_within_var.append(np.var(role_pca, axis=0))
    pc1_within_var.append(np.var(role_pca[:, 0]))

print(f"for {len(pca_within_var)} roles, shape is {pca_within_var[0].shape}")

for 275 roles, shape is (463,)


In [36]:
mean_pca_within_var = np.array(pca_within_var).mean(axis=0)
print(mean_pca_within_var.shape)


(463,)


In [38]:
pca_ratio = pca_across_var.sum() / mean_pca_within_var.sum()
print(f"ratio of pca_across_var / mean_pca_within_var is {pca_ratio}")

ratio of pca_across_var / mean_pca_within_var is 0.2151157295518321


### pc1 variance only

In [44]:
pc1_across_var = np.var(pca_results['pca_transformed'][:275, 0])
print(pc1_across_var)


796.3471953638646


In [46]:
mean_pc1_within_var = np.array(pc1_within_var).mean()
print(mean_pc1_within_var)

pc1_ratio = pc1_across_var / mean_pc1_within_var
print(f"ratio of pc1_across_var / mean_pc1_within_var is {pc1_ratio}")






361.803697094426
ratio of pc1_across_var / mean_pc1_within_var is 2.20104770006269
