# PCA on roles

In [1]:
import os
import sys
import torch
import numpy as np
import pandas as pd

sys.path.append('.')
sys.path.append('..')

from utils.pca_utils import L2MeanScaler, MeanScaler, compute_pca, plot_variance_explained
from sklearn.preprocessing import StandardScaler
from plots import plot_pc

## Configuration

In [20]:
# Configuration - Change these parameters for different models/datasets
base_dir = "/workspace/qwen-3-32b"
type = "roles_240"
dir = f"{base_dir}/{type}"
model_name = "Qwen-3-32B"
layer = 32

In [21]:
# 30 or 240
if type == "roles":
    n_questions = 30
    n_prompt_types = 2
elif type == "roles_240":  # roles_240 or other patterns
    n_questions = 240
    n_prompt_types = 1

## Load vectors

In [22]:
# load all vectors 
vector_dir = f"{dir}/pre_vectors"
#ignore = ["infant.pt", "toddler.pt", "caveman.pt"]
ignore = []

# iterate through each .pt file in the directory
vectors = {}
for file in os.listdir(vector_dir):
    if file.endswith(".pt") and file not in ignore:
        vectors[file.replace(".pt", "")] = torch.load(os.path.join(vector_dir, file))

print(f"Found {len(vectors.keys())} roles with vectors")

Found 276 roles with vectors


In [23]:
# load default vectors
default_vectors = torch.load(f"{dir}/default_vectors.pt")

In [7]:
print(vectors['graduate'].keys())
print(default_vectors.keys())
print(default_vectors['activations'].keys())

dict_keys(['pos_1', 'pos_3', 'pos_all'])
dict_keys(['activations', 'metadata'])
dict_keys(['pos_1', 'default_1', 'all_1'])


## PCA 

In [24]:
pos_2_roles = []
pos_2_vectors = []
pos_3_roles = []
pos_3_vectors = []

# get the vectors keys for pos_2 and pos_3 for each role
for role, vector in vectors.items():
    if 'pos_2' in vector.keys():
        pos_2_roles.append(role)
        pos_2_vectors.append(vector['pos_2'])
    if 'pos_3' in vector.keys():
        pos_3_roles.append(role)
        pos_3_vectors.append(vector['pos_3'])

print(len(pos_2_roles))
print(len(pos_3_roles))

combined_vectors = pos_2_vectors + pos_3_vectors

188
275


In [25]:
def sorted_by_pc(pca_transformed, pc_index, labels):
    df = pd.DataFrame({
        "label": labels,
        "projection": pca_transformed[:, pc_index],
    })
    df_sorted = df.sort_values(by="projection", ascending=True)
    return df_sorted


In [26]:
float_stack_vectors = torch.stack(combined_vectors).float()
print(float_stack_vectors.shape)


torch.Size([463, 64, 5120])


In [27]:
# float_stack_vectors is (n_samples, n_layers, hidden_dims)
scaler = MeanScaler()

pca_transformed, variance_explained, n_components, pca, scaler = compute_pca(
    float_stack_vectors, 
    layer=layer, 
    scaler=scaler
)


PCA fitted with 463 components
Cumulative variance for first 5 components: [0.11549975 0.18194795 0.23261741 0.27795842 0.31388858]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 42
Dimensions for 80% variance: 73
Dimensions for 90% variance: 137
Dimensions for 95% variance: 203


In [28]:
pc_df = sorted_by_pc(pca_transformed, 0, pos_2_roles + pos_3_roles)
top_roles = pc_df['label'].head(5).tolist()
bottom_roles = pc_df['label'].tail(5).tolist()
print(top_roles)
print(bottom_roles)

['void', 'wraith', 'leviathan', 'demon', 'vampire']
['auditor', 'accountant', 'publisher', 'screener', 'analyst']


In [29]:
# flip PC1
to_flip = [1]
for pc in to_flip:
    pca.components_[pc - 1] *= -1
    pca_transformed[:, pc - 1] *= -1

In [30]:

results = {}
results['layer'] = layer
results['roles'] = {
    'pos_2': pos_2_roles,
    'pos_3': pos_3_roles
}
results['vectors'] = {
    'pos_2': pos_2_vectors,
    'pos_3': pos_3_vectors
}
results['pca_transformed'] = pca_transformed
results['variance_explained'] = variance_explained
results['n_components'] = n_components
results['pca'] = pca
results['scaler'] = scaler

pca_dir = f"{dir}/pca"
os.makedirs(pca_dir, exist_ok=True)
torch.save(results, f"{pca_dir}/pre_layer{layer}_mean_pos23.pt")



## Plots

In [31]:
type = "pos23"

plot_dir = f"/root/git/plots/{model_name.lower().replace(' ', '-')}/roles"
os.makedirs(plot_dir, exist_ok=True)

normalized = True
if normalized:
    pca_results = torch.load(f"{dir}/pca/pre_layer{layer}_mean_{type}.pt", weights_only=False)
else:
    pca_results = torch.load(f"{dir}/pca/pre_layer{layer}_{type}.pt", weights_only=False)


In [12]:
# flip PCs if needed
to_flip = [1, 3]

# for pc in to_flip:
#     pca_results['pca'].components_[pc - 1] *= -1
#     pca_results['pca_transformed'][:, pc - 1] *= -1
# torch.save(pca_results, f"{dir}/pca/layer{layer}_normalized_pos23.pt")

In [32]:
# load in PCs
default_vectors = torch.load(f"{dir}/pre_vectors/default_vectors.pt")

In [33]:
# also calculate role labels for plotting
def get_role_labels(pca_results):
    labels = []
    if 'pos_2' in pca_results['roles'].keys():
        pos_2_roles = [role.replace('_', ' ').title() for role in pca_results['roles']['pos_2']]
        pos_2_roles = [f"{role} (Somewhat RP)" for role in pos_2_roles]
        labels.extend(pos_2_roles)
    if 'pos_3' in pca_results['roles'].keys():
        pos_3_roles = [role.replace('_', ' ').title() for role in pca_results['roles']['pos_3']]
        pos_3_roles = [f"{role} (Fully RP)" for role in pos_3_roles]
        labels.extend(pos_3_roles)
    return labels

role_labels = get_role_labels(pca_results)
print(len(role_labels))



463


In [34]:
# get default activation and project into PCA space
assistant_layer_activation = default_vectors['activations']['default_1'][layer, :].float().reshape(1, -1)

asst_scaled = pca_results['scaler'].transform(assistant_layer_activation)
asst_projected = pca_results['pca'].transform(asst_scaled)

assistant_layer_activation = assistant_layer_activation.numpy()


In [35]:
if normalized:
    subtitle = f"{model_name.replace('-', ' ')}, Layer {layer} - Mean-Centered"
else:
    subtitle = f"{model_name.replace('-', ' ')}, Layer {layer} - Shared Question Set, Mean-Centered and Scaled Vectors"

for i in range(6):
    fig = plot_pc(
        pca_results=pca_results,
        role_labels=role_labels,
        layer=layer,
        pc_component=i,
        assistant_activation=default_vectors['activations']['default_1'],
        assistant_projection=asst_projected[0],
        title="PCA on Role-Playing Vectors",
        subtitle=subtitle,
    )
    fig.show()

    # if not normalized:
    #     fig.write_html(f"{plot_dir}/pc{i+1}.html")
    # else:
    #     fig.write_html(f"{plot_dir}/pc{i+1}_mean.html")


In [25]:
var_fig = plot_variance_explained(
    pca_results['variance_explained'],
    title="Variance Explained by Role PC Components",
    subtitle=f"{model_name.replace('-', ' ')}, Layer {layer} - {len(role_labels)} Total Components",
    max_components=50
)
var_fig.show()
var_fig.write_html(f"{plot_dir}/variance_explained.html")