# Compare vectors to behavior


In [1]:
import torch
import os
import sys
import numpy as np

sys.path.append('.')
sys.path.append('..')


In [25]:
model_name = "qwen-3-32b"
layer = 32
base_dir = f"/workspace/{model_name}"

In [26]:
default_all_layers = torch.load(f"{base_dir}/roles_240/default_vectors.pt")['activations']['default_1']

In [27]:
role_results = torch.load(f"{base_dir}/roles_240/pca/layer{layer}_mean_pos23.pt", weights_only=False)
trait_results = torch.load(f"{base_dir}/traits_240/pca/layer{layer}_mean_pos-neg50.pt", weights_only=False)

In [28]:
pos_2_vectors = role_results['vectors']['pos_2']
pos_3_vectors = role_results['vectors']['pos_3']
pos_2_roles = role_results['roles']['pos_2']
pos_3_roles = role_results['roles']['pos_3']

combined_vectors = pos_2_vectors + pos_3_vectors
role_vectors = torch.stack(pos_2_vectors + pos_3_vectors).float()
role_labels = pos_2_roles + pos_3_roles
print(role_vectors.shape)

torch.Size([463, 64, 5120])


In [29]:
trait_vectors = torch.stack(trait_results['vectors']['pos_neg_50']).float()
trait_labels = trait_results['traits']['pos_neg_50']

print(trait_vectors.shape)


torch.Size([240, 64, 5120])


In [30]:
import torch.nn.functional as F
import pandas as pd

default_vector = default_all_layers[layer].float()

# Normalize the default vector
default_norm = F.normalize(default_vector.unsqueeze(0), dim=-1)

# Get role vectors at the specified layer and compute cosine similarity
role_vectors_at_layer = role_vectors[:, layer, :]  # Shape: [448, 4608]
role_vectors_norm = F.normalize(role_vectors_at_layer, dim=-1)
role_similarities = (role_vectors_norm @ default_norm.T).squeeze()  # Shape: [448]

# Get trait vectors at the specified layer and compute cosine similarity
trait_vectors_at_layer = trait_vectors[:, layer, :]  # Shape: [239, 4608]
trait_vectors_norm = F.normalize(trait_vectors_at_layer, dim=-1)
trait_similarities = (trait_vectors_norm @ default_norm.T).squeeze()  # Shape: [239]

# Create role type labels (pos_2 or pos_3)
num_pos_2 = len(pos_2_roles)
num_pos_3 = len(pos_3_roles)
role_types = ['pos_2'] * num_pos_2 + ['pos_3'] * num_pos_3

# Build the dataframe
role_df = pd.DataFrame({
    'name': role_labels,
    'label': 'role',
    'role_type': role_types,
    'similarity': role_similarities.numpy()
})

trait_df = pd.DataFrame({
    'name': trait_labels,
    'label': 'trait',
    'role_type': None,
    'similarity': trait_similarities.numpy()
})

df = pd.concat([role_df, trait_df], ignore_index=True)


In [31]:
df[df['role_type'] == 'pos_3'].sort_values(by='similarity', ascending=False).head(50)

Unnamed: 0,name,label,role_type,similarity
439,assistant,role,pos_3,0.999252
222,teacher,role,pos_3,0.99739
339,interpreter,role,pos_3,0.996735
302,organizer,role,pos_3,0.99638
420,collaborator,role,pos_3,0.996115
223,synthesizer,role,pos_3,0.996034
279,presenter,role,pos_3,0.995923
210,tutor,role,pos_3,0.995884
319,moderator,role,pos_3,0.99585
322,mentor,role,pos_3,0.995819


In [32]:
df[df['role_type'] == 'pos_3'].sort_values(by='similarity', ascending=False).tail(50)

Unnamed: 0,name,label,role_type,similarity
212,trickster,role,pos_3,0.950015
351,hermit,role,pos_3,0.949992
227,surfer,role,pos_3,0.949911
269,provocateur,role,pos_3,0.949791
197,whale,role,pos_3,0.949591
348,hoarder,role,pos_3,0.949572
391,dreamer,role,pos_3,0.949171
235,spirit,role,pos_3,0.948857
303,oracle,role,pos_3,0.948512
278,prey,role,pos_3,0.948227


In [33]:
df.sort_values(by='similarity', ascending=False).to_csv(f"./results/{model_name}/vector_similarity.csv", index=False)