# Compare vectors to behavior


In [5]:
import torch
import os
import sys
import numpy as np
import pandas as pd

sys.path.append('.')
sys.path.append('..')


In [1]:
model_name = "qwen-3-32b"
layer = 32
base_dir = f"/workspace/{model_name}"

In [38]:
default_all_layers = torch.load(f"{base_dir}/roles_240/default_vectors.pt")['activations']['default_1']
contrast_vectors = torch.load(f"{base_dir}/roles_240/contrast_vectors.pt")

In [39]:
role_results = torch.load(f"{base_dir}/roles_240/pca/layer{layer}_pos23.pt", weights_only=False)
trait_results = torch.load(f"{base_dir}/traits_240/pca/layer{layer}_pos-neg50.pt", weights_only=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [40]:
pos_2_vectors = role_results['vectors']['pos_2']
pos_3_vectors = role_results['vectors']['pos_3']
pos_2_roles = role_results['roles']['pos_2']
pos_3_roles = role_results['roles']['pos_3']

combined_vectors = pos_2_vectors + pos_3_vectors
role_vectors = torch.stack(pos_2_vectors + pos_3_vectors).float()
role_labels = pos_2_roles + pos_3_roles
print(role_vectors.shape)

torch.Size([377, 80, 8192])


In [41]:
trait_vectors = torch.stack(trait_results['vectors']['pos_neg_50']).float()
trait_labels = trait_results['traits']['pos_neg_50']

print(trait_vectors.shape)


torch.Size([240, 80, 8192])


In [42]:
import torch.nn.functional as F
import pandas as pd

default_vector = default_all_layers[layer].float()
contrast_vector = contrast_vectors[layer].float() * -1

default_norm = F.normalize(default_vector.unsqueeze(0), dim=-1)
contrast_norm = F.normalize(contrast_vector.unsqueeze(0), dim=-1)

# Get role vectors at the specified layer and compute cosine similarity
role_vectors_at_layer = role_vectors[:, layer, :]  # Shape: [448, 4608]
role_vectors_norm = F.normalize(role_vectors_at_layer, dim=-1)
role_similarities = (role_vectors_norm @ default_norm.T).squeeze()  # Shape: [448]
role_similarities_contrast = (role_vectors_norm @ contrast_norm.T).squeeze()  # Shape: [448]

# Get trait vectors at the specified layer and compute cosine similarity
trait_vectors_at_layer = trait_vectors[:, layer, :]  # Shape: [239, 4608]
trait_vectors_norm = F.normalize(trait_vectors_at_layer, dim=-1)
trait_similarities = (trait_vectors_norm @ default_norm.T).squeeze()  # Shape: [239]
trait_similarities_contrast = (trait_vectors_norm @ contrast_norm.T).squeeze()  # Shape: [239]
# Create role type labels (pos_2 or pos_3)
num_pos_2 = len(pos_2_roles)
num_pos_3 = len(pos_3_roles)
role_types = ['pos_2'] * num_pos_2 + ['pos_3'] * num_pos_3

# Build the dataframe
role_df = pd.DataFrame({
    'name': role_labels,
    'label': 'role',
    'role_type': role_types,
    'default_similarity': role_similarities.numpy(),
    'contrast_similarity': role_similarities_contrast.numpy()
})

trait_df = pd.DataFrame({
    'name': trait_labels,
    'label': 'trait',
    'role_type': None,
    'default_similarity': trait_similarities.numpy(),
    'contrast_similarity': trait_similarities_contrast.numpy()
})

df = pd.concat([role_df, trait_df], ignore_index=True)


In [43]:
df[df['role_type'] == 'pos_3'].sort_values(by='contrast_similarity', ascending=False).head(50)

Unnamed: 0,name,label,role_type,default_similarity,contrast_similarity
353,assistant,role,pos_3,0.997039,0.293464
143,summarizer,role,pos_3,0.980068,0.283677
254,instructor,role,pos_3,0.989009,0.253344
328,consultant,role,pos_3,0.98955,0.243047
200,planner,role,pos_3,0.981953,0.241452
216,organizer,role,pos_3,0.985661,0.241123
363,analyst,role,pos_3,0.984435,0.238824
174,researcher,role,pos_3,0.985303,0.236318
171,reviewer,role,pos_3,0.984693,0.233218
142,supervisor,role,pos_3,0.984356,0.229611


In [44]:
df[df['role_type'] == 'pos_3'].sort_values(by='contrast_similarity', ascending=False).tail(50)

Unnamed: 0,name,label,role_type,default_similarity,contrast_similarity
156,simulacrum,role,pos_3,0.883287,-0.054747
212,parasite,role,pos_3,0.866285,-0.054758
149,spirit,role,pos_3,0.897585,-0.056028
229,narcissist,role,pos_3,0.747981,-0.060354
167,romantic,role,pos_3,0.887845,-0.061376
112,warrior,role,pos_3,0.883315,-0.065532
154,smuggler,role,pos_3,0.810311,-0.070233
322,criminal,role,pos_3,0.802728,-0.070508
228,narrator,role,pos_3,0.899819,-0.071091
194,predator,role,pos_3,0.867048,-0.074169


In [45]:
df.sort_values(by='default_similarity', ascending=False).to_csv(f"./results/{model_name}/vector_similarity.csv", index=False)

## Behavioral data

In [3]:
import json

traits_scores_path = f"{base_dir}/traits/default_scores"
trait_scores = {}

for file in os.listdir(traits_scores_path):
    if file.endswith('.json'):
        trait_name = file.replace('.json', '')
        with open(os.path.join(traits_scores_path, file), 'r') as f:
            scores = json.load(f)

        numeric_scores = [v for v in scores.values() if isinstance(v, (int, float))]
            
        if numeric_scores:
            mean_score = np.mean(numeric_scores)
        else:
            # If all values are non-numeric, use NaN
            mean_score = np.nan

        trait_scores[trait_name] = mean_score




In [9]:
# load in csv
df = pd.read_csv(f"./results/{model_name}/vector_similarity.csv")

# Only map trait_scores to rows where label == 'trait'
df['trait_score'] = df.apply(
    lambda row: trait_scores.get(row['name']) if row['label'] == 'trait' else None, 
    axis=1
)

df.sort_values(by='default_similarity', ascending=False).to_csv(f"./results/{model_name}/vector_similarity.csv", index=False)

print(f"Trait rows with scores: {df[df['label'] == 'trait']['trait_score'].notna().sum()}")
print(f"Role rows with scores: {df[df['label'] == 'role']['trait_score'].notna().sum()}")  # Should be 0

Trait rows with scores: 240
Role rows with scores: 0
