# Compare vectors to behavior


In [1]:
import torch
import os
import sys
import numpy as np
import pandas as pd

sys.path.append('.')
sys.path.append('..')


In [51]:
model_name = "llama-3.3-70b"
layer = 40
base_dir = f"/workspace/{model_name}"

## Vector sim

In [28]:
default_all_layers = torch.load(f"{base_dir}/roles_240/default_vectors.pt")['activations']['default_1']
contrast_vectors = torch.load(f"{base_dir}/roles_240/contrast_vectors.pt")

FileNotFoundError: [Errno 2] No such file or directory: '/workspace/qwen-3-32b/roles_240/contrast_vectors.pt'

In [52]:
role_results = torch.load(f"{base_dir}/roles_240/pca/layer{layer}_mean_pos23.pt", weights_only=False)
trait_results = torch.load(f"{base_dir}/traits_240/pca/layer{layer}_mean_pos-neg50.pt", weights_only=False)

In [40]:
pos_2_vectors = role_results['vectors']['pos_2']
pos_3_vectors = role_results['vectors']['pos_3']
pos_2_roles = role_results['roles']['pos_2']
pos_3_roles = role_results['roles']['pos_3']

combined_vectors = pos_2_vectors + pos_3_vectors
role_vectors = torch.stack(pos_2_vectors + pos_3_vectors).float()
role_labels = pos_2_roles + pos_3_roles
print(role_vectors.shape)

torch.Size([377, 80, 8192])


In [53]:
trait_vectors = torch.stack(trait_results['vectors']['pos_neg_50']).float()
trait_labels = trait_results['traits']['pos_neg_50']

print(trait_vectors.shape)


torch.Size([240, 80, 8192])


In [None]:
import torch.nn.functional as F
import pandas as pd

default_vector = default_all_layers[layer].float()
contrast_vector = contrast_vectors[layer].float() * -1

default_norm = F.normalize(default_vector.unsqueeze(0), dim=-1)
contrast_norm = F.normalize(contrast_vector.unsqueeze(0), dim=-1)

# Get role vectors at the specified layer and compute cosine similarity
role_vectors_at_layer = role_vectors[:, layer, :]
role_vectors_norm = F.normalize(role_vectors_at_layer, dim=-1)
role_similarities = (role_vectors_norm @ default_norm.T).squeeze()
role_similarities_contrast = (role_vectors_norm @ contrast_norm.T).squeeze()

# Get trait vectors at the specified layer and compute cosine similarity
trait_vectors_at_layer = trait_vectors[:, layer, :]
trait_vectors_norm = F.normalize(trait_vectors_at_layer, dim=-1)
trait_similarities = (trait_vectors_norm @ default_norm.T).squeeze()
trait_similarities_contrast = (trait_vectors_norm @ contrast_norm.T).squeeze()

# Create role type labels (pos_2 or pos_3)
num_pos_2 = len(pos_2_roles)
num_pos_3 = len(pos_3_roles)
role_types = ['pos_2'] * num_pos_2 + ['pos_3'] * num_pos_3

# Build separate dataframes
role_df = pd.DataFrame({
    'name': role_labels,
    'role_type': role_types,
    'default_similarity': role_similarities.numpy(),
    'contrast_similarity': role_similarities_contrast.numpy()
})

trait_df = pd.DataFrame({
    'name': trait_labels,
    'default_similarity': trait_similarities.numpy(),
    'contrast_similarity': trait_similarities_contrast.numpy()
})

In [43]:
df[df['role_type'] == 'pos_3'].sort_values(by='contrast_similarity', ascending=False).head(50)

Unnamed: 0,name,label,role_type,default_similarity,contrast_similarity
353,assistant,role,pos_3,0.997039,0.293464
143,summarizer,role,pos_3,0.980068,0.283677
254,instructor,role,pos_3,0.989009,0.253344
328,consultant,role,pos_3,0.98955,0.243047
200,planner,role,pos_3,0.981953,0.241452
216,organizer,role,pos_3,0.985661,0.241123
363,analyst,role,pos_3,0.984435,0.238824
174,researcher,role,pos_3,0.985303,0.236318
171,reviewer,role,pos_3,0.984693,0.233218
142,supervisor,role,pos_3,0.984356,0.229611


In [44]:
df[df['role_type'] == 'pos_3'].sort_values(by='contrast_similarity', ascending=False).tail(50)

Unnamed: 0,name,label,role_type,default_similarity,contrast_similarity
156,simulacrum,role,pos_3,0.883287,-0.054747
212,parasite,role,pos_3,0.866285,-0.054758
149,spirit,role,pos_3,0.897585,-0.056028
229,narcissist,role,pos_3,0.747981,-0.060354
167,romantic,role,pos_3,0.887845,-0.061376
112,warrior,role,pos_3,0.883315,-0.065532
154,smuggler,role,pos_3,0.810311,-0.070233
322,criminal,role,pos_3,0.802728,-0.070508
228,narrator,role,pos_3,0.899819,-0.071091
194,predator,role,pos_3,0.867048,-0.074169


In [None]:
role_df.sort_values(by='default_similarity', ascending=False).to_csv(f"./results/{model_name}/role_similarity.csv", index=False)
trait_df.sort_values(by='default_similarity', ascending=False).to_csv(f"./results/{model_name}/trait_similarity.csv", index=False)

## Behavioral data

In [26]:
import json

traits_scores_path = f"{base_dir}/traits/default_scores"
trait_scores = {}

for file in os.listdir(traits_scores_path):
    if file.endswith('.json'):
        trait_name = file.replace('.json', '')
        with open(os.path.join(traits_scores_path, file), 'r') as f:
            scores = json.load(f)

        numeric_scores = [v for v in scores.values() if isinstance(v, (int, float))]
            
        if numeric_scores:
            mean_score = np.mean(numeric_scores)
        else:
            # If all values are non-numeric, use NaN
            mean_score = np.nan

        trait_scores[trait_name] = mean_score




In [None]:
roles_scores_path = f"{base_dir}/roles/default_scores"
role_scores = {}

for file in os.listdir(roles_scores_path):
    if file.endswith('.json'):
        role_name = file.replace('.json', '')
        with open(os.path.join(roles_scores_path, file), 'r') as f:
            scores = json.load(f)

        numeric_scores = [v for v in scores.values() if isinstance(v, (int, float))]

        # Remap: 0->0, 1->0, 2->1, 3->2
        remapped = [0 if s in [0, 1] else (1 if s == 2 else 2) for s in numeric_scores]

        # Count occurrences of each score
        counts = {
            'pos_0': numeric_scores.count(0),
            'pos_1': numeric_scores.count(1),
            'pos_2': numeric_scores.count(2),
            'pos_3': numeric_scores.count(3),
            'mean_score': sum(remapped) / (len(remapped) * 2) if remapped else np.nan
        }

        role_scores[role_name] = counts

# Convert to DataFrame
role_scores_df = pd.DataFrame.from_dict(role_scores, orient='index')
role_scores_df.index.name = 'name'
role_scores_df = role_scores_df.reset_index()

print(role_scores_df.head())

In [29]:
# Load CSVs
role_df = pd.read_csv(f"./results/{model_name}/role_similarity.csv")
trait_df = pd.read_csv(f"./results/{model_name}/trait_similarity.csv")

# Add trait_score to trait_df
trait_df['trait_score'] = trait_df['name'].map(trait_scores)

# Merge role score counts into role_df
role_df = role_df.merge(role_scores_df, on='name', how='left')

# Save updated CSVs
role_df.to_csv(f"./results/{model_name}/role_similarity.csv", index=False)
trait_df.to_csv(f"./results/{model_name}/trait_similarity.csv", index=False)

print(f"Trait rows with scores: {trait_df['trait_score'].notna().sum()}")
print(role_df.head())

Trait rows with scores: 240
           name role_type  default_similarity  contrast_similarity  pos_0  \
0     assistant     pos_3            0.999253            -0.020101      0   
1   interpreter     pos_2            0.998760            -0.054107      0   
2    generalist     pos_2            0.998374            -0.035637      0   
3   facilitator     pos_2            0.998215            -0.037791      0   
4  collaborator     pos_2            0.998125            -0.053676      0   

   pos_1  pos_2  pos_3  
0      0      4    196  
1      5     63    132  
2     39     83     78  
3      3     55    142  
4      1     61    138  


## Correlation Analysis

In [None]:
from scipy.stats import pearsonr, spearmanr
import json

# Load separate CSVs
role_df = pd.read_csv(f"./results/{model_name}/role_similarity.csv")
trait_df = pd.read_csv(f"./results/{model_name}/trait_similarity.csv")
trait_df = trait_df.dropna(subset=['trait_score'])
role_df_with_scores = role_df.dropna(subset=['mean_score'])

print("=" * 60)
print("Correlation: default_similarity vs contrast_similarity")
print("=" * 60)

# Roles only
pearson_roles, p_pearson_roles = pearsonr(role_df['default_similarity'], role_df['contrast_similarity'])
spearman_roles, p_spearman_roles = spearmanr(role_df['default_similarity'], role_df['contrast_similarity'])
print(f"\nRoles (n={len(role_df)}):")
print(f"  Pearson:  r={pearson_roles:.4f}, p={p_pearson_roles:.2e}")
print(f"  Spearman: ρ={spearman_roles:.4f}, p={p_spearman_roles:.2e}")

# Traits only
pearson_traits, p_pearson_traits = pearsonr(trait_df['default_similarity'], trait_df['contrast_similarity'])
spearman_traits, p_spearman_traits = spearmanr(trait_df['default_similarity'], trait_df['contrast_similarity'])
print(f"\nTraits (n={len(trait_df)}):")
print(f"  Pearson:  r={pearson_traits:.4f}, p={p_pearson_traits:.2e}")
print(f"  Spearman: ρ={spearman_traits:.4f}, p={p_spearman_traits:.2e}")

print("\n" + "=" * 60)
print("Correlation: vector similarity vs behavioral score")
print("=" * 60)

# Roles: default_similarity vs mean_score
pearson_default_role, p_default_role = pearsonr(role_df_with_scores['default_similarity'], role_df_with_scores['mean_score'])
spearman_default_role, p_spearman_default_role = spearmanr(role_df_with_scores['default_similarity'], role_df_with_scores['mean_score'])
print(f"\nRoles - default_similarity vs mean_score (n={len(role_df_with_scores)}):")
print(f"  Pearson:  r={pearson_default_role:.4f}, p={p_default_role:.2e}")
print(f"  Spearman: ρ={spearman_default_role:.4f}, p={p_spearman_default_role:.2e}")

# Roles: contrast_similarity vs mean_score
pearson_contrast_role, p_contrast_role = pearsonr(role_df_with_scores['contrast_similarity'], role_df_with_scores['mean_score'])
spearman_contrast_role, p_spearman_contrast_role = spearmanr(role_df_with_scores['contrast_similarity'], role_df_with_scores['mean_score'])
print(f"\nRoles - contrast_similarity vs mean_score (n={len(role_df_with_scores)}):")
print(f"  Pearson:  r={pearson_contrast_role:.4f}, p={p_contrast_role:.2e}")
print(f"  Spearman: ρ={spearman_contrast_role:.4f}, p={p_spearman_contrast_role:.2e}")

# Traits: default_similarity vs trait_score
pearson_default_trait, p_default_trait = pearsonr(trait_df['default_similarity'], trait_df['trait_score'])
spearman_default_trait, p_spearman_default_trait = spearmanr(trait_df['default_similarity'], trait_df['trait_score'])
print(f"\nTraits - default_similarity vs trait_score (n={len(trait_df)}):")
print(f"  Pearson:  r={pearson_default_trait:.4f}, p={p_default_trait:.2e}")
print(f"  Spearman: ρ={spearman_default_trait:.4f}, p={p_spearman_default_trait:.2e}")

# Traits: contrast_similarity vs trait_score
pearson_contrast_trait, p_contrast_trait = pearsonr(trait_df['contrast_similarity'], trait_df['trait_score'])
spearman_contrast_trait, p_spearman_contrast_trait = spearmanr(trait_df['contrast_similarity'], trait_df['trait_score'])
print(f"\nTraits - contrast_similarity vs trait_score (n={len(trait_df)}):")
print(f"  Pearson:  r={pearson_contrast_trait:.4f}, p={p_contrast_trait:.2e}")
print(f"  Spearman: ρ={spearman_contrast_trait:.4f}, p={p_spearman_contrast_trait:.2e}")

In [None]:
# Save to JSON
results = {
    "default_vs_contrast": {
        "roles": {
            "n": len(role_df),
            "pearson": {"r": pearson_roles, "p": p_pearson_roles},
            "spearman": {"rho": spearman_roles, "p": p_spearman_roles}
        },
        "traits": {
            "n": len(trait_df),
            "pearson": {"r": pearson_traits, "p": p_pearson_traits},
            "spearman": {"rho": spearman_traits, "p": p_spearman_traits}
        }
    },
    "vector_similarity_vs_behavioral_score": {
        "roles": {
            "default_similarity": {
                "n": len(role_df_with_scores),
                "pearson": {"r": pearson_default_role, "p": p_default_role},
                "spearman": {"rho": spearman_default_role, "p": p_spearman_default_role}
            },
            "contrast_similarity": {
                "n": len(role_df_with_scores),
                "pearson": {"r": pearson_contrast_role, "p": p_contrast_role},
                "spearman": {"rho": spearman_contrast_role, "p": p_spearman_contrast_role}
            }
        },
        "traits": {
            "default_similarity": {
                "n": len(trait_df),
                "pearson": {"r": pearson_default_trait, "p": p_default_trait},
                "spearman": {"rho": spearman_default_trait, "p": p_spearman_default_trait}
            },
            "contrast_similarity": {
                "n": len(trait_df),
                "pearson": {"r": pearson_contrast_trait, "p": p_contrast_trait},
                "spearman": {"rho": spearman_contrast_trait, "p": p_spearman_contrast_trait}
            }
        }
    }
}

with open(f"./results/{model_name}/correlation_results.json", "w") as f:
    json.dump(results, f, indent=2)

print(f"Saved correlation results to ./results/{model_name}/correlation_results.json")

## Role PC similarity with traits

In [54]:
import torch.nn.functional as F
# Get top 5 role PCs
role_pcs = torch.tensor(role_results['pca'].components_[:5]).float()  # Shape: [5, hidden_dim]

# Get trait vectors at the specified layer
trait_vecs = trait_vectors[:, layer, :].float()  # Shape: [n_traits, hidden_dim]

# Normalize both for cosine similarity
role_pcs_norm = F.normalize(role_pcs, dim=-1)  # Shape: [5, hidden_dim]
trait_vecs_norm = F.normalize(trait_vecs, dim=-1)  # Shape: [n_traits, hidden_dim]

# Compute cosine similarity: [n_traits, 5]
similarities = trait_vecs_norm @ role_pcs_norm.T

# Create dataframe with similarities
pc_sim_df = pd.DataFrame({
    'trait': trait_labels,
    'PC1': similarities[:, 0].numpy(),
    'PC2': similarities[:, 1].numpy(),
    'PC3': similarities[:, 2].numpy(),
    'PC4': similarities[:, 3].numpy(),
    'PC5': similarities[:, 4].numpy(),
})

# Print top 10 and bottom 10 for each PC
for i in range(5):
    pc_col = f'PC{i+1}'
    print(f"\n{'='*60}")
    print(f"{pc_col} - Top 10 traits (most similar)")
    print("="*60)
    print(pc_sim_df.nlargest(10, pc_col)[['trait', pc_col]].to_string(index=False))
    
    print(f"\n{pc_col} - Bottom 10 traits (most dissimilar)")
    print("-"*60)
    print(pc_sim_df.nsmallest(10, pc_col)[['trait', pc_col]].to_string(index=False))


PC1 - Top 10 traits (most similar)
       trait      PC1
   enigmatic 0.762524
    ethereal 0.758917
    romantic 0.743763
    mystical 0.731033
     cryptic 0.722849
   whimsical 0.721664
 spontaneous 0.694479
melodramatic 0.670557
    dramatic 0.652410
   grandiose 0.625320

PC1 - Bottom 10 traits (most dissimilar)
------------------------------------------------------------
       trait       PC1
 transparent -0.657656
     factual -0.596230
    moderate -0.539773
    grounded -0.507743
conciliatory -0.506993
        calm -0.487453
  diplomatic -0.482365
  methodical -0.470943
    reserved -0.450273
  analytical -0.437280

PC2 - Top 10 traits (most similar)
       trait      PC2
    reactive 0.716139
      casual 0.715274
    visceral 0.701328
   impulsive 0.652146
    flippant 0.642449
  irreverent 0.637775
       petty 0.587813
disorganized 0.582317
    neurotic 0.581644
  nonchalant 0.569046

PC2 - Bottom 10 traits (most dissimilar)
----------------------------------------------

In [55]:
# Save the PC similarity dataframe
pc_sim_df.to_csv(f"./results/{model_name}/role_pc_trait_similarity.csv", index=False)
print(f"Saved to ./results/{model_name}/role_pc_trait_similarity.csv")

Saved to ./results/llama-3.3-70b/role_pc_trait_similarity.csv


In [2]:
# load in csv for all 3 models
models = [
    'gemma-2-27b',
    'qwen-3-32b',
    'llama-3.3-70b',
]


In [3]:
# Load PC similarity CSVs for all models
pc_sim_dfs = {}
for model in models:
    pc_sim_dfs[model] = pd.read_csv(f"./results/{model}/role_pc_trait_similarity.csv")

# Print top N traits for qwen PC1-3 and gemma PC2-3
N = 5


for model in models:
    print(f"\n{model}")
    print("="*60)
    for pc in ['PC1', 'PC2', 'PC3']:
        print(f"\n{pc} - Top {N}:")
        print(pc_sim_dfs[model].nlargest(N, pc)[['trait', pc]].to_string(index=False))
        print(f"\n{pc} - Bottom {N}:")
        print(pc_sim_dfs[model].nsmallest(N, pc)[['trait', pc]].to_string(index=False))



gemma-2-27b

PC1 - Top 5:
        trait      PC1
      factual 0.873635
   methodical 0.812940
  transparent 0.794871
     moderate 0.746195
structuralist 0.743175

PC1 - Bottom 5:
    trait       PC1
 romantic -0.877956
enigmatic -0.827663
 ethereal -0.802732
  cryptic -0.794250
obsessive -0.780832

PC2 - Top 5:
   trait      PC2
   stoic 0.705741
 serious 0.671351
detached 0.647201
  formal 0.612947
reserved 0.607707

PC2 - Bottom 5:
          trait       PC2
     gregarious -0.679885
improvisational -0.677909
         casual -0.632634
     irreverent -0.586699
     empathetic -0.582980

PC3 - Top 5:
       trait      PC3
    grounded 0.539665
   practical 0.506593
experiential 0.422742
reductionist 0.399932
       naive 0.396455

PC3 - Bottom 5:
      trait       PC3
   abstract -0.611486
introverted -0.607227
 conceptual -0.587547
big_picture -0.535779
    erudite -0.532346

qwen-3-32b

PC1 - Top 5:
      trait      PC1
 methodical 0.876927
    factual 0.846249
       calm 0.81730

In [18]:
# ============================================================
# CONFIGURATION - Edit this cell to change what gets plotted
# ============================================================

N = 5  # Number of traits to use from each PC

# Define axes: (model, pc, end) where end is 'top' or 'bottom'
# Each tuple becomes one axis on the radial plot
plot_axes = [
    ('qwen-3-32b', 'PC1', 'top'),       # analytical, calm, factual, methodical, transparent
    ('qwen-3-32b', 'PC3', 'bottom'),    # benevolent, forgiving, humanistic, meditative, nurturing
    ('gemma-2-27b', 'PC2', 'top'),   # casual, empathetic, gregarious, improvisational, irreverent
    ('gemma-2-27b', 'PC3', 'bottom'),      # experiential, grounded, naive, practical, reductionist
    ('qwen-3-32b', 'PC2', 'bottom'),    # abstract, eloquent, erudite, introverted, ritualistic
]

# Labels for each axis (customize as needed)
axis_labels = [
    'Assistant-like',
    'Supportive', 
    'Systematic',
    'Pedagogical',
    'Individualistic',
]

In [6]:
# Load behavioral trait scores for all 3 models
import json
behavioral_scores = {}

for model in models:
    model_scores_path = f'/workspace/{model}/traits/default_scores'
    model_data = {}
    
    for file in os.listdir(model_scores_path):
        if file.endswith('.json'):
            trait_name = file.replace('.json', '')
            with open(os.path.join(model_scores_path, file), 'r') as f:
                scores = json.load(f)
            
            numeric_scores = [v for v in scores.values() if isinstance(v, (int, float))]
            if numeric_scores:
                model_data[trait_name] = np.mean(numeric_scores)
            else:
                model_data[trait_name] = np.nan
    
    behavioral_scores[model] = model_data

print(f"Loaded behavioral scores for {len(models)} models")

Loaded behavioral scores for 3 models


In [19]:
import plotly.graph_objects as go

# Get traits for each axis based on configuration
axis_traits = []
for source_model, pc, end in plot_axes:
    df = pc_sim_dfs[source_model]
    if end == 'top':
        traits = df.nlargest(N, pc)['trait'].tolist()
    else:
        traits = df.nsmallest(N, pc)['trait'].tolist()
    axis_traits.append(traits)
    print(f"{source_model} {pc} ({end}): {traits}")

# Calculate mean behavioral score for each axis for each model
plot_data = {model: [] for model in models}

for axis_idx, traits in enumerate(axis_traits):
    for model in models:
        scores = [behavioral_scores[model].get(t, np.nan) for t in traits]
        mean_score = np.nanmean(scores)
        plot_data[model].append(mean_score)

# Create radial plot
fig = go.Figure()

colors = ['rgb(31, 119, 180)', 'rgb(255, 127, 14)', 'rgb(44, 160, 44)']

for idx, model in enumerate(models):
    values = plot_data[model] + [plot_data[model][0]]  # Close the polygon
    
    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=axis_labels + [axis_labels[0]],
        name=model,
        line=dict(color=colors[idx])
    ))

fig.update_polars(
    radialaxis=dict(
        range=[0, 100],
        showticklabels=False
    )
)

fig.update_layout(
    title="Default Behavior Scores for Role PC Traits",
    height=500,
    width=600,
    showlegend=True
)

fig.show()

qwen-3-32b PC1 (top): ['methodical', 'factual', 'calm', 'transparent', 'analytical']
qwen-3-32b PC3 (bottom): ['benevolent', 'forgiving', 'humanistic', 'nurturing', 'meditative']
gemma-2-27b PC2 (top): ['stoic', 'serious', 'detached', 'formal', 'reserved']
gemma-2-27b PC3 (bottom): ['abstract', 'introverted', 'conceptual', 'big_picture', 'erudite']
qwen-3-32b PC2 (bottom): ['introverted', 'ritualistic', 'erudite', 'eloquent', 'abstract']
