# Default behavior


In [13]:
import numpy as np
import json
import os
import torch
import pandas as pd

In [14]:
models = [
    'gemma-2-27b',
    'qwen-3-32b',
    'llama-3.3-70b',
]

scores_path = '/workspace/{model}/traits/default_scores'

In [15]:
# Load scores for all traits across different models
data = {}

for model in models:
    model_scores_path = f'/workspace/{model}/traits/default_scores'
    model_data = {}
    
    # Iterate through all trait files
    for file in os.listdir(model_scores_path):
        if file.endswith('.json'):
            trait_name = file.replace('.json', '')
            
            # Load the JSON file
            with open(os.path.join(model_scores_path, file), 'r') as f:
                scores = json.load(f)
            
            # Filter out non-numeric values (refusals, etc.) and calculate mean
            numeric_scores = [v for v in scores.values() if isinstance(v, (int, float))]
            
            if numeric_scores:
                mean_score = np.mean(numeric_scores)
            else:
                # If all values are non-numeric, use NaN
                mean_score = np.nan
            
            model_data[trait_name] = mean_score
    
    data[model] = model_data

# Create DataFrame with traits as rows and models as columns
df = pd.DataFrame(data)
print(f"Loaded scores for {len(df)} traits across {len(df.columns)} models")
print(f"\nDataFrame shape: {df.shape}")
print(f"\nNumber of NaN values per model:")
print(df.isna().sum())
print(f"\nFirst few rows:")
df.head()

Loaded scores for 240 traits across 3 models

DataFrame shape: (240, 3)

Number of NaN values per model:
gemma-2-27b      0
qwen-3-32b       0
llama-3.3-70b    0
dtype: int64

First few rows:


Unnamed: 0,gemma-2-27b,qwen-3-32b,llama-3.3-70b
absolutist,2.55,10.5,9.55
grandiose,0.25,0.325,1.125
relativist,75.1,69.575,67.0
holistic,88.775,89.925,89.0
goofy,3.8,4.05,3.825


In [16]:
# now get the top 5 traits for the top 5 PC traits for one of the models
import torch

pca_results = torch.load('/workspace/llama-3.3-70b/traits_240/pca/layer40_pos-neg50.pt', weights_only=False)
labels = pca_results['traits']['pos_neg_50']

traits_df = pd.DataFrame({
    'label': labels,
    'pc1': pca_results['pca_transformed'][:, 0],
    'pc2': pca_results['pca_transformed'][:, 1],
    'pc3': pca_results['pca_transformed'][:, 2],
    'pc4': pca_results['pca_transformed'][:, 3],
    'pc5': pca_results['pca_transformed'][:, 4],
})

traits_df.head(5)



Trying to unpickle estimator PCA from version 1.7.0 when using version 1.7.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator StandardScaler from version 1.7.0 when using version 1.7.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



Unnamed: 0,label,pc1,pc2,pc3,pc4,pc5
0,zealous,21.55899,-46.148135,-16.24403,-39.555186,24.236954
1,wry,53.599807,6.029477,-9.867787,-2.250747,-30.484981
2,witty,8.627841,-65.2779,12.505692,-33.514593,-23.324574
3,whimsical,24.592243,-59.602846,5.875083,12.541718,16.588861
4,visceral,52.724652,-6.302639,33.776209,-18.275349,3.76979


In [17]:
# Calculate pc{i}_low and pc{i}_high for each PC and each model
pc_stats = {}

for model in models:
    model_stats = {}
    
    for i in range(1, 6):  # PC1 through PC5
        pc_col = f'pc{i}'
        
        # Sort by PC value to get top 5 and bottom 5
        sorted_traits = traits_df.sort_values(pc_col)
        
        # Get bottom 5 (low) and top 5 (high) trait names
        bottom_5_traits = sorted_traits.head(5)['label'].tolist()
        print(f"PC{i}")
        print(f"bottom_5_traits: {bottom_5_traits}")
        top_5_traits = sorted_traits.tail(5)['label'].tolist()
        print(f"top_5_traits: {top_5_traits}")
        # Calculate mean scores for these traits from the model's scores
        # Filter out NaN values in case any trait is missing
        bottom_5_scores = [df.loc[trait, model] for trait in bottom_5_traits if trait in df.index]
        top_5_scores = [df.loc[trait, model] for trait in top_5_traits if trait in df.index]
        
        model_stats[f'pc{i}_low'] = np.mean(bottom_5_scores) if bottom_5_scores else np.nan
        model_stats[f'pc{i}_high'] = np.mean(top_5_scores) if top_5_scores else np.nan
    
    pc_stats[model] = model_stats

# Create DataFrame for easy viewing
pc_stats_df = pd.DataFrame(pc_stats).T
print("PC statistics (mean scores for top 5 and bottom 5 traits):")
pc_stats_df

PC1
bottom_5_traits: ['supportive', 'conscientious', 'nurturing', 'resilient', 'earnest']
top_5_traits: ['bitter', 'cruel', 'hostile', 'nonchalant', 'flippant']
PC2
bottom_5_traits: ['animated', 'charismatic', 'metaphorical', 'dramatic', 'poetic']
top_5_traits: ['rationalist', 'detached', 'understated', 'reserved', 'dispassionate']
PC3
bottom_5_traits: ['bombastic', 'ritualistic', 'meticulous', 'solemn', 'formal']
top_5_traits: ['understated', 'accessible', 'naive', 'chill', 'casual']
PC4
bottom_5_traits: ['sassy', 'animated', 'charismatic', 'decisive', 'problem_solving']
top_5_traits: ['serene', 'cryptic', 'understated', 'enigmatic', 'meditative']
PC5
bottom_5_traits: ['exploratory', 'verbose', 'generous', 'circumspect', 'challenging']
top_5_traits: ['decisive', 'reverent', 'concise', 'confident', 'naive']
PC1
bottom_5_traits: ['supportive', 'conscientious', 'nurturing', 'resilient', 'earnest']
top_5_traits: ['bitter', 'cruel', 'hostile', 'nonchalant', 'flippant']
PC2
bottom_5_traits:

Unnamed: 0,pc1_low,pc1_high,pc2_low,pc2_high,pc3_low,pc3_high,pc4_low,pc4_high,pc5_low,pc5_high
gemma-2-27b,88.615,1.11,34.813,71.335,66.48,42.605,42.533,38.785,79.425,32.205
qwen-3-32b,92.382,0.49,39.587,60.178434,74.615,38.085,52.992,38.765,81.25,41.74
llama-3.3-70b,90.66,0.455,34.995,64.29,73.115,34.815,47.99,36.015,79.19,35.825


In [69]:
# load in each model's default responses projected into llama's trait space

projected = {}

for model_a in models:
    projected[model_a] = {}
    for model_b in models:
        projected_path = f'/workspace/{model_a}/traits/{model_b}/overall_mean.pt'
        projected[model_a][model_b] = torch.load(projected_path, weights_only=False)
        print(f"Top 5 PC Loadings for {model_b} projected into {model_a}'s trait space:")
        print(projected[model_a][model_b]['mean_activation_projected'][:5])
        print("\n")
    print("-"*100)

for model in models:
    projected[model] = torch.load(f'/workspace/llama-3.3-70b/traits/{model}/overall_mean.pt', weights_only=False)


Top 5 PC Loadings for gemma-2-27b projected into gemma-2-27b's trait space:
tensor([-16.7774, -17.5412,  -9.9381,  -7.4385,   4.1118], dtype=torch.float64)


Top 5 PC Loadings for qwen-3-32b projected into gemma-2-27b's trait space:
tensor([-20.0446, -16.0493,  -8.9185,  -6.7037,   5.0574], dtype=torch.float64)


Top 5 PC Loadings for llama-3.3-70b projected into gemma-2-27b's trait space:
tensor([-23.8249, -18.1947,  -8.6698,  -4.9781,   3.3518], dtype=torch.float64)


----------------------------------------------------------------------------------------------------
Top 5 PC Loadings for gemma-2-27b projected into qwen-3-32b's trait space:
tensor([ 33.9885,  -6.3186, -16.7539,  32.7086,  -2.9828], dtype=torch.float64)


Top 5 PC Loadings for qwen-3-32b projected into qwen-3-32b's trait space:
tensor([ 37.2324,   2.5526, -16.6726,  16.1802,  -7.9260], dtype=torch.float64)


Top 5 PC Loadings for llama-3.3-70b projected into qwen-3-32b's trait space:
tensor([ 38.8172,   2.9463, -19.41

In [None]:
print(projected['llama-3.3-70b']['mean_activation'].shape)


torch.Size([80, 8192])


In [62]:
# Calculate axis max values for each PC dimension
# For plot1: pc1_low, pc2_high, pc3_low, pc4_low, pc5_low
# For plot2: pc1_high, pc2_low, pc3_high, pc4_high, pc5_high

plot1_axes = []
plot2_axes = []

for i in range(5):
    pc_min = min(pca_results['pca_transformed'][:, i])
    pc_max = max(pca_results['pca_transformed'][:, i])
    
    # For plot1_contents: ['pc1_low', 'pc2_high', 'pc3_low', 'pc4_low', 'pc5_low']
    if i == 1:  # PC2 uses _high, so use max
        plot1_axes.append(pc_max)
    else:  # PC1, PC3, PC4, PC5 use _low, so use min
        plot1_axes.append(pc_min)
    
    # For plot2_contents: ['pc1_high', 'pc2_low', 'pc3_high', 'pc4_high', 'pc5_high']
    if i == 1:  # PC2 uses _low, so use min
        plot2_axes.append(pc_min)
    else:  # PC1, PC3, PC4, PC5 use _high, so use max
        plot2_axes.append(pc_max)
    
    print(f"PC{i+1}: min={pc_min:.2f}, max={pc_max:.2f}")

print(f"\nplot1_axes: {[f'{x:.2f}' for x in plot1_axes]}")
print(f"plot2_axes: {[f'{x:.2f}' for x in plot2_axes]}")


PC1: min=-100.58, max=86.01
PC2: min=-94.68, max=83.44
PC3: min=-77.38, max=75.33
PC4: min=-67.35, max=72.82
PC5: min=-55.03, max=55.32

plot1_axes: ['-100.58', '83.44', '-77.38', '-67.35', '-55.03']
plot2_axes: ['86.01', '-94.68', '75.33', '72.82', '55.32']


In [68]:
# make two pentagon plots, one for each of the axes

plot1_contents = ['pc1_low', 'pc2_high', 'pc3_low', 'pc4_low', 'pc5_low']
plot2_contents = ['pc1_high', 'pc2_low', 'pc3_high', 'pc4_high', 'pc5_high']
plot1_labels = ['Agreeable', 'Analytical', 'Formal', 'Dynamic', 'Exploratory']
plot2_labels = ['Hostile', 'Expressive', 'Casual', 'Contemplative', 'Decisive']

# Create a figure with two subplots
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplot structure with two polar plots
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{'type': 'polar'}] * 2],
    subplot_titles=('Behavior Scores', 'Projected Activations in Trait PC Space')
)

# Define colors for each model
colors = ['rgb(31, 119, 180)', 'rgb(255, 127, 14)', 'rgb(44, 160, 44)']

# Calculate max range for bidirectional axis (find the largest absolute value)
max_range = max(max(plot1_axes), max(plot2_axes))

# Determine which PCs need to be flipped
# We want plot1_axes to always be positive and plot2_axes to always be negative
# For plot1_contents: pc1_low, pc2_high, pc3_low, pc4_low, pc5_low
# pc1_low uses min (negative), so flip it
# pc2_high uses max (positive), so don't flip
# pc3_low uses min (negative), so flip it
# pc4_low uses min (negative), so flip it
# pc5_low uses min (negative), so flip it
flip_sign = [True, False, True, True, True]  # Which PCs to flip

# Add traces for each model
for idx, model in enumerate(models):
    # Plot 1 - left subplot (behavior scores)
    values1 = [pc_stats_df.loc[model, col] for col in plot1_contents]
    values1.append(values1[0])  # Close the pentagon
    
    fig.add_trace(
        go.Scatterpolar(
            r=values1,
            theta=plot1_labels + [plot1_labels[0]],
            name=model,
            line=dict(color=colors[idx]),
            showlegend=True
        ),
        row=1, col=1
    )
    
    # Plot 2 - right subplot (projected activations with bidirectional scaling)
    # Extract PC values from projected tensor (indices 0-4 for PC1-PC5)
    projected_vals = projected[model]['mean_activation_projected'][:5].cpu().numpy()
    
    # Flip signs where needed so plot1 is always positive direction
    projected_vals_flipped = [projected_vals[i] * (-1 if flip_sign[i] else 1) for i in range(5)]
    
    # When we flip, we also need to swap which axis we use for normalization
    # After flipping: positive should use plot1_axes, negative should use plot2_axes
    values2 = []
    for i in range(5):
        val = projected_vals_flipped[i]
        
        # After flipping, the axes are swapped for flipped PCs
        if flip_sign[i]:
            # Flipped: positive uses plot2_axes (which was the min), negative uses plot1_axes (which was the max)
            # But we want positive to use plot1 and negative to use plot2 for display
            # So we need to swap the axes we reference
            pos_axis = plot2_axes[i]  # What was the min is now positive
            neg_axis = plot1_axes[i]  # What was the max is now negative
        else:
            # Not flipped: use axes as-is
            pos_axis = plot1_axes[i]
            neg_axis = plot2_axes[i]
        
        if val >= 0:
            # Scale positive values
            normalized = (val / pos_axis) * max_range if pos_axis != 0 else 0
        else:
            # Scale negative values
            normalized = (val / neg_axis) * max_range if neg_axis != 0 else 0
        values2.append(normalized)
    
    values2.append(values2[0])  # Close the pentagon
    
    fig.add_trace(
        go.Scatterpolar(
            r=values2,
            theta=plot1_labels + [plot1_labels[0]],
            name=model,
            line=dict(color=colors[idx]),
            showlegend=False  # Only show legend once
        ),
        row=1, col=2
    )

# Update left subplot to 0-100 range
fig.update_polars(
    radialaxis=dict(
        range=[0, 100],
        showticklabels=False
    ),
    row=1, col=1
)

# Update right subplot to bidirectional range
fig.update_polars(
    radialaxis=dict(
        range=[-max_range, max_range],
        showticklabels=False
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(
    title={
        'text': "Default Behavior Expressing Traits Corresponding with Top 5 Trait PCs",
        'subtitle': {
        'text': 'Llama 3.3 70B Trait PCs',
    }
    },
    
    height=500,
    width=900,
    showlegend=True
)

fig.show()


In [66]:
# Original plot with just mean behavior scores for plot1
import plotly.graph_objects as go

# Create a single polar plot
fig = go.Figure()

# Define colors for each model
colors = ['rgb(31, 119, 180)', 'rgb(255, 127, 14)', 'rgb(44, 160, 44)']

# Add traces for each model
for idx, model in enumerate(models):
    values1 = [pc_stats_df.loc[model, col] for col in plot1_contents]
    values1.append(values1[0])  # Close the pentagon
    
    fig.add_trace(
        go.Scatterpolar(
            r=values1,
            theta=plot1_labels + [plot1_labels[0]],
            name=model,
            line=dict(color=colors[idx])
        )
    )

# Update polar axes
fig.update_polars(
    radialaxis=dict(
        range=[0, 100],
        showticklabels=False
    )
)

# Update layout
fig.update_layout(
    title={
        'text': "Default Behavior Expressing Traits from Top 5 Trait PCs",
        'subtitle': {
        'text': 'Traits Selected from Llama 3.3 70B\'s PCs',
    }},
    height=500,
    width=600,
    showlegend=True
)

fig.show()
