# Default behavior


In [None]:
import numpy as np
import json
import os
import torch
import pandas as pd

In [None]:
models = [
    'gemma-2-27b',
    'qwen-3-32b',
]

scores_path = '/workspace/{model}/traits/default_scores'

In [5]:
# Load scores for all traits across different models
data = {}

for model in models:
    model_scores_path = f'/workspace/{model}/traits/default_scores'
    model_data = {}
    
    # Iterate through all trait files
    for file in os.listdir(model_scores_path):
        if file.endswith('.json'):
            trait_name = file.replace('.json', '')
            
            # Load the JSON file
            with open(os.path.join(model_scores_path, file), 'r') as f:
                scores = json.load(f)
            
            # Filter out non-numeric values (refusals, etc.) and calculate mean
            numeric_scores = [v for v in scores.values() if isinstance(v, (int, float))]
            
            if numeric_scores:
                mean_score = np.mean(numeric_scores)
            else:
                # If all values are non-numeric, use NaN
                mean_score = np.nan
            
            model_data[trait_name] = mean_score
    
    data[model] = model_data

# Create DataFrame with traits as rows and models as columns
df = pd.DataFrame(data)
print(f"Loaded scores for {len(df)} traits across {len(df.columns)} models")
print(f"\nDataFrame shape: {df.shape}")
print(f"\nNumber of NaN values per model:")
print(df.isna().sum())
print(f"\nFirst few rows:")
df.head()

Loaded scores for 240 traits across 2 models

DataFrame shape: (240, 2)

Number of NaN values per model:
gemma-2-27b    0
qwen-3-32b     0
dtype: int64

First few rows:


Unnamed: 0,gemma-2-27b,qwen-3-32b
absolutist,2.55,10.5
grandiose,0.25,0.325
relativist,75.1,69.575
holistic,88.775,89.925
goofy,3.8,4.05


In [7]:
# now get the top 5 traits for the top 5 PC traits for one of the models
import torch

pca_results = torch.load('/workspace/llama-3.3-70b/traits_240/pca/layer40_pos-neg50.pt', weights_only=False)
labels = pca_results['traits']['pos_neg_50']

traits_df = pd.DataFrame({
    'label': labels,
    'pc1': pca_results['pca_transformed'][:, 0],
    'pc2': pca_results['pca_transformed'][:, 1],
    'pc3': pca_results['pca_transformed'][:, 2],
    'pc4': pca_results['pca_transformed'][:, 3],
    'pc5': pca_results['pca_transformed'][:, 4],
})

traits_df.head(5)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,label,pc1,pc2,pc3,pc4,pc5
0,zealous,21.55899,-46.148135,-16.24403,-39.555186,24.236954
1,wry,53.599807,6.029477,-9.867787,-2.250747,-30.484981
2,witty,8.627841,-65.2779,12.505692,-33.514593,-23.324574
3,whimsical,24.592243,-59.602846,5.875083,12.541718,16.588861
4,visceral,52.724652,-6.302639,33.776209,-18.275349,3.76979


In [10]:
# Calculate pc{i}_low and pc{i}_high for each PC and each model
pc_stats = {}

for model in models:
    model_stats = {}
    
    for i in range(1, 6):  # PC1 through PC5
        pc_col = f'pc{i}'
        
        # Sort by PC value to get top 5 and bottom 5
        sorted_traits = traits_df.sort_values(pc_col)
        
        # Get bottom 5 (low) and top 5 (high) trait names
        bottom_5_traits = sorted_traits.head(5)['label'].tolist()
        print(f"PC{i}")
        print(f"bottom_5_traits: {bottom_5_traits}")
        top_5_traits = sorted_traits.tail(5)['label'].tolist()
        print(f"top_5_traits: {top_5_traits}")
        # Calculate mean scores for these traits from the model's scores
        # Filter out NaN values in case any trait is missing
        bottom_5_scores = [df.loc[trait, model] for trait in bottom_5_traits if trait in df.index]
        top_5_scores = [df.loc[trait, model] for trait in top_5_traits if trait in df.index]
        
        model_stats[f'pc{i}_low'] = np.mean(bottom_5_scores) if bottom_5_scores else np.nan
        model_stats[f'pc{i}_high'] = np.mean(top_5_scores) if top_5_scores else np.nan
    
    pc_stats[model] = model_stats

# Create DataFrame for easy viewing
pc_stats_df = pd.DataFrame(pc_stats).T
print("PC statistics (mean scores for top 5 and bottom 5 traits):")
pc_stats_df

PC1
bottom_5_traits: ['supportive', 'conscientious', 'nurturing', 'resilient', 'earnest']
top_5_traits: ['bitter', 'cruel', 'hostile', 'nonchalant', 'flippant']
PC2
bottom_5_traits: ['animated', 'charismatic', 'metaphorical', 'dramatic', 'poetic']
top_5_traits: ['rationalist', 'detached', 'understated', 'reserved', 'dispassionate']
PC3
bottom_5_traits: ['bombastic', 'ritualistic', 'meticulous', 'solemn', 'formal']
top_5_traits: ['understated', 'accessible', 'naive', 'chill', 'casual']
PC4
bottom_5_traits: ['sassy', 'animated', 'charismatic', 'decisive', 'problem_solving']
top_5_traits: ['serene', 'cryptic', 'understated', 'enigmatic', 'meditative']
PC5
bottom_5_traits: ['exploratory', 'verbose', 'generous', 'circumspect', 'challenging']
top_5_traits: ['decisive', 'reverent', 'concise', 'confident', 'naive']
PC1
bottom_5_traits: ['supportive', 'conscientious', 'nurturing', 'resilient', 'earnest']
top_5_traits: ['bitter', 'cruel', 'hostile', 'nonchalant', 'flippant']
PC2
bottom_5_traits:

Unnamed: 0,pc1_low,pc1_high,pc2_low,pc2_high,pc3_low,pc3_high,pc4_low,pc4_high,pc5_low,pc5_high
gemma-2-27b,88.615,1.11,34.813,71.335,66.48,42.605,42.533,38.785,79.425,32.205
qwen-3-32b,92.382,0.49,39.587,60.178434,74.615,38.085,52.992,38.765,81.25,41.74


In [None]:
# make two pentagon plots, one for each of the axes

plot1_contents = ['pc1_low', 'pc2_high', 'pc3_low', 'pc4_low', 'pc5_low']
plot2_contents = ['pc1_high', 'pc2_low', 'pc3_high', 'pc4_high', 'pc5_high']
plot1_labels = ['Warmth', 'Analytical', 'Formal', 'Dynamic', 'Exploratory']
plot2_labels = ['Hostility', 'Expressive', 'Casual', 'Contemplative', 'Decisive']

# Create a figure with two subplots
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplot structure with two polar plots
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{'type': 'polar'}] * 2],
    subplot_titles=('Plot 1', 'Plot 2')
)

# Define colors for each model
colors = ['rgb(31, 119, 180)', 'rgb(255, 127, 14)', 'rgb(44, 160, 44)']

# Add traces for each model
for idx, model in enumerate(models):
    # Plot 1 - left subplot
    values1 = [pc_stats_df.loc[model, col] for col in plot1_contents]
    values1.append(values1[0])  # Close the pentagon
    
    fig.add_trace(
        go.Scatterpolar(
            r=values1,
            theta=plot1_labels + [plot1_labels[0]],
            name=model,
            line=dict(color=colors[idx]),
            showlegend=True
        ),
        row=1, col=1
    )
    
    # Plot 2 - right subplot
    values2 = [pc_stats_df.loc[model, col] for col in plot2_contents]
    values2.append(values2[0])  # Close the pentagon
    
    fig.add_trace(
        go.Scatterpolar(
            r=values2,
            theta=plot2_labels + [plot2_labels[0]],
            name=model,
            line=dict(color=colors[idx]),
            showlegend=False  # Only show legend once
        ),
        row=1, col=2
    )

# Update polar axes to have same range and no ticks
fig.update_polars(
    radialaxis=dict(
        range=[0, 100],
        showticklabels=False
    )
)

# Update layout
fig.update_layout(
    title="Model Behavior Comparison on PC Dimensions",
    height=500,
    showlegend=True
)

fig.show()
