# Analyze scores for each trait

In [46]:
import json
import os
import torch
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as sp
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


## Score statistics

In [3]:
# load data from data/extract_scores
score_dir = "/root/git/persona-subspace/traits/data/extract_scores"

# iterate through each json file in the directory
scores = {}
for file in os.listdir(score_dir):
    if file.endswith(".json"):
        with open(os.path.join(score_dir, file), "r") as f:
            scores[file.replace(".json", "")] = json.load(f)

print(f"Found {len(scores.keys())} traits with scores")


Found 240 traits with scores


In [4]:
# Analyze refusals and clean data
refusal_info = {}
scores_clean = {}

for trait, score_obj in scores.items():
    refusals = []
    cleaned_scores = {}
    
    # Check each score for refusals
    for key, value in score_obj.items():
        if value == "REFUSAL":
            refusals.append(key)
            cleaned_scores[key] = 0  # Replace refusals with NaN
        else:
            cleaned_scores[key] = float(value)  # Ensure numeric
    
    scores_clean[trait] = cleaned_scores
    refusal_info[trait] = {
        "refusals": refusals,
        "refusal_count": len(refusals)
    }

# Show refusal statistics
total_refusals = sum(info["refusal_count"] for info in refusal_info.values())
traits_with_refusals = sum(1 for info in refusal_info.values() if info["refusal_count"] > 0)

print(f"Refusal Statistics:")
print(f"Total refusals across all traits: {total_refusals}")
print(f"Traits with refusals: {traits_with_refusals}")

if total_refusals > 0:
    sorted_refusals = sorted(refusal_info.items(), key=lambda x: x[1]["refusal_count"], reverse=True)
    print(f"\nTop 10 traits with most refusals:")
    for trait, info in sorted_refusals[:10]:
        if info["refusal_count"] > 0:
            print(f"  {trait}: {info['refusal_count']} refusals - {info['refusals']}")

Refusal Statistics:
Total refusals across all traits: 4
Traits with refusals: 4

Top 10 traits with most refusals:
  animated: 1 refusals - ['neg_p1_q7']
  diplomatic: 1 refusals - ['neg_p2_q19']
  generalist: 1 refusals - ['neg_p0_q4']
  utilitarian: 1 refusals - ['default_p2_q13']


In [5]:
# Create numpy arrays using cleaned scores (refusals as NaN)
# Structure: 3D tensor with shape (3 types, 5 prompts, 20 questions)
scores_np = {}

for trait, cleaned_scores in scores_clean.items():
    # Create 3D array: [type, prompt, question]
    scores_3d = np.full((3, 5, 20), np.nan)
    
    # Extract scores for each type, prompt, and question
    for prompt_idx in range(5):
        for question_idx in range(20):
            # pos scores
            pos_key = f"pos_p{prompt_idx}_q{question_idx}"
            if pos_key in cleaned_scores:
                scores_3d[0, prompt_idx, question_idx] = cleaned_scores[pos_key]
            
            # neg scores  
            neg_key = f"neg_p{prompt_idx}_q{question_idx}"
            if neg_key in cleaned_scores:
                scores_3d[1, prompt_idx, question_idx] = cleaned_scores[neg_key]
            
            # default scores
            default_key = f"default_p{prompt_idx}_q{question_idx}"
            if default_key in cleaned_scores:
                scores_3d[2, prompt_idx, question_idx] = cleaned_scores[default_key]
    
    scores_np[trait] = scores_3d

print(f"Created numpy arrays for {len(scores_np)} traits")
print(f"Shape of each array: {next(iter(scores_np.values())).shape}")
print(f"Example (first trait): {list(scores_np.keys())[0]}")
example_trait = list(scores_np.keys())[0]
print(f"Pos scores for first 2 prompts, 5 questions:\n{scores_np[example_trait][0, :2, :5]}")
print(f"Neg scores for first 2 prompts, 5 questions:\n{scores_np[example_trait][1, :2, :5]}")

Created numpy arrays for 240 traits
Shape of each array: (3, 5, 20)
Example (first trait): absolutist
Pos scores for first 2 prompts, 5 questions:
[[ 0. 10. 10. 10.  0.]
 [ 0. 10.  0. 10.  0.]]
Neg scores for first 2 prompts, 5 questions:
[[ 0.  0.  0. 10.  0.]
 [ 0. 10.  0.  0.  0.]]


In [6]:
# Calculate simplified statistics for each trait
stats = {}

for trait, scores_3d in scores_np.items():
    pos_scores = scores_3d[0]  # shape: (5, 20) 
    neg_scores = scores_3d[1]  # shape: (5, 20)
    
    # Mean difference between pos and neg across all samples
    pos_minus_neg_mean = np.nanmean(pos_scores - neg_scores)
    
    # Count all pos/neg pairs with same prompt_index and question_index
    high_pos_low_neg_count = 0
    large_diff_count = 0
    
    # Check all 100 pairs (5 prompts × 20 questions)
    for prompt_idx in range(5):
        for question_idx in range(20):
            pos_val = pos_scores[prompt_idx, question_idx]
            neg_val = neg_scores[prompt_idx, question_idx]
            
            # Skip if either value is NaN
            if not (np.isnan(pos_val) or np.isnan(neg_val)):
                # Count high pos, low neg cases
                if pos_val > 50 and neg_val < 50:
                    high_pos_low_neg_count += 1
                
                # Count large difference cases  
                if abs(pos_val - neg_val) > 40:
                    large_diff_count += 1
    
    stats[trait] = {
        "pos_minus_neg_mean": pos_minus_neg_mean,
        "high_pos_low_neg_count": high_pos_low_neg_count,
        "large_diff_count": large_diff_count
    }

# Show example statistics for first trait
example_trait = list(stats.keys())[0]
print(f"Example statistics for '{example_trait}':")
for key, value in stats[example_trait].items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value}")

print(f"\nCalculated statistics for {len(stats)} traits")

# Show summary of counts
high_pos_counts = [s["high_pos_low_neg_count"] for s in stats.values()]
large_diff_counts = [s["large_diff_count"] for s in stats.values()]
print(f"\nHigh pos, low neg count distribution: min={min(high_pos_counts)}, max={max(high_pos_counts)}, mean={np.mean(high_pos_counts):.1f}")
print(f"Large diff count distribution: min={min(large_diff_counts)}, max={max(large_diff_counts)}, mean={np.mean(large_diff_counts):.1f}")

# Export to CSV
stats_df = pd.DataFrame.from_dict(stats, orient='index')
stats_df.index.name = 'trait'
stats_df.to_csv('./results/pos_neg.csv')
print(f"\nExported statistics to pos_neg.csv")
print(f"Shape: {stats_df.shape}")

Example statistics for 'absolutist':
  pos_minus_neg_mean: 21.75
  high_pos_low_neg_count: 25
  large_diff_count: 25

Calculated statistics for 240 traits

High pos, low neg count distribution: min=0, max=100, mean=79.6
Large diff count distribution: min=0, max=100, mean=79.4

Exported statistics to pos_neg.csv
Shape: (240, 3)


In [7]:
# Calculate pos - default statistics similar to pos - neg
pos_default_stats = {}

for trait, scores_3d in scores_np.items():
    pos_scores = scores_3d[0]      # shape: (5, 20) 
    default_scores = scores_3d[2]  # shape: (5, 20)
    
    # Mean difference between pos and default across all samples
    pos_minus_default_mean = np.nanmean(pos_scores - default_scores)
    
    # Count all pos/default pairs with same prompt_index and question_index
    high_pos_low_default_count = 0
    large_diff_count = 0
    
    # Check all 100 pairs (5 prompts × 20 questions)
    for prompt_idx in range(5):
        for question_idx in range(20):
            pos_val = pos_scores[prompt_idx, question_idx]
            default_val = default_scores[prompt_idx, question_idx]
            
            # Skip if either value is NaN
            if not (np.isnan(pos_val) or np.isnan(default_val)):
                # Count high pos, low default cases
                if pos_val > 50 and default_val < 50:
                    high_pos_low_default_count += 1
                
                # Count large difference cases  
                if abs(pos_val - default_val) > 40:
                    large_diff_count += 1
    
    pos_default_stats[trait] = {
        "pos_minus_default_mean": pos_minus_default_mean,
        "high_pos_low_default_count": high_pos_low_default_count,
        "large_diff_count": large_diff_count
    }

# Show example statistics for first trait
example_trait = list(pos_default_stats.keys())[0]
print(f"Example pos-default statistics for '{example_trait}':")
for key, value in pos_default_stats[example_trait].items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value}")

print(f"\nCalculated pos-default statistics for {len(pos_default_stats)} traits")

# Show summary of counts
high_pos_counts = [s["high_pos_low_default_count"] for s in pos_default_stats.values()]
large_diff_counts = [s["large_diff_count"] for s in pos_default_stats.values()]
print(f"\nHigh pos, low default count distribution: min={min(high_pos_counts)}, max={max(high_pos_counts)}, mean={np.mean(high_pos_counts):.1f}")
print(f"Large diff count distribution: min={min(large_diff_counts)}, max={max(large_diff_counts)}, mean={np.mean(large_diff_counts):.1f}")

# Export to CSV
pos_default_df = pd.DataFrame.from_dict(pos_default_stats, orient='index')
pos_default_df.index.name = 'trait'
pos_default_df.to_csv('./results/pos_default.csv')
print(f"\nExported pos-default statistics to pos_default.csv")
print(f"Shape: {pos_default_df.shape}")

Example pos-default statistics for 'absolutist':
  pos_minus_default_mean: 22.75
  high_pos_low_default_count: 25
  large_diff_count: 25

Calculated pos-default statistics for 240 traits

High pos, low default count distribution: min=0, max=100, mean=47.1
Large diff count distribution: min=0, max=100, mean=46.8

Exported pos-default statistics to pos_default.csv
Shape: (240, 3)


## PCA

In [8]:
# check vectors

# load all vectors from data/vectors
vector_dir = "/root/git/persona-subspace/traits/data/vectors"

# iterate through each .pt file in the directory
vectors = {}
for file in os.listdir(vector_dir):
    if file.endswith(".pt"):
        vectors[file.replace(".pt", "")] = torch.load(os.path.join(vector_dir, file))

print(f"Found {len(vectors.keys())} traits with vectors")

Found 240 traits with vectors


In [10]:
vectors['zealous']['pos_neg'].shape

torch.Size([46, 4608])

In [39]:
layer = 34

pos_neg = [vector['pos_neg'] for vector in vectors.values()]
pos_neg_50 = [vector['pos_neg_50'] for vector in vectors.values()]
pos_default = [vector['pos_default'] for vector in vectors.values()]
pos_default_50 = [vector['pos_default_50'] for vector in vectors.values()]

print(len(pos_neg))


240


In [32]:
def compute_pca(activation_list, layer):
    layer_activations = activation_list[:, layer, :]
    
    scaler = StandardScaler()
    scaled_layer_activations = scaler.fit_transform(layer_activations)

    pca = PCA()
    pca_transformed = pca.fit_transform(scaled_layer_activations)

    variance_explained = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(variance_explained)
    n_components = len(variance_explained)

    print(f"PCA fitted with {n_components} components")
    print(f"Cumulative variance for first 5 components: {cumulative_variance[:5]}")

    # Find elbow using second derivative method
    def find_elbow_point(variance_explained):
        """Find elbow point using second derivative method"""
        # Calculate first and second derivatives
        first_diff = np.diff(variance_explained)
        second_diff = np.diff(first_diff) 
        
        # Find point with maximum second derivative (most curvature)
        elbow_idx = np.argmax(np.abs(second_diff)) + 1  # +1 to account for diff operations
        return elbow_idx

    elbow_point = find_elbow_point(variance_explained)
    dims_70_pca = np.argmax(cumulative_variance >= 0.70) + 1
    dims_80_pca = np.argmax(cumulative_variance >= 0.80) + 1
    dims_90_pca = np.argmax(cumulative_variance >= 0.90) + 1
    dims_95_pca = np.argmax(cumulative_variance >= 0.95) + 1

    print("\nPCA Analysis Results:")
    print(f"Elbow point at component: {elbow_point + 1}")
    print(f"Dimensions for 70% variance: {dims_70_pca}")
    print(f"Dimensions for 80% variance: {dims_80_pca}")
    print(f"Dimensions for 90% variance: {dims_90_pca}")
    print(f"Dimensions for 95% variance: {dims_95_pca}")

    return pca_transformed, variance_explained, n_components 

In [87]:
def plot_pca_cosine_similarity(pca_transformed, trait_labels, pc_component=0, 
                             layer=None, reference_point=None, color_threshold=0.0):
    """
    Create a plot similar to the PC1 Cosine Similarity visualization.
    Shows labels on hover for most points, with visible labels and leader lines 
    for the 20 traits at either end of the range to avoid overlap.
    
    Parameters:
    - pca_transformed: PCA-transformed data (n_samples, n_components)
    - trait_labels: List of labels for each data point
    - pc_component: Which PC component to use (0-indexed, so PC1 = 0)
    - layer: Layer number for title
    - reference_point: Reference point for cosine similarity calculation 
                      (if None, uses the PC component direction)
    - color_threshold: Threshold for coloring points (default: 0.0)
    
    Returns:
    - Plotly figure object
    """
    
    # Extract the specified PC component
    pc_values = pca_transformed[:, pc_component]
    
    # Calculate cosine similarities
    if reference_point is None:
        # Use the PC component direction as reference
        # This creates a "cosine similarity with PC direction" interpretation
        cosine_sims = pc_values / np.linalg.norm(pc_values)  # Normalized PC values
    else:
        # Calculate cosine similarity with a specific reference point
        cosine_sims = cosine_similarity(pca_transformed, reference_point.reshape(1, -1)).flatten()
    
    # Create colors based on threshold
    colors = ['red' if sim < color_threshold else 'blue' for sim in cosine_sims]
    
    # Identify extreme traits (10 lowest and 10 highest)
    sorted_indices = np.argsort(cosine_sims)
    low_extreme_indices = sorted_indices[:10]
    high_extreme_indices = sorted_indices[-10:]
    extreme_indices = set(list(low_extreme_indices) + list(high_extreme_indices))
    
    # Create subplot figure
    fig = sp.make_subplots(
        rows=2, cols=1,
        row_heights=[0.6, 0.4],
        vertical_spacing=0.1,
        subplot_titles=[
            f'PC{pc_component+1} Cosine Similarity',
            'Trait Frequency Distribution'
        ]
    )
    
    # Split points into regular and extreme for different display modes
    regular_x, regular_y, regular_colors, regular_labels = [], [], [], []
    extreme_x, extreme_y, extreme_colors, extreme_labels = [], [], [], []
    
    for i, (sim, color, label) in enumerate(zip(cosine_sims, colors, trait_labels)):
        if i in extreme_indices:
            extreme_x.append(sim)
            extreme_y.append(1)
            extreme_colors.append(color)
            extreme_labels.append(label)
        else:
            regular_x.append(sim)
            regular_y.append(1)
            regular_colors.append(color)
            regular_labels.append(label)
    
    # Add regular points (hover labels only)
    if regular_x:
        fig.add_trace(
            go.Scatter(
                x=regular_x,
                y=regular_y,
                mode='markers',
                marker=dict(
                    color=regular_colors,
                    size=8,
                    opacity=0.7
                ),
                text=regular_labels,
                showlegend=False,
                hovertemplate='<b>%{text}</b><br>Cosine Similarity: %{x:.3f}<extra></extra>'
            ),
            row=1, col=1
        )
    
    # Add extreme points with visible labels and leader lines
    if extreme_x:
        fig.add_trace(
            go.Scatter(
                x=extreme_x,
                y=extreme_y,
                mode='markers',
                marker=dict(
                    color=extreme_colors,
                    size=8,
                    opacity=0.9
                ),
                text=extreme_labels,
                showlegend=False,
                hovertemplate='<b>%{text}</b><br>Cosine Similarity: %{x:.3f}<extra></extra>'
            ),
            row=1, col=1
        )
        
        # Create predefined alternating heights with variation
        # High positions with variation
        high_positions = [1.6, 1.45, 1.55, 1.35, 1.5, 1.4, 1.65, 1.3, 1.58, 1.42]
        # Low positions with variation  
        low_positions = [0.4, 0.55, 0.45, 0.65, 0.5, 0.6, 0.35, 0.7, 0.42, 0.58]
        
        # Alternate high-low pattern
        all_y_positions = []
        for i in range(10):
            all_y_positions.extend([high_positions[i], low_positions[i]])
        
        # Handle low extremes (10 lowest cosine similarities)
        for i, idx in enumerate(low_extreme_indices):
            x_pos = cosine_sims[idx]
            label = trait_labels[idx]
            color = colors[idx]
            y_label = all_y_positions[i]
            
            # Add leader line as a separate trace
            fig.add_trace(
                go.Scatter(
                    x=[x_pos, x_pos],
                    y=[1.0, y_label],
                    mode='lines',
                    line=dict(color=color, width=1),
                    showlegend=False,
                    hoverinfo='skip'
                ),
                row=1, col=1
            )
            
            # Add label at the end of the line
            fig.add_annotation(
                x=x_pos,
                y=y_label,
                text=label,
                showarrow=False,
                font=dict(size=10, color=color),
                bgcolor="rgba(255, 255, 255, 0.9)",
                bordercolor=color,
                borderwidth=1,
                row=1, col=1
            )
        
        # Handle high extremes (10 highest cosine similarities)
        for i, idx in enumerate(high_extreme_indices):
            x_pos = cosine_sims[idx]
            label = trait_labels[idx]
            color = colors[idx]
            y_label = all_y_positions[i + 10]  # Offset by 10 to continue the pattern
            
            # Add leader line as a separate trace
            fig.add_trace(
                go.Scatter(
                    x=[x_pos, x_pos],
                    y=[1.0, y_label],
                    mode='lines',
                    line=dict(color=color, width=1),
                    showlegend=False,
                    hoverinfo='skip'
                ),
                row=1, col=1
            )
            
            # Add label at the end of the line
            fig.add_annotation(
                x=x_pos,
                y=y_label,
                text=label,
                showarrow=False,
                font=dict(size=10, color=color),
                bgcolor="rgba(255, 255, 255, 0.9)",
                bordercolor=color,
                borderwidth=1,
                row=1, col=1
            )
    
    # Add vertical line at x=0 for both panels
    fig.add_vline(
        x=0,
        line_dash="solid",
        line_color="gray",
        line_width=1,
        opacity=0.7,
        row=1, col=1
    )
    
    fig.add_vline(
        x=0,
        line_dash="solid", 
        line_color="gray",
        line_width=1,
        opacity=0.7,
        row=2, col=1
    )
    
    # Bottom panel: Histogram
    fig.add_trace(
        go.Histogram(
            x=cosine_sims,
            nbinsx=30,
            opacity=0.7,
            marker_color='steelblue',
            showlegend=False
        ),
        row=2, col=1
    )
    
    # Update layout
    fig.update_layout(
        height=700,
        title=dict(
            text="PCA on Trait Vectors from Mean Response Activations",
            subtitle={
                "text": f"Gemma 2 27B, Layer {layer}",
            },
            x=0.5,
            font=dict(size=16)
        ),
        showlegend=False
    )
    
    # Calculate symmetric range around 0 (not around data center)
    max_abs_value = max(abs(min(cosine_sims)), abs(max(cosine_sims)))
    x_half_width = max_abs_value * 1.1  # Add 10% padding
    
    # Update x-axes with symmetric ranges centered on 0
    fig.update_xaxes(
        row=1, col=1,
        range=[-x_half_width, x_half_width]
    )
    
    fig.update_xaxes(
        title_text=f"PC{pc_component+1} Cosine Similarity",
        row=2, col=1,
        range=[-x_half_width, x_half_width]
    )
    
    # Update y-axes
    fig.update_yaxes(
        title_text="",
        showticklabels=False,
        row=1, col=1,
        range=[0.25, 1.75]  # Range for varied label heights
    )
    
    fig.update_yaxes(
        title_text="Frequency",
        row=2, col=1
    )
    
    return fig

### pos_neg 

In [20]:
# PCA on pos_neg but filter out traits with pos_minus_neg_mean < 40

filtered_pos_neg_traits = []
filtered_pos_neg = []

for trait, vector in vectors.items():
    if stats[trait]['pos_minus_neg_mean'] >= 40:
        filtered_pos_neg_traits.append(trait)
        filtered_pos_neg.append(vector['pos_neg'])

print(len(filtered_pos_neg_traits))

filtered_pos_neg = torch.stack(filtered_pos_neg).float()
print(filtered_pos_neg.shape)

208
torch.Size([208, 46, 4608])


In [54]:
pca_transformed, variance_explained, n_components = compute_pca(filtered_pos_neg, layer)

PCA fitted with 208 components
Cumulative variance for first 5 components: [0.15301124 0.2713136  0.36042659 0.42452626 0.46193193]

PCA Analysis Results:
Elbow point at component: 5
Dimensions for 70% variance: 18
Dimensions for 80% variance: 33
Dimensions for 90% variance: 65
Dimensions for 95% variance: 100


### pos_neg_50

In [30]:
# PCA on pos_neg_50 but filter out traits with large_diff_count < 10
filtered_pos_neg_50_traits = []
filtered_pos_neg_50 = []

for trait, vector in vectors.items():
    if stats[trait]['large_diff_count'] >= 10:
        filtered_pos_neg_50_traits.append(trait)
        filtered_pos_neg_50.append(vector['pos_neg_50'])

print(len(filtered_pos_neg_50_traits))

filtered_pos_neg_50 = torch.stack(filtered_pos_neg_50).float()
print(filtered_pos_neg_50.shape)

235
torch.Size([235, 46, 4608])


In [58]:
pca_transformed, variance_explained, n_components = compute_pca(filtered_pos_neg_50, layer)

PCA fitted with 235 components
Cumulative variance for first 5 components: [0.14948702 0.26322171 0.34780798 0.41020005 0.4467624 ]

PCA Analysis Results:
Elbow point at component: 5
Dimensions for 70% variance: 20
Dimensions for 80% variance: 38
Dimensions for 90% variance: 76
Dimensions for 95% variance: 117


In [98]:
component = 9
fig = plot_pca_cosine_similarity(
        pca_transformed=pca_transformed,
        trait_labels=filtered_pos_neg_50_traits,
        pc_component=component,
        layer=layer,
        color_threshold=0.0  # Adjust this threshold as needed
    )
fig.show()
fig.write_html(f"./results/pc{component+1}.html")

### pos_default

In [43]:
# PCA on all of pos_default ... or some other clustering algorithm?

pos_default = torch.stack(pos_default).float()
print(pos_default.shape)

torch.Size([240, 46, 4608])


In [44]:
pca_transformed, variance_explained, n_components = compute_pca(pos_default, layer)

PCA fitted with 240 components
Cumulative variance for first 5 components: [0.14114245 0.23283618 0.29676924 0.34340508 0.3846963 ]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 24
Dimensions for 80% variance: 42
Dimensions for 90% variance: 76
Dimensions for 95% variance: 111
