# Analyze scores for each trait

In [1]:
import json
import os
import torch
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as sp
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
from tqdm import tqdm


## Score statistics

In [2]:
# load data from data/extract_scores
score_dir = "/workspace/traits/extract_scores"

# iterate through each json file in the directory
scores = {}
for file in os.listdir(score_dir):
    if file.endswith(".json"):
        with open(os.path.join(score_dir, file), "r") as f:
            scores[file.replace(".json", "")] = json.load(f)

print(f"Found {len(scores.keys())} traits with scores")


Found 240 traits with scores


In [3]:
# Analyze refusals and clean data
refusal_info = {}
scores_clean = {}

for trait, score_obj in scores.items():
    refusals = []
    cleaned_scores = {}
    
    # Check each score for refusals
    for key, value in score_obj.items():
        if value == "REFUSAL":
            refusals.append(key)
            cleaned_scores[key] = 0  # Replace refusals with NaN
        else:
            cleaned_scores[key] = float(value)  # Ensure numeric
    
    scores_clean[trait] = cleaned_scores
    refusal_info[trait] = {
        "refusals": refusals,
        "refusal_count": len(refusals)
    }

# Show refusal statistics
total_refusals = sum(info["refusal_count"] for info in refusal_info.values())
traits_with_refusals = sum(1 for info in refusal_info.values() if info["refusal_count"] > 0)

print(f"Refusal Statistics:")
print(f"Total refusals across all traits: {total_refusals}")
print(f"Traits with refusals: {traits_with_refusals}")

if total_refusals > 0:
    sorted_refusals = sorted(refusal_info.items(), key=lambda x: x[1]["refusal_count"], reverse=True)
    print(f"\nTop 10 traits with most refusals:")
    for trait, info in sorted_refusals[:10]:
        if info["refusal_count"] > 0:
            print(f"  {trait}: {info['refusal_count']} refusals - {info['refusals']}")

Refusal Statistics:
Total refusals across all traits: 17
Traits with refusals: 6

Top 10 traits with most refusals:
  diplomatic: 7 refusals - ['neg_p2_q135', 'neg_p2_q141', 'neg_p2_q161', 'neg_p2_q163', 'neg_p3_q75', 'neg_p3_q97', 'neg_p3_q161']
  elitist: 4 refusals - ['pos_p0_q165', 'pos_p0_q189', 'pos_p1_q26', 'pos_p1_q213']
  evil: 2 refusals - ['pos_p2_q58', 'pos_p4_q153']
  generalist: 2 refusals - ['neg_p2_q145', 'neg_p3_q152']
  egalitarian: 1 refusals - ['neg_p2_q238']
  militant: 1 refusals - ['pos_p0_q101']


In [5]:
# Create numpy arrays using cleaned scores (refusals as NaN)
# Structure: 3D tensor with shape (3 types, 5 prompts, 20 questions)
scores_np = {}

for trait, cleaned_scores in scores_clean.items():
    # Create 3D array: [type, prompt, question]
    scores_3d = np.full((3, 5, 240), np.nan)
    
    # Extract scores for each type, prompt, and question
    for prompt_idx in range(5):
        for question_idx in range(240):
            # pos scores
            pos_key = f"pos_p{prompt_idx}_q{question_idx}"
            if pos_key in cleaned_scores:
                scores_3d[0, prompt_idx, question_idx] = cleaned_scores[pos_key]
            
            # neg scores  
            neg_key = f"neg_p{prompt_idx}_q{question_idx}"
            if neg_key in cleaned_scores:
                scores_3d[1, prompt_idx, question_idx] = cleaned_scores[neg_key]
            
            # default scores
            default_key = f"default_p{prompt_idx}_q{question_idx}"
            if default_key in cleaned_scores:
                scores_3d[2, prompt_idx, question_idx] = cleaned_scores[default_key]
    
    scores_np[trait] = scores_3d

print(f"Created numpy arrays for {len(scores_np)} traits")
print(f"Shape of each array: {next(iter(scores_np.values())).shape}")
print(f"Example (first trait): {list(scores_np.keys())[0]}")
example_trait = list(scores_np.keys())[0]
print(f"Pos scores for first 2 prompts, 5 questions:\n{scores_np[example_trait][0, :2, :5]}")
print(f"Neg scores for first 2 prompts, 5 questions:\n{scores_np[example_trait][1, :2, :5]}")

Created numpy arrays for 240 traits
Shape of each array: (3, 5, 240)
Example (first trait): rationalist
Pos scores for first 2 prompts, 5 questions:
[[ 95.  95.  95.  90. 100.]
 [ 90.  95.  90.  80.  90.]]
Neg scores for first 2 prompts, 5 questions:
[[20. 20. 80. 10.  0.]
 [40. 30. 40. 40. 85.]]


In [6]:
# Calculate simplified statistics for each trait
stats = {}

for trait, scores_3d in scores_np.items():
    pos_scores = scores_3d[0]  # shape: (5, 20) 
    neg_scores = scores_3d[1]  # shape: (5, 20)
    
    # Mean difference between pos and neg across all samples
    pos_minus_neg_mean = np.nanmean(pos_scores - neg_scores)
    
    # Count all pos/neg pairs with same prompt_index and question_index
    high_pos_low_neg_count = 0
    large_diff_count = 0
    
    # Check all 100 pairs (5 prompts × 20 questions)
    for prompt_idx in range(5):
        for question_idx in range(240):
            pos_val = pos_scores[prompt_idx, question_idx]
            neg_val = neg_scores[prompt_idx, question_idx]
            
            # Skip if either value is NaN
            if not (np.isnan(pos_val) or np.isnan(neg_val)):
                # Count high pos, low neg cases
                if pos_val > 50 and neg_val < 50:
                    high_pos_low_neg_count += 1
                
                # Count large difference cases  
                if abs(pos_val - neg_val) > 40:
                    large_diff_count += 1
    
    stats[trait] = {
        "pos_minus_neg_mean": pos_minus_neg_mean,
        "high_pos_low_neg_count": high_pos_low_neg_count,
        "large_diff_count": large_diff_count
    }

# Show example statistics for first trait
example_trait = list(stats.keys())[0]
print(f"Example statistics for '{example_trait}':")
for key, value in stats[example_trait].items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value}")

print(f"\nCalculated statistics for {len(stats)} traits")

# Show summary of counts
high_pos_counts = [s["high_pos_low_neg_count"] for s in stats.values()]
large_diff_counts = [s["large_diff_count"] for s in stats.values()]
print(f"\nHigh pos, low neg count distribution: min={min(high_pos_counts)}, max={max(high_pos_counts)}, mean={np.mean(high_pos_counts):.1f}")
print(f"Large diff count distribution: min={min(large_diff_counts)}, max={max(large_diff_counts)}, mean={np.mean(large_diff_counts):.1f}")

# Export to CSV
stats_df = pd.DataFrame.from_dict(stats, orient='index')
stats_df.index.name = 'trait'
stats_df.to_csv('./results/pca_240/pos_neg.csv')
print(f"\nExported statistics to pos_neg.csv")
print(f"Shape: {stats_df.shape}")

Example statistics for 'rationalist':
  pos_minus_neg_mean: 38.94
  high_pos_low_neg_count: 543
  large_diff_count: 537

Calculated statistics for 240 traits

High pos, low neg count distribution: min=1, max=1200, mean=984.8
Large diff count distribution: min=1, max=1200, mean=983.5

Exported statistics to pos_neg.csv
Shape: (240, 3)


In [7]:
# Calculate pos - default statistics similar to pos - neg
pos_default_stats = {}

for trait, scores_3d in scores_np.items():
    pos_scores = scores_3d[0]      # shape: (5, 20) 
    default_scores = scores_3d[2]  # shape: (5, 20)
    
    # Mean difference between pos and default across all samples
    pos_minus_default_mean = np.nanmean(pos_scores - default_scores)
    
    # Count all pos/default pairs with same prompt_index and question_index
    high_pos_low_default_count = 0
    large_diff_count = 0
    
    # Check all 100 pairs (5 prompts × 20 questions)
    for prompt_idx in range(5):
        for question_idx in range(240):
            pos_val = pos_scores[prompt_idx, question_idx]
            default_val = default_scores[prompt_idx, question_idx]
            
            # Skip if either value is NaN
            if not (np.isnan(pos_val) or np.isnan(default_val)):
                # Count high pos, low default cases
                if pos_val > 50 and default_val < 50:
                    high_pos_low_default_count += 1
                
                # Count large difference cases  
                if abs(pos_val - default_val) > 40:
                    large_diff_count += 1
    
    pos_default_stats[trait] = {
        "pos_minus_default_mean": pos_minus_default_mean,
        "high_pos_low_default_count": high_pos_low_default_count,
        "large_diff_count": large_diff_count
    }

# Show example statistics for first trait
example_trait = list(pos_default_stats.keys())[0]
print(f"Example pos-default statistics for '{example_trait}':")
for key, value in pos_default_stats[example_trait].items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value}")

print(f"\nCalculated pos-default statistics for {len(pos_default_stats)} traits")

# Show summary of counts
high_pos_counts = [s["high_pos_low_default_count"] for s in pos_default_stats.values()]
large_diff_counts = [s["large_diff_count"] for s in pos_default_stats.values()]
print(f"\nHigh pos, low default count distribution: min={min(high_pos_counts)}, max={max(high_pos_counts)}, mean={np.mean(high_pos_counts):.1f}")
print(f"Large diff count distribution: min={min(large_diff_counts)}, max={max(large_diff_counts)}, mean={np.mean(large_diff_counts):.1f}")

# Export to CSV
pos_default_df = pd.DataFrame.from_dict(pos_default_stats, orient='index')
pos_default_df.index.name = 'trait'
pos_default_df.to_csv('./results/pca_240/pos_default.csv')
print(f"\nExported pos-default statistics to pos_default.csv")
print(f"Shape: {pos_default_df.shape}")

Example pos-default statistics for 'rationalist':
  pos_minus_default_mean: 11.95
  high_pos_low_default_count: 74
  large_diff_count: 74

Calculated pos-default statistics for 240 traits

High pos, low default count distribution: min=0, max=1200, mean=699.5
Large diff count distribution: min=0, max=1200, mean=694.8

Exported pos-default statistics to pos_default.csv
Shape: (240, 3)


## PCA

In [8]:
# check vectors

# load all vectors from data/vectors
vector_dir = "/workspace/traits/vectors"

# iterate through each .pt file in the directory
vectors = {}
for file in os.listdir(vector_dir):
    if file.endswith(".pt"):
        vectors[file.replace(".pt", "")] = torch.load(os.path.join(vector_dir, file))

print(f"Found {len(vectors.keys())} traits with vectors")

Found 240 traits with vectors


In [9]:
vectors['zealous']['pos_neg'].shape

torch.Size([46, 4608])

In [10]:
layer = 34

pos_neg = [vector['pos_neg'] for vector in vectors.values()]
pos_neg_50 = [vector['pos_neg_50'] for vector in vectors.values()]
pos_default = [vector['pos_default'] for vector in vectors.values()]
pos_default_50 = [vector['pos_default_50'] for vector in vectors.values()]

print(len(pos_neg))


240


In [11]:
def compute_pca(activation_list, layer):
    layer_activations = activation_list[:, layer, :]
    
    scaler = StandardScaler()
    scaled_layer_activations = scaler.fit_transform(layer_activations)

    pca = PCA()
    pca_transformed = pca.fit_transform(scaled_layer_activations)

    variance_explained = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(variance_explained)
    n_components = len(variance_explained)

    print(f"PCA fitted with {n_components} components")
    print(f"Cumulative variance for first 5 components: {cumulative_variance[:5]}")

    # Find elbow using second derivative method
    def find_elbow_point(variance_explained):
        """Find elbow point using second derivative method"""
        # Calculate first and second derivatives
        first_diff = np.diff(variance_explained)
        second_diff = np.diff(first_diff) 
        
        # Find point with maximum second derivative (most curvature)
        elbow_idx = np.argmax(np.abs(second_diff)) + 1  # +1 to account for diff operations
        return elbow_idx

    elbow_point = find_elbow_point(variance_explained)
    dims_70_pca = np.argmax(cumulative_variance >= 0.70) + 1
    dims_80_pca = np.argmax(cumulative_variance >= 0.80) + 1
    dims_90_pca = np.argmax(cumulative_variance >= 0.90) + 1
    dims_95_pca = np.argmax(cumulative_variance >= 0.95) + 1

    print("\nPCA Analysis Results:")
    print(f"Elbow point at component: {elbow_point + 1}")
    print(f"Dimensions for 70% variance: {dims_70_pca}")
    print(f"Dimensions for 80% variance: {dims_80_pca}")
    print(f"Dimensions for 90% variance: {dims_90_pca}")
    print(f"Dimensions for 95% variance: {dims_95_pca}")

    return pca_transformed, variance_explained, n_components, pca, scaler 

In [12]:
def plot_pca_cosine_similarity(pca_transformed, trait_labels, pc_component=0, 
                             layer=None, reference_point=None, color_threshold=0.0):
    """
    Create a plot similar to the PC1 Cosine Similarity visualization.
    Shows labels on hover for most points, with visible labels and leader lines 
    for the 20 traits at either end of the range to avoid overlap.
    
    Parameters:
    - pca_transformed: PCA-transformed data (n_samples, n_components)
    - trait_labels: List of labels for each data point
    - pc_component: Which PC component to use (0-indexed, so PC1 = 0)
    - layer: Layer number for title
    - reference_point: Reference point for cosine similarity calculation 
                      (if None, uses the PC component direction)
    - color_threshold: Threshold for coloring points (default: 0.0)
    
    Returns:
    - Plotly figure object
    """
    
    # Extract the specified PC component
    pc_values = pca_transformed[:, pc_component]
    
    # Calculate cosine similarities
    if reference_point is None:
        # Use the PC component direction as reference
        # This creates a "cosine similarity with PC direction" interpretation
        cosine_sims = pc_values / np.linalg.norm(pc_values)  # Normalized PC values
    else:
        # Calculate cosine similarity with a specific reference point
        cosine_sims = cosine_similarity(pca_transformed, reference_point.reshape(1, -1)).flatten()
    
    # Create colors based on threshold
    colors = ['red' if sim < color_threshold else 'blue' for sim in cosine_sims]
    
    # Identify extreme traits (10 lowest and 10 highest)
    sorted_indices = np.argsort(cosine_sims)
    low_extreme_indices = sorted_indices[:10]
    high_extreme_indices = sorted_indices[-10:]
    extreme_indices = set(list(low_extreme_indices) + list(high_extreme_indices))
    
    # Create subplot figure
    fig = sp.make_subplots(
        rows=2, cols=1,
        row_heights=[0.6, 0.4],
        vertical_spacing=0.1,
        subplot_titles=[
            f'PC{pc_component+1} Cosine Similarity',
            'Trait Frequency Distribution'
        ]
    )
    
    # Split points into regular and extreme for different display modes
    regular_x, regular_y, regular_colors, regular_labels = [], [], [], []
    extreme_x, extreme_y, extreme_colors, extreme_labels = [], [], [], []
    
    for i, (sim, color, label) in enumerate(zip(cosine_sims, colors, trait_labels)):
        if i in extreme_indices:
            extreme_x.append(sim)
            extreme_y.append(1)
            extreme_colors.append(color)
            extreme_labels.append(label)
        else:
            regular_x.append(sim)
            regular_y.append(1)
            regular_colors.append(color)
            regular_labels.append(label)
    
    # Add regular points (hover labels only)
    if regular_x:
        fig.add_trace(
            go.Scatter(
                x=regular_x,
                y=regular_y,
                mode='markers',
                marker=dict(
                    color=regular_colors,
                    size=8,
                    opacity=0.7
                ),
                text=regular_labels,
                showlegend=False,
                hovertemplate='<b>%{text}</b><br>Cosine Similarity: %{x:.3f}<extra></extra>'
            ),
            row=1, col=1
        )
    
    # Add extreme points with visible labels and leader lines
    if extreme_x:
        fig.add_trace(
            go.Scatter(
                x=extreme_x,
                y=extreme_y,
                mode='markers',
                marker=dict(
                    color=extreme_colors,
                    size=8,
                    opacity=0.9
                ),
                text=extreme_labels,
                showlegend=False,
                hovertemplate='<b>%{text}</b><br>Cosine Similarity: %{x:.3f}<extra></extra>'
            ),
            row=1, col=1
        )
        
        # Create predefined alternating heights with variation
        # High positions with variation
        high_positions = [1.6, 1.45, 1.55, 1.35, 1.5, 1.4, 1.65, 1.3, 1.58, 1.42]
        # Low positions with variation  
        low_positions = [0.4, 0.55, 0.45, 0.65, 0.5, 0.6, 0.35, 0.7, 0.42, 0.58]
        
        # Alternate high-low pattern
        all_y_positions = []
        for i in range(10):
            all_y_positions.extend([high_positions[i], low_positions[i]])
        
        # Handle low extremes (10 lowest cosine similarities)
        for i, idx in enumerate(low_extreme_indices):
            x_pos = cosine_sims[idx]
            label = trait_labels[idx]
            color = colors[idx]
            y_label = all_y_positions[i]
            
            # Add leader line as a separate trace
            fig.add_trace(
                go.Scatter(
                    x=[x_pos, x_pos],
                    y=[1.0, y_label],
                    mode='lines',
                    line=dict(color=color, width=1),
                    showlegend=False,
                    hoverinfo='skip'
                ),
                row=1, col=1
            )
            
            # Add label at the end of the line
            fig.add_annotation(
                x=x_pos,
                y=y_label,
                text=label,
                showarrow=False,
                font=dict(size=10, color=color),
                bgcolor="rgba(255, 255, 255, 0.9)",
                bordercolor=color,
                borderwidth=1,
                row=1, col=1
            )
        
        # Handle high extremes (10 highest cosine similarities)
        for i, idx in enumerate(high_extreme_indices):
            x_pos = cosine_sims[idx]
            label = trait_labels[idx]
            color = colors[idx]
            y_label = all_y_positions[i + 10]  # Offset by 10 to continue the pattern
            
            # Add leader line as a separate trace
            fig.add_trace(
                go.Scatter(
                    x=[x_pos, x_pos],
                    y=[1.0, y_label],
                    mode='lines',
                    line=dict(color=color, width=1),
                    showlegend=False,
                    hoverinfo='skip'
                ),
                row=1, col=1
            )
            
            # Add label at the end of the line
            fig.add_annotation(
                x=x_pos,
                y=y_label,
                text=label,
                showarrow=False,
                font=dict(size=10, color=color),
                bgcolor="rgba(255, 255, 255, 0.9)",
                bordercolor=color,
                borderwidth=1,
                row=1, col=1
            )
    
    # Add vertical line at x=0 for both panels
    fig.add_vline(
        x=0,
        line_dash="solid",
        line_color="gray",
        line_width=1,
        opacity=0.7,
        row=1, col=1
    )
    
    fig.add_vline(
        x=0,
        line_dash="solid", 
        line_color="gray",
        line_width=1,
        opacity=0.7,
        row=2, col=1
    )
    
    # Bottom panel: Histogram
    fig.add_trace(
        go.Histogram(
            x=cosine_sims,
            nbinsx=30,
            opacity=0.7,
            marker_color='steelblue',
            showlegend=False
        ),
        row=2, col=1
    )
    
    # Update layout
    fig.update_layout(
        height=700,
        title=dict(
            text="PCA on Trait Vectors from Mean Response Activations",
            subtitle={
                "text": f"Gemma 2 27B, Layer {layer}",
            },
            x=0.5,
            font=dict(size=16)
        ),
        showlegend=False
    )
    
    # Calculate symmetric range around 0 (not around data center)
    max_abs_value = max(abs(min(cosine_sims)), abs(max(cosine_sims)))
    x_half_width = max_abs_value * 1.1  # Add 10% padding
    
    # Update x-axes with symmetric ranges centered on 0
    fig.update_xaxes(
        row=1, col=1,
        range=[-x_half_width, x_half_width]
    )
    
    fig.update_xaxes(
        title_text=f"PC{pc_component+1} Cosine Similarity",
        row=2, col=1,
        range=[-x_half_width, x_half_width]
    )
    
    # Update y-axes
    fig.update_yaxes(
        title_text="",
        showticklabels=False,
        row=1, col=1,
        range=[0.25, 1.75]  # Range for varied label heights
    )
    
    fig.update_yaxes(
        title_text="Frequency",
        row=2, col=1
    )
    
    return fig

In [13]:
def plot_3d_pca(pca_transformed, variance_explained, trait_labels, layer):
    # Create 3D scatter plot if we have enough components

    fig_3d = go.Figure(data=[go.Scatter3d(
        x=pca_transformed[:, 0],
        y=pca_transformed[:, 1], 
        z=pca_transformed[:, 2],
        mode='markers+text',
        text=trait_labels,
        textposition='top center',
        textfont=dict(size=6),
        marker=dict(
            size=3,
            color=['blue'] * len(trait_labels),
            line=dict(width=2, color='black')
        ),
        hovertemplate='<b>%{text}</b><br>' +
                    f'PC1: %{{x:.3f}}<br>' +
                    f'PC2: %{{y:.3f}}<br>' +
                    f'PC3: %{{z:.3f}}<br>' +
                    '<extra></extra>'
    )])
    
    fig_3d.update_layout(
        title={
            "text": f'Trait Vectors in Principal Component Space',
            "subtitle": {
                "text": f"Gemma 2 27B, Layer {layer}",
            },
        },
        scene=dict(
            xaxis_title=f'PC1 ({variance_explained[0]*100:.1f}%)',
            yaxis_title=f'PC2 ({variance_explained[1]*100:.1f}%)',
            zaxis_title=f'PC3 ({variance_explained[2]*100:.1f}%)'
        ),
        width=1000,
        height=800
    )
    
    fig_3d.show()
    fig_3d.write_html(f"./results/pca_3d.html")

### pos_neg 

In [20]:
# PCA on pos_neg but filter out traits with pos_minus_neg_mean < 40

filtered_pos_neg_traits = []
filtered_pos_neg = []

for trait, vector in vectors.items():
    if stats[trait]['pos_minus_neg_mean'] >= 40:
        filtered_pos_neg_traits.append(trait)
        filtered_pos_neg.append(vector['pos_neg'])

print(len(filtered_pos_neg_traits))

filtered_pos_neg = torch.stack(filtered_pos_neg).float()
print(filtered_pos_neg.shape)

208
torch.Size([208, 46, 4608])


In [54]:
pca_transformed, variance_explained, n_components = compute_pca(filtered_pos_neg, layer)

PCA fitted with 208 components
Cumulative variance for first 5 components: [0.15301124 0.2713136  0.36042659 0.42452626 0.46193193]

PCA Analysis Results:
Elbow point at component: 5
Dimensions for 70% variance: 18
Dimensions for 80% variance: 33
Dimensions for 90% variance: 65
Dimensions for 95% variance: 100


### pos_neg_50

In [15]:
# PCA on pos_neg_50 but filter out traits with large_diff_count < 10
filtered_pos_neg_50_traits = []
filtered_pos_neg_50 = []

for trait, vector in vectors.items():
    if stats[trait]['large_diff_count'] >= 120:
        filtered_pos_neg_50_traits.append(trait)
        filtered_pos_neg_50.append(vector['pos_neg_50'])
    else:
        print(f"Skipping {trait} because large_diff_count is {stats[trait]['large_diff_count']}")

print(len(filtered_pos_neg_50_traits))

filtered_pos_neg_50 = torch.stack(filtered_pos_neg_50).float()
print(filtered_pos_neg_50.shape)

Skipping vindictive because large_diff_count is 1
239
torch.Size([239, 46, 4608])


In [17]:
pca_transformed, variance_explained, n_components, pca, scaler = compute_pca(filtered_pos_neg_50, layer)

PCA fitted with 239 components
Cumulative variance for first 5 components: [0.19661985 0.33749878 0.45002325 0.52655587 0.57136379]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 10
Dimensions for 80% variance: 18
Dimensions for 90% variance: 39
Dimensions for 95% variance: 66


In [None]:
component = 10
fig = plot_pca_cosine_similarity(
        pca_transformed=pca_transformed,
        trait_labels=filtered_pos_neg_50_traits,
        pc_component=component,
        layer=layer,
        color_threshold=0.0  # Adjust this threshold as needed
    )
fig.show()
fig.write_html(f"./results/pca_240/pc{component+1}.html")

In [28]:
plot_3d_pca(pca_transformed, variance_explained, filtered_pos_neg_50_traits, layer)

### pos_neg_50 with Mean Assistant

In [34]:
# Re-run PCA on filtered_pos_neg_50 with the enhanced function to get PCA and scaler objects
pca_transformed_neg50, variance_explained_neg50, n_components_neg50, pca_neg50, scaler_neg50 = compute_pca(filtered_pos_neg_50, layer)

# Project the mean assistant activation into this PCA space
mean_projected_neg50 = project_mean_activation(pca_neg50, scaler_neg50, layer)

# Create the enhanced cosine similarity plot for PC1
component = 0  # PC1
fig_neg50_with_mean = plot_pca_cosine_similarity_with_mean(
    pca_transformed=pca_transformed_neg50,
    trait_labels=filtered_pos_neg_50_traits,
    mean_projected=mean_projected_neg50,
    pc_component=component,
    layer=layer,
    color_threshold=0.0
)
fig_neg50_with_mean.show()
fig_neg50_with_mean.write_html(f"./results/pca_240/pc{component+1}_assistant.html")

# Create the enhanced 3D PCA plot
fig_3d_neg50_with_mean = plot_3d_pca_with_mean(
    pca_transformed=pca_transformed_neg50,
    variance_explained=variance_explained_neg50,
    trait_labels=filtered_pos_neg_50_traits,
    mean_projected=mean_projected_neg50,
    layer=layer
)



PCA fitted with 239 components
Cumulative variance for first 5 components: [0.19661985 0.33749878 0.45002325 0.52655587 0.57136379]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 10
Dimensions for 80% variance: 18
Dimensions for 90% variance: 39
Dimensions for 95% variance: 66
Loaded mean default activation with shape: torch.Size([46, 4608])
Extracted layer 34 activation with shape: (1, 4608)
Mean activation projected to PCA space with shape: (1, 239)
First 3 PC coordinates: [-26.963, 6.518, -16.872]



Assistant Position in PCA Space:
  PC1: -26.963
  PC2: 6.518
  PC3: -16.872
  Distance from origin: 32.467

Traits relative to Assistant:
  Closest trait: deferential (distance: 7.543)
  Furthest trait: bombastic (distance: 100.583)
  Mean distance: 51.534
  Std distance: 21.728


In [36]:
for i in range(1, 10):
    component = i
    fig_neg50_with_mean = plot_pca_cosine_similarity_with_mean(
        pca_transformed=pca_transformed_neg50,
        trait_labels=filtered_pos_neg_50_traits,
        mean_projected=mean_projected_neg50,
        pc_component=component,
        layer=layer,
        color_threshold=0.0
    )
    fig_neg50_with_mean.show()
    fig_neg50_with_mean.write_html(f"./results/pca_240/pc{component+1}_assistant.html")

## Default Assistant Activation

In [30]:
def load_and_compute_mean_default_activation():
    """
    Load all trait activations and compute the mean default assistant activation.
    
    This function:
    1. Loads all .pt files from data/response_activations/
    2. Extracts all activations with keys matching default_p{prompt_idx}_q{question_idx}
    3. Computes the mean across all traits and all default samples
    4. Returns a tensor of shape (num_layers, hidden_dim) = (46, 4608)
    5. Saves the result as data/default_activation.pt
    
    Returns:
        torch.Tensor: Mean default activation tensor of shape (46, 4608)
    """
    
    # Directory containing response activations
    activations_dir = Path("data/response_activations")
    
    # Get list of all trait files
    trait_files = list(activations_dir.glob("*.pt"))
    print(f"Found {len(trait_files)} trait files to process")
    
    # Collect all default activations
    all_default_activations = []
    
    # Process each trait file
    for trait_file in tqdm(trait_files, desc="Loading trait activations"):
        try:
            # Load the activation dictionary for this trait
            activations = torch.load(trait_file, map_location='cpu')
            
            # Extract default activations using the same logic as 5_vectors.py
            for prompt_idx in range(5):  # p0 to p4
                for question_idx in range(20):  # q0 to q19
                    default_key = f"default_p{prompt_idx}_q{question_idx}"
                    
                    if default_key in activations:
                        activation_tensor = activations[default_key]
                        all_default_activations.append(activation_tensor)
            
        except Exception as e:
            print(f"Error loading {trait_file.name}: {e}")
            continue
    
    print(f"Collected {len(all_default_activations)} default activation tensors")
    
    if not all_default_activations:
        raise ValueError("No default activations found!")
    
    # Stack all activations and compute mean
    # Each activation tensor has shape (46, 4608)
    stacked_activations = torch.stack(all_default_activations)
    print(f"Stacked activations shape: {stacked_activations.shape}")
    
    # Compute mean across all samples (first dimension)
    mean_default_activation = stacked_activations.mean(dim=0)
    print(f"Mean default activation shape: {mean_default_activation.shape}")
    
    # Save the result
    output_path = "data/default_activation.pt"
    torch.save(mean_default_activation, output_path)
    print(f"Saved mean default activation to {output_path}")
    
    # Print some summary statistics
    print(f"\nSummary statistics:")
    print(f"  Mean activation magnitude: {mean_default_activation.abs().mean():.6f}")
    print(f"  Standard deviation: {mean_default_activation.std():.6f}")
    print(f"  Min value: {mean_default_activation.min():.6f}")
    print(f"  Max value: {mean_default_activation.max():.6f}")
    
    return mean_default_activation



In [31]:
def project_mean_activation(pca, scaler, layer):
    """
    Project the mean default activation into the PCA space.
    
    Parameters:
    - pca: Fitted PCA object from compute_pca()
    - scaler: Fitted StandardScaler object from compute_pca()  
    - layer: Layer number to extract from mean activation
    
    Returns:
    - mean_projected: Mean activation projected into PCA space
    """
    # Load the mean default activation
    mean_activation_path = "data/default_activation.pt"
    mean_default_activation = torch.load(mean_activation_path, map_location='cpu')
    
    print(f"Loaded mean default activation with shape: {mean_default_activation.shape}")
    
    # Extract the same layer used for trait vectors
    mean_layer_activation = mean_default_activation[layer, :].float().numpy().reshape(1, -1)
    print(f"Extracted layer {layer} activation with shape: {mean_layer_activation.shape}")
    
    # Apply the same preprocessing (standardization) used for trait vectors
    mean_scaled = scaler.transform(mean_layer_activation)
    
    # Project into PCA space
    mean_projected = pca.transform(mean_scaled)
    
    print(f"Mean activation projected to PCA space with shape: {mean_projected.shape}")
    print(f"First 3 PC coordinates: [{mean_projected[0, 0]:.3f}, {mean_projected[0, 1]:.3f}, {mean_projected[0, 2]:.3f}]")
    
    return mean_projected[0]  # Return as 1D array

In [32]:
def plot_pca_cosine_similarity_with_mean(pca_transformed, trait_labels, mean_projected, 
                                        pc_component=0, layer=None, reference_point=None, 
                                        color_threshold=0.0, use_mean_as_reference=False):
    """
    Create a plot similar to the PC1 Cosine Similarity visualization, but with the mean assistant activation included.
    
    Parameters:
    - pca_transformed: PCA-transformed data (n_samples, n_components)
    - trait_labels: List of labels for each data point
    - mean_projected: Mean assistant activation projected into PCA space
    - pc_component: Which PC component to use (0-indexed, so PC1 = 0)
    - layer: Layer number for title
    - reference_point: Reference point for cosine similarity calculation 
    - color_threshold: Threshold for coloring points (default: 0.0)
    - use_mean_as_reference: If True, calculate cosine similarity relative to mean activation
    
    Returns:
    - Plotly figure object
    """
    
    # Extract the specified PC component
    pc_values = pca_transformed[:, pc_component]
    mean_pc_value = mean_projected[pc_component]
    
    # Calculate cosine similarities
    if use_mean_as_reference:
        # Use mean activation as reference point
        cosine_sims = cosine_similarity(pca_transformed, mean_projected.reshape(1, -1)).flatten()
        mean_cosine_sim = 1.0  # Perfect similarity with itself
      
    elif reference_point is not None:
        # Calculate cosine similarity with a specific reference point
        cosine_sims = cosine_similarity(pca_transformed, reference_point.reshape(1, -1)).flatten()
        mean_cosine_sim = cosine_similarity(mean_projected.reshape(1, -1), reference_point.reshape(1, -1))[0, 0]

    else:
        # Use the PC component direction as reference
        cosine_sims = pc_values / np.linalg.norm(pc_values)  # Normalized PC values
        mean_cosine_sim = mean_pc_value / np.linalg.norm(np.concatenate([pc_values, [mean_pc_value]]))

    
    # Create colors based on threshold
    colors = ['red' if sim < color_threshold else 'blue' for sim in cosine_sims]
    
    # Identify extreme traits (10 lowest and 10 highest)
    sorted_indices = np.argsort(cosine_sims)
    low_extreme_indices = sorted_indices[:10]
    high_extreme_indices = sorted_indices[-10:]
    extreme_indices = set(list(low_extreme_indices) + list(high_extreme_indices))
    
    # Create single figure (no subplots)
    fig = go.Figure()
    
    # Split points into regular and extreme for different display modes
    regular_x, regular_y, regular_colors, regular_labels = [], [], [], []
    extreme_x, extreme_y, extreme_colors, extreme_labels = [], [], [], []
    
    for i, (sim, color, label) in enumerate(zip(cosine_sims, colors, trait_labels)):
        if i in extreme_indices:
            extreme_x.append(sim)
            extreme_y.append(1)
            extreme_colors.append(color)
            extreme_labels.append(label)
        else:
            regular_x.append(sim)
            regular_y.append(1)
            regular_colors.append(color)
            regular_labels.append(label)
    
    # Add regular points (hover labels only)
    if regular_x:
        fig.add_trace(
            go.Scatter(
                x=regular_x,
                y=regular_y,
                mode='markers',
                marker=dict(
                    color=regular_colors,
                    size=8,
                    opacity=0.7
                ),
                text=regular_labels,
                showlegend=False,
                hovertemplate='<b>%{text}</b><br>Cosine Similarity: %{x:.3f}<extra></extra>'
            )
        )
    
    # Add extreme points with visible labels and leader lines
    if extreme_x:
        fig.add_trace(
            go.Scatter(
                x=extreme_x,
                y=extreme_y,
                mode='markers',
                marker=dict(
                    color=extreme_colors,
                    size=8,
                    opacity=0.9
                ),
                text=extreme_labels,
                showlegend=False,
                hovertemplate='<b>%{text}</b><br>Cosine Similarity: %{x:.3f}<extra></extra>'
            )
        )
        
        # Add labels for extreme points (same logic as original)
        high_positions = [1.6, 1.45, 1.55, 1.35, 1.5, 1.4, 1.65, 1.3, 1.58, 1.42]
        low_positions = [0.4, 0.55, 0.45, 0.65, 0.5, 0.6, 0.35, 0.7, 0.42, 0.58]
        all_y_positions = []
        for i in range(10):
            all_y_positions.extend([high_positions[i], low_positions[i]])
        
        # Handle low extremes
        for i, idx in enumerate(low_extreme_indices):
            x_pos = cosine_sims[idx]
            label = trait_labels[idx]
            color = colors[idx]
            y_label = all_y_positions[i]
            
            fig.add_trace(
                go.Scatter(
                    x=[x_pos, x_pos],
                    y=[1.0, y_label],
                    mode='lines',
                    line=dict(color=color, width=1),
                    showlegend=False,
                    hoverinfo='skip'
                )
            )
            
            fig.add_annotation(
                x=x_pos,
                y=y_label,
                text=label,
                showarrow=False,
                font=dict(size=10, color=color),
                bgcolor="rgba(255, 255, 255, 0.9)",
                bordercolor=color,
                borderwidth=1
            )
        
        # Handle high extremes
        for i, idx in enumerate(high_extreme_indices):
            x_pos = cosine_sims[idx]
            label = trait_labels[idx]
            color = colors[idx]
            y_label = all_y_positions[i + 10]
            
            fig.add_trace(
                go.Scatter(
                    x=[x_pos, x_pos],
                    y=[1.0, y_label],
                    mode='lines',
                    line=dict(color=color, width=1),
                    showlegend=False,
                    hoverinfo='skip'
                )
            )
            
            fig.add_annotation(
                x=x_pos,
                y=y_label,
                text=label,
                showarrow=False,
                font=dict(size=10, color=color),
                bgcolor="rgba(255, 255, 255, 0.9)",
                bordercolor=color,
                borderwidth=1
            )
    
    # Add vertical line at x=0
    fig.add_vline(x=0, line_dash="solid", line_color="gray", line_width=1, opacity=0.7)
    
    # Add black dashed vertical line for assistant position
    fig.add_vline(x=mean_cosine_sim, line_dash="dash", line_color="black", line_width=1, opacity=1.0)
    
    # Add Assistant label at same height as extremes
    assistant_y_position = 1.6  # Same as first high position
    fig.add_annotation(
        x=mean_cosine_sim,
        y=assistant_y_position,
        text="Assistant",
        showarrow=False,
        font=dict(size=10, color="black"),
        bgcolor="rgba(255, 255, 255, 0.9)",
        bordercolor="black",
        borderwidth=1
    )
    
    # Update layout
    fig.update_layout(
        height=500,  # Reduced height since no subplot
        title=dict(
            text=f"PC{pc_component+1} Cosine Similarity with Assistant",
            subtitle={
                "text": f"Gemma 2 27B, Layer {layer}",
            },
            x=0.5,
            font=dict(size=16)
        ),
        showlegend=False
    )
    
    # Calculate symmetric range that includes mean
    all_values = list(cosine_sims) + [mean_cosine_sim]
    max_abs_value = max(abs(min(all_values)), abs(max(all_values)))
    x_half_width = max_abs_value * 1.1
    
    # Update x-axis
    fig.update_xaxes(
        title_text=f"PC{pc_component+1} Cosine Similarity",
        range=[-x_half_width, x_half_width]
    )
    
    # Update y-axis
    fig.update_yaxes(
        title_text="",
        showticklabels=False,
        range=[0.25, 1.75]  # Standard range for labels
    )
    
    return fig

In [None]:
def plot_3d_pca_with_mean(pca_transformed, variance_explained, trait_labels, mean_projected, layer):
    """
    Create a 3D scatter plot of trait vectors in PCA space, including the mean assistant activation.
    
    Parameters:
    - pca_transformed: PCA-transformed trait data (n_samples, n_components)
    - variance_explained: Explained variance ratio from PCA
    - trait_labels: List of trait labels
    - mean_projected: Mean assistant activation projected into PCA space
    - layer: Layer number for title
    """
    
    # Create 3D scatter plot with trait vectors
    fig_3d = go.Figure()
    
    # Add trait vectors
    fig_3d.add_trace(go.Scatter3d(
        x=pca_transformed[:, 0],
        y=pca_transformed[:, 1], 
        z=pca_transformed[:, 2],
        mode='markers+text',
        text=trait_labels,
        textposition='top center',
        textfont=dict(size=6),
        marker=dict(
            size=3,
            color='blue',
            line=dict(width=1, color='darkblue'),
            opacity=0.7
        ),
        showlegend=False,
        hovertemplate='<b>%{text}</b><br>' +
                    f'PC1: %{{x:.3f}}<br>' +
                    f'PC2: %{{y:.3f}}<br>' +
                    f'PC3: %{{z:.3f}}<br>' +
                    '<extra></extra>'
    ))
    
    # Add mean assistant activation as simple red dot
    fig_3d.add_trace(go.Scatter3d(
        x=[mean_projected[0]],
        y=[mean_projected[1]],
        z=[mean_projected[2]],
        mode='markers+text',
        text=['Assistant'],
        textposition='top center',
        textfont=dict(size=8, color='black'),
        marker=dict(
            size=5,  # 2 sizes bigger than trait dots (3 -> 5)
            color='red',
            opacity=1.0
        ),
        showlegend=False,
        hovertemplate='<b>Assistant</b><br>' +
                    f'PC1: %{{x:.3f}}<br>' +
                    f'PC2: %{{y:.3f}}<br>' +
                    f'PC3: %{{z:.3f}}<br>' +
                    '<extra></extra>'
    ))
    
    fig_3d.update_layout(
        title={
            "text": f'Trait Vectors in Principal Component Space with Assistant',
            "subtitle": {
                "text": f"Gemma 2 27B, Layer {layer}",
            },
        },
        scene=dict(
            xaxis_title=f'PC1 ({variance_explained[0]*100:.1f}%)',
            yaxis_title=f'PC2 ({variance_explained[1]*100:.1f}%)',
            zaxis_title=f'PC3 ({variance_explained[2]*100:.1f}%)',
            camera=dict(
                eye=dict(x=1.5, y=1.5, z=1.5)
            )
        ),
        width=1000,
        height=800,
        showlegend=False
    )
    
    # Print some statistics about the mean assistant position
    print(f"\nAssistant Position in PCA Space:")
    print(f"  PC1: {mean_projected[0]:.3f}")
    print(f"  PC2: {mean_projected[1]:.3f}")  
    print(f"  PC3: {mean_projected[2]:.3f}")
    print(f"  Distance from origin: {np.linalg.norm(mean_projected[:3]):.3f}")
    
    # Calculate distances from mean assistant to all traits
    distances = np.linalg.norm(pca_transformed[:, :3] - mean_projected[:3], axis=1)
    closest_idx = np.argmin(distances)
    furthest_idx = np.argmax(distances)
    
    print(f"\nTraits relative to Assistant:")
    print(f"  Closest trait: {trait_labels[closest_idx]} (distance: {distances[closest_idx]:.3f})")
    print(f"  Furthest trait: {trait_labels[furthest_idx]} (distance: {distances[furthest_idx]:.3f})")
    print(f"  Mean distance: {distances.mean():.3f}")
    print(f"  Std distance: {distances.std():.3f}")
    
    fig_3d.show()
    fig_3d.write_html(f"./results/pca_240/pca_3d_assistant.html")
    
    return fig_3d