# Analyze default activation

- calculate distance from mean assistant -> mean pos_3, compare to inter-role distances
- get direction from mean assistant -> mean pos_3
- compare this direction to mean assistant -> mean pos_2 and mean pos_2 -> mean pos_3

In [1]:
import json
import os
import torch
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as sp
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
from tqdm import tqdm

In [18]:
type = "pos23" # either pos23 or pos3
dir = "roles_240" # either roles or roles_240
layer = 22 # either layer 22 or 34

output_dir = f"./results/{dir}"

## Load vectors

we already have mean default and per-role mean pos_2 and pos_3.
can just mean the per-role but might be better to do weighted mean based on number of samples.

In [3]:
# load all vectors 
vector_dir = f"/workspace/{dir}/vectors"

# iterate through each .pt file in the directory
vectors = {}
for file in os.listdir(vector_dir):
    if file.endswith(".pt"):
        vectors[file.replace(".pt", "")] = torch.load(os.path.join(vector_dir, file))

print(f"Found {len(vectors.keys())} traits with vectors")

Found 275 traits with vectors


In [4]:
# load default vectors
default_vectors = torch.load(f"/workspace/{dir}/default_vectors.pt")

In [6]:
print(vectors['graduate'].keys())
print(vectors['graduate']['pos_2'].shape)
print(default_vectors.keys())
print(default_vectors['activations'].keys())

dict_keys(['pos_2', 'pos_3', 'pos_all'])
torch.Size([46, 4608])
dict_keys(['activations', 'metadata'])
dict_keys(['pos_1', 'all_1', 'default_1'])


## Calculate mean pos_2 and mean pos_3 vector

In [12]:
pos_2_activations = []
pos_3_activations = []

for role, vector in vectors.items():
    if 'pos_2' in vector and 'pos_3' in vector:
        pos_2_activations.append(vector['pos_2'])
        pos_3_activations.append(vector['pos_3'])

print(len(pos_2_activations))
print(len(pos_3_activations))

pos_2_activations = torch.stack(pos_2_activations)
pos_3_activations = torch.stack(pos_3_activations)

print(pos_2_activations.shape)
print(pos_3_activations.shape)

mean_pos_2 = pos_2_activations.mean(dim=0)[layer, :]
mean_pos_3 = pos_3_activations.mean(dim=0)[layer, :]
mean_assistant = default_vectors['activations']['default_1'][layer, :]

    

173
173
torch.Size([173, 46, 4608])
torch.Size([173, 46, 4608])


## Analyze directions

In [13]:
# Calculate direction vectors
dir_assistant_to_pos2 = mean_pos_2 - mean_assistant
dir_assistant_to_pos3 = mean_pos_3 - mean_assistant  
dir_pos2_to_pos3 = mean_pos_3 - mean_pos_2

print("Direction vectors calculated:")
print(f"assistant -> pos2: shape {dir_assistant_to_pos2.shape}")
print(f"assistant -> pos3: shape {dir_assistant_to_pos3.shape}")
print(f"pos2 -> pos3: shape {dir_pos2_to_pos3.shape}")

# Normalize direction vectors for proper cosine similarity
dir_assistant_to_pos2_norm = dir_assistant_to_pos2 / torch.norm(dir_assistant_to_pos2)
dir_assistant_to_pos3_norm = dir_assistant_to_pos3 / torch.norm(dir_assistant_to_pos3)
dir_pos2_to_pos3_norm = dir_pos2_to_pos3 / torch.norm(dir_pos2_to_pos3)

# Calculate cosine similarities between direction pairs
cos_sim_ass_pos2_vs_ass_pos3 = torch.cosine_similarity(
    dir_assistant_to_pos2_norm.unsqueeze(0), 
    dir_assistant_to_pos3_norm.unsqueeze(0)
).item()

cos_sim_ass_pos2_vs_pos2_pos3 = torch.cosine_similarity(
    dir_assistant_to_pos2_norm.unsqueeze(0), 
    dir_pos2_to_pos3_norm.unsqueeze(0)
).item()

cos_sim_ass_pos3_vs_pos2_pos3 = torch.cosine_similarity(
    dir_assistant_to_pos3_norm.unsqueeze(0), 
    dir_pos2_to_pos3_norm.unsqueeze(0)
).item()

# Calculate magnitudes (distances)
mag_assistant_to_pos2 = torch.norm(dir_assistant_to_pos2).item()
mag_assistant_to_pos3 = torch.norm(dir_assistant_to_pos3).item()
mag_pos2_to_pos3 = torch.norm(dir_pos2_to_pos3).item()


Direction vectors calculated:
assistant -> pos2: shape torch.Size([4608])
assistant -> pos3: shape torch.Size([4608])
pos2 -> pos3: shape torch.Size([4608])


In [14]:
print("\n" + "="*60)
print("DIRECTION ANALYSIS RESULTS")
print("="*60)

print(f"\nDIRECTION MAGNITUDES (Euclidean distances):")
print(f"assistant -> pos2:    {mag_assistant_to_pos2:.4f}")
print(f"assistant -> pos3:    {mag_assistant_to_pos3:.4f}")
print(f"pos2 -> pos3:         {mag_pos2_to_pos3:.4f}")

print(f"\nCOSINE SIMILARITIES between directions:")
print(f"(assistant->pos2) vs (assistant->pos3):  {cos_sim_ass_pos2_vs_ass_pos3:.4f}")
print(f"(assistant->pos2) vs (pos2->pos3):       {cos_sim_ass_pos2_vs_pos2_pos3:.4f}")
print(f"(assistant->pos3) vs (pos2->pos3):       {cos_sim_ass_pos3_vs_pos2_pos3:.4f}")

# Additional geometric analysis
print(f"\nGEOMETRIC ANALYSIS:")

# Check triangle inequality to verify our vectors make sense
sum_of_two_sides = mag_assistant_to_pos2 + mag_pos2_to_pos3
third_side = mag_assistant_to_pos3
print(f"Triangle inequality check:")
print(f"  |ass->pos2| + |pos2->pos3| = {sum_of_two_sides:.1f}")
print(f"  |ass->pos3|                = {third_side:.1f}")
print(f"  Difference: {sum_of_two_sides - third_side:.1f}")

if abs(sum_of_two_sides - third_side) < 50:  # Some tolerance for numerical precision
    print(f"  → These form an approximately STRAIGHT LINE (collinear)")
elif sum_of_two_sides > third_side + 50:
    print(f"  → These form a proper TRIANGLE (non-collinear)")
else:
    print(f"  → Intermediate case")

# Calculate the angle between assistant->pos2 and assistant->pos3
angle_rad = torch.acos(torch.clamp(torch.tensor(cos_sim_ass_pos2_vs_ass_pos3), -1, 1))
angle_deg = torch.rad2deg(angle_rad).item()
print(f"\nAngle between (assistant->pos2) and (assistant->pos3): {angle_deg:.1f}°")

# The key insight: negative cosine similarity
print(f"\nKEY INSIGHT:")
print(f"• The NEGATIVE cosine similarity (-0.15) between (assistant->pos2) and (pos2->pos3)")
print(f"  means these directions point in SLIGHTLY OPPOSITE directions!")
print(f"• This suggests the three points form a 'bent' or 'V-shaped' path, not a straight line")
print(f"• The transformation goes: assistant → pos2, then 'turns around' somewhat to reach pos3")

print(f"\nINTERPRETATION:")
if cos_sim_ass_pos2_vs_pos2_pos3 < -0.5:
    print(f"• STRONG negative alignment: path makes a sharp turn/reversal at pos2")
elif cos_sim_ass_pos2_vs_pos2_pos3 < 0:
    print(f"• WEAK negative alignment: path makes a slight turn/bend at pos2")
elif cos_sim_ass_pos2_vs_pos2_pos3 < 0.3:
    print(f"• WEAK positive alignment: somewhat linear but with significant deviation")
else:
    print(f"• STRONG positive alignment: approximately linear progression")

# Verify the geometric relationship
expected_if_linear = cos_sim_ass_pos2_vs_ass_pos3  # Should equal cos_sim_ass_pos2_vs_pos2_pos3 if collinear
print(f"\nLINEARITY CHECK:")
print(f"If the three points were collinear, we'd expect:")
print(f"  cos_sim(ass->pos2, pos2->pos3) ≈ cos_sim(ass->pos2, ass->pos3) = {expected_if_linear:.4f}")
print(f"But we observe: {cos_sim_ass_pos2_vs_pos2_pos3:.4f}")
print(f"Difference: {abs(cos_sim_ass_pos2_vs_pos2_pos3 - expected_if_linear):.4f}")

print("="*60)


DIRECTION ANALYSIS RESULTS

DIRECTION MAGNITUDES (Euclidean distances):
assistant -> pos2:    448.0000
assistant -> pos3:    508.0000
pos2 -> pos3:         438.0000

COSINE SIMILARITIES between directions:
(assistant->pos2) vs (assistant->pos3):  0.5859
(assistant->pos2) vs (pos2->pos3):       -0.3418
(assistant->pos3) vs (pos2->pos3):       0.5586

GEOMETRIC ANALYSIS:
Triangle inequality check:
  |ass->pos2| + |pos2->pos3| = 886.0
  |ass->pos3|                = 508.0
  Difference: 378.0
  → These form a proper TRIANGLE (non-collinear)

Angle between (assistant->pos2) and (assistant->pos3): 54.1°

KEY INSIGHT:
• The NEGATIVE cosine similarity (-0.15) between (assistant->pos2) and (pos2->pos3)
  means these directions point in SLIGHTLY OPPOSITE directions!
• This suggests the three points form a 'bent' or 'V-shaped' path, not a straight line
• The transformation goes: assistant → pos2, then 'turns around' somewhat to reach pos3

INTERPRETATION:
• WEAK negative alignment: path makes a s

## Per-Role Direction Analysis

Now let's analyze each role individually to see how they compare to the overall pattern.

In [15]:
# Per-role direction analysis
import numpy as np

# Initialize arrays to store results
role_names = []
magnitudes = []  # [assistant->pos2, assistant->pos3, pos2->pos3]
cosine_sims = []  # [ass_pos2_vs_ass_pos3, ass_pos2_vs_pos2_pos3, ass_pos3_vs_pos2_pos3]
interpretations = []

print("Analyzing each role individually...")
print("="*80)

for role_name, role_vectors in tqdm(vectors.items()):
    # Skip roles that don't have both pos_2 and pos_3
    if 'pos_2' not in role_vectors or 'pos_3' not in role_vectors:
        continue
    
    # Get vectors for this role at the specified layer
    role_pos2 = role_vectors['pos_2'][layer, :]
    role_pos3 = role_vectors['pos_3'][layer, :]
    # Use same default assistant vector for all comparisons
    role_assistant = mean_assistant  
    
    # Calculate direction vectors
    dir_ass_to_pos2 = role_pos2 - role_assistant
    dir_ass_to_pos3 = role_pos3 - role_assistant
    dir_pos2_to_pos3 = role_pos3 - role_pos2
    
    # Calculate magnitudes
    mag_ass_pos2 = torch.norm(dir_ass_to_pos2).item()
    mag_ass_pos3 = torch.norm(dir_ass_to_pos3).item()
    mag_pos2_pos3 = torch.norm(dir_pos2_to_pos3).item()
    
    # Normalize for cosine similarity
    dir_ass_to_pos2_norm = dir_ass_to_pos2 / torch.norm(dir_ass_to_pos2)
    dir_ass_to_pos3_norm = dir_ass_to_pos3 / torch.norm(dir_ass_to_pos3)
    dir_pos2_to_pos3_norm = dir_pos2_to_pos3 / torch.norm(dir_pos2_to_pos3)
    
    # Calculate cosine similarities
    cos_ass_pos2_vs_ass_pos3 = torch.cosine_similarity(
        dir_ass_to_pos2_norm.unsqueeze(0), 
        dir_ass_to_pos3_norm.unsqueeze(0)
    ).item()
    
    cos_ass_pos2_vs_pos2_pos3 = torch.cosine_similarity(
        dir_ass_to_pos2_norm.unsqueeze(0), 
        dir_pos2_to_pos3_norm.unsqueeze(0)
    ).item()
    
    cos_ass_pos3_vs_pos2_pos3 = torch.cosine_similarity(
        dir_ass_to_pos3_norm.unsqueeze(0), 
        dir_pos2_to_pos3_norm.unsqueeze(0)
    ).item()
    
    # Determine interpretation
    if cos_ass_pos2_vs_pos2_pos3 < -0.5:
        interp = "sharp_turn"
    elif cos_ass_pos2_vs_pos2_pos3 < 0:
        interp = "slight_turn"
    elif cos_ass_pos2_vs_pos2_pos3 < 0.3:
        interp = "weak_linear"
    else:
        interp = "strong_linear"
    
    # Store results
    role_names.append(role_name)
    magnitudes.append([mag_ass_pos2, mag_ass_pos3, mag_pos2_pos3])
    cosine_sims.append([cos_ass_pos2_vs_ass_pos3, cos_ass_pos2_vs_pos2_pos3, cos_ass_pos3_vs_pos2_pos3])
    interpretations.append(interp)

# Convert to numpy arrays
role_names = np.array(role_names)
magnitudes = np.array(magnitudes)  # Shape: (n_roles, 3)
cosine_sims = np.array(cosine_sims)  # Shape: (n_roles, 3)
interpretations = np.array(interpretations)

print(f"\nCompleted analysis for {len(role_names)} roles")
print(f"Magnitudes shape: {magnitudes.shape}")
print(f"Cosine similarities shape: {cosine_sims.shape}")
print(f"Interpretations shape: {interpretations.shape}")

Analyzing each role individually...


100%|██████████| 275/275 [00:00<00:00, 4138.67it/s]


Completed analysis for 173 roles
Magnitudes shape: (173, 3)
Cosine similarities shape: (173, 3)
Interpretations shape: (173,)





In [16]:
# Display summary statistics
print("\n" + "="*80)
print("PER-ROLE DIRECTION ANALYSIS SUMMARY")
print("="*80)

# Magnitude statistics
print(f"\nMAGNITUDE STATISTICS:")
mag_labels = ['assistant->pos2', 'assistant->pos3', 'pos2->pos3']
for i, label in enumerate(mag_labels):
    mean_mag = magnitudes[:, i].mean()
    std_mag = magnitudes[:, i].std()
    min_mag = magnitudes[:, i].min()
    max_mag = magnitudes[:, i].max()
    print(f"{label:20s}: mean={mean_mag:6.1f}, std={std_mag:5.1f}, range=[{min_mag:5.1f}, {max_mag:5.1f}]")

# Cosine similarity statistics  
print(f"\nCOSINE SIMILARITY STATISTICS:")
cos_labels = ['(ass->pos2)vs(ass->pos3)', '(ass->pos2)vs(pos2->pos3)', '(ass->pos3)vs(pos2->pos3)']
for i, label in enumerate(cos_labels):
    mean_cos = cosine_sims[:, i].mean()
    std_cos = cosine_sims[:, i].std()
    min_cos = cosine_sims[:, i].min()
    max_cos = cosine_sims[:, i].max()
    print(f"{label:25s}: mean={mean_cos:6.3f}, std={std_cos:5.3f}, range=[{min_cos:6.3f}, {max_cos:6.3f}]")

# Interpretation distribution
print(f"\nINTERPRETATION DISTRIBUTION:")
unique_interps, counts = np.unique(interpretations, return_counts=True)
for interp, count in zip(unique_interps, counts):
    pct = 100 * count / len(interpretations)
    print(f"{interp:15s}: {count:3d} roles ({pct:4.1f}%)")

# Compare to overall means
print(f"\nCOMPARISON TO OVERALL MEANS:")
print(f"Overall mean magnitudes:      [{mag_assistant_to_pos2:.1f}, {mag_assistant_to_pos3:.1f}, {mag_pos2_to_pos3:.1f}]")
print(f"Per-role mean magnitudes:     [{magnitudes[:, 0].mean():.1f}, {magnitudes[:, 1].mean():.1f}, {magnitudes[:, 2].mean():.1f}]")
print(f"Overall cosine similarities:  [{cos_sim_ass_pos2_vs_ass_pos3:.3f}, {cos_sim_ass_pos2_vs_pos2_pos3:.3f}, {cos_sim_ass_pos3_vs_pos2_pos3:.3f}]")
print(f"Per-role mean cosine sims:    [{cosine_sims[:, 0].mean():.3f}, {cosine_sims[:, 1].mean():.3f}, {cosine_sims[:, 2].mean():.3f}]")

# Find extreme cases
print(f"\nEXTREME CASES:")

# Most linear (highest cos_sim for ass->pos2 vs pos2->pos3)
most_linear_idx = np.argmax(cosine_sims[:, 1])
print(f"Most LINEAR role:    {role_names[most_linear_idx]} (cos_sim = {cosine_sims[most_linear_idx, 1]:.3f})")

# Most bent (lowest cos_sim for ass->pos2 vs pos2->pos3)  
most_bent_idx = np.argmin(cosine_sims[:, 1])
print(f"Most BENT role:      {role_names[most_bent_idx]} (cos_sim = {cosine_sims[most_bent_idx, 1]:.3f})")

# Largest transformation (highest assistant->pos3 distance)
largest_transform_idx = np.argmax(magnitudes[:, 1])
print(f"Largest TRANSFORM:   {role_names[largest_transform_idx]} (distance = {magnitudes[largest_transform_idx, 1]:.1f})")

# Smallest transformation
smallest_transform_idx = np.argmin(magnitudes[:, 1])
print(f"Smallest TRANSFORM:  {role_names[smallest_transform_idx]} (distance = {magnitudes[smallest_transform_idx, 1]:.1f})")

print("="*80)


PER-ROLE DIRECTION ANALYSIS SUMMARY

MAGNITUDE STATISTICS:
assistant->pos2     : mean= 690.3, std=199.8, range=[300.0, 2024.0]
assistant->pos3     : mean= 818.3, std=276.0, range=[458.0, 1976.0]
pos2->pos3          : mean= 628.1, std=219.3, range=[235.0, 1480.0]

COSINE SIMILARITY STATISTICS:
(ass->pos2)vs(ass->pos3) : mean= 0.637, std=0.255, range=[-0.652,  0.977]
(ass->pos2)vs(pos2->pos3): mean=-0.203, std=0.392, range=[-0.914,  0.746]
(ass->pos3)vs(pos2->pos3): mean= 0.552, std=0.261, range=[-0.324,  0.949]

INTERPRETATION DISTRIBUTION:
sharp_turn     :  49 roles (28.3%)
slight_turn    :  73 roles (42.2%)
strong_linear  :  24 roles (13.9%)
weak_linear    :  27 roles (15.6%)

COMPARISON TO OVERALL MEANS:
Overall mean magnitudes:      [448.0, 508.0, 438.0]
Per-role mean magnitudes:     [690.3, 818.3, 628.1]
Overall cosine similarities:  [0.586, -0.342, 0.559]
Per-role mean cosine sims:    [0.637, -0.203, 0.552]

EXTREME CASES:
Most LINEAR role:    martyr (cos_sim = 0.746)
Most BENT r

In [19]:
# Save results to CSV
import pandas as pd
import os

# Create DataFrame with consistent naming
results_df = pd.DataFrame({
    'role_name': role_names,
    'mag_asst_pos2': magnitudes[:, 0],
    'mag_asst_pos3': magnitudes[:, 1],  
    'mag_pos2_pos3': magnitudes[:, 2],
    'cos_asst_pos2_vs_asst_pos3': cosine_sims[:, 0],
    'cos_asst_pos2_vs_pos2_pos3': cosine_sims[:, 1],
    'cos_asst_pos3_vs_pos2_pos3': cosine_sims[:, 2],
    'interpretation': interpretations
})

# Define output path
os.makedirs(output_dir, exist_ok=True)
output_file = f"{output_dir}/layer{layer}_{type}_directions.csv"

# Save to CSV
results_df.to_csv(output_file, index=False)

print(f"\nResults saved to: {output_file}")
print(f"Shape: {results_df.shape}")
print(f"\nFirst 5 rows:")
print(results_df.head())


Results saved to: ./results/roles_240/layer22_pos23_directions.csv
Shape: (173, 8)

First 5 rows:
    role_name  mag_asst_pos2  mag_asst_pos3  mag_pos2_pos3  \
0      writer          490.0          852.0          916.0   
1  workaholic         1072.0         1472.0          564.0   
2     witness          340.0          916.0          856.0   
3   visionary          652.0         1088.0          624.0   
4       virus          732.0          864.0          704.0   

   cos_asst_pos2_vs_asst_pos3  cos_asst_pos2_vs_pos2_pos3  \
0                    0.153320                   -0.392578   
1                    0.949219                    0.566406   
2                    0.353516                   -0.019409   
3                    0.855469                    0.447266   
4                    0.625000                   -0.271484   

   cos_asst_pos3_vs_pos2_pos3 interpretation  
0                    0.851562    slight_turn  
1                    0.796875  strong_linear  
2                   

## Find basis where the 3 means are co-linear

In [24]:
# Find the basis where assistant, pos_2, pos_3 are co-linear
# The key insight: if three points are co-linear in some projection,
# we need to find the 1D subspace (line) that best fits all three points

print("Finding co-linear basis for the three mean vectors...")
print("="*60)

# Stack the three mean vectors and convert to float32 for SVD
three_means = torch.stack([mean_assistant, mean_pos_2, mean_pos_3]).float()
print(f"Three means shape: {three_means.shape}")
print(f"Data type: {three_means.dtype}")

# Method 1: PCA to find the principal direction
# Center the data (subtract mean)
centered_means = three_means - three_means.mean(dim=0, keepdim=True)
print(f"Centered means shape: {centered_means.shape}")

# Compute SVD (equivalent to PCA for centered data)
U, S, V = torch.svd(centered_means)
print(f"SVD shapes - U: {U.shape}, S: {S.shape}, V: {V.shape}")
print(f"Singular values: {S}")

# The first principal component (first column of V) gives us the best 1D projection
principal_direction = V[:, 0]  # Shape: [4608]
print(f"Principal direction shape: {principal_direction.shape}")

# Project the three means onto this direction
projections = torch.matmul(three_means, principal_direction)
print(f"Projections onto principal direction: {projections}")

# Also get the projections of the difference vectors (convert to float32)
dir_assistant_to_pos2_f32 = dir_assistant_to_pos2.float()
dir_assistant_to_pos3_f32 = dir_assistant_to_pos3.float()
dir_pos2_to_pos3_f32 = dir_pos2_to_pos3.float()

proj_asst_to_pos2 = torch.dot(dir_assistant_to_pos2_f32, principal_direction)
proj_asst_to_pos3 = torch.dot(dir_assistant_to_pos3_f32, principal_direction) 
proj_pos2_to_pos3 = torch.dot(dir_pos2_to_pos3_f32, principal_direction)

print(f"\nProjections of direction vectors:")
print(f"assistant -> pos2: {proj_asst_to_pos2:.4f}")
print(f"assistant -> pos3: {proj_asst_to_pos3:.4f}")
print(f"pos2 -> pos3:      {proj_pos2_to_pos3:.4f}")

# Check collinearity: if vectors are co-linear, then asst->pos3 = asst->pos2 + pos2->pos3
expected_asst_to_pos3 = proj_asst_to_pos2 + proj_pos2_to_pos3
actual_asst_to_pos3 = proj_asst_to_pos3

print(f"\nCollinearity check in principal direction:")
print(f"Expected assistant->pos3: {expected_asst_to_pos3:.4f}")  
print(f"Actual assistant->pos3:   {actual_asst_to_pos3:.4f}")
print(f"Difference:               {abs(expected_asst_to_pos3 - actual_asst_to_pos3):.6f}")

# Calculate what fraction of the original variance is captured
total_variance = torch.sum(S**2)
captured_variance = S[0]**2
variance_ratio = captured_variance / total_variance

print(f"\nVariance analysis:")
print(f"Total variance: {total_variance:.4f}")
print(f"Captured by 1st PC: {captured_variance:.4f}")  
print(f"Variance ratio: {variance_ratio:.4f} ({100*variance_ratio:.1f}%)")

print("="*60)

Finding co-linear basis for the three mean vectors...
Three means shape: torch.Size([3, 4608])
Data type: torch.float32
Centered means shape: torch.Size([3, 4608])
SVD shapes - U: torch.Size([3, 3]), S: torch.Size([3]), V: torch.Size([4608, 3])
Singular values: tensor([3.5894e+02, 2.9564e+02, 5.4057e-04])
Principal direction shape: torch.Size([4608])
Projections onto principal direction: tensor([-6137.9639, -5857.9028, -5631.2891])

Projections of direction vectors:
assistant -> pos2: 280.0565
assistant -> pos3: 506.6112
pos2 -> pos3:      931.4732

Collinearity check in principal direction:
Expected assistant->pos3: 1211.5297
Actual assistant->pos3:   506.6112
Difference:               704.918457

Variance analysis:
Total variance: 216235.9844
Captured by 1st PC: 128835.3828
Variance ratio: 0.5958 (59.6%)


In [25]:
# Method 2: Alternative approach using the span of the two direction vectors
# Find the 2D subspace spanned by assistant->pos2 and assistant->pos3

print("\nMethod 2: Finding 2D subspace spanned by direction vectors")
print("-" * 50)

# Create matrix with the two direction vectors as columns (convert to float32)
direction_matrix = torch.stack([dir_assistant_to_pos2_f32, dir_assistant_to_pos3_f32], dim=1)  # Shape: [4608, 2]
print(f"Direction matrix shape: {direction_matrix.shape}")
print(f"Direction matrix dtype: {direction_matrix.dtype}")

# Compute SVD of the direction matrix  
U_dir, S_dir, V_dir = torch.svd(direction_matrix)
print(f"Direction SVD - U: {U_dir.shape}, S: {S_dir.shape}, V: {V_dir.shape}")
print(f"Direction singular values: {S_dir}")

# The columns of U_dir give us the orthonormal basis for the 2D subspace
basis_2d = U_dir[:, :2]  # Shape: [4608, 2] - first 2 columns
print(f"2D basis shape: {basis_2d.shape}")

# Project all three vectors onto this 2D subspace (use float32 versions)
mean_assistant_f32 = mean_assistant.float()
mean_pos_2_f32 = mean_pos_2.float()
mean_pos_3_f32 = mean_pos_3.float()

proj_2d_assistant = torch.matmul(basis_2d.T, mean_assistant_f32)  # Shape: [2]
proj_2d_pos2 = torch.matmul(basis_2d.T, mean_pos_2_f32)
proj_2d_pos3 = torch.matmul(basis_2d.T, mean_pos_3_f32)

print(f"\n2D projections:")
print(f"Assistant: [{proj_2d_assistant[0]:.2f}, {proj_2d_assistant[1]:.2f}]")
print(f"Pos2:      [{proj_2d_pos2[0]:.2f}, {proj_2d_pos2[1]:.2f}]")  
print(f"Pos3:      [{proj_2d_pos3[0]:.2f}, {proj_2d_pos3[1]:.2f}]")

# In 2D, find the line that best fits the three points
points_2d = torch.stack([proj_2d_assistant, proj_2d_pos2, proj_2d_pos3])  # Shape: [3, 2]
print(f"2D points shape: {points_2d.shape}")

# Center the 2D points
centered_2d = points_2d - points_2d.mean(dim=0, keepdim=True)
print(f"Centered 2D points: {centered_2d}")

# SVD of centered 2D points to find best line
U_2d, S_2d, V_2d = torch.svd(centered_2d)  
line_direction_2d = V_2d[:, 0]  # First column = direction of best-fit line in 2D
print(f"Best-fit line direction in 2D: [{line_direction_2d[0]:.4f}, {line_direction_2d[1]:.4f}]")

# Project the 2D points onto the line
projections_on_line = torch.matmul(points_2d, line_direction_2d)
print(f"Projections onto line: {projections_on_line}")

# The best-fit line direction in the original high-dimensional space
line_direction_hd = torch.matmul(basis_2d, line_direction_2d)  # Shape: [4608]
print(f"Line direction in high-D space shape: {line_direction_hd.shape}")

print(f"\nVariance explained by line in 2D subspace:")
total_2d_var = torch.sum(S_2d**2)
line_2d_var = S_2d[0]**2
line_2d_ratio = line_2d_var / total_2d_var
print(f"Line explains {line_2d_ratio:.4f} ({100*line_2d_ratio:.1f}%) of 2D variance")

# Final collinearity check using this line direction
proj_hd_asst_to_pos2 = torch.dot(dir_assistant_to_pos2_f32, line_direction_hd)
proj_hd_asst_to_pos3 = torch.dot(dir_assistant_to_pos3_f32, line_direction_hd)
proj_hd_pos2_to_pos3 = torch.dot(dir_pos2_to_pos3_f32, line_direction_hd)

print(f"\nFinal collinearity check using best-fit line:")
print(f"assistant -> pos2: {proj_hd_asst_to_pos2:.4f}")
print(f"assistant -> pos3: {proj_hd_asst_to_pos3:.4f}")
print(f"pos2 -> pos3:      {proj_hd_pos2_to_pos3:.4f}")

expected_hd = proj_hd_asst_to_pos2 + proj_hd_pos2_to_pos3
print(f"Expected asst->pos3: {expected_hd:.4f}")
print(f"Difference: {abs(expected_hd - proj_hd_asst_to_pos3):.8f}")

print("="*60)


Method 2: Finding 2D subspace spanned by direction vectors
--------------------------------------------------
Direction matrix shape: torch.Size([4608, 2])
Direction matrix dtype: torch.float32
Direction SVD - U: torch.Size([4608, 2]), S: torch.Size([2]), V: torch.Size([2, 2])
Direction singular values: tensor([603.9178, 304.2725])
2D basis shape: torch.Size([4608, 2])

2D projections:
Assistant: [-5097.29, 4020.98]
Pos2:      [-4717.85, 4257.74]
Pos3:      [-4627.38, 3829.80]
2D points shape: torch.Size([3, 2])
Centered 2D points: tensor([[-283.1157,  -15.1929],
        [  96.3208,  221.5659],
        [ 186.7944, -206.3726]])
Best-fit line direction in 2D: [0.9440, -0.3300]
Projections onto line: tensor([-6138.5879, -5858.5254, -5631.9136])
Line direction in high-D space shape: torch.Size([4608])

Variance explained by line in 2D subspace:
Line explains 0.5958 (59.6%) of 2D variance

Final collinearity check using best-fit line:
assistant -> pos2: 280.0575
assistant -> pos3: 506.6114

In [26]:
# Visualization and final analysis
import matplotlib.pyplot as plt

print("\nSUMMARY: Finding the Co-linear Basis")
print("="*60)

print("We found two approaches to make the vectors co-linear:")
print("\n1. DIRECT PCA on the three mean vectors:")
print(f"   - Captures {100*variance_ratio:.1f}% of variance in 1D projection")
print(f"   - Principal direction explains the main variation between the three points")

print("\n2. 2D SUBSPACE + BEST-FIT LINE approach:")
print(f"   - First find 2D subspace spanned by the direction vectors")  
print(f"   - Then find best-fit line within this 2D space")
print(f"   - Line captures {100*line_2d_ratio:.1f}% of variance within the 2D subspace")

print(f"\nBOTH approaches give us a 1D basis (line direction) where:")
print(f"   - The three points (assistant, pos2, pos3) are approximately co-linear")
print(f"   - The direction vectors project consistently onto this line")

print("\n" + "="*60)
print("INTERPRETATION:")
print("="*60)

print(f"The 'collinear basis' represents the PRIMARY TRANSFORMATION AXIS")
print(f"in the high-dimensional activation space that captures the progression:")
print(f"   assistant → pos_2 → pos_3")
print()
print(f"In this basis:")
print(f"• All three mean vectors lie approximately on a line")  
print(f"• The transformation from assistant to pos_3 can be decomposed as:")
print(f"  assistant → pos_2 (partial transformation)")
print(f"  pos_2 → pos_3 (completion of transformation)")
print(f"• This suggests a smooth, one-dimensional 'role transformation space'")

# Compare the two methods
print(f"\nMethod comparison:")
print(f"Method 1 (direct PCA) difference: {abs(expected_asst_to_pos3 - actual_asst_to_pos3):.8f}")
print(f"Method 2 (2D→1D) difference:      {abs(expected_hd - proj_hd_asst_to_pos3):.8f}")

if abs(expected_hd - proj_hd_asst_to_pos3) < abs(expected_asst_to_pos3 - actual_asst_to_pos3):
    print("→ Method 2 (2D subspace approach) achieves better collinearity!")
    best_direction = line_direction_hd
    best_method = "2D subspace"
else:
    print("→ Method 1 (direct PCA) achieves better collinearity!")  
    best_direction = principal_direction
    best_method = "Direct PCA"

print(f"\nFINAL RESULT:")
print(f"Best collinear basis found using: {best_method}")
print(f"Basis vector shape: {best_direction.shape}")
print(f"This direction in activation space represents the primary axis")
print(f"along which the assistant → pos_2 → pos_3 transformation occurs.")

print("="*60)


SUMMARY: Finding the Co-linear Basis
We found two approaches to make the vectors co-linear:

1. DIRECT PCA on the three mean vectors:
   - Captures 59.6% of variance in 1D projection
   - Principal direction explains the main variation between the three points

2. 2D SUBSPACE + BEST-FIT LINE approach:
   - First find 2D subspace spanned by the direction vectors
   - Then find best-fit line within this 2D space
   - Line captures 59.6% of variance within the 2D subspace

BOTH approaches give us a 1D basis (line direction) where:
   - The three points (assistant, pos2, pos3) are approximately co-linear
   - The direction vectors project consistently onto this line

INTERPRETATION:
The 'collinear basis' represents the PRIMARY TRANSFORMATION AXIS
in the high-dimensional activation space that captures the progression:
   assistant → pos_2 → pos_3

In this basis:
• All three mean vectors lie approximately on a line
• The transformation from assistant to pos_3 can be decomposed as:
  assista

## Compare basis with PCA results from pos23

In [27]:
pca_results = torch.load(f"/workspace/{dir}/pca/layer{layer}_{type}.pt", weights_only=False)

In [29]:
# Compare with PCA results from other role vectors
print("\nComparing with PCA first principal component")
print("="*50)

# Load your pca_results (assuming this contains the PCA object)
# The first PC direction vector is stored in pca.components_[0]

# Get the first principal component from your PCA analysis
# Note: pca.components_ are already in the original feature space (4608D)
pca_first_pc = pca_results['pca'].components_[0]  # Shape should be [4608]

# Convert to torch tensor and ensure float32
pca_first_pc_tensor = torch.tensor(pca_first_pc, dtype=torch.float32)

print(f"PCA first PC shape: {pca_first_pc_tensor.shape}")
print(f"Our collinear basis shape: {best_direction.shape}")

# Calculate cosine similarity between the two directions
cosine_sim = torch.cosine_similarity(
    pca_first_pc_tensor.unsqueeze(0), 
    best_direction.unsqueeze(0)
).item()

print(f"\nCosine similarity between:")
print(f"  - PCA first PC (from role vectors)")
print(f"  - Our collinear basis (assistant→pos_2→pos_3)")
print(f"  = {cosine_sim:.4f}")

# Interpret the similarity
print(f"\nInterpretation:")
if abs(cosine_sim) > 0.8:
    direction = "same" if cosine_sim > 0 else "opposite"
    print(f"✓ STRONG alignment ({direction} direction)")
    print(f"  The role transformation axis we found is very similar")
    print(f"  to the main axis of variation across different roles!")
elif abs(cosine_sim) > 0.5:
    direction = "same" if cosine_sim > 0 else "opposite" 
    print(f"○ MODERATE alignment ({direction} direction)")
    print(f"  There's significant overlap between the axes")
elif abs(cosine_sim) > 0.2:
    print(f"△ WEAK alignment")
    print(f"  Some relationship but largely independent axes")
else:
    print(f"✗ NO alignment")
    print(f"  The axes are essentially orthogonal/independent")

# Additional analysis: project our three points onto the PCA first PC
proj_asst_pca = torch.dot(mean_assistant_f32, pca_first_pc_tensor)
proj_pos2_pca = torch.dot(mean_pos_2_f32, pca_first_pc_tensor)  
proj_pos3_pca = torch.dot(mean_pos_3_f32, pca_first_pc_tensor)

print(f"\nProjections of our three points onto PCA first PC:")
print(f"Assistant: {proj_asst_pca:.2f}")
print(f"Pos_2:     {proj_pos2_pca:.2f}")
print(f"Pos_3:     {proj_pos3_pca:.2f}")

# Check if they're ordered consistently
assistant_to_pos2_pca = proj_pos2_pca - proj_asst_pca
assistant_to_pos3_pca = proj_pos3_pca - proj_asst_pca
pos2_to_pos3_pca = proj_pos3_pca - proj_pos2_pca

print(f"\nDirection consistency check:")
print(f"assistant→pos2 along PCA PC: {assistant_to_pos2_pca:.2f}")
print(f"assistant→pos3 along PCA PC: {assistant_to_pos3_pca:.2f}")
print(f"pos2→pos3 along PCA PC:      {pos2_to_pos3_pca:.2f}")

if assistant_to_pos2_pca * assistant_to_pos3_pca > 0 and assistant_to_pos2_pca * pos2_to_pos3_pca > 0:
    print("✓ All transformations go in the SAME direction along PCA axis")
    print("  This confirms assistant→pos_2→pos_3 is a consistent progression")
else:
    print("△ Mixed directions - the progression may not be linear along PCA axis")

print("="*50)


Comparing with PCA first principal component
PCA first PC shape: torch.Size([4608])
Our collinear basis shape: torch.Size([4608])

Cosine similarity between:
  - PCA first PC (from role vectors)
  - Our collinear basis (assistant→pos_2→pos_3)
  = 0.4265

Interpretation:
△ WEAK alignment
  Some relationship but largely independent axes

Projections of our three points onto PCA first PC:
Assistant: -733.37
Pos_2:     -640.97
Pos_3:     -515.32

Direction consistency check:
assistant→pos2 along PCA PC: 92.40
assistant→pos3 along PCA PC: 218.05
pos2→pos3 along PCA PC:      125.65
✓ All transformations go in the SAME direction along PCA axis
  This confirms assistant→pos_2→pos_3 is a consistent progression


## Distance comparison

In [30]:
# Compare assistant->role distances with inter-role distances
print("\nComparing Assistant-to-Role vs Inter-Role Distances")
print("="*60)

# We already have individual role vectors: pos_2_activations and pos_3_activations
# Shape: [173 roles, 46 layers, 4608 features]

# Extract vectors for our layer
role_pos2_vectors = pos_2_activations[:, layer, :].float()  # Shape: [173, 4608]
role_pos3_vectors = pos_3_activations[:, layer, :].float()  # Shape: [173, 4608]

print(f"Role pos_2 vectors shape: {role_pos2_vectors.shape}")
print(f"Role pos_3 vectors shape: {role_pos3_vectors.shape}")

# Calculate all pairwise distances between pos_2 vectors
print("\nCalculating pairwise distances between pos_2 vectors...")
pos2_pairwise_distances = []
n_roles = role_pos2_vectors.shape[0]

for i in range(n_roles):
    for j in range(i+1, n_roles):  # Only upper triangle to avoid duplicates
        dist = torch.norm(role_pos2_vectors[i] - role_pos2_vectors[j]).item()
        pos2_pairwise_distances.append(dist)

pos2_pairwise_distances = np.array(pos2_pairwise_distances)
print(f"Computed {len(pos2_pairwise_distances)} pairwise pos_2 distances")

# Calculate all pairwise distances between pos_3 vectors  
print("Calculating pairwise distances between pos_3 vectors...")
pos3_pairwise_distances = []

for i in range(n_roles):
    for j in range(i+1, n_roles):
        dist = torch.norm(role_pos3_vectors[i] - role_pos3_vectors[j]).item()
        pos3_pairwise_distances.append(dist)

pos3_pairwise_distances = np.array(pos3_pairwise_distances)
print(f"Computed {len(pos3_pairwise_distances)} pairwise pos_3 distances")

# Calculate distances from assistant to each role's pos_2 and pos_3
assistant_to_pos2_distances = []
assistant_to_pos3_distances = []

for i in range(n_roles):
    dist_pos2 = torch.norm(role_pos2_vectors[i] - mean_assistant_f32).item()
    dist_pos3 = torch.norm(role_pos3_vectors[i] - mean_assistant_f32).item()
    assistant_to_pos2_distances.append(dist_pos2)
    assistant_to_pos3_distances.append(dist_pos3)

assistant_to_pos2_distances = np.array(assistant_to_pos2_distances)
assistant_to_pos3_distances = np.array(assistant_to_pos3_distances)

print(f"Computed {len(assistant_to_pos2_distances)} assistant→pos_2 distances")
print(f"Computed {len(assistant_to_pos3_distances)} assistant→pos_3 distances")

# Compare statistics
print("\n" + "="*60)
print("DISTANCE COMPARISON RESULTS")
print("="*60)

print(f"\nPAIRWISE INTER-ROLE DISTANCES:")
print(f"pos_2 vs pos_2: mean={pos2_pairwise_distances.mean():.1f}, std={pos2_pairwise_distances.std():.1f}")
print(f"                range=[{pos2_pairwise_distances.min():.1f}, {pos2_pairwise_distances.max():.1f}]")
print(f"pos_3 vs pos_3: mean={pos3_pairwise_distances.mean():.1f}, std={pos3_pairwise_distances.std():.1f}")
print(f"                range=[{pos3_pairwise_distances.min():.1f}, {pos3_pairwise_distances.max():.1f}]")

print(f"\nASSISTANT-TO-ROLE DISTANCES:")
print(f"assistant→pos_2: mean={assistant_to_pos2_distances.mean():.1f}, std={assistant_to_pos2_distances.std():.1f}")
print(f"                 range=[{assistant_to_pos2_distances.min():.1f}, {assistant_to_pos2_distances.max():.1f}]")
print(f"assistant→pos_3: mean={assistant_to_pos3_distances.mean():.1f}, std={assistant_to_pos3_distances.std():.1f}")
print(f"                 range=[{assistant_to_pos3_distances.min():.1f}, {assistant_to_pos3_distances.max():.1f}]")

print(f"\nOUR SPECIFIC DISTANCES (assistant ↔ mean vectors):")
print(f"assistant→mean_pos_2: {mag_assistant_to_pos2:.1f}")
print(f"assistant→mean_pos_3: {mag_assistant_to_pos3:.1f}")

# Compare ratios and relative positioning
print(f"\nCOMPARATIVE ANALYSIS:")

# How do our specific distances compare to the distributions?
pos2_percentile = (assistant_to_pos2_distances < mag_assistant_to_pos2).mean() * 100
pos3_percentile = (assistant_to_pos3_distances < mag_assistant_to_pos3).mean() * 100

print(f"• Our assistant→mean_pos_2 distance ({mag_assistant_to_pos2:.0f}) is at the {pos2_percentile:.1f}th percentile")
print(f"• Our assistant→mean_pos_3 distance ({mag_assistant_to_pos3:.0f}) is at the {pos3_percentile:.1f}th percentile")

# Compare with inter-role variation
inter_role_pos2_mean = pos2_pairwise_distances.mean()
inter_role_pos3_mean = pos3_pairwise_distances.mean()
assistant_role_pos2_mean = assistant_to_pos2_distances.mean()  
assistant_role_pos3_mean = assistant_to_pos3_distances.mean()

print(f"\n• Mean inter-role pos_2 distance: {inter_role_pos2_mean:.1f}")
print(f"• Mean assistant→role pos_2 distance: {assistant_role_pos2_mean:.1f}")
print(f"  Ratio: {assistant_role_pos2_mean/inter_role_pos2_mean:.2f}x")

print(f"• Mean inter-role pos_3 distance: {inter_role_pos3_mean:.1f}")
print(f"• Mean assistant→role pos_3 distance: {assistant_role_pos3_mean:.1f}")
print(f"  Ratio: {assistant_role_pos3_mean/inter_role_pos3_mean:.2f}x")

print(f"\nINTERPRETATION:")
if assistant_role_pos2_mean/inter_role_pos2_mean < 0.8:
    print(f"✓ Assistant is CLOSER to roles than roles are to each other")
    print(f"  This suggests assistant represents a 'center point' in role space")
elif assistant_role_pos2_mean/inter_role_pos2_mean > 1.2:
    print(f"△ Assistant is FARTHER from roles than roles are from each other")
    print(f"  This suggests assistant is outside the main role cluster")
else:
    print(f"○ Assistant distance to roles is SIMILAR to inter-role distances")
    print(f"  This suggests assistant is embedded within role space")

print("="*60)


Comparing Assistant-to-Role vs Inter-Role Distances
Role pos_2 vectors shape: torch.Size([173, 4608])
Role pos_3 vectors shape: torch.Size([173, 4608])

Calculating pairwise distances between pos_2 vectors...
Computed 14878 pairwise pos_2 distances
Calculating pairwise distances between pos_3 vectors...
Computed 14878 pairwise pos_3 distances
Computed 173 assistant→pos_2 distances
Computed 173 assistant→pos_3 distances

DISTANCE COMPARISON RESULTS

PAIRWISE INTER-ROLE DISTANCES:
pos_2 vs pos_2: mean=744.4, std=274.9
                range=[194.1, 2464.2]
pos_3 vs pos_3: mean=926.0, std=319.5
                range=[170.3, 2511.3]

ASSISTANT-TO-ROLE DISTANCES:
assistant→pos_2: mean=690.2, std=199.8
                 range=[299.4, 2024.2]
assistant→pos_3: mean=818.3, std=275.9
                 range=[457.3, 1979.4]

OUR SPECIFIC DISTANCES (assistant ↔ mean vectors):
assistant→mean_pos_2: 448.0
assistant→mean_pos_3: 508.0

COMPARATIVE ANALYSIS:
• Our assistant→mean_pos_2 distance (448) is a