## Between Class Scatter (LDA)

In [2]:
import os
import sys
import json
import torch
import numpy as np
import pandas as pd

sys.path.append('.')
sys.path.append('..')

from utils.pca_utils import L2MeanScaler, MeanScaler, compute_pca

## Configuration

In [3]:
# Configuration - Change these parameters for different models/datasets
base_dir = "/workspace/gemma-2-27b"
type = "roles_240"
dir = f"{base_dir}/{type}"
model_name = "Gemma-2-27B"
layer = 22

In [4]:
pca_results = torch.load(f"{dir}/pca/layer{layer}_pos23.pt", weights_only=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [5]:
# vectors (between class)
vectors = torch.stack(pca_results['vectors']['pos_3'])[:, layer, :].float()
print(vectors.shape)

torch.Size([275, 4608])


In [6]:
# load in scores
scores = {}
for file in os.listdir(f"{dir}/extract_scores"):
    if file.endswith('.json'):
        scores[file.replace('.json', '')] = json.load(open(f"{dir}/extract_scores/{file}"))

print(f"Loaded {len(scores)} scores")


Loaded 275 scores


In [9]:
# load in raw activations
acts3 = {}
acts2 = {}
for file in os.listdir(f"{dir}/response_activations"):
    if file.endswith('.pt') and 'default' not in file:
        # dict we should iterate over (1200 each)
        role_acts3 = []
        role_acts2 = []
        obj = torch.load(f"{dir}/response_activations/{file}")
        for key in obj:
            if scores[file.replace('.pt', '')][key] == 3:
                role_acts3.append(obj[key])
            elif scores[file.replace('.pt', '')][key] == 2:
                role_acts2.append(obj[key])
        if role_acts3:
            acts3[file.replace('.pt', '')] = torch.stack(role_acts3)
        if role_acts2:
            acts2[file.replace('.pt', '')] = torch.stack(role_acts2)

print(f"Loaded {len(acts3)} roles")


Loaded 275 roles


In [11]:
def prepare_class_data(acts2, acts3, layer, min_samples=10):
    """
    Combine acts2 and acts3 dictionaries into class-based structure.
    
    Parameters:
    -----------
    acts2 : dict
        Dictionary mapping role names to tensors of shape [n_samples, n_layers, N]
    acts3 : dict
        Dictionary mapping role names to tensors of shape [n_samples, n_layers, N]
    layer : int
        Which layer to extract activations from
    min_samples : int
        Minimum number of samples required to include a class
    
    Returns:
    --------
    class_data : dict
        Dictionary mapping class names (e.g., "graduate_pos_2") to arrays of shape [n_samples, N]
    """
    class_data = {}
    
    # Process pos_2 samples
    for role, activations in acts2.items():
        class_name = f"{role}_pos_2"
        # Extract specified layer
        layer_acts = activations[:, layer, :].float().numpy()
        if len(layer_acts) >= min_samples:
            class_data[class_name] = layer_acts
    
    # Process pos_3 samples
    for role, activations in acts3.items():
        class_name = f"{role}_pos_3"
        # Extract specified layer
        layer_acts = activations[:, layer, :].float().numpy()
        if len(layer_acts) >= min_samples:
            class_data[class_name] = layer_acts
    
    return class_data

# Prepare the data
class_data = prepare_class_data(acts2, acts3, layer=layer, min_samples=10)
print(f"Number of classes after filtering: {len(class_data)}")
print(f"Sample counts per class:")
for class_name, data in sorted(class_data.items()):
    print(f"  {class_name}: {len(data)} samples")

Number of classes after filtering: 494
Sample counts per class:
  aberration_pos_2: 84 samples
  aberration_pos_3: 934 samples
  absurdist_pos_3: 1199 samples
  accountant_pos_2: 417 samples
  accountant_pos_3: 585 samples
  activist_pos_2: 228 samples
  activist_pos_3: 953 samples
  actor_pos_2: 39 samples
  actor_pos_3: 1148 samples
  addict_pos_2: 133 samples
  addict_pos_3: 998 samples
  adolescent_pos_2: 21 samples
  adolescent_pos_3: 1158 samples
  advocate_pos_2: 330 samples
  advocate_pos_3: 818 samples
  alien_pos_3: 1196 samples
  altruist_pos_2: 726 samples
  altruist_pos_3: 463 samples
  amateur_pos_2: 141 samples
  amateur_pos_3: 1023 samples
  ambassador_pos_2: 294 samples
  ambassador_pos_3: 721 samples
  amnesiac_pos_3: 1190 samples
  analyst_pos_2: 355 samples
  analyst_pos_3: 842 samples
  anarchist_pos_2: 45 samples
  anarchist_pos_3: 1154 samples
  ancient_pos_3: 1198 samples
  angel_pos_2: 29 samples
  angel_pos_3: 1168 samples
  anthropologist_pos_2: 105 samples
 

In [12]:
def find_role_variance_subspace(class_data, scaler=None, k=None):
    """
    Find subspace maximizing between-class variance using Linear Discriminant Analysis.
    
    Parameters:
    -----------
    class_data : dict
        Dictionary mapping class names to arrays of shape [n_samples, N]
    scaler : sklearn-compatible scaler or None
        Optional scaler (StandardScaler, L2MeanScaler, MeanScaler, etc.)
    k : int, optional
        Number of dimensions in subspace. If None, returns all components.
    
    Returns:
    --------
    projection_matrix : array of shape [N, k]
        Columns are the optimal projection directions
    eigenvalues : array
        The eigenvalues (variance ratios)
    projected_data : dict
        Dictionary mapping class names to projected data of shape [n_samples, k]
    """
    # Gather all data and class information
    class_names = list(class_data.keys())
    class_samples = [class_data[name] for name in class_names]
    class_sizes = [len(samples) for samples in class_samples]
    
    # Concatenate all data
    X = np.vstack(class_samples)  # shape: [total_samples, N]
    N = X.shape[1]
    
    # Apply scaler if provided
    if scaler is not None:
        X = scaler.fit_transform(X)
        # Also transform individual classes for later projection
        scaled_class_samples = []
        start_idx = 0
        for size in class_sizes:
            scaled_class_samples.append(X[start_idx:start_idx + size])
            start_idx += size
        class_samples = scaled_class_samples
    
    # Compute overall mean
    mu_total = X.mean(axis=0)
    
    # Compute per-class means
    class_means = [samples.mean(axis=0) for samples in class_samples]
    
    # Compute between-class scatter matrix (weighted by number of samples)
    S_B = np.zeros((N, N))
    for i, (mean, n_samples) in enumerate(zip(class_means, class_sizes)):
        diff = mean - mu_total
        S_B += n_samples * np.outer(diff, diff)
    
    # Compute total scatter matrix
    S_T = np.zeros((N, N))
    for i in range(len(X)):
        diff = X[i] - mu_total
        S_T += np.outer(diff, diff)
    
    # Solve generalized eigenvalue problem: S_B w = λ S_T w
    # Equivalent to: (S_T^-1 @ S_B) w = λ w
    eigenvalues, eigenvectors = np.linalg.eig(
        np.linalg.pinv(S_T) @ S_B
    )
    
    # Sort by eigenvalue (descending)
    idx = eigenvalues.argsort()[::-1]
    eigenvalues = eigenvalues[idx].real
    eigenvectors = eigenvectors[:, idx].real
    
    # Select top k components
    if k is not None:
        eigenvectors = eigenvectors[:, :k]
        eigenvalues = eigenvalues[:k]
    
    # Project data onto subspace
    projected_data = {}
    for name, samples in zip(class_names, class_samples):
        projected_data[name] = samples @ eigenvectors
    
    return eigenvectors, eigenvalues, projected_data

In [None]:
# Run the between-class scatter analysis
from sklearn.preprocessing import StandardScaler

# You can try different scalers:
# - StandardScaler() - standardize to zero mean and unit variance
# - L2MeanScaler() - imported from utils.pca_utils
# - MeanScaler() - imported from utils.pca_utils
# - None - no scaling

scaler = StandardScaler()
k = 10  # Number of dimensions to keep

projection_matrix, eigenvalues, projected_data = find_role_variance_subspace(
    class_data, 
    scaler=scaler,
    k=k
)

print(f"Projection matrix shape: {projection_matrix.shape}")
print(f"Number of components: {len(eigenvalues)}")

In [None]:
# Display results
print("=" * 60)
print("BETWEEN-CLASS SCATTER ANALYSIS RESULTS")
print("=" * 60)

# Show top eigenvalues and variance ratios
print(f"\nTop {k} eigenvalues:")
for i, eigval in enumerate(eigenvalues):
    print(f"  Component {i+1}: {eigval:.6f}")

# Calculate variance ratios
total_variance = eigenvalues.sum()
variance_ratios = eigenvalues / total_variance
print(f"\nVariance ratios (of between-class variance):")
for i, ratio in enumerate(variance_ratios):
    print(f"  Component {i+1}: {ratio:.4f} ({ratio*100:.2f}%)")

print(f"\nCumulative variance explained:")
cumsum = np.cumsum(variance_ratios)
for i, cum in enumerate(cumsum):
    print(f"  Components 1-{i+1}: {cum:.4f} ({cum*100:.2f}%)")

# Show projected data shapes
print(f"\nProjected data shapes:")
for class_name, proj_data in sorted(projected_data.items()):
    print(f"  {class_name}: {proj_data.shape}")

In [None]:
# Project the mean role vectors into the LDA space
import sys
sys.path.append('..')
sys.path.append('../roles')
from plots import plot_pc

# Extract pos_2 and pos_3 mean vectors from the specified layer
pos_2_vectors = torch.stack(pca_results['vectors']['pos_2'])[:, layer, :].float().numpy()
pos_3_vectors = torch.stack(pca_results['vectors']['pos_3'])[:, layer, :].float().numpy()

# Combine them
combined_vectors = np.vstack([pos_2_vectors, pos_3_vectors])

# Apply the same scaler if it was used
if scaler is not None:
    combined_vectors_scaled = scaler.transform(combined_vectors)
else:
    combined_vectors_scaled = combined_vectors

# Project onto LDA subspace
combined_vectors_projected = combined_vectors_scaled @ projection_matrix

print(f"Pos_2 vectors: {pos_2_vectors.shape}")
print(f"Pos_3 vectors: {pos_3_vectors.shape}")
print(f"Combined projected shape: {combined_vectors_projected.shape}")
print(f"Expected: ({len(pos_2_vectors) + len(pos_3_vectors)}, {k})")

In [None]:
# Create a results structure compatible with plot_pc
# Create role labels for both pos_2 and pos_3
pos_2_labels = [role.replace('_', ' ').title() + " (Somewhat RP)" for role in pca_results['roles']['pos_2']]
pos_3_labels = [role.replace('_', ' ').title() + " (Fully RP)" for role in pca_results['roles']['pos_3']]
role_labels = pos_2_labels + pos_3_labels

# Create a mock "LDA results" structure compatible with plot_pc
lda_results = {
    'pca_transformed': combined_vectors_projected,  # The projected vectors
    'variance_explained': eigenvalues / eigenvalues.sum(),  # Variance ratios
    'pca': type('obj', (object,), {
        'components_': projection_matrix.T  # LDA components (transposed to match PCA format)
    })(),
    'scaler': scaler,
    'roles': {
        'pos_2': pca_results['roles']['pos_2'],
        'pos_3': pca_results['roles']['pos_3']
    },
    'vectors': {
        'pos_2': pca_results['vectors']['pos_2'],
        'pos_3': pca_results['vectors']['pos_3']
    }
}

print(f"Created LDA results structure")
print(f"  Projected data shape: {lda_results['pca_transformed'].shape}")
print(f"  Components shape: {lda_results['pca'].components_.shape}")
print(f"  Number of roles: {len(role_labels)} ({len(pos_2_labels)} pos_2 + {len(pos_3_labels)} pos_3)")

In [None]:
# Plot the LDA components using plot_pc
# Similar to the PCA notebook, we'll plot the first few components

subtitle = f"{model_name}, Layer {layer} - LDA Between-Class Scatter Analysis"

for i in range(min(6, k)):  # Plot up to 6 components or k, whichever is smaller
    fig = plot_pc(
        pca_results=lda_results,
        role_labels=role_labels,
        layer=layer,
        pc_component=i,
        assistant_activation=None,  # Optional: add if you have default vectors
        assistant_projection=None,  # Optional: add if you have default vectors
        title=f"LDA Component {i+1} - Role-Playing Vectors",
        subtitle=subtitle,
        scaled=False  # We already scaled when creating the LDA projection
    )
    fig.show()