## Between Class Scatter (LDA)

In [1]:
import os
import sys
import json
import torch
import numpy as np
import pandas as pd

sys.path.append('.')
sys.path.append('..')

from utils.pca_utils import L2MeanScaler, MeanScaler, compute_pca
from plots import plot_pc

## Configuration

In [22]:
# Configuration - Change these parameters for different models/datasets
model_name = "llama-3.3-70b"
base_dir = f"/workspace/{model_name}"
type = "roles_240"
dir = f"{base_dir}/{type}"
layer = 40

out_dir = f"/root/git/plots/{model_name}/roles/lda"
os.makedirs(out_dir, exist_ok=True)

In [23]:
scaler = "mean"
lda_results = torch.load(f"{dir}/lda/layer{layer}_{scaler}_pos23.pt", weights_only=False)

scaler_names = {
    "standard": "Mean-Centered and Divided by Standard Deviation",
    "l2mean": "Mean-Centered and L2-Normalized",
    "mean": "Mean-Centered",
    "none": "Not Scaled"
}
subtitle = f"{model_name.replace('-', ' ').title()}, Layer {layer} - Activations {scaler_names[scaler]}"

In [24]:
print(lda_results['mean_vectors_projected'].shape)

(377, 465)


In [25]:
def get_role_labels(pca_results):
    labels = []
    if 'pos_2' in pca_results['roles'].keys():
        pos_2_roles = [role.replace('_', ' ').title() for role in pca_results['roles']['pos_2']]
        pos_2_roles = [f"{role} (Somewhat RP)" for role in pos_2_roles]
        labels.extend(pos_2_roles)
    if 'pos_3' in pca_results['roles'].keys():
        pos_3_roles = [role.replace('_', ' ').title() for role in pca_results['roles']['pos_3']]
        pos_3_roles = [f"{role} (Fully RP)" for role in pos_3_roles]
        labels.extend(pos_3_roles)
    return labels

In [26]:
# Load PCA results to get original raw vectors
pca_results = torch.load(f"{dir}/pca/layer{layer}_pos23.pt", weights_only=False)

# Define roles to filter out
roles_to_exclude = []

# Filter roles and get indices to keep
def filter_roles(roles_dict, exclude_list):
    """Filter out specific roles and return filtered dict and indices to keep."""
    filtered = {}
    indices_to_keep = []
    current_idx = 0
    
    for key in ['pos_2', 'pos_3']:
        if key in roles_dict:
            filtered[key] = []
            for role in roles_dict[key]:
                if role not in exclude_list:
                    filtered[key].append(role)
                    indices_to_keep.append(current_idx)
                current_idx += 1
    
    return filtered, indices_to_keep

# Filter the roles
filtered_roles, indices_to_keep = filter_roles(lda_results['roles'], roles_to_exclude)

print(f"Original roles: {len(lda_results['roles']['pos_2'])} pos_2 + {len(lda_results['roles']['pos_3'])} pos_3 = {len(lda_results['roles']['pos_2']) + len(lda_results['roles']['pos_3'])} total")
print(f"Filtered roles: {len(filtered_roles['pos_2'])} pos_2 + {len(filtered_roles['pos_3'])} pos_3 = {len(indices_to_keep)} total")
print(f"Excluded: {roles_to_exclude}")

# Extract raw mean vectors from PCA results (before any transformation)
pos_2_vectors = torch.stack(pca_results['vectors']['pos_2'])[:, layer, :].float().numpy()
pos_3_vectors = torch.stack(pca_results['vectors']['pos_3'])[:, layer, :].float().numpy()
combined_raw_vectors = np.vstack([pos_2_vectors, pos_3_vectors])

# Filter vectors and projected data by indices
filtered_vectors = combined_raw_vectors[indices_to_keep]
filtered_projected = lda_results['mean_vectors_projected'][indices_to_keep]

print(f"\nFiltered vectors shape: {filtered_vectors.shape}")
print(f"Filtered projected shape: {filtered_projected.shape}")

# Create a simple object to hold the projection matrix (components)
class LDAComponents:
    def __init__(self, components):
        self.components_ = components

# Combine LDA results with original vectors for plotting
lda_results_for_plot = {
    'pca_transformed': filtered_projected,  # Pre-computed projections (filtered)
    'variance_explained': lda_results['variance_explained'],
    'pca': LDAComponents(lda_results['projection_matrix']),  # Projection matrix [activation_dim, n_components]
    'scaler': lda_results['scaler'],
    'roles': filtered_roles,  # Use filtered roles
    'vectors': filtered_vectors  # Raw activation vectors (filtered) [n_samples, activation_dim]
}

# Generate filtered role labels
role_labels = get_role_labels(lda_results_for_plot)

print(f"\nLDA results prepared for plotting")
print(f"  Raw vectors shape: {lda_results_for_plot['vectors'].shape}")
print(f"  Projected shape: {lda_results_for_plot['pca_transformed'].shape}")
print(f"  Projection matrix shape: {lda_results_for_plot['pca'].components_.shape}")
print(f"  Variance explained: {len(lda_results_for_plot['variance_explained'])} components")
print(f"  Number of role labels: {len(role_labels)}")

Original roles: 102 pos_2 + 275 pos_3 = 377 total
Filtered roles: 102 pos_2 + 275 pos_3 = 377 total
Excluded: []

Filtered vectors shape: (377, 8192)
Filtered projected shape: (377, 465)

LDA results prepared for plotting
  Raw vectors shape: (377, 8192)
  Projected shape: (377, 465)
  Projection matrix shape: (8192, 465)
  Variance explained: 465 components
  Number of role labels: 377



Trying to unpickle estimator PCA from version 1.7.0 when using version 1.7.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator StandardScaler from version 1.7.0 when using version 1.7.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



In [27]:
default_vectors = torch.load(f"{dir}/default_vectors.pt")
assistant_layer_activation = default_vectors['activations']['default_1'][layer, :].float().reshape(1, -1)

asst_scaled = lda_results_for_plot['scaler'].transform(assistant_layer_activation)
asst_projected = asst_scaled @ lda_results_for_plot['pca'].components_

assistant_layer_activation = assistant_layer_activation.numpy()

In [28]:
print(asst_projected.shape)

(1, 465)


In [29]:
# Plot the LDA components using plot_pc
# This shows both cosine similarity and projection

for i in range(min(6, len(lda_results['variance_explained']))):  # Plot up to 6 components
    fig = plot_pc(
        pca_results=lda_results_for_plot,  # Use the combined structure
        role_labels=role_labels,
        pc_component=i,
        assistant_activation=assistant_layer_activation,  # Optional: add raw assistant activation
        assistant_projection=asst_projected.squeeze(),
        title=f"LDA on Role-Playing Activations",
        subtitle=subtitle,
        scaled=True  # Apply the scaler in the plotting function
    )
    fig.show()
    fig.write_html(f"{out_dir}/ld{i}_{scaler}.html")