# PCA on traits

In [1]:
import sys
import os
import torch
import pandas as pd
sys.path.append('.')
sys.path.append('..')

from utils.pca_utils import *
from plots import *

## Configuration

In [7]:
# Configuration - Change these parameters for different models/datasets
base_dir = "/workspace/gemma-2-27b"
type = "traits_240"
dir = f"{base_dir}/{type}_base"
model_name = "Gemma-2-27B"
layer = 22

## Load vectors

In [3]:
# load all vectors from vectors
vector_dir = f"{dir}/vectors"

# iterate through each .pt file in the directory
vectors = {}
for file in os.listdir(vector_dir):
    if file.endswith(".pt"):
        vectors[file.replace(".pt", "")] = torch.load(os.path.join(vector_dir, file))

print(f"Found {len(vectors.keys())} traits with vectors")

Found 240 traits with vectors


In [4]:
# load default vectors
default_vectors = torch.load(f"{base_dir}/roles_240_base/default_vectors.pt")

In [5]:
print(vectors['zealous'].keys())
print(default_vectors.keys())
print(default_vectors['activations'].keys())

dict_keys(['pos_neg', 'pos_neg_50', 'pos_default', 'pos_default_50', 'pos_70', 'pos_40_70'])
dict_keys(['activations', 'metadata'])
dict_keys(['pos_1', 'default_1', 'all_1'])


## PCA

In [9]:
# load in stats
results_subdir = model_name.lower()
stats = pd.read_csv(f"./results/{results_subdir}/{type}/pos_neg.csv", index_col='trait')
print(stats.loc['zealous']['large_diff_count'])

1185.0


In [10]:
# PCA on pos_neg_50 but filter out traits with large_diff_count < 10
filtered_pos_neg_50_traits = []
filtered_pos_neg_50 = []

for trait, vector in vectors.items():
    if stats.loc[trait]['large_diff_count'] >= 10:
        filtered_pos_neg_50_traits.append(trait)
        filtered_pos_neg_50.append(vector['pos_neg_50'])
    else:
        print(f"Skipping {trait} because large_diff_count is {stats.loc[trait]['large_diff_count']}")

print(len(filtered_pos_neg_50_traits))

Skipping vindictive because large_diff_count is 1.0
239


In [11]:
float_stack_vectors = torch.stack(filtered_pos_neg_50).float()
print(float_stack_vectors.shape)


torch.Size([239, 46, 4608])


In [12]:
pca_transformed, variance_explained, n_components, pca, scaler = compute_pca(float_stack_vectors, layer)

PCA fitted with 239 components
Cumulative variance for first 5 components: [0.28675105 0.43554298 0.53176326 0.60356735 0.6427829 ]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 7
Dimensions for 80% variance: 13
Dimensions for 90% variance: 28
Dimensions for 95% variance: 52


In [9]:
# save pca
results = {}

results['layer'] = layer
results['traits'] = {
    'pos_neg_50': filtered_pos_neg_50_traits
}
results['vectors'] = {
    'pos_neg_50': filtered_pos_neg_50,
}
results['pca_transformed'] = pca_transformed
results['variance_explained'] = variance_explained
results['n_components'] = n_components
results['pca'] = pca
results['scaler'] = scaler

pca_dir = f"{dir}/pca"
os.makedirs(pca_dir, exist_ok=True)
torch.save(results, f"{pca_dir}/layer{layer}_pos-neg50.pt")

In [8]:
# run and save PCA with normalized vectors
normalized_vectors = F.normalize(float_stack_vectors, p=2, dim=-1)

norms = normalized_vectors.norm(p=2, dim=-1)
print(norms.mean().item(), norms.std().item())

1.0000009536743164 3.5813638987747254e-07


In [9]:
pca_transformed, variance_explained, n_components, pca, scaler = compute_pca(normalized_vectors, layer, scaler=False)

PCA fitted with 240 components
Cumulative variance for first 5 components: [0.23673316 0.37392849 0.4664077  0.54179005 0.60332086]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 9
Dimensions for 80% variance: 16
Dimensions for 90% variance: 35
Dimensions for 95% variance: 62


In [13]:
# save pca
results = {}

results['layer'] = layer
results['traits'] = {
    'pos_neg_50': filtered_pos_neg_50_traits
}
results['vectors'] = {
    'pos_neg_50': filtered_pos_neg_50,
}
results['pca_transformed'] = pca_transformed
results['variance_explained'] = variance_explained
results['n_components'] = n_components
results['pca'] = pca
results['scaler'] = scaler

pca_dir = f"{dir}/pca"
os.makedirs(pca_dir, exist_ok=True)
torch.save(results, f"{pca_dir}/layer{layer}_normalized_pos-neg50.pt")

## Plots

In [14]:
plot_dir = f"./results/{model_name.lower()}/pca_240_base"
os.makedirs(plot_dir, exist_ok=True)

normalized = False
if normalized:
    pca_results = torch.load(f"{dir}/pca/layer{layer}_normalized_pos-neg50.pt", weights_only=False)
else:
    pca_results = torch.load(f"{dir}/pca/layer{layer}_pos-neg50.pt", weights_only=False)


In [15]:
default_vectors = torch.load(f"{base_dir}/roles_240_base/default_vectors.pt")

In [16]:
# get default activationa nd project into PCA space
assistant_layer_activation = default_vectors['activations']['default_1'][layer, :].float().reshape(1, -1)
if not normalized:
    asst_scaled = pca_results['scaler'].transform(assistant_layer_activation)
    asst_projected = pca_results['pca'].transform(asst_scaled)
else:
    asst_normalized = F.normalize(assistant_layer_activation, p=2, dim=-1)
    asst_projected = pca_results['pca'].transform(asst_normalized)

assistant_layer_activation = assistant_layer_activation.numpy()

In [17]:
trait_labels = [trait.replace('_', ' ').capitalize() for trait in pca_results['traits']['pos_neg_50']]

In [19]:
if normalized:
    subtitle = f"{model_name.replace('-', ' ')}, Layer {layer} - Shared Question Set, Unit Normalized Vectors"
else:
    subtitle = f"{model_name.replace('-', ' ')} Base, Layer {layer} - Shared Question Set, Mean-Centered and Scaled Vectors"

for i in range(5):
    fig = plot_pc(
        pca_results=pca_results,
        trait_labels=trait_labels,
        layer=layer,
        pc_component=i,
        assistant_activation=default_vectors['activations']['default_1'],
        assistant_projection=asst_projected[0],
        subtitle=subtitle,
        scaled=not normalized,
    )
    fig.show()

    if not normalized:
        fig.write_html(f"{plot_dir}/pc{i+1}.html")
    else:
        fig.write_html(f"{plot_dir}/pc{i+1}_normalized.html")

In [8]:
# for i in range(10):
#     fig = plot_pca_cosine_similarity(
#             pca_results=pca_results,
#             trait_labels=trait_labels,
#             pc_component=i,
#             layer=layer,
#             assistant_activation=default_vectors['activations']['default_1'],
#             subtitle=f"{model_name.replace('-', ' ')}, Layer {layer} - Shared Question Set"
#         )
#     fig.show()
#     fig.write_html(f"{plot_dir}/pc{i+1}_cossim.html")

In [9]:
# for i in range(10):
#     fig = plot_pca_projection(
#             pca_results=pca_results,
#             trait_labels=trait_labels,
#             pc_component=i,
#             assistant_activation=asst_projected[0],
#             subtitle=f"{model_name.replace('-', ' ')}, Layer {layer} - Shared Question Set"
#         )
#     fig.show()
#     fig.write_html(f"{plot_dir}/pc{i+1}_projection.html")

In [10]:
fig_3d = plot_3d_pca(
    pca_results['pca_transformed'],
    pca_results['variance_explained'],
    trait_labels,
    assistant_projection=asst_projected[0],
    title="Trait Vectors in 3D Principal Component Space",
    subtitle=f"{model_name.replace('-', ' ')}, Layer {layer} - Shared Question Set"
)
fig_3d.show()
fig_3d.write_html(f"{plot_dir}/pca_3d.html")