# Compare contrast vectors

In [2]:
import torch
import os
import sys
import numpy as np

sys.path.append('.')
sys.path.append('..')

from utils.pca_utils import *
from plots import plot_pc


In [19]:
model_name = "qwen-3-32b"
layer = 32
total_layers = 64
base_dir = f"/workspace/{model_name}"

In [20]:
# compute PCA for each vector
role_results = torch.load(f"{base_dir}/roles_240/pca/layer{layer}_pos23.pt", weights_only=False)


Trying to unpickle estimator PCA from version 1.7.0 when using version 1.7.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator StandardScaler from version 1.7.0 when using version 1.7.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



In [21]:
pos_2_vectors = role_results['vectors']['pos_2']
pos_3_vectors = role_results['vectors']['pos_3']
pos_2_roles = role_results['roles']['pos_2']
pos_3_roles = role_results['roles']['pos_3']

combined_vectors = pos_2_vectors + pos_3_vectors
float_stack_vectors = torch.stack(combined_vectors).float()
print(float_stack_vectors.shape)

torch.Size([463, 64, 5120])


In [24]:
pc1 = []
for i in range(float_stack_vectors.shape[1]):
    scaler = MeanScaler()
    pca_transformed, variance_explained, n_components, pca, scaler = compute_pca(
        float_stack_vectors, 
        layer=i, 
        scaler=scaler
    )
    results = {}
    results['layer'] = i
    results['roles'] = {
        'pos_2': pos_2_roles,
        'pos_3': pos_3_roles
    }
    results['vectors'] = {
        'pos_2': pos_2_vectors,
        'pos_3': pos_3_vectors
    }
    results['pca_transformed'] = pca_transformed
    results['variance_explained'] = variance_explained
    results['n_components'] = n_components
    results['pca'] = pca
    results['scaler'] = scaler

    pc1.append(pca.components_[0] + scaler.mean)

    pca_dir = f"{base_dir}/roles_240/pca"
    torch.save(results, f"{pca_dir}/layer{i}_mean_pos23.pt")

    

PCA fitted with 463 components
Cumulative variance for first 5 components: [0.5090459  0.65688956 0.7349645  0.7690813  0.79451   ]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 3
Dimensions for 80% variance: 6
Dimensions for 90% variance: 14
Dimensions for 95% variance: 31
PCA fitted with 463 components
Cumulative variance for first 5 components: [0.510143   0.64883673 0.7225447  0.7570902  0.7828112 ]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 3
Dimensions for 80% variance: 6
Dimensions for 90% variance: 16
Dimensions for 95% variance: 37
PCA fitted with 463 components
Cumulative variance for first 5 components: [0.5115164  0.64522505 0.7160474  0.7503392  0.77595663]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 3
Dimensions for 80% variance: 7
Dimensions for 90% variance: 17
Dimensions for 95% variance: 39
PCA fitted with 463 components
Cumulative variance for first 5 components

In [25]:
pc1 = []
for i in range(total_layers):
    layer_results = torch.load(f"{base_dir}/roles_240/pca/layer{i}_mean_pos23.pt", weights_only=False)
    l_pc1 = layer_results['pca'].components_[0]

    pc1.append(l_pc1)

In [26]:
stacked_pc1 = np.array(pc1)
print(stacked_pc1.shape)

(64, 5120)


In [27]:
role_all_layers = torch.stack(pos_3_vectors).mean(dim=0)
default_all_layers = torch.load(f"{base_dir}/roles_240/default_vectors.pt")['activations']['default_1']
contrast_vector = role_all_layers - default_all_layers
print(contrast_vector.shape)

np_contrast_vector = contrast_vector.float().numpy()




torch.Size([64, 5120])


In [28]:
# calculate per layer cosine similarity between pc1 and contrast vector
per_layer_similarities = np.sum(stacked_pc1 * np_contrast_vector, axis=1) / (
    np.linalg.norm(stacked_pc1, axis=1) * np.linalg.norm(np_contrast_vector, axis=1)
)
print(per_layer_similarities.shape)
abs_similarities = np.abs(per_layer_similarities)
print(abs_similarities)



(64,)
[0.8713679  0.86184615 0.8628778  0.88251275 0.8776989  0.8872892
 0.889757   0.9056756  0.90460914 0.8873945  0.8992397  0.9101598
 0.91102517 0.8957784  0.89122087 0.8892675  0.89538294 0.8881486
 0.8848059  0.8728626  0.87090135 0.87244505 0.84570557 0.8268829
 0.81682825 0.77523005 0.77641654 0.70484203 0.67770576 0.5977799
 0.599707   0.68441623 0.7102516  0.7143612  0.7199867  0.708858
 0.7198475  0.75353444 0.7841215  0.8202975  0.8310348  0.8453241
 0.8462699  0.85207593 0.87164193 0.85105324 0.8448051  0.85605454
 0.8373903  0.82600075 0.8389436  0.83153564 0.82764393 0.81286526
 0.78648096 0.7862624  0.8010314  0.82069236 0.837611   0.85203534
 0.85781085 0.86298865 0.84383047 0.86788213]


In [29]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=list(range(float_stack_vectors.shape[1])),
    y=abs_similarities,
    mode='lines+markers',
    name='Cosine Similarity'
))

fig.add_vline(x=layer, line_dash="dot", line_color="red", annotation_text=f"PCA originally on Layer {layer}", annotation_position="top right")

fig.update_layout(
    title={
        'text': 'Role PC1 vs Contrast Vector Per-Layer Cosine Similarity',
        'subtitle': {
            'text': f'{model_name.replace("-", " ").title()}',
        }
    },
    xaxis_title='Layer',
    yaxis_title='Absolute Cosine Similarity',
    height=500,
    width=800,
)

fig.show()
fig.write_html(f'/root/git/plots/{model_name}/pc1/contrast_vector_cosine_similarity_nomean.html')

In [None]:
def get_role_labels(pca_results):
    labels = []
    if 'pos_2' in pca_results['roles'].keys():
        pos_2_roles = [role.replace('_', ' ').title() for role in pca_results['roles']['pos_2']]
        pos_2_roles = [f"{role} (Somewhat RP)" for role in pos_2_roles]
        labels.extend(pos_2_roles)
    if 'pos_3' in pca_results['roles'].keys():
        pos_3_roles = [role.replace('_', ' ').title() for role in pca_results['roles']['pos_3']]
        pos_3_roles = [f"{role} (Fully RP)" for role in pos_3_roles]
        labels.extend(pos_3_roles)
    return labels

In [None]:
# plot PCs at different layers
total_layers = 80



for i in range(0, total_layers, total_layers // 4):
    layer_results = torch.load(f"{base_dir}/roles_240/pca/layer{i}_mean_pos23.pt", weights_only=False)
    subtitle = f"{model_name.replace('-', ' ').title()}, Layer {i} - Mean-Centered"

    assistant_layer_activation = default_all_layers[i].float().reshape(1, -1)
    asst_scaled = layer_results['scaler'].transform(assistant_layer_activation)
    asst_projected = layer_results['pca'].transform(asst_scaled)

    fig = plot_pc(
        pca_results=layer_results,
        role_labels=get_role_labels(layer_results),
        layer=i,
        pc_component=0,
        assistant_activation=default_all_layers,
        assistant_projection=asst_projected[0],
        title="PCA on Role-Playing Vectors",
        subtitle=subtitle,
    )
    fig.show()
    plot_dir = f"/root/git/plots/{model_name}/pc1"
    fig.write_html(f"{plot_dir}/layer{i}.html")

i = total_layers - 1
layer_results = torch.load(f"{base_dir}/roles_240/pca/layer{i}_mean_pos23.pt", weights_only=False)
subtitle = f"{model_name.replace('-', ' ').title()}, Layer {i} - Mean-Centered"

assistant_layer_activation = default_all_layers[i].float().reshape(1, -1)
asst_scaled = layer_results['scaler'].transform(assistant_layer_activation)
asst_projected = layer_results['pca'].transform(asst_scaled)

fig = plot_pc(
    pca_results=layer_results,
    role_labels=get_role_labels(layer_results),
    layer=i,
    pc_component=0,
    assistant_activation=default_all_layers,
    assistant_projection=asst_projected[0],
    title="PCA on Role-Playing Vectors",
    subtitle=subtitle,
)
fig.show()
plot_dir = f"/root/git/plots/{model_name}/pc1"
fig.write_html(f"{plot_dir}/layer{i}.html")


In [None]:
# save all the PC1 vectors, need to flip them so they align with contrast vector

contrast_vectors = torch.load(f"/workspace/{model_name}/capped/configs/multi_contrast_vectors.pt", weights_only=False)


pc1_vectors = []

for i in range(total_layers):
    layer_results = torch.load(f"{base_dir}/roles_240/pca/layer{i}_mean_pos23.pt", weights_only=False)
    pc1 = layer_results['pca'].components_[0]

    contrast_vector = contrast_vectors[i]['vector']

    if torch.nn.functional.cosine_similarity(torch.from_numpy(pc1).reshape(1, -1), contrast_vector.reshape(1, -1)) < 0:
        pc1 = -pc1

    pc1_vectors.append(pc1)




In [37]:
print(torch.from_numpy(pc1_vectors[0]).shape)

torch.Size([4608])


In [38]:
vectors = []
for l in range(total_layers):
    vectors.append({
        'scaler': None,
        'name': f"layer_{l}/role_pc1_mean_pos23",
        'vector': torch.from_numpy(pc1_vectors[l]),
        'layer': l
    })
outfile = f"{base_dir}/evals/configs/multi_pc1_vectors.pt"
os.makedirs(os.path.dirname(outfile), exist_ok=True)
torch.save(vectors, outfile)