# Finding other methods for getting role PC1

In [1]:
# let's compare role PC1 to some of the vectors similar to it

import os
import sys
import torch
import numpy as np
import pandas as pd
import torch.nn.functional as F

sys.path.append('.')
sys.path.append('..')


In [2]:
# Configuration 
model_name = "gemma-2-27b"
layer = 22

base_dir = f"/workspace/{model_name}"
type = "roles_240"
dir = f"{base_dir}/{type}"

In [3]:
pca_results = torch.load(f"{dir}/pca/layer{layer}_pos23.pt", weights_only=False)

In [4]:
pc1 = pca_results['pca'].components_[0]
pca_transformed = pca_results['pca_transformed']

In [5]:
def get_role_labels(pca_results):
    labels = []
    if 'pos_2' in pca_results['roles'].keys():
        labels.extend(pca_results['roles']['pos_2'])
    if 'pos_3' in pca_results['roles'].keys():
        labels.extend(pca_results['roles']['pos_3'])
    return labels

def get_trait_labels(pca_results):
    labels = []
    if 'pos_neg' in pca_results['traits'].keys():
        labels.extend(pca_results['traits']['pos_neg'])
    if 'pos_neg_50' in pca_results['traits'].keys():
        labels.extend(pca_results['traits']['pos_neg_50'])
    return labels

labels = get_role_labels(pca_results)
print(len(labels))

448


In [31]:
def sorted_by_pc(pc_index):
    df = pd.DataFrame({
        "label": labels,
        "projection": pca_transformed[:, pc_index],
        "score": ["pos_2"] * len(pca_results['roles']['pos_2']) + ["pos_3"] * len(pca_results['roles']['pos_3']) 
    })
    df_sorted = df.sort_values(by="projection", ascending=True)
    return df_sorted

In [32]:
pc_df = sorted_by_pc(0)
top_roles = pc_df['label'].head(5).tolist()

print(top_roles)

['caveman', 'eldritch', 'leviathan', 'void', 'aberration']


In [37]:
top_role_activations = []
for role in top_roles:
    # load in the activations
    obj = torch.load(f"{dir}/vectors/{role}.pt", weights_only=False)
    top_role_activations.append(obj['pos_3'])

# get the average activation of the top roles
top_role_activations = torch.stack(top_role_activations) # roles, layers, hidden dim
avg_top_role_activations = torch.mean(top_role_activations, dim=0) # layers, hidden dim


In [39]:
# get cosine similarity with pc1
pc1_torch = torch.tensor(pc1)
top_role_sims = F.cosine_similarity(top_role_activations[:, layer, :], pc1_torch, dim=1)
avg_top_role_sims = F.cosine_similarity(avg_top_role_activations[layer, :], pc1_torch, dim=0)

print(top_role_sims, avg_top_role_sims)

tensor([-0.0131, -0.0100, -0.0101, -0.0065, -0.0060], dtype=torch.float64) tensor(-0.0091, dtype=torch.float64)


In [48]:
# subtract the default assistant activation from the top role activations
default_acts = torch.load(f"{dir}/default_vectors.pt", weights_only=False)
default_acts = default_acts['activations']['default_1']

In [49]:
contrast_top_role_activations = top_role_activations - default_acts
contrast_avg_top_role_activations = avg_top_role_activations - default_acts

In [51]:
top_role_sims = F.cosine_similarity(contrast_top_role_activations[:, layer, :], pc1_torch, dim=1)
avg_top_role_sims = F.cosine_similarity(contrast_avg_top_role_activations[layer, :], pc1_torch, dim=0)

print(top_role_sims, avg_top_role_sims)

tensor([-0.3795, -0.4868, -0.4699, -0.4968, -0.4903], dtype=torch.float64) tensor(-0.5014, dtype=torch.float64)
