# Compare base and instruct vectors

* load in vectors from pca
* measure cosine similarity 
* print highest and lowest

In [53]:
import torch
import pandas as pd
import numpy as np

In [75]:
type = "traits_240"
model = "gemma-2-27b"
layer = 22

In [76]:
if type == "roles_240":
    instruct_results = torch.load(f"/workspace/{model}/new_{type}/pca/layer{layer}_pos23.pt", weights_only=False)
    base_results = torch.load(f"/workspace/{model}/new_{type}_base/pca/layer{layer}_pos23.pt", weights_only=False)
elif type == "traits_240":
    instruct_results = torch.load(f"/workspace/{model}/{type}/pca/layer{layer}_pos-neg50.pt", weights_only=False)
    base_results = torch.load(f"/workspace/{model}/{type}_base/pca/layer{layer}_pos-neg50.pt", weights_only=False)


## Compare vectors

In [77]:
# get vectors from each
if type == "roles_240":
    instruct_vectors = torch.stack(instruct_results["vectors"]["pos_2"] + instruct_results["vectors"]["pos_3"]).float()[:, layer, :]
    base_vectors = torch.stack(base_results["vectors"]["pos_2"] + base_results["vectors"]["pos_3"]).float()[:, layer, :]
elif type == "traits_240":
    instruct_vectors = torch.stack(instruct_results["vectors"]["pos_neg_50"]).float()[:, layer, :]
    base_vectors = torch.stack(base_results["vectors"]["pos_neg_50"]).float()[:, layer, :]

print(instruct_vectors.shape)
print(base_vectors.shape)


torch.Size([239, 4608])
torch.Size([239, 4608])


In [78]:
def get_role_labels(pca_results):
    labels = []
    if 'pos_2' in pca_results['roles'].keys():
        labels.extend(pca_results['roles']['pos_2'])
    if 'pos_3' in pca_results['roles'].keys():
        labels.extend(pca_results['roles']['pos_3'])
    return labels

def get_trait_labels(pca_results):
    return pca_results['traits']['pos_neg_50']

if type == "roles_240":
    labels = get_role_labels(instruct_results)
elif type == "traits_240":
    labels = get_trait_labels(instruct_results)
print(labels[:5])

['zealous', 'wry', 'witty', 'whimsical', 'visceral']


In [79]:
cos_sims = torch.nn.functional.cosine_similarity(instruct_vectors, base_vectors, dim=-1).numpy()
print(cos_sims.shape)

(239,)


In [80]:
df = pd.DataFrame({
    "roles": labels,
    "cos_sim": cos_sims,
    #"score": ["pos_2"] * len(instruct_results['roles']['pos_2']) + ["pos_3"] * len(instruct_results['roles']['pos_3'])
})
df.sort_values(by="cos_sim", ascending=False, inplace=True)

In [82]:
df.head(10)

Unnamed: 0,roles,cos_sim
201,conceptual,0.966407
150,ethereal,0.959057
40,romantic,0.958917
129,grounded,0.95891
121,idealistic,0.957538
179,deontological,0.953754
98,melancholic,0.953429
132,goofy,0.953262
237,abstract,0.953132
88,mystical,0.953127


In [83]:
df.tail(10)

Unnamed: 0,roles,cos_sim
81,nurturing,0.623705
199,concise,0.611413
118,impulsive,0.605041
133,generous,0.571753
221,avoidant,0.554274
9,understated,0.545085
70,pensive,0.543136
217,blunt,0.537833
160,eloquent,0.502503
164,efficient,0.360117


In [74]:
df.to_csv(f"./results/{model}/base_instruct/{type}_cos_sims.csv")

## Compare PCA

In [19]:
## get PCs 

instruct_pcs = instruct_results["pca"].components_
base_pcs = base_results["pca"].components_

print(instruct_pcs.shape)
print(base_pcs.shape)



(448, 4608)
(448, 4608)


In [21]:
instruct_pcs_norm = instruct_pcs / np.linalg.norm(instruct_pcs, axis=-1, keepdims=True)
base_pcs_norm = base_pcs / np.linalg.norm(base_pcs, axis=-1, keepdims=True)
pc_cos_sims = np.sum(instruct_pcs_norm * base_pcs_norm, axis=-1)

In [23]:
for i in range(20):
    print(f"PC {i}: {pc_cos_sims[i]:.4f}")



PC 0: 0.8397
PC 1: -0.6778
PC 2: 0.5438
PC 3: 0.6838
PC 4: -0.5785
PC 5: -0.7681
PC 6: -0.6944
PC 7: -0.6840
PC 8: -0.7868
PC 9: -0.7188
PC 10: -0.5295
PC 11: -0.5303
PC 12: -0.3717
PC 13: 0.0359
PC 14: -0.1583
PC 15: 0.5932
PC 16: -0.1106
PC 17: -0.3482
PC 18: 0.5264
PC 19: -0.1726
