# Compare base and instruct vectors

* load in vectors from pca
* measure cosine similarity 
* print highest and lowest

In [20]:
import torch
import pandas as pd
import numpy as np

In [37]:
type = "roles_240"
model = "gemma-2-27b"
layer = 22

In [45]:
if type == "roles_240":
    instruct_results = torch.load(f"/workspace/{model}/{type}/pca/layer{layer}_pos23.pt", weights_only=False)
    base_results = torch.load(f"/workspace/{model}/{type}_base/pca/layer{layer}_pos23.pt", weights_only=False)
elif type == "traits_240":
    instruct_results = torch.load(f"/workspace/{model}/{type}/pca/layer{layer}_pos-neg50.pt", weights_only=False)
    base_results = torch.load(f"/workspace/{model}/{type}_base/pca/layer{layer}_pos-neg50.pt", weights_only=False)


## Compare vectors

In [46]:
# get vectors from each
if type == "roles_240":
    instruct_vectors = torch.stack(instruct_results["vectors"]["pos_2"] + instruct_results["vectors"]["pos_3"]).float()[:, layer, :]
    base_vectors = torch.stack(base_results["vectors"]["pos_2"] + base_results["vectors"]["pos_3"]).float()[:, layer, :]
elif type == "traits_240":
    instruct_vectors = torch.stack(instruct_results["vectors"]["pos_neg_50"]).float()[:, layer, :]
    base_vectors = torch.stack(base_results["vectors"]["pos_neg_50"]).float()[:, layer, :]

print(instruct_vectors.shape)
print(base_vectors.shape)


torch.Size([448, 4608])
torch.Size([448, 4608])


In [47]:
def get_role_labels(pca_results):
    labels = []
    if 'pos_2' in pca_results['roles'].keys():
        labels.extend(pca_results['roles']['pos_2'])
    if 'pos_3' in pca_results['roles'].keys():
        labels.extend(pca_results['roles']['pos_3'])
    return labels

def get_trait_labels(pca_results):
    return pca_results['traits']['pos_neg_50']

if type == "roles_240":
    labels = get_role_labels(instruct_results)
elif type == "traits_240":
    labels = get_trait_labels(instruct_results)
print(labels[:5])

['writer', 'workaholic', 'witness', 'visionary', 'virus']


In [48]:
cos_sims = torch.nn.functional.cosine_similarity(instruct_vectors, base_vectors, dim=-1).numpy()
print(cos_sims.shape)

(448,)


In [49]:
df = pd.DataFrame({
    "roles": labels,
    "cos_sim": cos_sims,
    "score": ["pos_2"] * len(instruct_results['roles']['pos_2']) + ["pos_3"] * len(instruct_results['roles']['pos_3'])
})
df.sort_values(by="cos_sim", ascending=False, inplace=True)

In [50]:
df.head(10)

Unnamed: 0,roles,cos_sim,score
26,stoic,0.997403,pos_2
172,aberration,0.997396,pos_2
1,workaholic,0.997377,pos_2
281,patient,0.99732,pos_3
177,workaholic,0.997268,pos_3
152,chameleon,0.997085,pos_2
333,hoarder,0.997077,pos_3
20,symbiont,0.997029,pos_2
300,narcissist,0.996957,pos_3
95,jester,0.996933,pos_2


In [51]:
df.tail(10)

Unnamed: 0,roles,cos_sim,score
222,sommelier,0.99314,pos_3
386,debugger,0.993134,pos_3
258,programmer,0.993133,pos_3
400,conservator,0.993104,pos_3
428,architect,0.993088,pos_3
153,cartographer,0.99308,pos_2
384,designer,0.992952,pos_3
413,cartographer,0.992775,pos_3
162,archaeologist,0.992745,pos_2
58,playwright,0.991342,pos_2


In [52]:
df.to_csv(f"./results/{model}/base_instruct/{type}_cos_sims.csv")

## Compare PCA

In [19]:
## get PCs 

instruct_pcs = instruct_results["pca"].components_
base_pcs = base_results["pca"].components_

print(instruct_pcs.shape)
print(base_pcs.shape)



(448, 4608)
(448, 4608)


In [21]:
instruct_pcs_norm = instruct_pcs / np.linalg.norm(instruct_pcs, axis=-1, keepdims=True)
base_pcs_norm = base_pcs / np.linalg.norm(base_pcs, axis=-1, keepdims=True)
pc_cos_sims = np.sum(instruct_pcs_norm * base_pcs_norm, axis=-1)

In [23]:
for i in range(20):
    print(f"PC {i}: {pc_cos_sims[i]:.4f}")



PC 0: 0.8397
PC 1: -0.6778
PC 2: 0.5438
PC 3: 0.6838
PC 4: -0.5785
PC 5: -0.7681
PC 6: -0.6944
PC 7: -0.6840
PC 8: -0.7868
PC 9: -0.7188
PC 10: -0.5295
PC 11: -0.5303
PC 12: -0.3717
PC 13: 0.0359
PC 14: -0.1583
PC 15: 0.5932
PC 16: -0.1106
PC 17: -0.3482
PC 18: 0.5264
PC 19: -0.1726
