# Check on vs off policy rollouts

comparing llama vectors generated on llama with those generated on qwen

* individual role vectors
* default vectors
* per layer contrast vector
* per layer PCA (mean)

maybe later i run some steering experiments

In [18]:
import torch
import os
import sys
import numpy as np

sys.path.append('.')
sys.path.append('..')

from utils.pca_utils import L2MeanScaler, MeanScaler, compute_pca, plot_variance_explained
from plots import plot_pc

In [2]:
model = "llama-3.3-70b"
target = "gemma_roles"
target_model = "gemma-2-27b"

base_dir = f"/workspace/{model}/{target}"

In [22]:
vector_dir = f"{base_dir}/vectors"
single_vector = torch.load(f"{vector_dir}/aberration.pt")
print(single_vector.keys())
print(single_vector['pos_3'].shape)

total_layers = single_vector['pos_3'].shape[0]

dict_keys(['pos_0', 'pos_1', 'pos_2', 'pos_3', 'pos_all'])
torch.Size([80, 8192])


In [None]:
vectors = {}
for file in os.listdir(vector_dir):
    if file.endswith(".pt"):
        vec = torch.load(os.path.join(vector_dir, file))
        assert vec['pos_3'].shape == single_vector['pos_3'].shape
        vectors[file.replace(".pt", "")] = vec

print(f"Found {len(vectors.keys())} roles with vectors")

Found 275 roles with vectors


In [13]:
pos_2_roles = []
pos_2_vectors = []
pos_3_roles = []
pos_3_vectors = []

# get the vectors keys for pos_2 and pos_3 for each role
for role, vector in vectors.items():
    if 'pos_2' in vector.keys():
        pos_2_roles.append(role)
        pos_2_vectors.append(vector['pos_2'])
    if 'pos_3' in vector.keys():
        pos_3_roles.append(role)
        pos_3_vectors.append(vector['pos_3'])

print(len(pos_2_roles))
print(len(pos_3_roles))

combined_vectors = pos_2_vectors + pos_3_vectors

173
275


## Contrast vector

In [None]:
role_all_layers = torch.stack(pos_3_vectors).mean(dim=0)
default_all_layers = torch.load(f"{base_dir}/default_vectors.pt")['activations']['default_1']
contrast_vector = role_all_layers - default_all_layers
print(contrast_vector.shape)


torch.Size([80, 8192])


In [31]:
torch.save(contrast_vector, f"{base_dir}/contrast_vectors.pt")

# PCA

In [14]:
float_stack_vectors = torch.stack(combined_vectors).float()
print(float_stack_vectors.shape)

torch.Size([448, 80, 8192])


In [15]:
pc1 = []
pca_dir = f"{base_dir}/pca"
os.makedirs(pca_dir, exist_ok=True)


In [16]:
for i in range(float_stack_vectors.shape[1]):
    scaler = MeanScaler()
    pca_transformed, variance_explained, n_components, pca, scaler = compute_pca(
        float_stack_vectors, 
        layer=i, 
        scaler=scaler
    )
    results = {}
    results['layer'] = i
    results['roles'] = {
        'pos_2': pos_2_roles,
        'pos_3': pos_3_roles
    }
    results['vectors'] = {
        'pos_2': pos_2_vectors,
        'pos_3': pos_3_vectors
    }
    results['pca_transformed'] = pca_transformed
    results['variance_explained'] = variance_explained
    results['n_components'] = n_components
    results['pca'] = pca
    results['scaler'] = scaler

    pc1.append(pca.components_[0])
    torch.save(results, f"{pca_dir}/layer{i}_mean_pos23.pt")


PCA fitted with 448 components
Cumulative variance for first 5 components: [0.78991675 0.96011806 0.9727989  0.9790828  0.9814258 ]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 1
Dimensions for 80% variance: 2
Dimensions for 90% variance: 2
Dimensions for 95% variance: 2
PCA fitted with 448 components
Cumulative variance for first 5 components: [0.842006   0.95739496 0.9688712  0.9751636  0.97779095]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 1
Dimensions for 80% variance: 1
Dimensions for 90% variance: 2
Dimensions for 95% variance: 2
PCA fitted with 448 components
Cumulative variance for first 5 components: [0.8495443  0.94538087 0.95967996 0.96840596 0.9718837 ]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 1
Dimensions for 80% variance: 1
Dimensions for 90% variance: 2
Dimensions for 95% variance: 3
PCA fitted with 448 components
Cumulative variance for first 5 components: [0.9

In [19]:
stacked_pc1 = torch.from_numpy(np.array(pc1))
print(stacked_pc1.shape)

torch.Size([80, 8192])


In [20]:
torch.save(stacked_pc1, f"{base_dir}/pc1_vectors.pt")

## Original vectors

In [24]:
orig_dir = f"/workspace/{model}/roles_240"


In [25]:
pc1 = []
for i in range(total_layers):
    layer_results = torch.load(f"{orig_dir}/pca/layer{i}_mean_pos23.pt", weights_only=False)
    l_pc1 = layer_results['pca'].components_[0]

    pc1.append(l_pc1)

stacked_pc1 = torch.from_numpy(np.array(pc1))
print(stacked_pc1.shape)
torch.save(stacked_pc1, f"{orig_dir}/pc1_vectors.pt")

FileNotFoundError: [Errno 2] No such file or directory: '/workspace/llama-3.3-70b/roles_240/pca/layer0_mean_pos23.pt'

In [28]:
# contrast vectors
contrast_file = f"/workspace/{model}/capped/configs/contrast/multi_contrast_vectors.pt"
contrast_obj = torch.load(contrast_file, weights_only=False)
print(contrast_obj[0].keys())


dict_keys(['scaler', 'name', 'vector', 'layer'])


In [29]:
contrasts = []
for c in contrast_obj:
    contrasts.append(c['vector'])

stacked_contrasts = torch.stack(contrasts)
print(stacked_contrasts.shape)


torch.Size([80, 8192])


In [32]:
torch.save(stacked_contrasts, f"{orig_dir}/contrast_vectors.pt")