# Check on vs off policy rollouts

comparing llama vectors generated on llama with those generated on qwen

* individual role vectors
* default vectors
* per layer contrast vector
* per layer PCA (mean)

maybe later i run some steering experiments

In [12]:
import torch
import os
import sys

sys.path.append('.')
sys.path.append('..')

from utils.pca_utils import L2MeanScaler, MeanScaler, compute_pca, plot_variance_explained
from plots import plot_pc

In [2]:
model = "llama-3.3-70b"
target = "gemma_roles"
target_model = "gemma-2-27b"

base_dir = f"/workspace/{model}/{target}"

In [None]:
vector_dir = f"{base_dir}/vectors"
single_vector = torch.load(f"{vector_dir}/aberration.pt")
print(single_vector.keys())
print(single_vector['pos_3'].shape)

dict_keys(['pos_0', 'pos_1', 'pos_2', 'pos_3', 'pos_all'])
torch.Size([80, 8192])


In [None]:
vectors = {}
for file in os.listdir(vector_dir):
    if file.endswith(".pt"):
        vec = torch.load(os.path.join(vector_dir, file))
        assert vec['pos_3'].shape == single_vector['pos_3'].shape
        vectors[file.replace(".pt", "")] = vec

print(f"Found {len(vectors.keys())} roles with vectors")

Found 275 roles with vectors


In [13]:
pos_2_roles = []
pos_2_vectors = []
pos_3_roles = []
pos_3_vectors = []

# get the vectors keys for pos_2 and pos_3 for each role
for role, vector in vectors.items():
    if 'pos_2' in vector.keys():
        pos_2_roles.append(role)
        pos_2_vectors.append(vector['pos_2'])
    if 'pos_3' in vector.keys():
        pos_3_roles.append(role)
        pos_3_vectors.append(vector['pos_3'])

print(len(pos_2_roles))
print(len(pos_3_roles))

combined_vectors = pos_2_vectors + pos_3_vectors

173
275


## Contrast vector

In [None]:
role_all_layers = torch.stack(pos_3_vectors).mean(dim=0)
default_all_layers = torch.load(f"{base_dir}/default_vectors.pt")['activations']['default_1']
contrast_vector = role_all_layers - default_all_layers
print(contrast_vector.shape)


torch.Size([80, 8192])


In [9]:
torch.save(contrast_vector, f"{base_dir}/contrast_vector.pt")

# PCA

In [14]:
float_stack_vectors = torch.stack(combined_vectors).float()
print(float_stack_vectors.shape)

torch.Size([448, 80, 8192])


In [15]:
pc1 = []
pca_dir = f"{base_dir}/pca"
os.makedirs(pca_dir, exist_ok=True)


In [None]:
for i in range(float_stack_vectors.shape[1]):
    scaler = MeanScaler()
    pca_transformed, variance_explained, n_components, pca, scaler = compute_pca(
        float_stack_vectors, 
        layer=i, 
        scaler=scaler
    )
    results = {}
    results['layer'] = i
    results['roles'] = {
        'pos_2': pos_2_roles,
        'pos_3': pos_3_roles
    }
    results['vectors'] = {
        'pos_2': pos_2_vectors,
        'pos_3': pos_3_vectors
    }
    results['pca_transformed'] = pca_transformed
    results['variance_explained'] = variance_explained
    results['n_components'] = n_components
    results['pca'] = pca
    results['scaler'] = scaler

    pc1.append(pca.components_[0])
    torch.save(results, f"{pca_dir}/layer{i}_mean_pos23.pt")
