# Check on vs off policy rollouts

comparing llama vectors generated on llama with those generated on qwen

* individual role vectors
* default vectors
* per layer contrast vector
* per layer PCA (mean)

maybe later i run some steering experiments

In [79]:
import torch
import os
import sys
import numpy as np

sys.path.append('.')
sys.path.append('..')

from utils.pca_utils import L2MeanScaler, MeanScaler, compute_pca, plot_variance_explained
from plots import plot_pc

In [105]:
model = "llama-3.3-70b"
target = "gemma_roles"
target_model = "gemma-2-27b"

base_dir = f"/workspace/{model}/{target}"

In [81]:
vector_dir = f"{base_dir}/vectors"
single_vector = torch.load(f"{vector_dir}/aberration.pt")
print(single_vector.keys())
print(single_vector['pos_3'].shape)

total_layers = single_vector['pos_3'].shape[0]

dict_keys(['pos_0', 'pos_1', 'pos_3', 'pos_all'])
torch.Size([80, 8192])


In [82]:
vectors = {}
for file in os.listdir(vector_dir):
    if file.endswith(".pt"):
        vec = torch.load(os.path.join(vector_dir, file))
        assert vec['pos_3'].shape == single_vector['pos_3'].shape
        vectors[file.replace(".pt", "")] = vec

print(f"Found {len(vectors.keys())} roles with vectors")

Found 275 roles with vectors


In [83]:
pos_2_roles = []
pos_2_vectors = []
pos_3_roles = []
pos_3_vectors = []

# get the vectors keys for pos_2 and pos_3 for each role
for role, vector in vectors.items():
    if 'pos_2' in vector.keys():
        pos_2_roles.append(role)
        pos_2_vectors.append(vector['pos_2'])
    if 'pos_3' in vector.keys():
        pos_3_roles.append(role)
        pos_3_vectors.append(vector['pos_3'])

print(len(pos_2_roles))
print(len(pos_3_roles))

combined_vectors = pos_2_vectors + pos_3_vectors

188
275


## Contrast vector

In [84]:
role_all_layers = torch.stack(pos_3_vectors).mean(dim=0)
default_all_layers = torch.load(f"{base_dir}/default_vectors.pt")['activations']['default_1']
contrast_vector = role_all_layers - default_all_layers
print(contrast_vector.shape)


torch.Size([80, 8192])


In [85]:
torch.save(contrast_vector, f"{base_dir}/contrast_vectors.pt")

## PCA

In [86]:
float_stack_vectors = torch.stack(combined_vectors).float()
print(float_stack_vectors.shape)

torch.Size([463, 80, 8192])


In [87]:
pc1 = []
pca_dir = f"{base_dir}/pca"
os.makedirs(pca_dir, exist_ok=True)


In [88]:
for i in range(float_stack_vectors.shape[1]):
    scaler = MeanScaler()
    pca_transformed, variance_explained, n_components, pca, scaler = compute_pca(
        float_stack_vectors, 
        layer=i, 
        scaler=scaler
    )
    results = {}
    results['layer'] = i
    results['roles'] = {
        'pos_2': pos_2_roles,
        'pos_3': pos_3_roles
    }
    results['vectors'] = {
        'pos_2': pos_2_vectors,
        'pos_3': pos_3_vectors
    }
    results['pca_transformed'] = pca_transformed
    results['variance_explained'] = variance_explained
    results['n_components'] = n_components
    results['pca'] = pca
    results['scaler'] = scaler

    pc1.append(pca.components_[0])
    torch.save(results, f"{pca_dir}/layer{i}_mean_pos23.pt")


PCA fitted with 463 components
Cumulative variance for first 5 components: [0.79536086 0.9164503  0.93833125 0.953572   0.96224475]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 1
Dimensions for 80% variance: 2
Dimensions for 90% variance: 2
Dimensions for 95% variance: 4
PCA fitted with 463 components
Cumulative variance for first 5 components: [0.8249228  0.91915685 0.9362355  0.9494287  0.95795476]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 1
Dimensions for 80% variance: 1
Dimensions for 90% variance: 2
Dimensions for 95% variance: 5
PCA fitted with 463 components
Cumulative variance for first 5 components: [0.80648535 0.9014548  0.9222997  0.9362288  0.94680446]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 1
Dimensions for 80% variance: 1
Dimensions for 90% variance: 2
Dimensions for 95% variance: 6
PCA fitted with 463 components
Cumulative variance for first 5 components: [0.9

In [89]:
stacked_pc1 = torch.from_numpy(np.array(pc1))
print(stacked_pc1.shape)

torch.Size([80, 8192])


In [90]:
torch.save(stacked_pc1, f"{base_dir}/pc1_vectors.pt")

## Original vectors

In [None]:
orig_dir = f"/workspace/{model}/roles_240"


In [None]:
pc1 = []
for i in range(total_layers):
    layer_results = torch.load(f"{orig_dir}/pca/layer{i}_mean_pos23.pt", weights_only=False)
    l_pc1 = layer_results['pca'].components_[0]

    pc1.append(l_pc1)

stacked_pc1 = torch.from_numpy(np.array(pc1))
print(stacked_pc1.shape)
torch.save(stacked_pc1, f"{orig_dir}/pc1_vectors.pt")

In [None]:
# contrast vectors
contrast_file = f"/workspace/{model}/capped/configs/contrast/multi_contrast_vectors.pt"
contrast_obj = torch.load(contrast_file, weights_only=False)
print(contrast_obj[0].keys())


In [None]:
contrasts = []
for c in contrast_obj:
    contrasts.append(c['vector'])

stacked_contrasts = torch.stack(contrasts)
print(stacked_contrasts.shape)


In [None]:
torch.save(stacked_contrasts, f"{orig_dir}/contrast_vectors.pt")

## Comparison

In [106]:
orig_dir = f"/workspace/{model}/roles_240"

In [107]:
# compare original vectors between the two models
# get mean, median, std of cosine sim
on_policy_results = torch.load(f"{orig_dir}/pca/layer40_pos23.pt", weights_only=False)
off_policy_results = torch.load(f"{base_dir}/pca/layer40_mean_pos23.pt", weights_only=False)

# can get vectors from these but need to align



Trying to unpickle estimator PCA from version 1.7.0 when using version 1.7.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator StandardScaler from version 1.7.0 when using version 1.7.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



### Find intersection of roles between on_policy and off_policy

In [93]:
# Find intersection of roles between on_policy and off_policy
on_pos_2_roles = set(on_policy_results['roles']['pos_2'])
off_pos_2_roles = set(off_policy_results['roles']['pos_2'])
common_pos_2_roles = sorted(list(on_pos_2_roles & off_pos_2_roles))

on_pos_3_roles = set(on_policy_results['roles']['pos_3'])
off_pos_3_roles = set(off_policy_results['roles']['pos_3'])
common_pos_3_roles = sorted(list(on_pos_3_roles & off_pos_3_roles))

print(f"Common pos_2 roles: {len(common_pos_2_roles)}")
print(f"Common pos_3 roles: {len(common_pos_3_roles)}")
print(f"Total common roles: {len(common_pos_2_roles) + len(common_pos_3_roles)}")

Common pos_2 roles: 92
Common pos_3 roles: 275
Total common roles: 367


In [94]:
# Extract and align vectors for common roles
def extract_aligned_vectors(results, common_pos_2, common_pos_3):
    """Extract vectors for common roles in consistent order."""
    # Get indices for pos_2 common roles
    pos_2_indices = []
    for role in common_pos_2:
        idx = results['roles']['pos_2'].index(role)
        pos_2_indices.append(idx)
    
    # Get indices for pos_3 common roles
    pos_3_indices = []
    for role in common_pos_3:
        idx = results['roles']['pos_3'].index(role)
        pos_3_indices.append(idx)
    
    # Extract vectors (note: vectors are already tensors with shape [n_layers, hidden_dim])
    pos_2_vectors = [results['vectors']['pos_2'][i] for i in pos_2_indices]
    pos_3_vectors = [results['vectors']['pos_3'][i] for i in pos_3_indices]
    
    # Concatenate pos_2 first, then pos_3
    all_vectors = pos_2_vectors + pos_3_vectors
    
    return all_vectors

on_aligned = extract_aligned_vectors(on_policy_results, common_pos_2_roles, common_pos_3_roles)
off_aligned = extract_aligned_vectors(off_policy_results, common_pos_2_roles, common_pos_3_roles)

print(f"Extracted {len(on_aligned)} aligned role vectors from on_policy")
print(f"Extracted {len(off_aligned)} aligned role vectors from off_policy")
print(f"Each vector shape: {on_aligned[0].shape}")

Extracted 367 aligned role vectors from on_policy
Extracted 367 aligned role vectors from off_policy
Each vector shape: torch.Size([80, 8192])


In [95]:
# Generate role labels
def get_role_labels(common_pos_2, common_pos_3):
    """Generate formatted labels for roles."""
    labels = []
    
    # pos_2 roles
    pos_2_labels = [role.replace('_', ' ').title() for role in common_pos_2]
    pos_2_labels = [f"{role} (Somewhat RP)" for role in pos_2_labels]
    labels.extend(pos_2_labels)
    
    # pos_3 roles
    pos_3_labels = [role.replace('_', ' ').title() for role in common_pos_3]
    pos_3_labels = [f"{role} (Fully RP)" for role in pos_3_labels]
    labels.extend(pos_3_labels)
    
    return labels

role_labels = get_role_labels(common_pos_2_roles, common_pos_3_roles)
print(f"Generated {len(role_labels)} role labels")
print(f"First 5 labels: {role_labels[:5]}")

Generated 367 role labels
First 5 labels: ['Addict (Somewhat RP)', 'Advocate (Somewhat RP)', 'Altruist (Somewhat RP)', 'Amateur (Somewhat RP)', 'Assistant (Somewhat RP)']


In [96]:
# Compute per-layer cosine similarity for each role
import torch.nn.functional as F

def compute_per_layer_cosine_similarity(on_vectors, off_vectors):
    """
    Compute cosine similarity between on_policy and off_policy vectors.
    
    Args:
        on_vectors: list of tensors, each with shape (n_layers, hidden_dim)
        off_vectors: list of tensors, each with shape (n_layers, hidden_dim)
    
    Returns:
        tensor of shape (n_roles, n_layers) with cosine similarities
    """
    n_roles = len(on_vectors)
    n_layers = on_vectors[0].shape[0]
    
    cosine_sims = torch.zeros(n_roles, n_layers)
    
    for role_idx in range(n_roles):
        for layer_idx in range(n_layers):
            # Get vectors at this layer
            on_vec = on_vectors[role_idx][layer_idx, :]
            off_vec = off_vectors[role_idx][layer_idx, :]
            
            # Compute cosine similarity
            cos_sim = F.cosine_similarity(on_vec.unsqueeze(0), off_vec.unsqueeze(0))
            cosine_sims[role_idx, layer_idx] = cos_sim.item()
    
    return cosine_sims

cosine_sims = compute_per_layer_cosine_similarity(on_aligned, off_aligned)
print(f"Cosine similarity matrix shape: {cosine_sims.shape}")
print(f"Shape: ({len(role_labels)} roles, {cosine_sims.shape[1]} layers)")

Cosine similarity matrix shape: torch.Size([367, 80])
Shape: (367 roles, 80 layers)


In [97]:
# Calculate statistics
target_layer = 40

print("="*80)
print("COSINE SIMILARITY STATISTICS")
print("="*80)

# Overall statistics
overall_mean = cosine_sims.mean().item()
overall_median = cosine_sims.median().item()
overall_std = cosine_sims.std().item()
overall_min = cosine_sims.min().item()
overall_max = cosine_sims.max().item()

print(f"\nOverall statistics (all roles, all layers):")
print(f"  Mean:   {overall_mean:.4f}")
print(f"  Median: {overall_median:.4f}")
print(f"  Std:    {overall_std:.4f}")
print(f"  Min:    {overall_min:.4f}")
print(f"  Max:    {overall_max:.4f}")

# Target layer statistics
target_layer_sims = cosine_sims[:, target_layer]
target_mean = target_layer_sims.mean().item()
target_median = target_layer_sims.median().item()
target_std = target_layer_sims.std().item()
target_min = target_layer_sims.min().item()
target_max = target_layer_sims.max().item()

print(f"\nTarget layer {target_layer} statistics (across all roles):")
print(f"  Mean:   {target_mean:.4f}")
print(f"  Median: {target_median:.4f}")
print(f"  Std:    {target_std:.4f}")
print(f"  Min:    {target_min:.4f}")
print(f"  Max:    {target_max:.4f}")

# Per-layer statistics (mean across roles at each layer)
per_layer_mean = cosine_sims.mean(dim=0)
per_layer_std = cosine_sims.std(dim=0)

print(f"\nPer-layer statistics (mean ± std across roles):")
print(f"  First 5 layers:")
for i in range(min(5, len(per_layer_mean))):
    print(f"    Layer {i:2d}: {per_layer_mean[i]:.4f} ± {per_layer_std[i]:.4f}")
print(f"  ...")
print(f"  Last 5 layers:")
for i in range(max(0, len(per_layer_mean)-5), len(per_layer_mean)):
    print(f"    Layer {i:2d}: {per_layer_mean[i]:.4f} ± {per_layer_std[i]:.4f}")

# Per-role statistics (mean across layers for each role)
per_role_mean = cosine_sims.mean(dim=1)
per_role_std = cosine_sims.std(dim=1)

# Find roles with highest and lowest mean similarity
sorted_indices = torch.argsort(per_role_mean, descending=True)
print(f"\nTop 5 roles with highest mean similarity across layers:")
for i in range(min(5, len(sorted_indices))):
    idx = sorted_indices[i]
    print(f"  {role_labels[idx]:50s}: {per_role_mean[idx]:.4f} ± {per_role_std[idx]:.4f}")

print(f"\nBottom 5 roles with lowest mean similarity across layers:")
for i in range(max(0, len(sorted_indices)-5), len(sorted_indices)):
    idx = sorted_indices[i]
    print(f"  {role_labels[idx]:50s}: {per_role_mean[idx]:.4f} ± {per_role_std[idx]:.4f}")

# Target layer: Top/Bottom roles
sorted_target_indices = torch.argsort(target_layer_sims, descending=True)
print(f"\nLayer {target_layer}: Top 5 roles with highest similarity:")
for i in range(min(5, len(sorted_target_indices))):
    idx = sorted_target_indices[i]
    print(f"  {role_labels[idx]:50s}: {target_layer_sims[idx]:.4f}")

print(f"\nLayer {target_layer}: Bottom 5 roles with lowest similarity:")
for i in range(max(0, len(sorted_target_indices)-5), len(sorted_target_indices)):
    idx = sorted_target_indices[i]
    print(f"  {role_labels[idx]:50s}: {target_layer_sims[idx]:.4f}")

print("="*80)

COSINE SIMILARITY STATISTICS

Overall statistics (all roles, all layers):
  Mean:   0.8852
  Median: 0.8984
  Std:    0.0956
  Min:    0.0840
  Max:    1.0000

Target layer 40 statistics (across all roles):
  Mean:   0.8595
  Median: 0.8594
  Std:    0.0271
  Min:    0.5859
  Max:    0.9258

Per-layer statistics (mean ± std across roles):
  First 5 layers:
    Layer  0: 0.9841 ± 0.0242
    Layer  1: 0.9883 ± 0.0184
    Layer  2: 0.9888 ± 0.0170
    Layer  3: 0.7700 ± 0.2024
    Layer  4: 0.7747 ± 0.1971
  ...
  Last 5 layers:
    Layer 75: 0.8971 ± 0.0105
    Layer 76: 0.8773 ± 0.0107
    Layer 77: 0.8628 ± 0.0119
    Layer 78: 0.8072 ± 0.0179
    Layer 79: 0.4668 ± 0.0945

Top 5 roles with highest mean similarity across layers:
  Devils Advocate (Fully RP)                        : 0.9369 ± 0.0636
  Contrarian (Fully RP)                             : 0.9328 ± 0.0573
  Mystic (Fully RP)                                 : 0.9237 ± 0.0499
  Shaman (Fully RP)                                

In [None]:
# Create heatmap of per-role, per-layer cosine similarities
import plotly.graph_objects as go

# Sort roles by mean similarity across layers (highest to lowest)
per_role_mean = cosine_sims.mean(dim=1)
sorted_indices = torch.argsort(per_role_mean, descending=True)

# Reorder the data
sorted_cosine_sims = cosine_sims[sorted_indices].numpy()
sorted_labels = [role_labels[i] for i in sorted_indices]

# Create heatmap
fig = go.Figure(data=go.Heatmap(
    z=sorted_cosine_sims,
    x=[f"L{i}" for i in range(sorted_cosine_sims.shape[1])],
    y=sorted_labels,
    colorscale='RdBu',
    zmid=0.5,  # Center colormap at 0.5
    colorbar=dict(
        title="Cosine<br>Similarity",
        thickness=15,
        len=0.7
    ),
    hovertemplate='Layer: %{x}<br>Role: %{y}<br>Similarity: %{z:.4f}<extra></extra>'
))

fig.update_layout(
    title={
        'text': f"On-Policy vs Off-Policy Role Vector Similarity<br><sub>{model} on {target_model}: {len(role_labels)} Common Roles Across {sorted_cosine_sims.shape[1]} Layers</sub>",
        'x': 0.5,
        'xanchor': 'center'
    },
    xaxis_title="Layer",
    yaxis_title="Role (sorted by mean similarity)",
    width=1200,
    height=max(800, len(role_labels) * 10),  # Scale height with number of roles
    xaxis=dict(
        tickmode='linear',
        tick0=0,
        dtick=5,  # Show every 5th layer
        side='top'
    ),
    yaxis=dict(
        tickfont=dict(size=8)
    )
)

#fig.show()

### Per-layer cosine similarity for 4 aggregate vector pairs

In [108]:
# Load the 4 pairs of vectors for comparison
# 1. Mean role pos_3 vectors
on_mean_role = torch.stack([v for v in on_policy_results['vectors']['pos_3']]).mean(dim=0)  # Shape: (n_layers, hidden_dim)
off_mean_role = torch.stack([v for v in off_policy_results['vectors']['pos_3']]).mean(dim=0)

# 2. Default vectors
on_default = torch.load(f"{orig_dir}/default_vectors.pt")['activations']['default_1']
off_default = torch.load(f"{base_dir}/default_vectors.pt")['activations']['default_1']

# 3. Contrast vectors
on_contrast = torch.load(f"{orig_dir}/contrast_vectors.pt")
off_contrast = torch.load(f"{base_dir}/contrast_vectors.pt")

# 4. PC1 vectors
on_pc1 = torch.load(f"{orig_dir}/pc1_vectors.pt")
off_pc1 = torch.load(f"{base_dir}/pc1_vectors.pt")

print("Loaded all vector pairs:")
print(f"  Mean role shape: {on_mean_role.shape}")
print(f"  Default shape: {on_default.shape}")
print(f"  Contrast shape: {on_contrast.shape}")
print(f"  PC1 shape: {on_pc1.shape}")

Loaded all vector pairs:
  Mean role shape: torch.Size([80, 8192])
  Default shape: torch.Size([80, 8192])
  Contrast shape: torch.Size([80, 8192])
  PC1 shape: torch.Size([80, 8192])


In [109]:
# Compute per-layer cosine similarity for each pair
def compute_pairwise_cosine_sim(vec1, vec2):
    """Compute per-layer cosine similarity between two multi-layer vectors."""
    n_layers = vec1.shape[0]
    sims = []
    for layer_idx in range(n_layers):
        sim = F.cosine_similarity(
            vec1[layer_idx, :].unsqueeze(0), 
            vec2[layer_idx, :].unsqueeze(0)
        )
        sims.append(sim.item())
    return sims

# Compute similarities for all 4 pairs
mean_role_sims = compute_pairwise_cosine_sim(on_mean_role, off_mean_role)
default_sims = compute_pairwise_cosine_sim(on_default, off_default)
contrast_sims = compute_pairwise_cosine_sim(on_contrast, off_contrast)
pc1_sims = compute_pairwise_cosine_sim(on_pc1, off_pc1)

# absolute value of cosine similarity
mean_role_sims = torch.abs(torch.tensor(mean_role_sims))
default_sims = torch.abs(torch.tensor(default_sims))
contrast_sims = torch.abs(torch.tensor(contrast_sims))
pc1_sims = torch.abs(torch.tensor(pc1_sims))

print(f"Computed cosine similarities across {len(mean_role_sims)} layers for all 4 pairs")

Computed cosine similarities across 80 layers for all 4 pairs


In [None]:
# Create 2x2 subplot figure
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Prepare data
n_layers = len(mean_role_sims)
layer_indices = list(range(n_layers))

# Find global y-axis range across all 4 pairs
all_sims = mean_role_sims + default_sims + contrast_sims + pc1_sims
y_min = min(all_sims)
y_max = max(all_sims)
y_range = [- 0.05,1.1]  # Add padding

# Create subplots
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Mean Role Vectors',
        'Default Vectors', 
        'Contrast Vectors',
        'PC1 Vectors'
    ),
    vertical_spacing=0.12,
    horizontal_spacing=0.1
)

fig.update_annotations(
    font=dict(size=14),

)

# Add traces for each pair
# Row 1, Col 1: Mean Role
fig.add_trace(
    go.Scatter(
        x=layer_indices,
        y=mean_role_sims,
        mode='lines+markers',
        name='Mean Role',
        marker=dict(size=4),
        line=dict(color='steelblue')
    ),
    row=1, col=1
)

# Row 1, Col 2: Default
fig.add_trace(
    go.Scatter(
        x=layer_indices,
        y=default_sims,
        mode='lines+markers',
        name='Default',
        marker=dict(size=4),
        line=dict(color='darkgreen')
    ),
    row=1, col=2
)

# Row 2, Col 1: Contrast
fig.add_trace(
    go.Scatter(
        x=layer_indices,
        y=contrast_sims,
        mode='lines+markers',
        name='Contrast',
        marker=dict(size=4),
        line=dict(color='darkorange')
    ),
    row=2, col=1
)

# Row 2, Col 2: PC1
fig.add_trace(
    go.Scatter(
        x=layer_indices,
        y=pc1_sims,
        mode='lines+markers',
        name='PC1',
        marker=dict(size=4),
        line=dict(color='purple')
    ),
    row=2, col=2
)

# Map for getting the right similarity values
sims_map = {
    (1, 1): mean_role_sims,
    (1, 2): default_sims,
    (2, 1): contrast_sims,
    (2, 2): pc1_sims
}

# Add vertical line at target layer for all subplots with annotations
for row in [1, 2]:
    for col in [1, 2]:
        fig.add_vline(
            x=target_layer, 
            line_dash="dot", 
            line_color="red", 
            annotation_text=f"{sims_map[(row, col)][target_layer]:.2f}",
            row=row, col=col

        )
      

# Update axes with shared y-axis range
fig.update_xaxes(title_text="Layer", row=2, col=1)
fig.update_xaxes(title_text="Layer", row=2, col=2)
fig.update_yaxes(title_text="Cosine Similarity", range=y_range, row=1, col=1)
fig.update_yaxes(title_text="Cosine Similarity", range=y_range, row=2, col=1)
fig.update_yaxes(range=y_range, row=1, col=2)
fig.update_yaxes(range=y_range, row=2, col=2)

# Update layout
fig.update_layout(
    title={
        'text': f'Per-Layer Cosine Similarity of Vectors on On-Policy vs Off-Policy Rollouts',
         'subtitle': {
            'text': f'{model.replace("-", " ").title()} Activations on {target_model.replace("-", " ").title()} Rollouts',
        }
    },
    height=600,
    width=800,
    showlegend=False
)


fig.show()