# Role Vector Analysis

This notebook computes role vectors for various metrics using activation summaries and response ratings.

In [1]:
import json
import os
import re
from pathlib import Path

import numpy as np
import pandas as pd
import pickle
from tqdm.notebook import tqdm

print(f"CWD: {os.getcwd()}")

CWD: /home/nate/repos/animacy/results/activations/analysis


In [2]:
# Paths
ACTIVATIONS_DIR = Path("../data/Qwen3-30B-A3B-Instruct-2507")
RATINGS_PATH = Path("../../ratings/data/Qwen3-30B-A3B-Instruct-2507/response_ratings.csv")
OUTPUT_DIR = Path("../../steering/data/Qwen3-30B-A3B-Instruct-2507")

# Ensure output directory exists
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Activations Dir: {ACTIVATIONS_DIR.resolve()}")
print(f"Ratings Path: {RATINGS_PATH.resolve()}")
print(f"Output Dir: {OUTPUT_DIR.resolve()}")

Activations Dir: /home/nate/repos/animacy/results/activations/data/Qwen3-30B-A3B-Instruct-2507
Ratings Path: /home/nate/repos/animacy/results/ratings/data/Qwen3-30B-A3B-Instruct-2507/response_ratings.csv
Output Dir: /home/nate/repos/animacy/results/steering/data/Qwen3-30B-A3B-Instruct-2507


In [3]:
# Load Ratings
if not RATINGS_PATH.exists():
    print(f"ERROR: Ratings file not found at {RATINGS_PATH.resolve()}")
else:
    ratings = pd.read_csv(RATINGS_PATH)
    print(f"Loaded {len(ratings)} ratings.")

    # Identify deviation columns
    deviation_cols = ["assistant_refusal", "role_refusal", "identify_as_assistant", "deny_internal_experience"]
    # Ensure boolean
    for col in deviation_cols:
        if col in ratings.columns:
            ratings[col] = ratings[col].fillna(False).astype(bool)

    ratings["has_deviation"] = ratings[deviation_cols].any(axis=1)
    print(f"Trials with deviations: {ratings['has_deviation'].sum()}")

Loaded 6300 ratings.
Trials with deviations: 617


  ratings[col] = ratings[col].fillna(False).astype(bool)
  ratings[col] = ratings[col].fillna(False).astype(bool)
  ratings[col] = ratings[col].fillna(False).astype(bool)
  ratings[col] = ratings[col].fillna(False).astype(bool)


In [4]:
def load_activations(base_dir, sys_type="with_sys"):
    data = []
    summaries_dir = base_dir / sys_type / "summaries"
    
    print(f"Checking summaries dir: {summaries_dir.resolve()}")
    if not summaries_dir.exists():
        print(f"Warning: {summaries_dir} does not exist.")
        return pd.DataFrame()
        
    files = list(summaries_dir.glob("*.json"))
    print(f"Loading {len(files)} files from {sys_type}...")
    
    for file_path in tqdm(files):
        filename = file_path.name
        try:
            stem = file_path.stem
            parts = stem.split("_")
            
            layer_part = parts[-1]
            layer = int(layer_part.replace("layer", ""))
            sample_idx = int(parts[-2])
            
            entry = {
                "sys_type": sys_type,
                "filename_stem": "_".join(parts),
                "sample_idx": sample_idx,
                "layer": layer,
            }
            
            with open(file_path, "r") as f:
                content = json.load(f)
            
            for key, value in content.items():
                if isinstance(value, list):
                    entry[key] = np.array(value)
            
            data.append(entry)
        except Exception as e:
            print(f"Error parsing {filename}: {e}")
            continue
            
    df = pd.DataFrame(data)
    return df

In [5]:
# Get unique roles and tasks from ratings to help parsing
known_roles = set(ratings["role_name"].dropna().unique())
known_tasks = set(ratings["task_name"].dropna().unique())

print(f"Known roles: {len(known_roles)}")
print(f"Known tasks: {len(known_tasks)}")

def parse_role_task(filename_stem, known_roles, known_tasks):
    # Ensure r is string
    role_map = {str(r).replace(" ", "_"): str(r) for r in known_roles}
    best_role = None
    best_task = None
    
    for r_underscore in role_map:
        if filename_stem.startswith(r_underscore):
            remainder = filename_stem[len(r_underscore):]
            if remainder.startswith("_"):
                task_part = remainder[1:]
                if task_part in known_tasks:
                    if best_role is None or len(r_underscore) > len(best_role.replace(" ", "_")):
                        best_role = role_map[r_underscore]
                        best_task = task_part
    
    if best_role:
        return best_role, best_task
    
    parts = filename_stem.split("_", 1)
    if len(parts) == 2:
        return parts[0], parts[1]
    return filename_stem, ""

Known roles: 125
Known tasks: 10


In [6]:
# Load with_sys activations
df_with_sys = load_activations(ACTIVATIONS_DIR, "with_sys")

if df_with_sys.empty:
    print("ERROR: df_with_sys is empty!")
else:
    # Parse roles and tasks
    parsed = df_with_sys["filename_stem"].apply(lambda x: parse_role_task(x, known_roles, known_tasks))
    df_with_sys["role"] = parsed.apply(lambda x: x[0])
    df_with_sys["task"] = parsed.apply(lambda x: x[1])

    print(f"Loaded {len(df_with_sys)} with_sys activations.")
    print(df_with_sys.head())

Checking summaries dir: /home/nate/repos/animacy/results/activations/data/Qwen3-30B-A3B-Instruct-2507/with_sys/summaries
Loading 25200 files from with_sys...


  0%|          | 0/25200 [00:00<?, ?it/s]

Loaded 25200 with_sys activations.
   sys_type                       filename_stem  sample_idx  layer  \
0  with_sys          shadow_favorites_4_layer26           4     26   
1  with_sys  blouse_meaningful_moment_2_layer38           2     38   
2  with_sys   emperor_meaning_of_life_1_layer38           1     38   
3  with_sys          weed_future_self_3_layer34           3     34   
4  with_sys               sock_dreams_3_layer34           3     34   

                                   avg_system_prompt  \
0  [0.05712890625, 0.640625, -0.018798828125, 0.0...   
1  [0.038818359375, 0.6015625, 0.006500244140625,...   
2  [0.048095703125, 0.55859375, -0.1591796875, 0....   
3  [0.107421875, 0.77734375, 0.2470703125, -0.005...   
4  [0.0810546875, 0.72265625, 0.080078125, -0.046...   

                                     avg_user_prompt  \
0  [0.06982421875, 0.40234375, 0.040771484375, -0...   
1  [-0.10791015625, 0.345703125, -0.197265625, -0...   
2  [-0.1640625, 0.318359375, 0.00216674

In [None]:
# Assistant roles definition
assistant_roles = ["AI", "assistant", "AI assistant", "helpful assistant"]

def get_assistant_baseline(df, metric_col, use_all_trials=True):
    if df.empty:
        return None
    mask = df["role"].isin(assistant_roles)
    assist_df = df[mask]
    
    if assist_df.empty:
        print(f"Warning: No assistant roles found for baseline in {metric_col}.")
        return None
    
    baseline = {}
    layers = assist_df["layer"].unique()
    
    for layer in layers:
        layer_data = assist_df[assist_df["layer"] == layer][metric_col]
        vectors = np.stack(layer_data.values)
        baseline[layer] = np.mean(vectors, axis=0)
        
    return baseline

def compute_and_save_metric(df, metric_col, metric_name, filter_deviations=False, assistant_baseline=None, output_dir=OUTPUT_DIR):
    if df.empty:
        print(f"Skipping {metric_name} because df is empty.")
        return

    if filter_deviations:
        df["sample_idx"] = df["sample_idx"].astype(int)
        ratings["sample_idx"] = ratings["sample_idx"].astype(int)
        
        merged = pd.merge(
            df,
            ratings[["role_name", "task_name", "sample_idx", "has_deviation"]],
            left_on=["role", "task", "sample_idx"],
            right_on=["role_name", "task_name", "sample_idx"],
            how="left"
        )
        merged["has_deviation"] = merged["has_deviation"].fillna(False)
        keep_mask = (~merged["has_deviation"]) | (merged["role"].isin(assistant_roles))
        working_df = merged[keep_mask].copy()
        print(f"Filtered {metric_name}: {len(df)} -> {len(working_df)} rows.")
    else:
        working_df = df.copy()
        
    role_vectors = {}
    roles = working_df["role"].unique()
    layers = working_df["layer"].unique()
    
    for role in tqdm(roles, desc=f"Computing {metric_name}"):
        role_vectors[role] = {}
        role_df = working_df[working_df["role"] == role]
        
        for layer in layers:
            layer_data = role_df[role_df["layer"] == layer][metric_col]
            if layer_data.empty:
                continue
                
            vectors = np.stack(layer_data.values)
            avg_vector = np.mean(vectors, axis=0)
            
            if assistant_baseline and layer in assistant_baseline:
                avg_vector = avg_vector - assistant_baseline[layer]
                
            role_vectors[role][layer] = avg_vector
            
    output_path = output_dir / f"role_vectors_{metric_name}.pkl"
    with open(output_path, "wb") as f:
        pickle.dump(role_vectors, f)
    print(f"Saved {metric_name} to {output_path}")

In [None]:
if not df_with_sys.empty:
    # 1) at_role_period
    metric = "at_role_period"
    baseline = get_assistant_baseline(df_with_sys, metric)
    compute_and_save_metric(df_with_sys, metric, metric, filter_deviations=False, assistant_baseline=baseline)

    # 2) at_role
    metric = "at_role"
    baseline = get_assistant_baseline(df_with_sys, metric)
    compute_and_save_metric(df_with_sys, metric, metric, filter_deviations=False, assistant_baseline=baseline)

    # 3) avg_response (Filtered)
    metric = "avg_response"
    baseline = get_assistant_baseline(df_with_sys, metric)
    compute_and_save_metric(df_with_sys, metric, metric, filter_deviations=True, assistant_baseline=baseline)

    # 4) avg_response_first_10_tokens (Filtered)
    metric = "avg_response_first_10_tokens"
    baseline = get_assistant_baseline(df_with_sys, metric)
    compute_and_save_metric(df_with_sys, metric, metric, filter_deviations=True, assistant_baseline=baseline)

In [None]:
# 5) avg_response (with_sys - without_sys)
df_without_sys = load_activations(ACTIVATIONS_DIR, "without_sys")

if df_without_sys.empty:
    print("ERROR: df_without_sys is empty!")
else:
    parsed_wo = df_without_sys["filename_stem"].apply(lambda x: parse_role_task(x, known_roles, known_tasks))
    df_without_sys["role"] = parsed_wo.apply(lambda x: x[0])
    df_without_sys["task"] = parsed_wo.apply(lambda x: x[1])

    print(f"Loaded {len(df_without_sys)} without_sys activations.")

    if not df_with_sys.empty:
        metric = "avg_response"
        merged_sys = pd.merge(
            df_with_sys[["role", "task", "sample_idx", "layer", metric]],
            df_without_sys[["role", "task", "sample_idx", "layer", metric]],
            on=["role", "task", "sample_idx", "layer"],
            suffixes=("_with", "_without")
        )

        print(f"Merged {len(merged_sys)} pairs for system prompt diff.")

        merged_sys["diff"] = merged_sys[f"{metric}_with"] - merged_sys[f"{metric}_without"]

        role_vectors_diff = {}
        roles = merged_sys["role"].unique()
        layers = merged_sys["layer"].unique()

        for role in tqdm(roles, desc="Computing avg_response_diff"):
            role_vectors_diff[role] = {}
            role_df = merged_sys[merged_sys["role"] == role]
            
            for layer in layers:
                layer_data = role_df[role_df["layer"] == layer]["diff"]
                if layer_data.empty:
                    continue
                    
                vectors = np.stack(layer_data.values)
                avg_vector = np.mean(vectors, axis=0)
                role_vectors_diff[role][layer] = avg_vector

        output_path = OUTPUT_DIR / "role_vectors_avg_response_sys_diff.pkl"
        with open(output_path, "wb") as f:
            pickle.dump(role_vectors_diff, f)
        print(f"Saved avg_response_sys_diff to {output_path}")