# Trajectories of role + trait space

In [54]:
import json
import os
import sys
import torch
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as sp
import torch.nn.functional as F
import re

sys.path.append('.')
sys.path.append('..')

from utils.inference_utils import *
from utils.probing_utils import *


In [55]:
# load in PCA results
CHAT_MODEL_NAME = "Qwen/Qwen3-32B"
model_readable = "Qwen 3 32B"
model_short = model_readable.replace(" ", "-").lower()
layer = 32


In [3]:
# filename = "auto40"
components = 6

In [4]:
role_dir = "roles_240" 
trait_dir = "traits_240"

acts_input_dir = f"/workspace/{model_short}/dynamics"
plot_output_dir = f"/root/git/plots/{model_short}/trajectory"
os.makedirs(plot_output_dir, exist_ok=True)

## Load and project all conversation activations 

In [5]:
# load in activations
role_results = torch.load(f"/workspace/{model_short}/{role_dir}/pca/layer{layer}_pos23.pt", weights_only=False)
trait_results = torch.load(f"/workspace/{model_short}/{trait_dir}/pca/layer{layer}_pos-neg50.pt", weights_only=False)


In [32]:
def pc_cosine_similarity(mean_acts_per_turn, pca_results, n_pcs=8):
    if isinstance(mean_acts_per_turn, list):
        stacked_acts = torch.stack(mean_acts_per_turn)
    else:
        stacked_acts = mean_acts_per_turn
    normalized_acts = F.normalize(stacked_acts, dim=1)
    normalized_pcs = pca_results['pca'].components_[:n_pcs] / np.linalg.norm(pca_results['pca'].components_[:n_pcs], axis=1, keepdims=True)
    cosine_sims = normalized_acts.float().numpy() @ normalized_pcs.T
    return cosine_sims

def pc_projection(mean_acts_per_turn, pca_results, n_pcs=8):
    if isinstance(mean_acts_per_turn, list):
        stacked_acts = torch.stack(mean_acts_per_turn)
    else:
        stacked_acts = mean_acts_per_turn
    stacked_acts = stacked_acts.float().numpy()
    scaled_acts = pca_results['scaler'].transform(stacked_acts)
    projected_acts = pca_results['pca'].transform(scaled_acts)
    return projected_acts[:, :n_pcs]
    

    

In [7]:
# load every activation file in the directory if the name matches a regex
all_objs = []
for file in os.listdir(acts_input_dir):
    if file.endswith('.pt'):
        obj = torch.load(f"{acts_input_dir}/{file}", weights_only=False, map_location="cpu")
        A = obj['activations'][:, layer, :]
        A = A[1::2]

        role_sims = pc_cosine_similarity(A, role_results, None)
        trait_sims = pc_cosine_similarity(A, trait_results, None)
        print(role_sims.shape)
        print(trait_sims.shape)

        role_projs = pc_projection(A, role_results, None)
        trait_projs = pc_projection(A, trait_results, None)

        obj['role_sims'] = role_sims
        obj['trait_sims'] = trait_sims
        obj['role_projs'] = role_projs
        obj['trait_projs'] = trait_projs

        all_objs.append(obj)

        torch.save(obj, f"{acts_input_dir}/projected/{file}")
# for file in os.listdir(f"{acts_input_dir}/projected"):

#     obj = torch.load(f"{acts_input_dir}/projected/{file}", weights_only=False, map_location="cpu")
#     all_objs.append(obj)



(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(7, 463)
(7, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(8, 463)
(8, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(9, 463)
(9, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(5, 463)
(5, 240)
(11, 463)
(11, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(5, 463)
(5, 240)
(14, 463)
(14, 240)
(10, 463)
(10, 240)
(8, 463)
(8, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(8, 463)
(8, 240)
(13, 463)
(13, 240)
(14, 463)
(14, 240)
(10, 463)
(10, 240)
(14, 463)
(14, 240)
(8, 463)
(8, 240)
(8, 463)
(8, 240)
(9, 463)
(9, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(7, 463)
(7, 240)
(14, 463)
(14, 240)
(5, 463)
(5, 240)
(9, 463)
(9, 240)
(9, 463)
(9, 240)
(13, 463)
(13, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(12, 463

In [8]:
all_role_projs = [obj['role_projs'] for obj in all_objs]
all_trait_projs = [obj['trait_projs'] for obj in all_objs]


In [9]:
sliced_role_projs = [obj[:, :components] for obj in all_role_projs]
sliced_trait_projs = [obj[:, :components] for obj in all_trait_projs]


In [10]:
files = [f"{obj['domain']}_persona{obj['persona_id']}_topic{obj['topic_id']}" for obj in all_objs]
print(files[:10])

['writing_persona9_topic9', 'writing_persona9_topic8', 'writing_persona9_topic7', 'writing_persona9_topic6', 'writing_persona9_topic5', 'writing_persona9_topic4', 'writing_persona9_topic3', 'writing_persona9_topic2', 'writing_persona9_topic19', 'writing_persona9_topic18']


In [11]:
print(obj.keys())

dict_keys(['model', 'auditor_model', 'domain', 'persona_id', 'persona', 'topic_id', 'topic', 'turns', 'conversation', 'activations', 'role_sims', 'trait_sims', 'role_projs', 'trait_projs'])


In [12]:
df = pd.DataFrame()
df['filenames'] = files


In [13]:
# calculate the cumulative path length for each conversation
role_path_lengths = [np.sum(np.linalg.norm(np.diff(obj['role_projs'], axis=0), axis=1)) for obj in all_objs]
trait_path_lengths = [np.sum(np.linalg.norm(np.diff(obj['trait_projs'], axis=0), axis=1)) for obj in all_objs]

df['role_path_length'] = role_path_lengths
df['trait_path_length'] = trait_path_lengths

In [None]:
# load csv from file into df
df = pd.read_csv(f"./path_lengths.csv")

In [14]:
# also get largest single jump
role_max_jump = [np.max(np.linalg.norm(np.diff(obj['role_projs'], axis=0), axis=1)) for obj in all_objs]
trait_max_jump = [np.max(np.linalg.norm(np.diff(obj['trait_projs'], axis=0), axis=1)) for obj in all_objs]
df['role_max_jump'] = role_max_jump
df['trait_max_jump'] = trait_max_jump



In [15]:
# role pc1 biggest jump and distance
role_pc1_path_lengths = [
    np.sum(np.abs(np.diff(obj['role_projs'][:, 0])))  # Only PC1 (column 0)
    for obj in all_objs
]

role_pc1_max_jumps = [
    np.max(np.abs(np.diff(obj['role_projs'][:, 0])))  # Max jump in PC1
    for obj in all_objs
]

df['role_pc1_path_length'] = role_pc1_path_lengths
df['role_pc1_max_jump'] = role_pc1_max_jumps


df.to_csv(f"./stats.csv", index=False)

In [16]:
pc1 = all_objs[files.index("therapy_persona10_topic19")]['role_projs'][:, 0]
print(pc1)

[-15.46315117 -13.0795536    5.49008361   4.20218061   3.55202424
  -7.76324587  -9.99787456  -8.80170644  -7.14331595  12.63595519
  -8.40975517   3.94093019   1.6871792   13.69682304]


## Getting roles with target projections on role PC1

In [56]:
dir = '/workspace/qwen-3-32b/roles_240'
pca_results = torch.load(f'{dir}/pca/layer{layer}_pos23.pt', weights_only=False)
pca_transformed = pca_results['pca_transformed']

In [57]:
def get_role_labels(pca_results):
    labels = []
    if 'pos_2' in pca_results['roles'].keys():
        labels.extend(pca_results['roles']['pos_2'])
    if 'pos_3' in pca_results['roles'].keys():
        labels.extend(pca_results['roles']['pos_3'])
    return labels

def get_trait_labels(pca_results):
    labels = []
    if 'pos_neg' in pca_results['traits'].keys():
        labels.extend(pca_results['traits']['pos_neg'])
    if 'pos_neg_50' in pca_results['traits'].keys():
        labels.extend(pca_results['traits']['pos_neg_50'])
    return labels

labels = get_role_labels(pca_results)
print(len(labels))

463


In [58]:
def sorted_by_pc(pc_index):
    df = pd.DataFrame({
        "label": labels,
        "projection": pca_transformed[:, pc_index],
        "score": ["pos_2"] * len(pca_results['roles']['pos_2']) + ["pos_3"] * len(pca_results['roles']['pos_3']) 
    })
    df_sorted = df.sort_values(by="projection", ascending=True)
    return df_sorted

In [59]:
pc_df = sorted_by_pc(0)
#pc_df.to_csv("/root/git/persona-subspace/dynamics/results/qwen-3-32b/roles_pc1.csv", index=False)

In [20]:
pc_df['projection'].dtype


dtype('float64')

In [65]:
target_roles = []
target_projections = []

for i in range(-38, 102, 26):
    print(i)
    # get the index of the label that is closest to i
    index = np.argmin(np.abs(pc_df['projection'] - i))
    #print the label and the projection
    print(pc_df['label'].iloc[index], pc_df['projection'].iloc[index], pc_df['score'].iloc[index])
    target_roles.append(pc_df['label'].iloc[index])
    target_projections.append(i)
print(target_roles)


-38
validator -37.98302365995255 pos_3
-12
podcaster -11.917165905106012 pos_2
14
visionary 14.000623879678056 pos_3
40
fool 40.05872476576472 pos_3
66
whale 66.41810071794757 pos_3
92
leviathan 92.81104297322707 pos_3
['validator', 'podcaster', 'visionary', 'fool', 'whale', 'leviathan']


In [66]:
target_roles[3] = "narcissist"
print(target_roles)

['validator', 'podcaster', 'visionary', 'narcissist', 'whale', 'leviathan']


In [67]:
# collect all sorted dataframes to calculate average positions
all_sorted_dataframes = []
position_data = {}

# for each target role, load in the activations and project them onto pc1
for role, proj in zip(target_roles, target_projections):
    print(role, proj)

    acts = torch.load(f"/workspace/qwen-3-32b/roles_240/response_activations/{role}.pt", weights_only=False)

    stacked_acts = torch.stack([v[layer, :] for v in acts.values()])
    role_projs = pc_projection(stacked_acts, pca_results, 1)

    role_df = pd.DataFrame({
        "label": acts.keys(),
        "projection": role_projs.squeeze(),
    })

    role_df['distance'] = np.abs(role_df['projection'] - proj)

    # sort the question index by closest to the target projection
    role_df = role_df.sort_values(by='distance')

    # store the sorted dataframe
    all_sorted_dataframes.append(role_df)

    # record position of each label in this sorted dataframe
    for idx, label in enumerate(role_df['label']):
        if label not in position_data:
            position_data[label] = []
        position_data[label].append(idx)

    # print the first 4 rows of the sorted dataframe
    print(role_df.head(4))
    role_df.to_csv(f"/root/git/persona-subspace/dynamics/results/qwen-3-32b/prefills/{role}_pc1.csv", index=False)


validator -38
            label  projection  distance
779    pos_p3_q59  -37.997406  0.002594
594   pos_p2_q114  -37.995072  0.004928
1090  pos_p4_q130  -38.031120  0.031120
1079  pos_p4_q119  -38.047506  0.047506
podcaster -12
           label  projection  distance
424  pos_p1_q184  -12.018051  0.018051
857  pos_p3_q137  -12.032356  0.032356
34    pos_p0_q34  -12.033337  0.033337
362  pos_p1_q122  -11.966280  0.033720
visionary 14
           label  projection  distance
396  pos_p1_q156   14.066310  0.066310
436  pos_p1_q196   13.893611  0.106389
139  pos_p0_q139   14.111792  0.111792
609  pos_p2_q129   13.839211  0.160789
narcissist 40
            label  projection  distance
405   pos_p1_q165   40.005305  0.005305
423   pos_p1_q183   39.980503  0.019497
1153  pos_p4_q193   39.970121  0.029879
421   pos_p1_q181   39.970005  0.029995
whale 66
            label  projection  distance
215   pos_p0_q215   65.923039  0.076961
1071  pos_p4_q111   65.900430  0.099570
638   pos_p2_q158   65.816

In [None]:
# collect distance data for each label per role
distance_data = {}

# for each dataframe, record the distance for each label  
for i, role_df in enumerate(all_sorted_dataframes):
    role = target_roles[i]
    for _, row in role_df.iterrows():
        label = row['label']
        distance = row['distance']
        
        if label not in distance_data:
            distance_data[label] = {}
        distance_data[label][role] = distance

# calculate average position and MSE for each label
average_positions = {}
mse_scores = {}

for label, positions in position_data.items():
    average_positions[label] = np.mean(positions)
    
    # calculate MSE across role distances
    if label in distance_data:
        distances = []
        for role in target_roles:
            if role in distance_data[label]:
                distances.append(distance_data[label][role])
        
        if distances:
            mse_scores[label] = np.mean(np.array(distances) ** 2)
        else:
            mse_scores[label] = np.inf
    else:
        mse_scores[label] = np.inf

# create dataframe with average position, MSE, and distance per role
rows = []
for label in average_positions.keys():
    row_data = {
        "label": label, 
        "average_position": average_positions[label],
        "mse_distance": mse_scores[label]
    }
    
    # add distance for each role
    for role in target_roles:
        if label in distance_data and role in distance_data[label]:
            row_data[f"{role}_distance"] = distance_data[label][role]
        else:
            row_data[f"{role}_distance"] = np.nan
    
    rows.append(row_data)

average_position_df = pd.DataFrame(rows)

# sort by MSE instead of average position
average_position_df = average_position_df.sort_values(by='mse_distance')


print("\nLabels sorted by average position across all dataframes:")
print(average_position_df)

In [69]:
# group by question_id and find minimum distance for each role across prompt_ids
import re



question_data = {}

# parse labels and group by question_id
for label in distance_data.keys():
    # extract prompt_id and question_id from label format pos_p{prompt_id}_q{question_id}
    match = re.match(r'pos_p(\d+)_q(\d+)', label)
    if match:
        prompt_id = int(match.group(1))
        question_id = int(match.group(2))
        
        if question_id not in question_data:
            question_data[question_id] = {}
        
        # store distance data for this prompt_id, question_id combination
        question_data[question_id][prompt_id] = {
            'label': label,
            'distances': distance_data[label]
        }

# for each question_id, find the best prompt_id for each role, then calculate MSE
question_results = []

for question_id, prompt_data in question_data.items():
    role_best_distances = {}
    role_best_prompts = {}
    
    # for each role, find the prompt_id that gives minimum distance for this question_id
    for role in target_roles:
        best_distance = np.inf
        best_prompt_id = None
        
        for prompt_id, data in prompt_data.items():
            if role in data['distances']:
                distance = data['distances'][role]
                if distance < best_distance:
                    best_distance = distance
                    best_prompt_id = prompt_id
        
        if best_prompt_id is not None:
            role_best_distances[role] = best_distance
            role_best_prompts[role] = best_prompt_id
    
    # calculate MSE across the best distances for each role
    if role_best_distances:
        distances = list(role_best_distances.values())
        mse = np.mean(np.array(distances) ** 2)
        
        row_data = {
            'question_id': question_id,
            'mse_distance': mse
        }
        
        # add individual role distances and their best prompt_ids
        for role in target_roles:
            if role in role_best_distances:
                row_data[f"{role}_distance"] = role_best_distances[role]
                row_data[f"{role}_best_prompt"] = role_best_prompts[role]
            else:
                row_data[f"{role}_distance"] = np.nan
                row_data[f"{role}_best_prompt"] = np.nan
        
        question_results.append(row_data)

# create dataframe and sort by MSE
question_df = pd.DataFrame(question_results)
question_df = question_df.sort_values(by='mse_distance')

question_df.to_csv(f"./results/qwen-3-32b/prefills/roles_pc1_questions.csv", index=False)

print("\nQuestions sorted by MSE distance (best prompt per role per question):")
print(question_df.head(20))


Questions sorted by MSE distance (best prompt per role per question):
     question_id  mse_distance  validator_distance  validator_best_prompt  \
35           132      1.590810            0.427428                      2   
137           42      1.715367            2.395581                      0   
86           217      1.804398            0.974830                      2   
64           133      2.058864            0.737193                      0   
100          111      2.079271            1.226899                      4   
29           120      2.478459            0.358391                      4   
52            31      3.133498            0.609123                      0   
139          170      3.680787            2.455120                      0   
9            139      3.814337            0.074946                      4   
129           23      3.928335            2.036972                      0   
11           165      4.089504            0.103096                      0   
162  

In [70]:
# get the best question_id and create jsonl output
import json

# get the best question_id (lowest MSE)
best_question_id = int(question_df.iloc[0]['question_id'])
print(f"Best question_id: {best_question_id}")

# collect data for the best question across all roles and their best prompts
jsonl_data = []

for i, role in enumerate(target_roles):
    best_prompt_col = f"{role}_best_prompt"
    best_prompt_id = question_df.iloc[0][best_prompt_col]
    
    if pd.notna(best_prompt_id):
        best_prompt_id = int(best_prompt_id)
        label = f"pos_p{best_prompt_id}_q{best_question_id}"
        
        # find the projection for this specific label
        acts = torch.load(f"/workspace/qwen-3-32b/roles_240/response_activations/{role}.pt", weights_only=False)
        pc1_projection = pc_projection(acts[label][layer, :].reshape(1, -1), pca_results, 1).squeeze()

        # get the question text
        target_obj = None
        with open(f"/workspace/qwen-3-32b/roles_240/responses/{role}.jsonl", "r") as f:
            for line in f:
                obj = json.loads(line)
                if obj['prompt_index'] == best_prompt_id and obj['question_index'] == best_question_id:
                    target_obj = obj

        # create the jsonl entry
        entry = {
            'id': i,
            'role': role,
            'pc1': float(pc1_projection),
            'question_index': best_question_id,
            'prompt_index': best_prompt_id,
            'label': label,
            'conversation': target_obj['conversation']
        }
        
        jsonl_data.append(entry)
        print(f"Role: {role}, Prompt: {best_prompt_id}, PC1: {pc1_projection:.6f}, Label: {label}")

# save to jsonl file
output_file = f"./results/qwen-3-32b/prefills/role_pc1_prefills_q{best_question_id}.jsonl"
with open(output_file, 'w') as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry) + '\n')

print(f"\nSaved {len(jsonl_data)} entries to {output_file}")
print(f"\nBest question ID {best_question_id} has MSE: {question_df.iloc[0]['mse_distance']:.6f}")

Best question_id: 132
Role: validator, Prompt: 2, PC1: -37.572572, Label: pos_p2_q132
Role: podcaster, Prompt: 0, PC1: -10.806745, Label: pos_p0_q132
Role: visionary, Prompt: 2, PC1: 15.177850, Label: pos_p2_q132
Role: narcissist, Prompt: 4, PC1: 40.991084, Label: pos_p4_q132
Role: whale, Prompt: 2, PC1: 68.348354, Label: pos_p2_q132
Role: leviathan, Prompt: 4, PC1: 92.232296, Label: pos_p4_q132

Saved 6 entries to ./results/qwen-3-32b/prefills/role_pc1_prefills_q132.jsonl

Best question ID 132 has MSE: 1.590810


In [None]:
columns = {}
for i in range(6):
    pc_df = sorted_by_pc(i)
    columns[f"pc{i+1}"] = pc_df["label"].values  # Use .values to get numpy array
    print(f"PC{i+1} first 40: {pc_df['label'].head(40).tolist()}")  # Debug print
    print(f"PC{i+1} last 40: {pc_df['label'].tail(40).tolist()}")  # Debug print

df = pd.DataFrame(columns)
df.to_csv("/root/git/persona-subspace/dynamics/results/qwen-3-32b/traits_sorted.csv", index=False)

## Plot trajectory