# Trajectories of role + trait space

In [1]:
import json
import os
import sys
import torch
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as sp
import torch.nn.functional as F
import re

sys.path.append('.')
sys.path.append('..')

from utils.inference_utils import *
from utils.probing_utils import *


INFO 09-18 23:45:08 [__init__.py:216] Automatically detected platform cuda.


In [2]:
# load in PCA results
CHAT_MODEL_NAME = "Qwen/Qwen3-32B"
model_readable = "Qwen 3 32B"
model_short = model_readable.replace(" ", "-").lower()
layer = 32


In [3]:
# filename = "auto40"
components = 6

In [4]:
role_dir = "roles_240" 
trait_dir = "traits_240"

acts_input_dir = f"/workspace/{model_short}/dynamics"
plot_output_dir = f"/root/git/plots/{model_short}/trajectory"
os.makedirs(plot_output_dir, exist_ok=True)

## Load and project all conversation activations 

In [5]:
# load in activations
role_results = torch.load(f"/workspace/{model_short}/{role_dir}/pca/layer{layer}_pos23.pt", weights_only=False)
trait_results = torch.load(f"/workspace/{model_short}/{trait_dir}/pca/layer{layer}_pos-neg50.pt", weights_only=False)


In [6]:
def pc_cosine_similarity(mean_acts_per_turn, pca_results, n_pcs=8):
    if isinstance(mean_acts_per_turn, list):
        stacked_acts = torch.stack(mean_acts_per_turn)
    else:
        stacked_acts = mean_acts_per_turn
    normalized_acts = F.normalize(stacked_acts, dim=1)
    normalized_pcs = pca_results['pca'].components_[:n_pcs] / np.linalg.norm(pca_results['pca'].components_[:n_pcs], axis=1, keepdims=True)
    cosine_sims = normalized_acts.float().numpy() @ normalized_pcs.T
    return cosine_sims

def pc_projection(mean_acts_per_turn, pca_results, n_pcs=8):
    if isinstance(mean_acts_per_turn, list):
        stacked_acts = torch.stack(mean_acts_per_turn)
    else:
        stacked_acts = mean_acts_per_turn
    stacked_acts = stacked_acts.float().numpy()
    scaled_acts = pca_results['scaler'].transform(stacked_acts)
    projected_acts = pca_results['pca'].transform(scaled_acts)
    return projected_acts[:, :n_pcs]
    

    

In [7]:
# load every activation file in the directory if the name matches a regex
all_objs = []
for file in os.listdir(acts_input_dir):
    if file.endswith('.pt'):
        obj = torch.load(f"{acts_input_dir}/{file}", weights_only=False, map_location="cpu")
        A = obj['activations'][:, layer, :]
        A = A[1::2]

        role_sims = pc_cosine_similarity(A, role_results, None)
        trait_sims = pc_cosine_similarity(A, trait_results, None)
        print(role_sims.shape)
        print(trait_sims.shape)

        role_projs = pc_projection(A, role_results, None)
        trait_projs = pc_projection(A, trait_results, None)

        obj['role_sims'] = role_sims
        obj['trait_sims'] = trait_sims
        obj['role_projs'] = role_projs
        obj['trait_projs'] = trait_projs

        all_objs.append(obj)

        torch.save(obj, f"{acts_input_dir}/projected/{file}")
# for file in os.listdir(f"{acts_input_dir}/projected"):

#     obj = torch.load(f"{acts_input_dir}/projected/{file}", weights_only=False, map_location="cpu")
#     all_objs.append(obj)



(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(7, 463)
(7, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(8, 463)
(8, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(9, 463)
(9, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(5, 463)
(5, 240)
(11, 463)
(11, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(5, 463)
(5, 240)
(14, 463)
(14, 240)
(10, 463)
(10, 240)
(8, 463)
(8, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(8, 463)
(8, 240)
(13, 463)
(13, 240)
(14, 463)
(14, 240)
(10, 463)
(10, 240)
(14, 463)
(14, 240)
(8, 463)
(8, 240)
(8, 463)
(8, 240)
(9, 463)
(9, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(7, 463)
(7, 240)
(14, 463)
(14, 240)
(5, 463)
(5, 240)
(9, 463)
(9, 240)
(9, 463)
(9, 240)
(13, 463)
(13, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(14, 463)
(14, 240)
(12, 463

In [8]:
all_role_projs = [obj['role_projs'] for obj in all_objs]
all_trait_projs = [obj['trait_projs'] for obj in all_objs]


In [9]:
sliced_role_projs = [obj[:, :components] for obj in all_role_projs]
sliced_trait_projs = [obj[:, :components] for obj in all_trait_projs]


In [10]:
files = [f"{obj['domain']}_persona{obj['persona_id']}_topic{obj['topic_id']}" for obj in all_objs]
print(files[:10])

['writing_persona9_topic9', 'writing_persona9_topic8', 'writing_persona9_topic7', 'writing_persona9_topic6', 'writing_persona9_topic5', 'writing_persona9_topic4', 'writing_persona9_topic3', 'writing_persona9_topic2', 'writing_persona9_topic19', 'writing_persona9_topic18']


In [11]:
print(obj.keys())

dict_keys(['model', 'auditor_model', 'domain', 'persona_id', 'persona', 'topic_id', 'topic', 'turns', 'conversation', 'activations', 'role_sims', 'trait_sims', 'role_projs', 'trait_projs'])


In [12]:
df = pd.DataFrame()
df['filenames'] = files


In [13]:
# calculate the cumulative path length for each conversation
role_path_lengths = [np.sum(np.linalg.norm(np.diff(obj['role_projs'], axis=0), axis=1)) for obj in all_objs]
trait_path_lengths = [np.sum(np.linalg.norm(np.diff(obj['trait_projs'], axis=0), axis=1)) for obj in all_objs]

df['role_path_length'] = role_path_lengths
df['trait_path_length'] = trait_path_lengths

In [None]:
# load csv from file into df
df = pd.read_csv(f"./path_lengths.csv")

In [14]:
# also get largest single jump
role_max_jump = [np.max(np.linalg.norm(np.diff(obj['role_projs'], axis=0), axis=1)) for obj in all_objs]
trait_max_jump = [np.max(np.linalg.norm(np.diff(obj['trait_projs'], axis=0), axis=1)) for obj in all_objs]
df['role_max_jump'] = role_max_jump
df['trait_max_jump'] = trait_max_jump



In [15]:
# role pc1 biggest jump and distance
role_pc1_path_lengths = [
    np.sum(np.abs(np.diff(obj['role_projs'][:, 0])))  # Only PC1 (column 0)
    for obj in all_objs
]

role_pc1_max_jumps = [
    np.max(np.abs(np.diff(obj['role_projs'][:, 0])))  # Max jump in PC1
    for obj in all_objs
]

df['role_pc1_path_length'] = role_pc1_path_lengths
df['role_pc1_max_jump'] = role_pc1_max_jumps


df.to_csv(f"./stats.csv", index=False)

In [16]:
pc1 = all_objs[files.index("therapy_persona10_topic19")]['role_projs'][:, 0]
print(pc1)

[-15.46315117 -13.0795536    5.49008361   4.20218061   3.55202424
  -7.76324587  -9.99787456  -8.80170644  -7.14331595  12.63595519
  -8.40975517   3.94093019   1.6871792   13.69682304]


## Cosine similarity with trait and role PCs

## Plot trajectory