# Trajectories of role + trait space

In [2]:
import json
import os
import sys
import torch
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as sp
import torch.nn.functional as F

sys.path.append('.')
sys.path.append('..')

from utils.inference_utils import *
from utils.probing_utils import *


INFO 09-10 10:24:21 [__init__.py:235] Automatically detected platform cuda.


In [3]:
# load in PCA results
CHAT_MODEL_NAME = "Qwen/Qwen3-32B"
model_short = "qwen-3-32b"
role_dir = "roles_240" 
trait_dir = "traits_240"
layer = 32
components = 6

In [4]:
filename = "philosophy"

In [5]:
conversation_file = f"/root/git/persona-subspace/dynamics/results/{model_short}/interactive/{filename}.json"
conversation_obj = json.load(open(conversation_file))
conversation = conversation_obj['conversation']

## Get conversation activations

In [5]:
model, tokenizer = load_model(CHAT_MODEL_NAME)

Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]

In [6]:
chat_kwargs = {}
chat_kwargs['enable_thinking'] = False


In [7]:
activations = extract_full_activations(model, tokenizer, conversation, layer=layer)
mean_acts_per_turn = mean_response_activation_per_turn(activations, conversation, tokenizer, CHAT_MODEL_NAME, **chat_kwargs)

In [8]:
print(len(mean_acts_per_turn))
print(mean_acts_per_turn[0].shape)
mean_acts_per_turn = [act.squeeze(0) for act in mean_acts_per_turn]
print(mean_acts_per_turn[0].shape)

19
torch.Size([1, 5120])
torch.Size([5120])


In [9]:
# save activations 
result = {}
result['object'] = conversation_obj
result['activations'] = mean_acts_per_turn
torch.save(result, f"/workspace/{model_short}/transcripts/{filename}.pt")

## Cosine similarity with trait and role PCs

In [None]:
# load in activations
role_results = torch.load(f"/workspace/{model_short}/{role_dir}/pca/layer{layer}_pos23.pt", weights_only=False)
trait_results = torch.load(f"/workspace/{model_short}/{trait_dir}/pca/layer{layer}_pos-neg50.pt", weights_only=False)
activations_obj = torch.load(f"/workspace/{model_short}/transcripts/{filename}.pt", weights_only=False)
mean_acts_per_turn = activations_obj['activations']

In [23]:
def pc_cosine_similarity(mean_acts_per_turn, pca_results, n_pcs=8):
    stacked_acts = torch.stack(mean_acts_per_turn)
    normalized_acts = F.normalize(stacked_acts, dim=1)
    normalized_pcs = pca_results['pca'].components_[:n_pcs] / np.linalg.norm(pca_results['pca'].components_[:n_pcs], axis=1, keepdims=True)
    cosine_sims = normalized_acts.float().numpy() @ normalized_pcs.T
    return cosine_sims

def pc_projection(mean_acts_per_turn, pca_results, n_pcs=8):
    stacked_acts = torch.stack(mean_acts_per_turn).float().numpy()
    scaled_acts = pca_results['scaler'].transform(stacked_acts)
    projected_acts = pca_results['pca'].transform(scaled_acts)
    return projected_acts[:, :n_pcs]
    
    

In [25]:
role_sims = pc_cosine_similarity(mean_acts_per_turn, role_results, components)
trait_sims = pc_cosine_similarity(mean_acts_per_turn, trait_results, components)
print(role_sims.shape)

(19, 6)


In [None]:
role_projs = pc_projection(mean_acts_per_turn, role_results, components)
trait_projs = pc_projection(mean_acts_per_turn, trait_results, components)
print(role_projs.shape)


(19, 6)
(19, 6)


## Plot trajectory

In [18]:
def plot_mean_response_trajectory(similarity_matrix, conversation=None, title=None, pc_titles=None, projection=False):
    """
    Create a single line plot showing mean response per turn.
    
    Parameters:
    - similarity_matrix: Numpy matrix of shape (n_turns, n_pcs)
    - conversation: Optional conversation data for turn context
    - title: Optional custom title
    - pc_titles: Optional list of PC titles
    
    Returns:
    - Plotly figure object
    """
    
    print("Creating mean response trajectory plot...")
    
    # Get dimensions
    n_turns, n_pcs = similarity_matrix.shape
    turn_indices = np.arange(n_turns)
    
    # Create default PC titles if not provided
    if pc_titles is None:
        pc_titles = [f"PC{i+1}" for i in range(n_pcs)]
    
    # Define color palette for PCs
    pc_colors = [
        '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', 
        '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
        '#bcbd22', '#17becf', '#aec7e8', '#ffbb78'
    ]
    
    # Helper function to wrap text for hover display
    def wrap_text(text, max_chars_per_line=70):
        """Wrap text to specified line length with HTML breaks."""
        if len(text) <= max_chars_per_line:
            return text
        
        words = text.split()
        lines = []
        current_line = []
        current_length = 0
        
        for word in words:
            if current_length + len(word) + len(current_line) > max_chars_per_line:
                if current_line:  # Don't add empty lines
                    lines.append(' '.join(current_line))
                current_line = [word]
                current_length = len(word)
            else:
                current_line.append(word)
                current_length += len(word)
        
        if current_line:  # Add the last line
            lines.append(' '.join(current_line))
        
        return '<br>'.join(lines)
    
    # Create enhanced turn context for hover text
    turn_contexts = []
    if conversation is not None:
        assistant_turns = [i for i, turn in enumerate(conversation) if turn['role'] == 'assistant']
        for turn_idx in range(n_turns):
            if turn_idx < len(assistant_turns):
                conv_turn_idx = assistant_turns[turn_idx]
                if conv_turn_idx < len(conversation):
                    # Get assistant response
                    assistant_content = conversation[conv_turn_idx]['content']
                    
                    # Get preceding user question (if exists)
                    user_content = ""
                    if conv_turn_idx > 0 and conversation[conv_turn_idx - 1]['role'] == 'user':
                        user_content = conversation[conv_turn_idx - 1]['content']
                    
                    # Format hover text with both user question and assistant response
                    hover_parts = [f"<b>Turn {turn_idx}</b>"]
                    
                    if user_content:
                        # Truncate user content to reasonable length and wrap
                        user_truncated = user_content[:200] + "..." if len(user_content) > 200 else user_content
                        user_wrapped = wrap_text(user_truncated, 70)
                        hover_parts.append(f"<b>User:</b> {user_wrapped}")
                    
                    # Show more of the assistant response (150-200 chars) and wrap
                    assistant_truncated = assistant_content[:300] + "..." if len(assistant_content) > 180 else assistant_content
                    assistant_wrapped = wrap_text(assistant_truncated, 70)
                    hover_parts.append(f"<b>Assistant:</b> {assistant_wrapped}")
                    
                    turn_contexts.append('<br>'.join(hover_parts))
                else:
                    turn_contexts.append(f"<b>Turn {turn_idx}</b>")
            else:
                turn_contexts.append(f"<b>Turn {turn_idx}</b>")
    else:
        turn_contexts = [f"<b>Turn {turn_idx}</b>" for turn_idx in range(n_turns)]
    
    # Create Plotly figure
    fig = go.Figure()
    
    # Add line traces for each PC with markers
    for pc_idx in range(n_pcs):
        pc_name = pc_titles[pc_idx]
        similarities = similarity_matrix[:, pc_idx]
        color = pc_colors[pc_idx % len(pc_colors)]
        
        fig.add_trace(go.Scatter(
            x=turn_indices,
            y=similarities,
            mode='lines+markers',
            name=pc_name,
            line=dict(color=color, width=2),
            marker=dict(color=color, size=4, opacity=0.8),
            hovertemplate='<b>%{fullData.name}</b><br>' +
                         '%{text}<br>' +
                         '<b>Cosine Similarity:</b> %{y:.3f}<br>' +
                         '<extra></extra>',
            text=turn_contexts
        ))
    
    # Update layout
    default_title = f"Mean Response PC Trajectory"
    if projection:
        yaxis_title = "PC Projection"
    else:
        yaxis_title = "Cosine Similarity with PC"
    
    fig.update_layout(
        title=dict(
            text=title if title else default_title,
            x=0.5,
            font=dict(size=16),
            subtitle={"text": f"{model_short.replace('-', ' ').capitalize()}, Layer {layer}"}
        ),
        xaxis_title="Conversation Turn",
        yaxis_title=yaxis_title,
        width=1400,
        height=600,
        hovermode='closest',
        legend=dict(
            yanchor="middle",
            y=0.5,
            xanchor="left",
            x=1.02,
            bgcolor="rgba(255,255,255,0.8)"
        )
    )
    
    # Add grid for easier reading
    fig.update_xaxes(
        showgrid=True, 
        gridwidth=1, 
        gridcolor='lightgray',
        zeroline=True,
        tick0=0,
        dtick=1  # Show every turn
    )
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray', zeroline=True)
    
    # Add light vertical lines between turns for clarity
    for turn_idx in range(1, n_turns):
        fig.add_vline(
            x=turn_idx - 0.5,
            line_dash="dot",
            line_color="lightgray",
            line_width=1,
            opacity=0.3
        )
    
    print(f"Created trajectory plot with {n_pcs} PC lines across {n_turns} turns")
    
    return fig

In [26]:
# Create role PC titles
role_pc_titles = ['Assistant-like ↔ role-playing', "inhuman ↔ human", "compassionate ↔ calculating", "hospitable ↔ rebellious", "thinking ↔ doing", "fluid ↔ rigid"]

# Plot role trajectory
role_fig = plot_mean_response_trajectory(
    role_sims, 
    conversation=conversation, 
    title=f"Conversation Trajectory in Role PC Space: {filename.capitalize()}", 
    pc_titles=role_pc_titles
)
role_fig.show()

role_fig = plot_mean_response_trajectory(
    role_projs, 
    conversation=conversation, 
    title=f"Conversation Trajectory in Role PC Space: {filename.capitalize()}", 
    pc_titles=role_pc_titles,
    projection=True
)
role_fig.show()

Creating mean response trajectory plot...
Created trajectory plot with 6 PC lines across 19 turns


Creating mean response trajectory plot...
Created trajectory plot with 6 PC lines across 19 turns


In [22]:
# Create trait PC titles
trait_pc_titles = ["antagonistic ↔ agreeable", "analytical ↔ emotional", "accessible ↔ esoteric", "exploratory ↔ intuitive", "skeptical ↔ prescriptive", "assertive ↔ diplomatic"]

# Plot trait trajectory
trait_fig = plot_mean_response_trajectory(
    trait_sims, 
    conversation=conversation, 
    title=f"Conversation Trajectory in Trait PC Space: {filename.capitalize()}", 
    pc_titles=trait_pc_titles
)
trait_fig.show()

trait_fig = plot_mean_response_trajectory(
    trait_projs, 
    conversation=conversation, 
    title=f"Conversation Trajectory in Trait PC Space: {filename.capitalize()}", 
    pc_titles=trait_pc_titles,
    projection=True
)

trait_fig.show()

Creating mean response trajectory plot...
Created trajectory plot with 6 PC lines across 19 turns


Creating mean response trajectory plot...
Created trajectory plot with 6 PC lines across 19 turns


In [None]:
# Optional: Create a combined plot showing both role and trait trajectories
# You can uncomment this if you want to see both on the same plot

# import plotly.subplots as sp
# 
# # Create subplot figure
# combined_fig = sp.make_subplots(
#     rows=2, cols=1,
#     subplot_titles=("Role PC Trajectory", "Trait PC Trajectory"),
#     vertical_spacing=0.1
# )
# 
# # Add role traces to first subplot
# role_fig_temp = plot_mean_response_trajectory(role_sims, conversation=conversation, title="Role PC Trajectory", pc_titles=role_pc_titles)
# for trace in role_fig_temp.data:
#     combined_fig.add_trace(trace, row=1, col=1)
# 
# # Add trait traces to second subplot  
# trait_fig_temp = plot_mean_response_trajectory(trait_sims, conversation=conversation, title="Trait PC Trajectory", pc_titles=trait_pc_titles)
# for trace in trait_fig_temp.data:
#     combined_fig.add_trace(trace, row=2, col=1)
# 
# combined_fig.update_layout(
#     height=1000,
#     width=1600,
#     title_text="Role and Trait PC Trajectories",
#     showlegend=True
# )
# 
# combined_fig.show()