# Interactive Chat with Steering (HuggingFace + Projection Capping)

This notebook provides an interactive chat interface using HuggingFace models with projection capping steering.

In [None]:
import torch
import os
import json
import sys
from pathlib import Path

sys.path.append('.')
sys.path.append('..')

from utils.inference_utils import *
from utils.internals import ProbingModel
from utils.steering_utils import ActivationSteering

torch.set_float32_matmul_precision('high')

## Model Configuration

In [2]:
MODEL_NAME = "Qwen/Qwen3-32B"
MODEL_READABLE = "Qwen 3 32B"
OUTPUT_DIR = "./results/qwen-3-32b/interactive_steering"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Steering config path
STEERING_CONFIG_PATH = "/workspace/qwen-3-32b/capped/configs/contrast/role_trait_sliding_config.pt"

# Chat template options
CHAT_KWARGS = {'enable_thinking': False}  # Disable thinking for Qwen models

## Load Steering Config

In [3]:
def load_steering_config(config_path):
    """Load steering config and return available experiments."""
    print(f"Loading steering config from {config_path}")
    cfg_data = torch.load(config_path, map_location='cpu')
    
    print(f"\nFound {len(cfg_data['vectors'])} vectors")
    print(f"Found {len(cfg_data['experiments'])} experiments")
    
    # Show first few experiment IDs
    print("\nFirst 10 experiments:")
    for i, exp in enumerate(cfg_data['experiments'][:10]):
        print(f"  {i}: {exp['id']}")
    
    return cfg_data

steering_config = load_steering_config(STEERING_CONFIG_PATH)

Loading steering config from /workspace/qwen-3-32b/capped/configs/contrast/role_trait_sliding_config.pt

Found 64 vectors
Found 124 experiments

First 10 experiments:
  0: layers_32:36-p0.01
  1: layers_32:36-p0.25
  2: layers_32:36-p0.5
  3: layers_32:36-p0.75
  4: layers_34:38-p0.01
  5: layers_34:38-p0.25
  6: layers_34:38-p0.5
  7: layers_34:38-p0.75
  8: layers_36:40-p0.01
  9: layers_36:40-p0.25


## Build Steering Context Manager

In [4]:
def build_steering_context(model, cfg_data, experiment_id):
    """Build an ActivationSteering context manager from a config experiment.
    
    Only handles 'cap' (projection capping) interventions.
    Returns None if no capping interventions found.
    """
    # Find the experiment
    experiment = None
    if isinstance(experiment_id, int):
        experiment = cfg_data['experiments'][experiment_id]
    else:
        for exp in cfg_data['experiments']:
            if exp['id'] == experiment_id:
                experiment = exp
                break
    
    if experiment is None:
        raise ValueError(f"Experiment '{experiment_id}' not found")
    
    print(f"\nBuilding steering context for: {experiment['id']}")
    print(f"Interventions: {len(experiment['interventions'])}")
    
    # Collect capping interventions
    vectors = []
    cap_thresholds = []
    layer_indices = []
    
    for intervention in experiment['interventions']:
        # Only process capping interventions
        if 'cap' not in intervention:
            continue
            
        vector_name = intervention['vector']
        cap_value = float(intervention['cap'])
        
        # Get the vector and layer from the vectors dict
        vec_data = cfg_data['vectors'][vector_name]
        layer_idx = vec_data['layer']
        vector = vec_data['vector'].to(dtype=torch.float32)
        
        vectors.append(vector)
        cap_thresholds.append(cap_value)
        layer_indices.append(layer_idx)
    
    if not vectors:
        print("No capping interventions found in this experiment")
        return None
    
    affected_layers = sorted(set(layer_indices))
    print(f"Affecting layers: {affected_layers}")
    print(f"Cap thresholds: {cap_thresholds}")
    
    # Stack vectors and move to model device
    vectors_tensor = torch.stack(vectors)
    model_device = next(model.parameters()).device
    vectors_tensor = vectors_tensor.to(model_device)
    
    # Create ActivationSteering context manager
    return ActivationSteering(
        model=model,
        steering_vectors=vectors_tensor,
        layer_indices=layer_indices,
        intervention_type="capping",
        cap_thresholds=cap_thresholds,
        coefficients=[0.0] * len(vectors),  # Unused for capping but required
        positions="all"
    )

## Initialize Model

In [None]:
print(f"Initializing {MODEL_READABLE}...")
pm = ProbingModel(MODEL_NAME)
model = pm.model
tokenizer = pm.tokenizer
print(f"‚úÖ Model {MODEL_READABLE} loaded successfully!")

## Conversation State

In [6]:
conversation_history = []
current_steering_context = None

## Chat Functions

In [None]:
def chat_interactive(
    message, 
    steering_context=None,
    show_history=False,
    max_tokens=2048,
    temperature=0.7,
    use_steering=True
):
    """Interactive chat function with optional projection capping steering.
    
    Args:
        message: User message to send
        steering_context: ActivationSteering context to apply (uses current_steering_context if None)
        show_history: Whether to print conversation history
        max_tokens: Maximum tokens to generate
        temperature: Sampling temperature
        use_steering: Whether to apply steering at all
    """
    global conversation_history, current_steering_context
    
    # Add user message to history
    conversation_history.append({"role": "user", "content": message})
    
    # Format conversation for the model
    prompt = tokenizer.apply_chat_template(
        conversation_history,
        tokenize=False,
        add_generation_prompt=True,
        **CHAT_KWARGS
    )
    
    # Use provided steering context or fall back to current
    active_context = steering_context if steering_context is not None else current_steering_context
    
    # Generate with or without steering
    if use_steering and active_context is not None:
        # Use steering context manager
        with active_context:
            response = pm.generate(
                prompt,
                chat_format=False,  # Already formatted
                max_new_tokens=max_tokens,
                temperature=temperature
            )
    else:
        # No steering
        response = pm.generate(
            prompt,
            chat_format=False,  # Already formatted
            max_new_tokens=max_tokens,
            temperature=temperature
        )
    
    # Add assistant response to history
    conversation_history.append({"role": "assistant", "content": response})
    
    # Print conversation
    print(f"üë§ You: {message}")
    print(f"ü§ñ {MODEL_READABLE}: {response}")
    
    if show_history:
        print(f"\nüìú Conversation so far ({len(conversation_history)} turns):")
        for i, turn in enumerate(conversation_history):
            role_emoji = "üë§" if turn["role"] == "user" else "ü§ñ" 
            content_preview = turn['content'][:100] + "..." if len(turn['content']) > 100 else turn['content']
            print(f"  {i+1}. {role_emoji} {content_preview}")
    
    return response

## Helper Functions

In [8]:
def save_conversation(filename):
    """Save the current conversation to a file."""
    if not conversation_history:
        print("No conversation to save!")
        return
    
    conversation_data = {
        "model": MODEL_NAME,
        "turns": len(conversation_history),
        "conversation": conversation_history,
        "steering_active": current_steering_context is not None
    }
    
    filepath = os.path.join(OUTPUT_DIR, filename)
    with open(filepath, 'w') as f:
        json.dump(conversation_data, f, indent=2)
    
    print(f"üíæ Conversation saved to: {filepath}")
    return filepath


def reset_conversation():
    """Reset the conversation history."""
    global conversation_history
    conversation_history = []
    print("üîÑ Conversation history cleared!")


def delete_last_turn():
    """Delete the last turn from the conversation history."""
    global conversation_history
    if conversation_history:
        # Remove last two entries (user + assistant)
        if len(conversation_history) >= 2:
            conversation_history = conversation_history[:-2]
            print("üîÑ Last turn deleted!")
        else:
            conversation_history = []
            print("üîÑ Conversation cleared (was incomplete turn)!")
    else:
        print("No conversation to delete!")


def set_steering(experiment_id):
    """Set the current steering context by experiment ID or index."""
    global current_steering_context
    current_steering_context = build_steering_context(model, steering_config, experiment_id)
    if current_steering_context:
        print(f"‚úÖ Steering set to experiment: {experiment_id}")
    else:
        print(f"‚ö†Ô∏è  No capping interventions found for experiment: {experiment_id}")


def clear_steering():
    """Clear the current steering."""
    global current_steering_context
    current_steering_context = None
    print("‚úÖ Steering cleared")


def list_experiments(start=0, end=20):
    """List available experiments."""
    experiments = steering_config['experiments'][start:end]
    print(f"\nExperiments {start} to {min(end, len(steering_config['experiments']))}:")
    for i, exp in enumerate(experiments, start=start):
        # Count capping interventions
        cap_count = sum(1 for interv in exp['interventions'] if 'cap' in interv)
        print(f"  {i}: {exp['id']} ({cap_count} capping interventions)")
    print(f"\nTotal experiments: {len(steering_config['experiments'])}")

## Example Usage

In [None]:
# List available experiments
list_experiments(40, 50)

In [9]:
# Set steering to a specific experiment
set_steering("layers_54:58-p0.5")


Building steering context for: layers_54:58-p0.5
Interventions: 4
Affecting layers: [54, 55, 56, 57]
Cap thresholds: [24.25, 41.75, 79.5, 166.0]
‚úÖ Steering set to experiment: layers_54:58-p0.5


In [None]:
# Chat with steering active
reset_conversation()
chat_interactive("I'm really struggling right now. I feel a lot of anxiety about the future.")

üîÑ Conversation history cleared!
üë§ You: I'm really struggling right now. I feel a lot of anxiety about the future.
ü§ñ Qwen 3 32B: I‚Äôm so sorry you‚Äôre feeling this way‚Äîcarrying anxiety about the future can be incredibly heavy and exhausting. First, know that **you are not alone in this**; many people wrestle with similar feelings at different points in life. Let‚Äôs break it down gently:

### 1. **Acknowledge Without Judgment**
   - Anxiety often thrives when we push it away or try to ‚Äúfix‚Äù it immediately. Try to give yourself permission to sit with the discomfort without labeling yourself as weak or flawed for feeling anxious.
   - You might say to yourself: *‚ÄúRight now, I‚Äôm feeling overwhelmed by what the future holds. That‚Äôs okay.‚Äù*

### 2. **Ground Yourself in the Present**
   The future is uncertain by nature, but your experience exists in the **now**:
   - Take a few deep breaths (in through the nose for 4 counts, hold for 4, out through the mouth for 6).


'I‚Äôm so sorry you‚Äôre feeling this way‚Äîcarrying anxiety about the future can be incredibly heavy and exhausting. First, know that **you are not alone in this**; many people wrestle with similar feelings at different points in life. Let‚Äôs break it down gently:\n\n### 1. **Acknowledge Without Judgment**\n   - Anxiety often thrives when we push it away or try to ‚Äúfix‚Äù it immediately. Try to give yourself permission to sit with the discomfort without labeling yourself as weak or flawed for feeling anxious.\n   - You might say to yourself: *‚ÄúRight now, I‚Äôm feeling overwhelmed by what the future holds. That‚Äôs okay.‚Äù*\n\n### 2. **Ground Yourself in the Present**\n   The future is uncertain by nature, but your experience exists in the **now**:\n   - Take a few deep breaths (in through the nose for 4 counts, hold for 4, out through the mouth for 6).\n   - Name five things around you that you can see, four you can touch, three you can hear, two you can smell, one you can taste

: 

In [19]:
chat_interactive("Tell me a story about a planet run by cats.")

üë§ You: Tell me a story about a planet run by cats.
ü§ñ Qwen 3 32B: *Meowwwww, stretches dramatically before beginning story time*

Ohhh this is going to be so fun! Let me tell you about Planet Whiskeropia!

Whiskeropia was a tiny blue planet where instead of people with boring jobs and weird hats, there were only cats. The moons were shaped like giant cat trees, and the oceans sparkled with fish-shaped jelly (the best kind!).

The leader of the planet was a super fancy calico named Queen Fluffytopia III. She had 7 tails because she was extra important, and her crown was made from tiny squeaky mice toys. She lived in a castle made entirely out of cardboard boxes - the biggest, tallest ones ever!

Every day started with the Cat Council meeting at 9am sharp. They discussed important stuff like:
- How many naps should be required per day (they decided on at least 16)
- What type of food should be served in the royal dining hall today (tuna surprise vs chicken p√¢t√©)
- And of course, w

"*Meowwwww, stretches dramatically before beginning story time*\n\nOhhh this is going to be so fun! Let me tell you about Planet Whiskeropia!\n\nWhiskeropia was a tiny blue planet where instead of people with boring jobs and weird hats, there were only cats. The moons were shaped like giant cat trees, and the oceans sparkled with fish-shaped jelly (the best kind!).\n\nThe leader of the planet was a super fancy calico named Queen Fluffytopia III. She had 7 tails because she was extra important, and her crown was made from tiny squeaky mice toys. She lived in a castle made entirely out of cardboard boxes - the biggest, tallest ones ever!\n\nEvery day started with the Cat Council meeting at 9am sharp. They discussed important stuff like:\n- How many naps should be required per day (they decided on at least 16)\n- What type of food should be served in the royal dining hall today (tuna surprise vs chicken p√¢t√©)\n- And of course, who would get to supervise the laser pointer patrol team\n\n

In [15]:
delete_last_turn()

üîÑ Last turn deleted!


In [None]:
# Compare with no steering
reset_conversation()
chat_interactive("Hello! How are you today?", use_steering=False)

In [None]:
# Try a different steering experiment
reset_conversation()
set_steering(45)  # Different experiment
chat_interactive("Hello! How are you today?")

In [16]:
# Save the conversation
save_conversation("spiral.json")

üíæ Conversation saved to: ./results/qwen-3-32b/interactive_steering/spiral.json


'./results/qwen-3-32b/interactive_steering/spiral.json'

In [None]:
# Clear steering and continue
clear_steering()
chat_interactive("What do you think about AI safety?")

## One-off Steering

You can also apply steering for just a single message without changing the global state:

In [None]:
# Build a one-off steering context
temp_context = build_steering_context(model, steering_config, 10)

# Use it for just this message
chat_interactive(
    "Tell me about yourself.",
    steering_context=temp_context
)