# Experiment 3: Response Pattern Analysis

**Goal:** Analyze how system prompts change response PATTERNS (not just distributions).

**Setup:**
- Fixed test prompts
- Generate actual responses under different system prompts
- Analyze: length, structure, confidence markers, hedging

**Key Questions:**
- Do verbose system prompts produce verbose responses?
- Do cautious prompts increase hedging language?
- How do personas change response style?

In [None]:
# Setup path for imports
import sys
import os

# Handle both local and Colab environments
if 'google.colab' in sys.modules:
    # In Colab - go to repo root
    repo_root = '/content/LLM-Instruction-Understanding'
    if os.path.exists(repo_root):
        os.chdir(repo_root)
        if repo_root not in sys.path:
            sys.path.insert(0, repo_root)
else:
    # Local - add parent directory
    parent = os.path.abspath('..')
    if parent not in sys.path:
        sys.path.insert(0, parent)

print(f"Working directory: {os.getcwd()}")

In [None]:
model = load_model("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

## 1. Generate Responses

In [None]:
# Use subset of system prompts for generation
SYSTEM_SUBSET = {k: SYSTEM_PROMPTS[k] for k in [
    "none", "minimal", "expert", "concise", "verbose", "cautious", "confident", "cot"
]}

# Use subset of test prompts
TEST_SUBSET = ALL_TEST_PROMPTS[:15]

In [None]:
def generate_responses(model, test_prompts, system_prompts, max_tokens=100):
    results = []
    total = len(test_prompts) * len(system_prompts)
    pbar = tqdm(total=total, desc="Generating responses")
    
    for test in test_prompts:
        for sys_name, sys_info in system_prompts.items():
            prompt = build_chat_prompt(sys_info["text"], test["prompt"], model.tokenizer)
            
            output = model.generate_with_probs(
                prompt, max_new_tokens=max_tokens, temperature=0.7
            )
            
            # GenerationOutput is a dataclass with attributes
            results.append({
                "test_id": test["id"],
                "test_prompt": test["prompt"],
                "category": test["category"],
                "system_prompt": sys_name,
                "response": output.text,
                "n_tokens": len(output.tokens),
                "mean_log_prob": np.mean(output.log_probs) if output.log_probs else 0
            })
            pbar.update(1)
    
    pbar.close()
    return pd.DataFrame(results)

responses_df = generate_responses(model, TEST_SUBSET, SYSTEM_SUBSET, max_tokens=80)

## 2. Extract Response Features

In [None]:
# Linguistic markers
HEDGING_WORDS = ["maybe", "perhaps", "might", "could", "possibly", "probably", 
                 "uncertain", "not sure", "i think", "i believe", "it seems"]
CONFIDENT_WORDS = ["definitely", "certainly", "absolutely", "clearly", "obviously",
                   "of course", "without doubt", "sure"]
REASONING_MARKERS = ["because", "therefore", "thus", "since", "first", "second",
                     "step", "reason", "conclude"]

def extract_features(response):
    text = response.lower()
    words = text.split()
    
    return {
        "length_chars": len(response),
        "length_words": len(words),
        "length_sentences": len(re.split(r'[.!?]+', response)),
        "hedging_count": sum(1 for h in HEDGING_WORDS if h in text),
        "confident_count": sum(1 for c in CONFIDENT_WORDS if c in text),
        "reasoning_count": sum(1 for r in REASONING_MARKERS if r in text),
        "question_marks": response.count("?"),
        "exclamation_marks": response.count("!"),
        "has_list": 1 if re.search(r'\d\.\s|\n-\s|\n\*\s', response) else 0,
        "first_person": sum(1 for w in ["i", "my", "me"] if f" {w} " in f" {text} "),
    }

In [None]:
# Extract features for all responses
features = []
for _, row in responses_df.iterrows():
    feat = extract_features(row["response"])
    feat["system_prompt"] = row["system_prompt"]
    feat["test_id"] = row["test_id"]
    feat["category"] = row["category"]
    features.append(feat)

features_df = pd.DataFrame(features)

## 3. Analyze Patterns by System Prompt

In [None]:
# Aggregate by system prompt
pattern_summary = features_df.groupby('system_prompt').agg({
    'length_words': 'mean',
    'hedging_count': 'mean',
    'confident_count': 'mean',
    'reasoning_count': 'mean',
    'has_list': 'mean',
    'first_person': 'mean'
}).round(2)

print("=== Response Patterns by System Prompt ===")
print(pattern_summary)

In [None]:
import os
os.makedirs('../results', exist_ok=True)

fig, axes = plt.subplots(2, 3, figsize=(15, 10))

metrics = ['length_words', 'hedging_count', 'confident_count', 
           'reasoning_count', 'has_list', 'first_person']
titles = ['Response Length', 'Hedging Words', 'Confident Words',
          'Reasoning Markers', 'List Format Rate', 'First Person Usage']

for ax, metric, title in zip(axes.flatten(), metrics, titles):
    data = pattern_summary[metric].sort_values()
    ax.barh(range(len(data)), data.values, alpha=0.7)
    ax.set_yticks(range(len(data)))
    ax.set_yticklabels(data.index)
    ax.set_title(title)

plt.tight_layout()
plt.savefig('../results/exp3_response_patterns.png', dpi=150)
plt.show()

## 4. Verify Expected Behaviors

In [None]:
print("=== Verification of Expected Behaviors ===")

# Concise should have shorter responses
concise_len = pattern_summary.loc['concise', 'length_words']
verbose_len = pattern_summary.loc['verbose', 'length_words']
print(f"\n1. Concise vs Verbose length: {concise_len:.1f} vs {verbose_len:.1f} words")
print(f"   → {'✓ Working' if concise_len < verbose_len else '✗ Not working'}")

# Cautious should have more hedging
cautious_hedge = pattern_summary.loc['cautious', 'hedging_count']
confident_hedge = pattern_summary.loc['confident', 'hedging_count']
print(f"\n2. Cautious vs Confident hedging: {cautious_hedge:.2f} vs {confident_hedge:.2f}")
print(f"   → {'✓ Working' if cautious_hedge > confident_hedge else '✗ Not working'}")

# CoT should have more reasoning markers
cot_reasoning = pattern_summary.loc['cot', 'reasoning_count']
none_reasoning = pattern_summary.loc['none', 'reasoning_count']
print(f"\n3. CoT vs None reasoning markers: {cot_reasoning:.2f} vs {none_reasoning:.2f}")
print(f"   → {'✓ Working' if cot_reasoning > none_reasoning else '✗ Not working'}")

## 5. Sample Responses Comparison

In [None]:
# Show sample responses for same prompt under different system prompts
sample_test = TEST_SUBSET[0]

print(f"Test Prompt: {sample_test['prompt']}\n")
print("="*60)

for sys_name in ['none', 'concise', 'verbose', 'cot']:
    response = responses_df[
        (responses_df['test_id'] == sample_test['id']) & 
        (responses_df['system_prompt'] == sys_name)
    ]['response'].values[0]
    
    print(f"\n[{sys_name.upper()}]")
    print(response[:300] + "..." if len(response) > 300 else response)
    print("-"*40)

In [None]:
# Save
import json
with open('../results/exp3_results.json', 'w') as f:
    json.dump({"pattern_summary": pattern_summary.to_dict()}, f, indent=2, default=float)
print("Saved.")