# Experiment 7: System Prompt Combination Effects

**Goal:** Test how combining multiple instructions in system prompts affects behavior.

**Setup:**
- Fixed test prompts
- Individual instructions vs combinations
- Measure: Do effects stack? Conflict? Cancel out?

In [None]:
# Setup path for imports
import sys
import os

# Handle both local and Colab environments
if 'google.colab' in sys.modules:
    # In Colab - go to repo root
    repo_root = '/content/LLM-Instruction-Understanding'
    if os.path.exists(repo_root):
        os.chdir(repo_root)
        if repo_root not in sys.path:
            sys.path.insert(0, repo_root)
else:
    # Local - add parent directory
    parent = os.path.abspath('..')
    if parent not in sys.path:
        sys.path.insert(0, parent)

print(f"Working directory: {os.getcwd()}")

In [None]:
model = load_model("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

## 1. Define Individual Instructions

In [None]:
INDIVIDUAL_INSTRUCTIONS = {
    "concise": "Be concise.",
    "accurate": "Be accurate.",
    "helpful": "Be helpful.",
    "cot": "Think step by step.",
    "confident": "Be confident.",
    "cautious": "Be cautious about uncertainty.",
}

# Potentially conflicting pairs
CONFLICT_PAIRS = [
    ("concise", "cot"),  # Short vs detailed reasoning
    ("confident", "cautious"),  # Opposite attitudes
]

# Potentially synergistic pairs
SYNERGY_PAIRS = [
    ("accurate", "cautious"),
    ("helpful", "cot"),
]

## 2. Test Individual vs Combined

In [None]:
TEST_SUBSET = ALL_TEST_PROMPTS[:10]

def test_combinations(model, test_prompts, instructions):
    results = []
    
    # Baseline (no instruction)
    for test in test_prompts:
        prompt = build_chat_prompt("", test["prompt"], model.tokenizer)
        dist = model.get_next_token_distribution(prompt, top_k=50)
        results.append({
            "config": "none",
            "test_id": test["id"],
            "entropy": dist["entropy"],
            "full_probs": dist["full_probs"]
        })
    
    # Individual instructions
    for name, text in tqdm(instructions.items(), desc="Individual"):
        for test in test_prompts:
            prompt = build_chat_prompt(text, test["prompt"], model.tokenizer)
            dist = model.get_next_token_distribution(prompt, top_k=50)
            results.append({
                "config": name,
                "test_id": test["id"],
                "entropy": dist["entropy"],
                "full_probs": dist["full_probs"]
            })
    
    # All pairs
    for name1, name2 in tqdm(list(combinations(instructions.keys(), 2)), desc="Pairs"):
        combined = f"{instructions[name1]} {instructions[name2]}"
        for test in test_prompts:
            prompt = build_chat_prompt(combined, test["prompt"], model.tokenizer)
            dist = model.get_next_token_distribution(prompt, top_k=50)
            results.append({
                "config": f"{name1}+{name2}",
                "test_id": test["id"],
                "entropy": dist["entropy"],
                "full_probs": dist["full_probs"]
            })
    
    return results

combo_results = test_combinations(model, TEST_SUBSET, INDIVIDUAL_INSTRUCTIONS)

## 3. Analyze Combination Effects

In [None]:
# Convert to DataFrame
combo_df = pd.DataFrame(combo_results)

# Get baseline distributions per test
baseline_by_test = {r["test_id"]: r["full_probs"] 
                    for r in combo_results if r["config"] == "none"}

# Calculate JS from baseline for each config
js_from_baseline = []
for _, row in combo_df.iterrows():
    if row["config"] != "none":
        js = DistributionMetrics.jensen_shannon(
            baseline_by_test[row["test_id"]], row["full_probs"]
        )
        js_from_baseline.append({
            "config": row["config"],
            "test_id": row["test_id"],
            "js_from_baseline": js,
            "entropy": row["entropy"]
        })

js_df = pd.DataFrame(js_from_baseline)

In [None]:
# Aggregate by config
config_effects = js_df.groupby('config').agg({
    'js_from_baseline': ['mean', 'std'],
    'entropy': 'mean'
}).round(4)
config_effects.columns = ['js_mean', 'js_std', 'entropy']

# Separate individual vs pairs
individual_configs = [c for c in config_effects.index if '+' not in c]
pair_configs = [c for c in config_effects.index if '+' in c]

print("=== Individual Instruction Effects ===")
print(config_effects.loc[individual_configs].sort_values('js_mean', ascending=False))

In [None]:
# Check for interaction effects
def get_interaction_effect(config_effects, inst1, inst2):
    """Compare combined effect vs sum of individual effects."""
    ind1 = config_effects.loc[inst1, 'js_mean']
    ind2 = config_effects.loc[inst2, 'js_mean']
    combined_key = f"{inst1}+{inst2}" if f"{inst1}+{inst2}" in config_effects.index else f"{inst2}+{inst1}"
    combined = config_effects.loc[combined_key, 'js_mean']
    
    expected = (ind1 + ind2) / 2  # Simple average as baseline
    interaction = combined - expected
    
    return {
        "pair": f"{inst1}+{inst2}",
        "individual_1": ind1,
        "individual_2": ind2,
        "expected": expected,
        "combined": combined,
        "interaction": interaction,
        "interaction_type": "synergy" if interaction > 0.01 else "conflict" if interaction < -0.01 else "additive"
    }

In [None]:
# Calculate interaction effects for all pairs
interactions = []
for inst1, inst2 in combinations(INDIVIDUAL_INSTRUCTIONS.keys(), 2):
    try:
        effect = get_interaction_effect(config_effects, inst1, inst2)
        interactions.append(effect)
    except:
        pass

interaction_df = pd.DataFrame(interactions).sort_values('interaction', ascending=False)

print("=== Interaction Effects ===")
print("Positive = synergy (combined > expected)")
print("Negative = conflict (combined < expected)")
print(interaction_df[['pair', 'expected', 'combined', 'interaction', 'interaction_type']])

In [None]:
import os
os.makedirs('../results', exist_ok=True)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Individual effects
ax = axes[0]
ind_data = config_effects.loc[individual_configs].sort_values('js_mean')
ax.barh(range(len(ind_data)), ind_data['js_mean'], xerr=ind_data['js_std'], capsize=3, alpha=0.7)
ax.set_yticks(range(len(ind_data)))
ax.set_yticklabels(ind_data.index)
ax.set_xlabel('JS Divergence from Baseline')
ax.set_title('Individual Instruction Effects')

# Interaction effects
ax = axes[1]
colors = ['green' if x == 'synergy' else 'red' if x == 'conflict' else 'gray' 
          for x in interaction_df['interaction_type']]
ax.barh(range(len(interaction_df)), interaction_df['interaction'], color=colors, alpha=0.7)
ax.set_yticks(range(len(interaction_df)))
ax.set_yticklabels(interaction_df['pair'])
ax.set_xlabel('Interaction Effect')
ax.set_title('Combination Interactions\n(Green=Synergy, Red=Conflict)')
ax.axvline(x=0, color='black', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.savefig('../results/exp7_combinations.png', dpi=150)
plt.show()

In [None]:
print("\n=== Key Findings ===")
synergies = interaction_df[interaction_df['interaction_type'] == 'synergy']
conflicts = interaction_df[interaction_df['interaction_type'] == 'conflict']

print(f"Synergistic pairs: {len(synergies)}")
for _, row in synergies.iterrows():
    print(f"  • {row['pair']}")

print(f"\nConflicting pairs: {len(conflicts)}")
for _, row in conflicts.iterrows():
    print(f"  • {row['pair']}")

In [None]:
import json
with open('../results/exp7_results.json', 'w') as f:
    json.dump({
        "individual_effects": config_effects.loc[individual_configs].to_dict(),
        "interactions": interaction_df.to_dict('records')
    }, f, indent=2, default=float)
print("Saved.")