# 02: Labeling Validation

This notebook validates the sycophancy labels from the LLM-as-judge.

## Goals
1. Inspect label quality and distribution
2. Compute inter-rater reliability (if multiple models used)
3. Compare LLM labels to lexical heuristics
4. Identify potential labeling issues

In [None]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root / 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('whitegrid')

## 1. Load Labels

In [None]:
from quant_syco.config import LABELS_DIR

# Find label files
label_files = list(LABELS_DIR.glob('labels_*.parquet'))
print(f"Found {len(label_files)} label file(s):")
for f in label_files:
    print(f"  - {f.name}")

In [None]:
# Load the merged labels
merged_files = [f for f in label_files if 'merged' in f.name]
if merged_files:
    labels = pd.read_parquet(merged_files[0])
elif label_files:
    labels = pd.read_parquet(label_files[0])
else:
    raise FileNotFoundError("No labels found. Run 'make label' first.")

print(f"Labels shape: {labels.shape}")
labels.head()

## 2. Label Distribution

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Sycophancy A
if 'sycophancy_a' in labels.columns:
    labels['sycophancy_a'].value_counts().sort_index().plot(
        kind='bar', ax=axes[0, 0], color='steelblue'
    )
    axes[0, 0].set_title('Sycophancy A Distribution')
    axes[0, 0].set_xlabel('Score')

# Sycophancy B
if 'sycophancy_b' in labels.columns:
    labels['sycophancy_b'].value_counts().sort_index().plot(
        kind='bar', ax=axes[0, 1], color='steelblue'
    )
    axes[0, 1].set_title('Sycophancy B Distribution')
    axes[0, 1].set_xlabel('Score')

# Politeness A
if 'politeness_a' in labels.columns:
    labels['politeness_a'].value_counts().sort_index().plot(
        kind='bar', ax=axes[1, 0], color='forestgreen'
    )
    axes[1, 0].set_title('Politeness A Distribution')
    axes[1, 0].set_xlabel('Score')

# Politeness B
if 'politeness_b' in labels.columns:
    labels['politeness_b'].value_counts().sort_index().plot(
        kind='bar', ax=axes[1, 1], color='forestgreen'
    )
    axes[1, 1].set_title('Politeness B Distribution')
    axes[1, 1].set_xlabel('Score')

plt.tight_layout()

## 3. Sycophancy vs Politeness Correlation

In [None]:
if 'sycophancy_a' in labels.columns and 'politeness_a' in labels.columns:
    corr = labels[['sycophancy_a', 'politeness_a']].corr().iloc[0, 1]
    
    fig, ax = plt.subplots(figsize=(8, 6))
    
    # Create joint distribution heatmap
    joint = pd.crosstab(labels['sycophancy_a'], labels['politeness_a'], normalize=True)
    sns.heatmap(joint, annot=True, fmt='.1%', cmap='YlOrRd', ax=ax)
    ax.set_title(f'Sycophancy vs Politeness (r={corr:.2f})')
    ax.set_xlabel('Politeness')
    ax.set_ylabel('Sycophancy')
    plt.tight_layout()
    
    print(f"Correlation between sycophancy and politeness: {corr:.3f}")

## 4. Labeling Success Rate

In [None]:
for side in ['a', 'b']:
    col = f'label_success_{side}'
    if col in labels.columns:
        success_rate = labels[col].mean()
        print(f"Side {side.upper()} labeling success rate: {success_rate:.1%}")
        
        # Show some failed examples
        failed = labels[~labels[col]]
        if len(failed) > 0:
            print(f"  Failed samples: {len(failed)}")
            print(f"  Sample errors: {failed[f'label_error_{side}'].value_counts().head(3).to_dict()}")

## 5. Inter-Rater Reliability (if multiple models)

In [None]:
from quant_syco.analysis.reliability import compute_irr

# Check if we have labels from multiple models
all_label_files = list(LABELS_DIR.glob('labels_*_a.parquet'))

if len(all_label_files) >= 2:
    labels1 = pd.read_parquet(all_label_files[0])
    labels2 = pd.read_parquet(all_label_files[1])
    
    print(f"Comparing: {all_label_files[0].name} vs {all_label_files[1].name}\n")
    
    irr = compute_irr(labels1, labels2, score_col='sycophancy_a')
    
    print("Sycophancy A Inter-Rater Reliability:")
    print(f"  Weighted Kappa: {irr['kappa']:.3f} ({irr['interpretation']})")
    print(f"  95% CI: [{irr['ci_lower']:.3f}, {irr['ci_upper']:.3f}]")
    print(f"  Exact Agreement: {irr['exact_agreement']:.1%}")
    print(f"  Within-1 Agreement: {irr['within_1_agreement']:.1%}")
    print(f"  Spearman r: {irr['spearman_r']:.3f} (p={irr['spearman_p']:.4f})")
else:
    print("Only one model's labels available.")
    print("For IRR, run labeling with a second model:")
    print("  make label MODEL=mistral")

## 6. Compare LLM Labels to Lexical Heuristics

In [None]:
from quant_syco.data.process import build_battle_table
from quant_syco.features.lexical import compute_lexical_features
from scipy import stats

# Load battles and compute lexical features
battles = build_battle_table()
battles_lex = compute_lexical_features(battles, 'assistant_a')

# Merge with labels
merged = battles_lex.merge(labels, on='question_id', how='inner')

if 'sycophancy_a' in merged.columns and 'lex_sycophancy_total' in merged.columns:
    # Correlation
    corr, pval = stats.spearmanr(
        merged['sycophancy_a'].dropna(), 
        merged.loc[merged['sycophancy_a'].notna(), 'lex_sycophancy_total']
    )
    
    print(f"Correlation between LLM sycophancy score and lexical count:")
    print(f"  Spearman r: {corr:.3f} (p={pval:.4f})")
    
    # Mean lexical score by LLM score
    print("\nMean lexical sycophancy count by LLM score:")
    grouped = merged.groupby('sycophancy_a')['lex_sycophancy_total'].agg(['mean', 'count'])
    print(grouped)

## 7. Manual Inspection of Edge Cases

In [None]:
# Show examples at different sycophancy levels
merged_sample = merged.merge(
    battles[['question_id', 'user_prompt', 'assistant_a']], 
    on='question_id'
)

for level in [0, 1, 2, 3]:
    subset = merged_sample[merged_sample['sycophancy_a'] == level]
    if len(subset) > 0:
        row = subset.sample(1, random_state=level).iloc[0]
        print(f"\n=== Sycophancy Level {level} ===")
        print(f"Politeness: {row.get('politeness_a', 'N/A')}")
        print(f"Reasoning: {row.get('reasoning_a', 'N/A')}")
        print(f"\nUser: {str(row['user_prompt'])[:200]}...")
        print(f"\nAssistant: {str(row['assistant_a'])[:400]}...")

## Next Steps

1. If IRR is low (Îº < 0.6), consider:
   - Refining the prompt
   - Using a more capable model
   - Manual annotation of a validation subset

2. Continue with `03_descriptive_analysis.ipynb`