# 01: Data Exploration

This notebook explores the LMSYS dataset structure and computes initial descriptive statistics.

## Goals
1. Load and understand the dataset structure
2. Examine conversation patterns
3. Understand user behavior and return patterns
4. Identify potential confounders for causal analysis

In [None]:
import sys
from pathlib import Path

# Add project root to path
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root / 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', 50)

%matplotlib inline
sns.set_style('whitegrid')

## 1. Load the Dataset

In [None]:
from quant_syco.data.download import download_lmsys

df_raw = download_lmsys()
print(f"Dataset shape: {df_raw.shape}")
print(f"\nColumns: {list(df_raw.columns)}")

In [None]:
df_raw.head(2)

## 2. Build Battle Table

In [None]:
from quant_syco.data.process import build_battle_table

battles = build_battle_table(df_raw)
print(f"Battle table shape: {battles.shape}")
battles.head(2)

## 3. Winner Distribution

In [None]:
winner_counts = battles['winner'].value_counts()
print(winner_counts)

fig, ax = plt.subplots(figsize=(8, 5))
winner_counts.plot(kind='bar', ax=ax)
ax.set_title('Winner Distribution')
ax.set_ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()

## 4. Model Distribution

In [None]:
# Most common models
model_counts = battles['model_a'].value_counts().head(15)

fig, ax = plt.subplots(figsize=(10, 6))
model_counts.plot(kind='barh', ax=ax)
ax.set_title('Top 15 Models (Side A)')
ax.set_xlabel('Count')
plt.tight_layout()

## 5. Response Length Distribution

In [None]:
from quant_syco.features.lexical import compute_response_length_features

battles_with_length = compute_response_length_features(battles)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(battles_with_length['assistant_a_word_count'].clip(0, 500), bins=50, alpha=0.7)
axes[0].set_title('Response A: Word Count')
axes[0].set_xlabel('Words')

axes[1].hist(battles_with_length['assistant_b_word_count'].clip(0, 500), bins=50, alpha=0.7)
axes[1].set_title('Response B: Word Count')
axes[1].set_xlabel('Words')

plt.tight_layout()

## 6. Topic Distribution

In [None]:
from quant_syco.features.topics import compute_topic_features, get_topic_distribution

battles_with_topics = compute_topic_features(battles)
topic_dist = get_topic_distribution(battles_with_topics)

fig, ax = plt.subplots(figsize=(8, 5))
topic_dist.plot(kind='bar', ax=ax)
ax.set_title('Topic Distribution')
ax.set_ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

## 7. Lexical Sycophancy Signals (Heuristic)

In [None]:
from quant_syco.features.lexical import compute_lexical_features

battles_lex = compute_lexical_features(battles, 'assistant_a')

print("Lexical sycophancy signals in assistant_a responses:")
for col in ['lex_flattery_count', 'lex_validation_seeking_count', 'lex_agreement_count']:
    if col in battles_lex.columns:
        print(f"  {col}: mean={battles_lex[col].mean():.2f}, max={battles_lex[col].max()}")

print(f"\nResponses with any lexical sycophancy signal: {battles_lex['lex_sycophancy_any'].mean():.1%}")

## 8. Sample Conversations

In [None]:
# Show a few examples
sample = battles.sample(3, random_state=42)

for i, (_, row) in enumerate(sample.iterrows()):
    print(f"=== Example {i+1} ===")
    print(f"Winner: {row['winner']}")
    print(f"Model A: {row['model_a']} | Model B: {row['model_b']}")
    print(f"\nUser: {row['user_prompt'][:300]}...")
    print(f"\nAssistant A: {str(row['assistant_a'])[:300]}...")
    print(f"\nAssistant B: {str(row['assistant_b'])[:300]}...")
    print("\n")

## Next Steps

1. Run sycophancy labeling: `make label`
2. Continue with `02_labeling_validation.ipynb`