# ACSES Pilot Study Analysis

This notebook analyzes the results from the pilot study to compare different LLM models for KBLI code validation.

## Key Questions to Answer:
1. **Success Rate**: For each model, what percentage of calls returned valid JSON vs. errors?
2. **Reasoning Quality**: Manual inspection of reasoning strings from each model
3. **Confidence Calibration**: Does confidence_score align with difficulty?
4. **Consensus**: How often did the 3 runs agree on the is_correct value?

In [None]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import os
from typing import Dict, List, Any

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

## 1. Load and Prepare Data

In [None]:
        def load_pilot_results(filename: str) -> pd.DataFrame:
            """Load pilot study results from JSON or JSONL file."""
            # Try JSONL first (new format), then JSON (legacy format)
            jsonl_filename = filename.replace('.json', '.jsonl')
            json_filepath = os.path.join('..', 'data', 'output', filename)
            jsonl_filepath = os.path.join('..', 'data', 'output', jsonl_filename)
            
            # Check which files exist
            jsonl_exists = os.path.exists(jsonl_filepath)
            json_exists = os.path.exists(json_filepath)
            
            if jsonl_exists:
                # Load JSONL format (preferred)
                print(f"📂 Loading JSONL format: {jsonl_filename}")
                data = []
                try:
                    with open(jsonl_filepath, 'r', encoding='utf-8') as f:
                        for line_num, line in enumerate(f, 1):
                            line = line.strip()
                            if line:
                                try:
                                    result = json.loads(line)
                                    data.append(result)
                                except json.JSONDecodeError as e:
                                    print(f"⚠️  Warning: Invalid JSON on line {line_num}: {e}")
                                    continue
                    
                    df = pd.DataFrame(data)
                    print(f"✅ Loaded {len(df)} results from {jsonl_filename}")
                    return df
                    
                except Exception as e:
                    print(f"❌ Error loading JSONL file: {e}")
                    return pd.DataFrame()
            
            elif json_exists:
                # Load legacy JSON format
                print(f"📂 Loading legacy JSON format: {filename}")
                try:
                    with open(json_filepath, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                    
                    df = pd.DataFrame(data)
                    print(f"✅ Loaded {len(df)} results from {filename}")
                    return df
                    
                except Exception as e:
                    print(f"❌ Error loading JSON file: {e}")
                    return pd.DataFrame()
            
            else:
                # No files found
                print(f"⚠️  File not found: {filename} or {jsonl_filename}")
                print("Available files in output directory:")
                output_dir = os.path.join('..', 'data', 'output')
                if os.path.exists(output_dir):
                    for f in os.listdir(output_dir):
                        if f.endswith(('.json', '.jsonl')):
                            print(f"  - {f}")
                return pd.DataFrame()
        
        # Load results for different models
        # Update these filenames based on what you've actually generated
        flash_results = load_pilot_results('pilot_results_gemini_1_5_flash_latest.json')
        # pro_results = load_pilot_results('pilot_results_gemini_1_5_pro_latest.json')  # Uncomment when available
        
        # Display basic info
        if not flash_results.empty:
            print(f"\nFlash results shape: {flash_results.shape}")
            print(f"Columns: {list(flash_results.columns)}")
            print(f"Date range: {flash_results['timestamp'].min()} to {flash_results['timestamp'].max()}")
# Load results for different models
# Update these filenames based on what you've actually generated
flash_results = load_pilot_results('pilot_results_gemini_1_5_flash_latest.json')
# pro_results = load_pilot_results('pilot_results_gemini_1_5_pro_latest.json')  # Uncomment when available

# Display basic info
if not flash_results.empty:
    print(f"\nFlash results shape: {flash_results.shape}")
    print(f"Columns: {list(flash_results.columns)}")
    print(f"Date range: {flash_results['timestamp'].min()} to {flash_results['timestamp'].max()}")

## 2. Success Rate Analysis

In [None]:
def analyze_success_rate(df: pd.DataFrame, model_name: str) -> Dict[str, Any]:
    """Analyze success rate and error patterns."""
    if df.empty:
        return {}
    
    total_calls = len(df)
    successful_calls = df['success'].sum()
    success_rate = successful_calls / total_calls
    
    # Error analysis
    errors = df[df['success'] == False]
    error_types = errors['error_type'].value_counts() if not errors.empty else pd.Series()
    
    results = {
        'model': model_name,
        'total_calls': total_calls,
        'successful_calls': successful_calls,
        'success_rate': success_rate,
        'error_count': len(errors),
        'error_types': error_types.to_dict() if not error_types.empty else {}
    }
    
    return results

# Analyze success rates
flash_success = analyze_success_rate(flash_results, 'Gemini 1.5 Flash')
# pro_success = analyze_success_rate(pro_results, 'Gemini 1.5 Pro')  # Uncomment when available

print("📊 SUCCESS RATE ANALYSIS")
print("=" * 50)

if flash_success:
    print(f"\n{flash_success['model']}:")
    print(f"  Total API calls: {flash_success['total_calls']}")
    print(f"  Successful calls: {flash_success['successful_calls']}")
    print(f"  Success rate: {flash_success['success_rate']:.1%}")
    if flash_success['error_types']:
        print(f"  Error types: {flash_success['error_types']}")

In [None]:
# Visualize success rates
if not flash_results.empty:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Success rate bar chart
    models = ['Gemini 1.5 Flash']  # Add more models when available
    success_rates = [flash_success['success_rate']]
    
    axes[0].bar(models, success_rates, color=['skyblue'])
    axes[0].set_ylabel('Success Rate')
    axes[0].set_title('API Call Success Rate by Model')
    axes[0].set_ylim(0, 1)
    
    # Add percentage labels on bars
    for i, rate in enumerate(success_rates):
        axes[0].text(i, rate + 0.01, f'{rate:.1%}', ha='center', va='bottom')
    
    # Processing time distribution for successful calls
    successful_flash = flash_results[flash_results['success'] == True]
    if not successful_flash.empty and 'processing_time_seconds' in successful_flash.columns:
        axes[1].hist(successful_flash['processing_time_seconds'], bins=20, alpha=0.7, 
                    label='Flash', color='skyblue')
        axes[1].set_xlabel('Processing Time (seconds)')
        axes[1].set_ylabel('Frequency')
        axes[1].set_title('Processing Time Distribution')
        axes[1].legend()
    
    plt.tight_layout()
    plt.show()

## 3. Reasoning Quality Analysis

In [None]:
def sample_reasoning_analysis(df: pd.DataFrame, model_name: str, n_samples: int = 10) -> pd.DataFrame:
    """Sample reasoning strings for manual inspection."""
    if df.empty:
        return pd.DataFrame()
    
    successful = df[df['success'] == True].copy()
    if successful.empty:
        print(f"No successful results found for {model_name}")
        return pd.DataFrame()
    
    # Sample diverse cases
    sample_df = successful.sample(min(n_samples, len(successful)), random_state=42)
    
    # Create analysis DataFrame
    analysis_cols = ['sample_id', 'assigned_kbli_code', 'is_correct', 'confidence_score', 
                     'original_text', 'reasoning']
    
    available_cols = [col for col in analysis_cols if col in sample_df.columns]
    sample_analysis = sample_df[available_cols].copy()
    
    # Add reasoning length and quality metrics
    if 'reasoning' in sample_analysis.columns:
        sample_analysis['reasoning_length'] = sample_analysis['reasoning'].astype(str).str.len()
        sample_analysis['mentions_hierarchy'] = sample_analysis['reasoning'].astype(str).str.contains(
            'section|division|group|class|sub-class', case=False, na=False
        )
    
    return sample_analysis

# Sample reasoning for manual inspection
print("🔍 REASONING QUALITY SAMPLES")
print("=" * 50)

if not flash_results.empty:
    flash_reasoning = sample_reasoning_analysis(flash_results, 'Flash', 15)
    
    if not flash_reasoning.empty:
        print(f"\n📝 Sample reasoning from Gemini 1.5 Flash:")
        print(f"Average reasoning length: {flash_reasoning['reasoning_length'].mean():.0f} characters")
        print(f"Mentions hierarchy: {flash_reasoning['mentions_hierarchy'].mean():.1%}")
        
        # Display first few examples
        display(flash_reasoning[['assigned_kbli_code', 'is_correct', 'confidence_score', 
                                'reasoning_length', 'mentions_hierarchy']].head())

In [None]:
# Display detailed reasoning examples
if not flash_results.empty and not flash_reasoning.empty:
    print("📖 DETAILED REASONING EXAMPLES")
    print("=" * 60)
    
    for i, (idx, row) in enumerate(flash_reasoning.head(3).iterrows()):
        print(f"\nExample {i+1}:")
        print(f"KBLI Code: {row.get('assigned_kbli_code', 'N/A')}")
        print(f"Prediction: {'Correct' if row.get('is_correct') else 'Incorrect'}")
        print(f"Confidence: {row.get('confidence_score', 'N/A')}")
        print(f"Job Description: {row.get('original_text', 'N/A')[:100]}...")
        print(f"Reasoning: {row.get('reasoning', 'N/A')[:300]}...")
        print("-" * 40)

## 4. Confidence Calibration Analysis

In [None]:
def analyze_confidence_calibration(df: pd.DataFrame, model_name: str) -> Dict[str, Any]:
    """Analyze how well confidence scores align with correctness."""
    if df.empty:
        return {}
    
    successful = df[df['success'] == True].copy()
    if successful.empty or 'confidence_score' not in successful.columns:
        return {}
    
    # Remove any non-numeric confidence scores
    successful = successful[pd.to_numeric(successful['confidence_score'], errors='coerce').notna()]
    successful['confidence_score'] = pd.to_numeric(successful['confidence_score'])
    
    if successful.empty:
        return {}
    
    # Basic statistics
    conf_stats = {
        'model': model_name,
        'mean_confidence': successful['confidence_score'].mean(),
        'std_confidence': successful['confidence_score'].std(),
        'min_confidence': successful['confidence_score'].min(),
        'max_confidence': successful['confidence_score'].max()
    }
    
    # Confidence by correctness
    if 'is_correct' in successful.columns:
        correct_conf = successful[successful['is_correct'] == True]['confidence_score']
        incorrect_conf = successful[successful['is_correct'] == False]['confidence_score']
        
        conf_stats.update({
            'mean_conf_correct': correct_conf.mean() if not correct_conf.empty else None,
            'mean_conf_incorrect': incorrect_conf.mean() if not incorrect_conf.empty else None,
            'conf_difference': (correct_conf.mean() - incorrect_conf.mean()) if not correct_conf.empty and not incorrect_conf.empty else None
        })
    
    return conf_stats

# Analyze confidence calibration
flash_conf = analyze_confidence_calibration(flash_results, 'Flash')

print("🎯 CONFIDENCE CALIBRATION ANALYSIS")
print("=" * 50)

if flash_conf:
    print(f"\n{flash_conf['model']} Model:")
    print(f"  Mean confidence: {flash_conf['mean_confidence']:.3f}")
    print(f"  Std confidence: {flash_conf['std_confidence']:.3f}")
    print(f"  Range: {flash_conf['min_confidence']:.3f} - {flash_conf['max_confidence']:.3f}")
    
    if flash_conf.get('mean_conf_correct') is not None:
        print(f"  Mean confidence (correct): {flash_conf['mean_conf_correct']:.3f}")
        print(f"  Mean confidence (incorrect): {flash_conf['mean_conf_incorrect']:.3f}")
        print(f"  Difference: {flash_conf['conf_difference']:.3f}")

In [None]:
# Visualize confidence calibration
if not flash_results.empty:
    successful_flash = flash_results[flash_results['success'] == True].copy()
    
    if not successful_flash.empty and 'confidence_score' in successful_flash.columns:
        # Convert confidence to numeric
        successful_flash['confidence_score'] = pd.to_numeric(
            successful_flash['confidence_score'], errors='coerce'
        )
        successful_flash = successful_flash[successful_flash['confidence_score'].notna()]
        
        if not successful_flash.empty:
            fig, axes = plt.subplots(1, 2, figsize=(15, 5))
            
            # Confidence distribution
            axes[0].hist(successful_flash['confidence_score'], bins=20, alpha=0.7, 
                        color='skyblue', edgecolor='black')
            axes[0].set_xlabel('Confidence Score')
            axes[0].set_ylabel('Frequency')
            axes[0].set_title('Confidence Score Distribution')
            axes[0].axvline(successful_flash['confidence_score'].mean(), color='red', 
                           linestyle='--', label=f'Mean: {successful_flash["confidence_score"].mean():.3f}')
            axes[0].legend()
            
            # Confidence by correctness (if available)
            if 'is_correct' in successful_flash.columns:
                correct_conf = successful_flash[successful_flash['is_correct'] == True]['confidence_score']
                incorrect_conf = successful_flash[successful_flash['is_correct'] == False]['confidence_score']
                
                if not correct_conf.empty and not incorrect_conf.empty:
                    axes[1].hist([correct_conf, incorrect_conf], bins=15, alpha=0.7, 
                                label=['Correct', 'Incorrect'], color=['green', 'red'])
                    axes[1].set_xlabel('Confidence Score')
                    axes[1].set_ylabel('Frequency')
                    axes[1].set_title('Confidence by Correctness')
                    axes[1].legend()
            
            plt.tight_layout()
            plt.show()

## 5. Consensus Analysis

In [None]:
def analyze_consensus(df: pd.DataFrame, model_name: str, n_runs: int = 3) -> Dict[str, Any]:
    """Analyze how often multiple runs agree on the is_correct value."""
    if df.empty:
        return {}
    
    successful = df[df['success'] == True].copy()
    if successful.empty or 'sample_id' not in successful.columns:
        return {}
    
    # Group by sample_id to analyze consensus
    consensus_data = []
    
    for sample_id in successful['sample_id'].unique():
        sample_runs = successful[successful['sample_id'] == sample_id]
        
        if len(sample_runs) < n_runs:
            continue  # Skip incomplete samples
        
        # Get is_correct values for this sample
        if 'is_correct' in sample_runs.columns:
            is_correct_values = sample_runs['is_correct'].tolist()
            
            # Calculate consensus
            correct_count = sum(is_correct_values)
            total_count = len(is_correct_values)
            
            consensus_type = None
            if correct_count == total_count:
                consensus_type = 'unanimous_correct'
            elif correct_count == 0:
                consensus_type = 'unanimous_incorrect'
            elif correct_count > total_count / 2:
                consensus_type = 'majority_correct'
            else:
                consensus_type = 'majority_incorrect'
            
            consensus_data.append({
                'sample_id': sample_id,
                'assigned_kbli_code': sample_runs.iloc[0].get('assigned_kbli_code', 'N/A'),
                'correct_count': correct_count,
                'total_count': total_count,
                'consensus_type': consensus_type,
                'mean_confidence': sample_runs['confidence_score'].mean() if 'confidence_score' in sample_runs.columns else None
            })
    
    if not consensus_data:
        return {'model': model_name, 'no_data': True}
    
    consensus_df = pd.DataFrame(consensus_data)
    
    # Calculate statistics
    consensus_stats = {
        'model': model_name,
        'total_samples': len(consensus_df),
        'consensus_breakdown': consensus_df['consensus_type'].value_counts().to_dict(),
        'unanimous_rate': len(consensus_df[consensus_df['consensus_type'].str.contains('unanimous')]) / len(consensus_df),
        'correct_rate': len(consensus_df[consensus_df['consensus_type'].str.contains('correct')]) / len(consensus_df)
    }
    
    return consensus_stats, consensus_df

# Analyze consensus
print("🤝 CONSENSUS ANALYSIS")
print("=" * 50)

if not flash_results.empty:
    flash_consensus = analyze_consensus(flash_results, 'Flash')
    
    if isinstance(flash_consensus, tuple):
        flash_stats, flash_consensus_df = flash_consensus
        
        print(f"\n{flash_stats['model']} Model:")
        print(f"  Total samples analyzed: {flash_stats['total_samples']}")
        print(f"  Unanimous agreement rate: {flash_stats['unanimous_rate']:.1%}")
        print(f"  Overall correct rate: {flash_stats['correct_rate']:.1%}")
        print(f"  Consensus breakdown:")
        for consensus_type, count in flash_stats['consensus_breakdown'].items():
            print(f"    {consensus_type}: {count} ({count/flash_stats['total_samples']:.1%})")
    
    elif flash_consensus.get('no_data'):
        print(f"\n{flash_consensus['model']} Model: No consensus data available")

In [None]:
# Visualize consensus patterns
if not flash_results.empty and 'flash_consensus_df' in locals():
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Consensus type distribution
    consensus_counts = flash_consensus_df['consensus_type'].value_counts()
    colors = {'unanimous_correct': 'green', 'unanimous_incorrect': 'red', 
             'majority_correct': 'lightgreen', 'majority_incorrect': 'lightcoral'}
    
    bar_colors = [colors.get(ct, 'gray') for ct in consensus_counts.index]
    axes[0].bar(range(len(consensus_counts)), consensus_counts.values, color=bar_colors)
    axes[0].set_xticks(range(len(consensus_counts)))
    axes[0].set_xticklabels(consensus_counts.index, rotation=45, ha='right')
    axes[0].set_ylabel('Number of Samples')
    axes[0].set_title('Consensus Type Distribution')
    
    # Confidence by consensus type
    if 'mean_confidence' in flash_consensus_df.columns and flash_consensus_df['mean_confidence'].notna().any():
        consensus_conf = flash_consensus_df.groupby('consensus_type')['mean_confidence'].mean()
        
        bar_colors_conf = [colors.get(ct, 'gray') for ct in consensus_conf.index]
        axes[1].bar(range(len(consensus_conf)), consensus_conf.values, color=bar_colors_conf)
        axes[1].set_xticks(range(len(consensus_conf)))
        axes[1].set_xticklabels(consensus_conf.index, rotation=45, ha='right')
        axes[1].set_ylabel('Mean Confidence Score')
        axes[1].set_title('Mean Confidence by Consensus Type')
    
    plt.tight_layout()
    plt.show()

## 6. Summary and Recommendations

In [None]:
print("📋 PILOT STUDY SUMMARY & RECOMMENDATIONS")
print("=" * 60)

if not flash_results.empty:
    print(f"\n🤖 GEMINI 1.5 FLASH RESULTS:")
    if flash_success:
        print(f"  ✓ Success Rate: {flash_success['success_rate']:.1%}")
    if flash_conf:
        print(f"  ✓ Mean Confidence: {flash_conf['mean_confidence']:.3f}")
        if flash_conf.get('conf_difference'):
            print(f"  ✓ Confidence Calibration: {flash_conf['conf_difference']:.3f} difference")
    if 'flash_stats' in locals():
        print(f"  ✓ Unanimous Agreement: {flash_stats['unanimous_rate']:.1%}")

print("\n💡 RECOMMENDATIONS:")
print("\n1. MODEL SELECTION:")
if flash_success and flash_success['success_rate'] > 0.9:
    print("   ✅ Flash shows good API reliability (>90% success rate)")
elif flash_success:
    print(f"   ⚠️  Flash success rate is {flash_success['success_rate']:.1%} - investigate error patterns")

print("\n2. PROMPT ENGINEERING:")
if 'flash_reasoning' in locals() and not flash_reasoning.empty:
    hierarchy_mention_rate = flash_reasoning['mentions_hierarchy'].mean()
    if hierarchy_mention_rate > 0.7:
        print("   ✅ Model effectively uses hierarchical context")
    else:
        print("   ⚠️  Consider emphasizing hierarchy usage in prompts")

print("\n3. QUALITY CONTROL:")
if 'flash_stats' in locals() and flash_stats['unanimous_rate'] > 0.6:
    print("   ✅ Good consensus rate - 3 runs provide reliable validation")
elif 'flash_stats' in locals():
    print("   ⚠️  Low consensus rate - consider increasing runs or adjusting temperature")

print("\n4. NEXT STEPS:")
print("   📝 Run comparison with Gemini 1.5 Pro")
print("   📊 Expand sample size for full validation")
print("   🔧 Fine-tune prompts based on error analysis")
print("   🎯 Implement confidence-based filtering for production")

## 7. Export Results for Further Analysis

In [None]:
# Export summary statistics
summary_data = {
    'analysis_timestamp': pd.Timestamp.now().isoformat(),
    'flash_results': flash_success if 'flash_success' in locals() else {},
    'flash_confidence': flash_conf if 'flash_conf' in locals() else {},
    'flash_consensus': flash_stats if 'flash_stats' in locals() else {}
}

# Save summary
output_path = os.path.join('..', 'data', 'output', 'pilot_analysis_summary.json')
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(summary_data, f, indent=2, ensure_ascii=False, default=str)

print(f"📊 Analysis summary exported to: {output_path}")

# Export detailed consensus data if available
if 'flash_consensus_df' in locals():
    consensus_path = os.path.join('..', 'data', 'output', 'consensus_analysis.csv')
    flash_consensus_df.to_csv(consensus_path, index=False)
    print(f"📊 Consensus analysis exported to: {consensus_path}")

print("\n✅ Analysis complete! Ready for next phase of the pilot study.")