# Hypothesis 3: Hallucination Reduction and Time-to-Market Impact
**Implementing hallucination-reducing techniques in LLMs significantly improves (>30%) time to market in new product development.**

In [None]:
# Cell [0] - Enhanced Setup and Imports with Advanced Statistical Testing (FIXED) + Model Filtering
# Purpose: Import all required libraries and configure environment settings for SiFP COSMIC Estimation Analysis with advanced statistical methods
# Dependencies: pandas, numpy, matplotlib, seaborn, scipy, neo4j, scikit-learn, dotenv, pymc, arviz, bayesian-testing
# Breadcrumbs: Setup -> Environment Configuration -> Analysis Preparation -> Advanced Statistical Methods

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings
from dotenv import load_dotenv

# Core statistical and machine learning imports
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Database connections
from neo4j import GraphDatabase

# Check scipy version for permutation_test availability
import scipy
scipy_version = [int(x) for x in scipy.__version__.split('.')]
SCIPY_HAS_PERMUTATION_TEST = (scipy_version[0] > 1) or (scipy_version[0] == 1 and scipy_version[1] >= 7)

print(f"SciPy version: {scipy.__version__}")
print(f"Permutation test available: {SCIPY_HAS_PERMUTATION_TEST}")

# Advanced statistical testing imports with better error handling
try:
    # Basic scipy stats (should always be available)
    from scipy.stats import (
        mannwhitneyu, wilcoxon, kruskal, friedmanchisquare, 
        chi2_contingency, fisher_exact, pearsonr, spearmanr, 
        kendalltau, norm, t as t_dist
    )
    
    # Try to import permutation_test and bootstrap (scipy 1.7.0+)
    if SCIPY_HAS_PERMUTATION_TEST:
        from scipy.stats import permutation_test, bootstrap
        PERMUTATION_TEST_AVAILABLE = True
        BOOTSTRAP_AVAILABLE = True
        print("‚úì SciPy permutation_test and bootstrap imported successfully")
    else:
        PERMUTATION_TEST_AVAILABLE = False
        BOOTSTRAP_AVAILABLE = False
        print("‚ö† SciPy permutation_test not available (requires SciPy >= 1.7.0)")
    
    # Bayesian analysis imports
    try:
        import pymc as pm
        import arviz as az
        BAYESIAN_AVAILABLE = True
        print("‚úì PyMC and ArviZ imported successfully")
    except ImportError:
        BAYESIAN_AVAILABLE = False
        print("‚ö† PyMC/ArviZ not available")
    
    # Bayesian hypothesis testing
    try:
        from bayesian_testing.experiments import BinaryDataTest, NormalDataTest
        BAYESIAN_TESTING_AVAILABLE = True
        print("‚úì Bayesian-testing imported successfully")
    except ImportError:
        BAYESIAN_TESTING_AVAILABLE = False
        print("‚ö† bayesian-testing not available")
    
    # Additional statistical utilities (CORRECTED FUNCTION NAMES)
    try:
        from statsmodels.stats.contingency_tables import mcnemar
        from statsmodels.stats.power import ttest_power, tt_ind_solve_power
        from statsmodels.stats.proportion import proportion_confint, proportions_ztest
        from statsmodels.stats.descriptivestats import describe
        from statsmodels.stats.multitest import multipletests
        STATSMODELS_AVAILABLE = True
        print("‚úì Statsmodels imported successfully")
    except ImportError:
        STATSMODELS_AVAILABLE = False
        print("‚ö† Statsmodels not available")
    
    ADVANCED_STATS_AVAILABLE = True
    print("‚úì Basic advanced statistical libraries loaded successfully")
    
except ImportError as e:
    print(f"‚ö† Warning: Some statistical libraries not available: {e}")
    ADVANCED_STATS_AVAILABLE = False
    PERMUTATION_TEST_AVAILABLE = False
    BOOTSTRAP_AVAILABLE = False
    BAYESIAN_AVAILABLE = False
    BAYESIAN_TESTING_AVAILABLE = False
    STATSMODELS_AVAILABLE = False

def custom_permutation_test(sample1, sample2, statistic_func, n_resamples=10000, alternative='two-sided', random_state=None):
    """
    Custom implementation of permutation test for older SciPy versions
    """
    if random_state is not None:
        np.random.seed(random_state)
    
    # Calculate observed statistic
    observed_stat = statistic_func(sample1, sample2)
    
    # Combine samples for permutation
    combined = np.concatenate([sample1, sample2])
    n1 = len(sample1)
    
    # Generate permutation distribution
    perm_stats = []
    for _ in range(n_resamples):
        perm_combined = np.random.permutation(combined)
        perm_sample1 = perm_combined[:n1]
        perm_sample2 = perm_combined[n1:]
        perm_stat = statistic_func(perm_sample1, perm_sample2)
        perm_stats.append(perm_stat)
    
    perm_stats = np.array(perm_stats)
    
    # Calculate p-value based on alternative hypothesis
    if alternative == 'two-sided':
        p_value = np.mean(np.abs(perm_stats) >= np.abs(observed_stat))
    elif alternative == 'greater':
        p_value = np.mean(perm_stats >= observed_stat)
    elif alternative == 'less':
        p_value = np.mean(perm_stats <= observed_stat)
    else:
        raise ValueError("alternative must be 'two-sided', 'greater', or 'less'")
    
    # Return result object similar to scipy's permutation_test
    class PermutationTestResult:
        def __init__(self, statistic, pvalue, null_distribution):
            self.statistic = statistic
            self.pvalue = pvalue
            self.null_distribution = null_distribution
    
    return PermutationTestResult(observed_stat, p_value, perm_stats)

def custom_bootstrap_ci(data, statistic_func, n_resamples=10000, confidence_level=0.95, random_state=None):
    """
    Custom implementation of bootstrap confidence intervals
    """
    if random_state is not None:
        np.random.seed(random_state)
    
    # Generate bootstrap samples
    bootstrap_stats = []
    for _ in range(n_resamples):
        bootstrap_sample = np.random.choice(data, len(data), replace=True)
        bootstrap_stat = statistic_func(bootstrap_sample)
        bootstrap_stats.append(bootstrap_stat)
    
    bootstrap_stats = np.array(bootstrap_stats)
    
    # Calculate confidence interval
    alpha = 1 - confidence_level
    lower_percentile = (alpha / 2) * 100
    upper_percentile = (1 - alpha / 2) * 100
    
    ci_lower = np.percentile(bootstrap_stats, lower_percentile)
    ci_upper = np.percentile(bootstrap_stats, upper_percentile)
    
    # Return result object similar to scipy's bootstrap
    class BootstrapResult:
        def __init__(self, confidence_interval, bootstrap_distribution):
            self.confidence_interval = (ci_lower, ci_upper)
            self.bootstrap_distribution = bootstrap_distribution
    
    return BootstrapResult((ci_lower, ci_upper), bootstrap_stats)

def setup_analysis_environment():
    """
    Configure analysis environment with display options and styling
    
    Returns:
        dict: Configuration parameters for the analysis
    """
    # Suppress warnings for cleaner output
    warnings.filterwarnings('ignore')
    
    # Configure matplotlib and seaborn styling
    plt.style.use('seaborn-v0_8-darkgrid')
    
    # Configure pandas display options for better readability
    # Configure pandas display settings for legal landscape format
    pd.set_option('display.width', 130)           # Set width threshold for legal landscape
    pd.set_option('display.max_columns', 25)     # Reasonable number of columns
    pd.set_option('display.max_colwidth', 25)    # Compact column width
    pd.set_option('display.precision', 2)        # Only 2 decimal places to save space
    pd.set_option('display.float_format', '{:.2f}'.format)  # Consistent float formatting
    # Note: Removed expand_frame_repr=False to allow natural wrapping at 130 chars
    
    # Load environment variables
    load_dotenv()
    
    # Parse target model IDs from environment
    target_model_ids = []
    results_analysis_models = os.getenv('RESULTS_ANALYSIS_MODEL_IDS', '').strip()
    
    if results_analysis_models:
        # Parse the comma-separated list of model variable names
        model_var_names = [name.strip() for name in results_analysis_models.split(',')]
        
        for var_name in model_var_names:
            model_id = os.getenv(var_name, '').strip()
            if model_id:
                target_model_ids.append(model_id)
                print(f"‚úì Added target model: {var_name} = {model_id}")
            else:
                print(f"‚ö† Warning: {var_name} not found in environment variables")
    
    if not target_model_ids:
        print("‚ö† Warning: No target model IDs found. Will analyze all models.")
        print("   Please check RESULTS_ANALYSIS_MODEL_IDS in .env file")
    
    # Configuration parameters
    config = {
        'NEO4J_URI': os.getenv('NEO4J_URI'),
        'NEO4J_USER': os.getenv('NEO4J_USER'),
        'NEO4J_PASSWORD': os.getenv('NEO4J_PASSWORD'),
        'NEO4J_PROJECT_NAME': os.getenv('NEO4J_PROJECT_NAME'),
        'TARGET_MODEL_IDS': target_model_ids,  # NEW: List of specific model IDs to analyze
        'CONVERSION_FACTOR': 0.957,  # SiFP = 0.957 √ó UFP (Desharnais)
        'COST_PER_HOUR': 100,  # Industry standard for cost impact calculations
        
        # Statistical testing configuration
        'ALPHA_LEVEL': 0.05,  # Significance level
        'BOOTSTRAP_SAMPLES': 10000,  # Number of bootstrap samples
        'PERMUTATION_SAMPLES': 10000,  # Number of permutation samples
        'BAYESIAN_SAMPLES': 2000,  # Number of MCMC samples
        'BAYESIAN_CHAINS': 4,  # Number of MCMC chains
        'IMPROVEMENT_THRESHOLD': 0.30,  # 30% improvement threshold
        'POWER_TARGET': 0.80,  # Target statistical power
        
        # Advanced testing flags
        'ADVANCED_STATS_AVAILABLE': ADVANCED_STATS_AVAILABLE,
        'SCIPY_VERSION': scipy.__version__
    }
    
    print("‚úì Analysis environment configured successfully")
    print(f"‚úì Project: {config['NEO4J_PROJECT_NAME']}")
    print(f"‚úì Target models for analysis: {len(target_model_ids)}")
    if target_model_ids:
        for i, model_id in enumerate(target_model_ids, 1):
            print(f"   {i}. {model_id}")
    print(f"‚úì Advanced statistical methods: {'Available' if ADVANCED_STATS_AVAILABLE else 'Limited'}")
    
    return config

def initialize_statistical_methods():
    """
    Initialize and test statistical methods availability with fallbacks
    
    Returns:
        dict: Available statistical methods configuration
    """
    methods_config = {
        'permutation_tests': False,
        'bootstrap_tests': False,
        'bayesian_analysis': False,
        'power_analysis': False
    }
    
    # Test permutation functionality
    try:
        test_data1 = np.random.normal(0, 1, 10)
        test_data2 = np.random.normal(0, 1, 10)
        
        def test_statistic(x, y):
            return np.mean(x) - np.mean(y)
        
        if PERMUTATION_TEST_AVAILABLE:
            # Use scipy's permutation_test
            perm_result = permutation_test((test_data1, test_data2), test_statistic, 
                                         n_resamples=100, random_state=42)
            methods_config['permutation_tests'] = True
            print("‚úì SciPy permutation_test available and working")
        else:
            # Use custom implementation
            perm_result = custom_permutation_test(test_data1, test_data2, test_statistic, 
                                                n_resamples=100, random_state=42)
            methods_config['permutation_tests'] = True
            print("‚úì Custom permutation_test fallback available and working")
            
    except Exception as e:
        print(f"‚ö† Warning: Permutation test functionality issue: {e}")
        methods_config['permutation_tests'] = False
    
    # Test bootstrap functionality
    try:
        if BOOTSTRAP_AVAILABLE:
            # Use scipy's bootstrap
            bootstrap_result = bootstrap((test_data1,), np.mean, n_resamples=100, 
                                       random_state=42)
            methods_config['bootstrap_tests'] = True
            print("‚úì SciPy bootstrap available and working")
        else:
            # Use custom implementation
            bootstrap_result = custom_bootstrap_ci(test_data1, np.mean, n_resamples=100, 
                                                 random_state=42)
            methods_config['bootstrap_tests'] = True
            print("‚úì Custom bootstrap fallback available and working")
            
    except Exception as e:
        print(f"‚ö† Warning: Bootstrap functionality issue: {e}")
        methods_config['bootstrap_tests'] = False
    
    # Test Bayesian functionality
    if BAYESIAN_AVAILABLE:
        try:
            with pm.Model() as test_model:
                mu = pm.Normal('mu', mu=0, sigma=1)
            methods_config['bayesian_analysis'] = True
            print("‚úì Bayesian analysis available and working")
        except Exception as e:
            print(f"‚ö† Warning: Bayesian analysis issue: {e}")
            methods_config['bayesian_analysis'] = False
    
    # Test power analysis (CORRECTED FUNCTION CALL)
    if STATSMODELS_AVAILABLE:
        try:
            power_result = tt_ind_solve_power(effect_size=0.5, nobs1=20, alpha=0.05)
            methods_config['power_analysis'] = True
            print("‚úì Power analysis available and working")
        except Exception as e:
            print(f"‚ö† Warning: Power analysis issue: {e}")
            methods_config['power_analysis'] = False
    
    return methods_config

def setup_bayesian_environment():
    """
    Configure PyMC and ArviZ for Bayesian analysis
    
    Returns:
        dict: Bayesian analysis configuration
    """
    if not BAYESIAN_AVAILABLE:
        return {}
    
    try:
        # Configure ArviZ styling
        az.style.use('arviz-darkgrid')
        
        # Set up PyMC configuration (version-aware)
        try:
            # For PyMC3 compatibility
            pm.set_tt_config('floatX', 'float64')
        except AttributeError:
            # PyMC v4+ doesn't have set_tt_config - configuration is handled differently
            # This is normal and expected for newer PyMC versions
            pass
        
        bayesian_config = {
            'target_accept': 0.9,
            'chains': 4,
            'draws': 2000,
            'tune': 1000,
            'cores': min(4, os.cpu_count() or 1),
            'return_inferencedata': True,
            'random_seed': 42
        }
        
        print("‚úì Bayesian analysis environment configured")
        return bayesian_config
        
    except Exception as e:
        print(f"‚ö† Warning: Bayesian environment setup issue: {e}")
        return {}

# Execute setup when cell runs
CONFIG = setup_analysis_environment()
STATISTICAL_CONFIG = initialize_statistical_methods()
BAYESIAN_CONFIG = setup_bayesian_environment()

# Make custom functions available globally if needed
if not PERMUTATION_TEST_AVAILABLE:
    permutation_test = custom_permutation_test
    print("‚úì Custom permutation_test function registered globally")

if not BOOTSTRAP_AVAILABLE:
    bootstrap = custom_bootstrap_ci
    print("‚úì Custom bootstrap function registered globally")

# Display configuration summary
print(f"\n" + "="*60)
print("STATISTICAL ANALYSIS CONFIGURATION")
print("="*60)
print(f"SciPy Version: {scipy.__version__}")
print(f"Available Methods:")
print(f"  ‚úì Basic statistics and visualization")
print(f"  ‚úì Standard hypothesis testing (t-tests, Mann-Whitney)")
print(f"  {'‚úì' if STATISTICAL_CONFIG.get('permutation_tests') else '‚úó'} Permutation tests {'(custom fallback)' if not PERMUTATION_TEST_AVAILABLE and STATISTICAL_CONFIG.get('permutation_tests') else '(scipy native)' if STATISTICAL_CONFIG.get('permutation_tests') else ''}")
print(f"  {'‚úì' if STATISTICAL_CONFIG.get('bootstrap_tests') else '‚úó'} Advanced bootstrapping {'(custom fallback)' if not BOOTSTRAP_AVAILABLE and STATISTICAL_CONFIG.get('bootstrap_tests') else '(scipy native)' if STATISTICAL_CONFIG.get('bootstrap_tests') else ''}")
print(f"  {'‚úì' if STATISTICAL_CONFIG.get('bayesian_analysis') else '‚úó'} Bayesian hypothesis testing")
print(f"  {'‚úì' if STATISTICAL_CONFIG.get('power_analysis') else '‚úó'} Power analysis")

print(f"\nConfiguration Parameters:")
print(f"  Alpha level: {CONFIG['ALPHA_LEVEL']}")
print(f"  Improvement threshold: {CONFIG['IMPROVEMENT_THRESHOLD']:.0%}")
print(f"  Bootstrap samples: {CONFIG['BOOTSTRAP_SAMPLES']:,}")
print(f"  Permutation samples: {CONFIG['PERMUTATION_SAMPLES']:,}")
if BAYESIAN_CONFIG:
    print(f"  MCMC samples: {CONFIG['BAYESIAN_SAMPLES']:,}")
    print(f"  MCMC chains: {CONFIG['BAYESIAN_CHAINS']}")

print(f"\nModel Filtering Configuration:")
if CONFIG['TARGET_MODEL_IDS']:
    print(f"  Analysis scope: FILTERED to {len(CONFIG['TARGET_MODEL_IDS'])} specific models")
    for i, model_id in enumerate(CONFIG['TARGET_MODEL_IDS'], 1):
        print(f"    {i}. {model_id}")
else:
    print(f"  Analysis scope: ALL MODELS (no filtering applied)")

if not PERMUTATION_TEST_AVAILABLE:
    print(f"\n‚ö† NOTE: Using custom permutation test implementation")
    print(f"   To use native SciPy implementation: pip install 'scipy>=1.7.0'")

if not BOOTSTRAP_AVAILABLE:
    print(f"\n‚ö† NOTE: Using custom bootstrap implementation")
    print(f"   To use native SciPy implementation: pip install 'scipy>=1.7.0'")

print(f"\n‚úì Enhanced statistical analysis environment ready with model filtering")

In [None]:
# Cell [1] - Load and Process Java Code Metrics
# Purpose: Load actual implementation metrics from iTrust java.csv for baseline establishment
# Dependencies: pandas, configured environment (Cell 0)
# Breadcrumbs: Setup -> Code Metrics Loading -> Baseline Data Preparation

def load_code_metrics():
    """
    Load and process Java code metrics from iTrust dataset
    
    Returns:
        pd.DataFrame: Processed code metrics with derived calculations
    """
    try:
        # Load java.csv from iTrust dataset
        java_df = pd.read_csv('../datasets/iTrust/iTrust/java.csv')
        print(f"‚úì Loaded java.csv: {java_df.shape[0]} code entities")

        # Filter to only File entries (excluding methods, functions, etc.)
        file_df = java_df[java_df['Kind'] == 'File'].copy()
        print(f"  Files: {len(file_df)}")

        # Select relevant metrics for SiFP analysis
        metrics_columns = [
            'Name', 'CountLine', 'CountLineCode', 'CountLineComment',
            'CountDeclClass', 'CountDeclMethod', 'CountDeclMethodAll',
            'CountDeclExecutableUnit', 'Cyclomatic', 'MaxCyclomatic'
        ]

        # Create analysis-ready dataframe
        code_metrics_df = file_df[metrics_columns].copy()

        # Calculate derived metrics that correlate with function points
        code_metrics_df['TotalUnits'] = (
            code_metrics_df['CountDeclClass'] + 
            code_metrics_df['CountDeclMethod']
        )

        return code_metrics_df
        
    except Exception as e:
        print(f"Error loading code metrics: {e}")
        raise

# Load and process the code metrics
code_metrics_df = load_code_metrics()

# Display sample data for verification
print("\nSample code metrics:")
print(code_metrics_df.head())

# Calculate and display summary statistics
print("\nCode Metrics Summary:")
print("=" * 40)
print(f"  Total files: {len(code_metrics_df)}")
print(f"  Average lines of code: {code_metrics_df['CountLineCode'].mean():.0f}")
print(f"  Average methods per file: {code_metrics_df['CountDeclMethod'].mean():.1f}")
print(f"  Average cyclomatic complexity: {code_metrics_df['Cyclomatic'].mean():.1f}")
print(f"  Total LOC in codebase: {code_metrics_df['CountLineCode'].sum():,}")

# Store key metrics for later analysis
code_summary = {
    'total_files': len(code_metrics_df),
    'total_lines': code_metrics_df['CountLineCode'].sum(),
    'total_classes': code_metrics_df['CountDeclClass'].sum(),
    'total_methods': code_metrics_df['CountDeclMethod'].sum(),
    'avg_complexity': code_metrics_df['Cyclomatic'].mean(),
    'total_units': code_metrics_df['TotalUnits'].sum()
}

print(f"\n‚úì Code metrics loaded and processed successfully")

In [None]:
# Cell [2] - Connect to Neo4j and Retrieve LLM SiFP Estimates (UPDATED WITH MODEL FILTERING)
# Purpose: Retrieve LLM-generated SiFP estimates for requirements with established ground truth links, filtered by target models
# Dependencies: Neo4j connection, json processing, CONFIG from Cell 0
# Breadcrumbs: Setup -> Code Metrics -> Neo4j Data Retrieval -> LLM Estimates Analysis

def retrieve_llm_estimates():
    """
    Connect to Neo4j and retrieve LLM SiFP estimates for ground truth requirements, filtered by target models
    
    Returns:
        tuple: (llm_estimates_df, ground_truth_requirements, model_success_df)
    """
    try:
        # Establish Neo4j connection using configuration
        driver = GraphDatabase.driver(
            CONFIG['NEO4J_URI'], 
            auth=(CONFIG['NEO4J_USER'], CONFIG['NEO4J_PASSWORD'])
        )
        print(f"‚úì Connected to Neo4j for project: {CONFIG['NEO4J_PROJECT_NAME']}")

        with driver.session() as session:
            # First, identify TARGET requirements with ground truth
            gt_query = """
                MATCH (r1:Requirement {project: $project_name, type: 'TARGET'})
                WHERE EXISTS((r1)-[:GROUND_TRUTH]-()) OR EXISTS(()-[:GROUND_TRUTH]-(r1))
                RETURN DISTINCT r1.id as requirement_id
            """
            
            gt_result = session.run(gt_query, project_name=CONFIG['NEO4J_PROJECT_NAME'])
            ground_truth_requirements = [record['requirement_id'] for record in gt_result]
            
            print(f"‚úì Found {len(ground_truth_requirements)} TARGET requirements with ground truth")

            # Query for LLM estimates on ground truth requirements with model filtering
            if CONFIG['TARGET_MODEL_IDS']:
                # Build model filtering clause
                model_filter = "AND se.model IN $target_models"
                print(f"‚úì Applying model filter for {len(CONFIG['TARGET_MODEL_IDS'])} specific models:")
                for i, model_id in enumerate(CONFIG['TARGET_MODEL_IDS'], 1):
                    print(f"   {i}. {model_id}")
            else:
                model_filter = ""
                print(f"‚úì No model filtering applied - analyzing all models")
            
            estimation_query = f"""
                MATCH (r1:Requirement {{project: $project_name, type: 'TARGET'}})-[se:SIFP_ESTIMATION]->(r2:Requirement)
                WHERE r1.id IN $ground_truth_reqs
                  AND se.final_estimation IS NOT NULL
                  AND se.is_valid = true
                  {model_filter}
                RETURN DISTINCT r1.id as requirement_id,
                       r1.content as requirement_content,
                       se.model as model,
                       se.actor_analysis as actor_json,
                       se.final_estimation as final_json,
                       se.judge_evaluation as judge_eval_json,
                       se.confidence as confidence,
                       se.judge_confidence as judge_confidence,
                       se.judge_score as judge_score
                ORDER BY r1.id, se.model
            """
            
            # Prepare query parameters
            query_params = {
                'project_name': CONFIG['NEO4J_PROJECT_NAME'], 
                'ground_truth_reqs': ground_truth_requirements
            }
            
            # Add model filtering parameter if applicable
            if CONFIG['TARGET_MODEL_IDS']:
                query_params['target_models'] = CONFIG['TARGET_MODEL_IDS']
            
            result = session.run(estimation_query, **query_params)
            
            # Process and structure the results
            records = []
            for record in result:
                try:
                    # Parse JSON data from Neo4j
                    actor_data = json.loads(record['actor_json']) if record['actor_json'] else {}
                    final_data = json.loads(record['final_json']) if record['final_json'] else {}
                    
                    # Extract UGEP and UGDG counts
                    actor_ugep = len(actor_data.get('ugeps', []))
                    actor_ugdg = len(actor_data.get('ugdgs', []))
                    final_ugep = len(final_data.get('ugeps', []))
                    final_ugdg = len(final_data.get('ugdgs', []))
                    
                    # Calculate SiFP using standard formula: SiFP = 4.6 √ó UGEP + 7.0 √ó UGDG
                    actor_sifp = 4.6 * actor_ugep + 7 * actor_ugdg
                    final_sifp = 4.6 * final_ugep + 7 * final_ugdg
                    
                    records.append({
                        'requirement_id': record['requirement_id'],
                        'requirement_content': record['requirement_content'][:100] + '...',
                        'model': record['model'],
                        'actor_ugep': actor_ugep,
                        'actor_ugdg': actor_ugdg,
                        'actor_sifp': actor_sifp,
                        'final_ugep': final_ugep,
                        'final_ugdg': final_ugdg,
                        'final_sifp': final_sifp,
                        'judge_score': record['judge_score'],
                        'confidence': record['confidence']
                    })
                except Exception as e:
                    print(f"Warning: Error processing record for {record.get('requirement_id', 'unknown')}: {e}")
                    continue

        # Create estimates DataFrame
        llm_estimates_df = pd.DataFrame(records)
        
        if not llm_estimates_df.empty:
            print(f"\n‚úì Retrieved {len(llm_estimates_df)} LLM estimates for ground truth requirements")
            
            # Verify model filtering worked correctly
            unique_models = sorted(llm_estimates_df['model'].unique())
            print(f"\nModels found in results: {len(unique_models)}")
            for i, model in enumerate(unique_models, 1):
                print(f"  {i}. {model}")
            
            # Check if any target models are missing
            if CONFIG['TARGET_MODEL_IDS']:
                missing_models = set(CONFIG['TARGET_MODEL_IDS']) - set(unique_models)
                if missing_models:
                    print(f"\n‚ö† Warning: Target models not found in results:")
                    for model in missing_models:
                        print(f"     ‚Ä¢ {model}")
                else:
                    print(f"\n‚úì All target models found in results")
            
            # Calculate model success rates
            print(f"\nModel Success Rates (Ground Truth Requirements):")
            print("=" * 50)
            
            model_success = []
            for model in sorted(llm_estimates_df['model'].unique()):
                model_estimates = llm_estimates_df[llm_estimates_df['model'] == model]
                successful_reqs = model_estimates['requirement_id'].nunique()
                success_rate = successful_reqs / len(ground_truth_requirements) * 100
                
                model_success.append({
                    'model': model,
                    'successful_estimates': successful_reqs,
                    'total_ground_truth': len(ground_truth_requirements),
                    'success_rate': success_rate
                })
                
                print(f"  {model}: {successful_reqs}/{len(ground_truth_requirements)} ({success_rate:.1f}%)")
            
            model_success_df = pd.DataFrame(model_success)
            
            # Display sample estimates for verification
            print(f"\nSample LLM estimates (filtered results):")
            display_cols = ['requirement_id', 'model', 'final_sifp', 'judge_score']
            print(llm_estimates_df[display_cols].head(10))
            
            # Summary statistics
            print(f"\nFiltered Analysis Summary:")
            print(f"  Total estimates: {len(llm_estimates_df)}")
            print(f"  Unique requirements: {llm_estimates_df['requirement_id'].nunique()}")
            print(f"  Unique models: {len(unique_models)}")
            print(f"  Average SiFP per estimate: {llm_estimates_df['final_sifp'].mean():.1f}")
            print(f"  Average judge score: {llm_estimates_df['judge_score'].mean():.2f}")
            
            return llm_estimates_df, ground_truth_requirements, model_success_df
            
        else:
            print("Warning: No LLM estimates found for ground truth requirements with specified models!")
            if CONFIG['TARGET_MODEL_IDS']:
                print("This could be because:")
                print("  1. The specified models haven't analyzed these requirements")
                print("  2. The model IDs in .env don't match the exact model names in Neo4j")
                print("  3. The requirements don't have valid estimations from these models")
                print(f"\nTarget models specified: {CONFIG['TARGET_MODEL_IDS']}")
            return pd.DataFrame(), ground_truth_requirements, pd.DataFrame()
            
    except Exception as e:
        print(f"Error retrieving LLM estimates: {e}")
        raise
    finally:
        driver.close()
        print("‚úì Neo4j connection closed")

# Execute the retrieval process
llm_estimates_df, ground_truth_requirements, model_success_df = retrieve_llm_estimates()

In [None]:
# Cell [3] - Establish Requirements-to-Code Mapping and Feature Analysis
# Purpose: Create mapping between requirements and actual code files for validation baseline
# Dependencies: llm_estimates_df from Cell 2, feature extraction logic
# Breadcrumbs: Setup -> Data Retrieval -> Requirements Mapping -> Feature Analysis

def analyze_requirement_features():
    """
    Analyze requirements by extracting feature identifiers and establishing mappings
    
    Returns:
        tuple: (feature_requirements_df, feature_mapping)
    """
    
    def extract_feature_from_requirement(req_id):
        """
        Extract feature/module name from requirement ID using common patterns
        
        Args:
            req_id (str): Requirement identifier
            
        Returns:
            str: Extracted feature name
        """
        # Handle common requirement ID patterns
        if 'UC' in req_id:
            # Use case format: UC1.1 -> UC1
            return req_id.split('.')[0]
        elif '-' in req_id:
            # Functional requirement format: FR-AUTH-001 -> AUTH
            parts = req_id.split('-')
            if len(parts) >= 2:
                return parts[1]
        return req_id  # Return original if no pattern matches

    # Check if we have LLM estimates to analyze
    if not llm_estimates_df.empty:
        print("Analyzing requirement features and groupings...")
        print("=" * 50)
        
        # Extract features from requirement IDs
        llm_estimates_df['feature'] = llm_estimates_df['requirement_id'].apply(extract_feature_from_requirement)
        
        # Group requirements by feature for analysis
        feature_requirements = llm_estimates_df.groupby('feature').agg({
            'requirement_id': 'nunique',  # Count unique requirements
            'final_sifp': ['mean', 'sum', 'std'],
            'model': 'nunique'  # Count how many models estimated this feature
        }).round(2)
        
        # Flatten column names for better readability
        feature_requirements.columns = [
            'unique_requirements', 'avg_sifp', 'total_sifp', 'std_sifp', 'models_count'
        ]
        
        # Sort by total SiFP for better insights
        feature_requirements = feature_requirements.sort_values('total_sifp', ascending=False)
        
        print("Requirements grouped by feature:")
        print(feature_requirements)
        
        # Calculate feature statistics
        print(f"\nFeature Analysis Summary:")
        print(f"  Total features identified: {len(feature_requirements)}")
        print(f"  Average requirements per feature: {feature_requirements['unique_requirements'].mean():.1f}")
        print(f"  Average SiFP per feature: {feature_requirements['avg_sifp'].mean():.1f}")
        print(f"  Most complex feature: {feature_requirements.index[0]} ({feature_requirements['total_sifp'].max():.1f} SiFP)")
        
        # Create feature mapping for traceability
        feature_mapping = {}
        for feature in feature_requirements.index:
            feature_reqs = llm_estimates_df[llm_estimates_df['feature'] == feature]['requirement_id'].unique()
            feature_mapping[feature] = {
                'requirements': list(feature_reqs),
                'count': len(feature_reqs),
                'estimated_loc': feature_requirements.loc[feature, 'total_sifp'] * CONFIG.get('avg_loc_per_sifp', 100)
            }
        
        return feature_requirements, feature_mapping
        
    else:
        print("Warning: No LLM estimates available for feature analysis")
        return pd.DataFrame(), {}

# Execute feature analysis
if not llm_estimates_df.empty:
    feature_requirements_df, feature_mapping = analyze_requirement_features()
    
    # Display insights about the mapping approach
    print(f"\nRequirement-to-Code Mapping Approach:")
    print("=" * 50)
    print("‚Ä¢ Using aggregate analysis based on requirement feature groupings")
    print("‚Ä¢ Features extracted from requirement IDs using pattern matching")
    print("‚Ä¢ In production, explicit traceability links would provide direct mapping")
    print("‚Ä¢ Current approach enables statistical validation at feature level")
    
else:
    print("Skipping feature analysis - no LLM estimates available")
    feature_requirements_df = pd.DataFrame()
    feature_mapping = {}

In [None]:
# Cell [4] - Calculate Normalized Metrics and Establish Conversion Baselines
# Purpose: Establish normalized relationships between SiFP and code metrics using UFP‚ÜíSiFP conversion
# Dependencies: code_summary from Cell 1, llm_estimates_df from Cell 2, CONFIG from Cell 0
# Breadcrumbs: Setup -> Data Collection -> Mapping -> Baseline Establishment

def calculate_normalized_metrics():
    """
    Calculate normalized metrics and establish baseline relationships between SiFP and code metrics
    
    Returns:
        tuple: (llm_analysis_df, baseline_metrics, industry_metrics)
    """
    
    print("Code Base Summary (Full Codebase):")
    print("=" * 40)
    for key, value in code_summary.items():
        print(f"  {key}: {value:.0f}")

    # Calculate normalized code metrics
    print("\nNormalized Code Metrics (Full Codebase):")
    print("-" * 40)

    # Key normalized metrics
    loc_per_file = code_summary['total_lines'] / code_summary['total_files']
    methods_per_kloc = (code_summary['total_methods'] / code_summary['total_lines']) * 1000
    classes_per_kloc = (code_summary['total_classes'] / code_summary['total_lines']) * 1000

    print(f"  Lines of code per file: {loc_per_file:.1f}")
    print(f"  Methods per KLOC: {methods_per_kloc:.1f}")
    print(f"  Classes per KLOC: {classes_per_kloc:.1f}")

    # Establish industry baselines
    print("\n" + "="*60)
    print("BASELINE CALCULATION APPROACHES")
    print("="*60)

    # Industry standards from research
    INDUSTRY_LOC_PER_UFP = 100  # Typical for Java
    INDUSTRY_LOC_PER_SIFP = INDUSTRY_LOC_PER_UFP / CONFIG['CONVERSION_FACTOR']  # Adjust for conversion

    industry_metrics = {
        'LOC_PER_UFP': INDUSTRY_LOC_PER_UFP,
        'LOC_PER_SIFP': INDUSTRY_LOC_PER_SIFP,
        'SIFP_PER_KLOC': 1000/INDUSTRY_LOC_PER_SIFP
    }

    print(f"\nIndustry Baseline (Research-based):")
    print(f"  Typical LOC per UFP (Java): {INDUSTRY_LOC_PER_UFP}")
    print(f"  Implied LOC per SiFP: {INDUSTRY_LOC_PER_SIFP:.1f}")
    print(f"  SiFP per KLOC: {industry_metrics['SIFP_PER_KLOC']:.2f}")

    # Analyze LLM estimates if available
    if not llm_estimates_df.empty and len(ground_truth_requirements) > 0:
        print("\n" + "="*60)
        print("LLM SIFP ANALYSIS (Scaled to Estimated Requirements)")
        print("="*60)
        
        # Get unique requirements that were successfully estimated
        estimated_requirements = llm_estimates_df['requirement_id'].unique()
        estimation_coverage = len(estimated_requirements) / len(ground_truth_requirements)
        
        print(f"\nEstimation Coverage:")
        print(f"  Ground truth requirements: {len(ground_truth_requirements)}")
        print(f"  Requirements with estimates: {len(estimated_requirements)} ({estimation_coverage:.1%})")
        
        # Scale code metrics based on estimation coverage
        scaled_code_metrics = {
            'lines': code_summary['total_lines'] * estimation_coverage,
            'classes': code_summary['total_classes'] * estimation_coverage,
            'methods': code_summary['total_methods'] * estimation_coverage
        }
        
        print(f"\nScaled Code Metrics (for estimated requirements only):")
        print(f"  Estimated lines of code: {scaled_code_metrics['lines']:.0f}")
        print(f"  Estimated classes: {scaled_code_metrics['classes']:.0f}")
        print(f"  Estimated methods: {scaled_code_metrics['methods']:.0f}")
        
        # Calculate metrics for each LLM model
        llm_analysis = []
        
        for model in sorted(llm_estimates_df['model'].unique()):
            model_data = llm_estimates_df[llm_estimates_df['model'] == model]
            
            # Calculate model totals
            total_sifp = model_data['final_sifp'].sum()
            total_ugep = model_data['final_ugep'].sum()
            total_ugdg = model_data['final_ugdg'].sum()
            successful_reqs = model_data['requirement_id'].nunique()
            
            # Calculate normalized metrics based on MODEL-SPECIFIC coverage
            model_coverage = successful_reqs / len(ground_truth_requirements)
            model_estimated_loc = code_summary['total_lines'] * model_coverage
            
            # Key normalized metrics
            sifp_per_kloc = (total_sifp / model_estimated_loc) * 1000 if model_estimated_loc > 0 else 0
            loc_per_sifp = model_estimated_loc / total_sifp if total_sifp > 0 else 0
            sifp_per_req = total_sifp / successful_reqs if successful_reqs > 0 else 0
            
            # Calculate equivalent UFP for comparison
            equivalent_ufp = total_sifp / CONFIG['CONVERSION_FACTOR']
            
            llm_analysis.append({
                'model': model,
                'successful_reqs': successful_reqs,
                'coverage': model_coverage,
                'total_sifp': total_sifp,
                'equivalent_ufp': equivalent_ufp,
                'total_ugep': total_ugep,
                'total_ugdg': total_ugdg,
                'estimated_loc': model_estimated_loc,
                'sifp_per_kloc': sifp_per_kloc,
                'loc_per_sifp': loc_per_sifp,
                'sifp_per_req': sifp_per_req
            })
            
            print(f"\n{model}:")
            print(f"  Successfully estimated: {successful_reqs}/{len(ground_truth_requirements)} requirements ({model_coverage:.1%})")
            print(f"  Total SiFP: {total_sifp:.1f} (equivalent to {equivalent_ufp:.1f} UFP)")
            print(f"  Estimated LOC coverage: {model_estimated_loc:.0f} lines")
            print(f"  SiFP per KLOC: {sifp_per_kloc:.2f}")
            print(f"  LOC per SiFP point: {loc_per_sifp:.1f}")
            print(f"  Deviation from industry baseline: {(loc_per_sifp - INDUSTRY_LOC_PER_SIFP)/INDUSTRY_LOC_PER_SIFP*100:+.1f}%")
        
        # Create analysis DataFrame
        llm_analysis_df = pd.DataFrame(llm_analysis)
        
        # Calculate project-specific baseline (weighted average of LLM estimates)
        if not llm_analysis_df.empty:
            weighted_loc_per_sifp = np.average(llm_analysis_df['loc_per_sifp'], 
                                              weights=llm_analysis_df['coverage'])
            
            baseline_metrics = {
                'project_loc_per_sifp': weighted_loc_per_sifp,
                'industry_loc_per_sifp': INDUSTRY_LOC_PER_SIFP,
                'difference_pct': (weighted_loc_per_sifp - INDUSTRY_LOC_PER_SIFP)/INDUSTRY_LOC_PER_SIFP*100,
                'estimated_requirements': estimated_requirements,
                'scaled_code_metrics': scaled_code_metrics
            }
            
            print("\n\nBASELINE COMPARISON:")
            print("-" * 40)
            print(f"  Industry baseline LOC/SiFP: {INDUSTRY_LOC_PER_SIFP:.1f}")
            print(f"  Project baseline LOC/SiFP (weighted avg): {weighted_loc_per_sifp:.1f}")
            print(f"  Difference: {baseline_metrics['difference_pct']:+.1f}%")
            
            return llm_analysis_df, baseline_metrics, industry_metrics
        
    # Return empty results if no LLM data
    return pd.DataFrame(), {}, industry_metrics

# Execute the normalized metrics calculation
llm_analysis_df, baseline_metrics, industry_metrics = calculate_normalized_metrics()

print(f"\n‚úì Normalized metrics calculated successfully")

In [None]:
# Cell [5] - Detailed Normalized Performance Analysis
# Purpose: Analyze model accuracy in normalized units (per SiFP point) with quality metrics
# Dependencies: llm_analysis_df and baseline_metrics from Cell 4, llm_estimates_df from Cell 2  
# Breadcrumbs: Setup -> Data Collection -> Baseline Establishment -> Performance Analysis

def analyze_model_performance():
    """
    Analyze detailed performance metrics for each LLM model
    
    Returns:
        pd.DataFrame: Performance analysis with accuracy rankings
    """
    
    if llm_estimates_df.empty or llm_analysis_df.empty:
        print("Warning: No LLM data available for performance analysis")
        return pd.DataFrame()
    
    print("Normalized Model Performance Analysis (Per SiFP Point)")
    print("=" * 60)
    
    model_performance = []
    
    # Use the project baseline from calculated metrics
    baseline_loc_per_sifp = baseline_metrics.get('project_loc_per_sifp', 
                                                industry_metrics.get('LOC_PER_SIFP', 100))
    
    print(f"Using baseline: {baseline_loc_per_sifp:.1f} LOC per SiFP")
    print("-" * 60)
    
    for _, row in llm_analysis_df.iterrows():
        model = row['model']
        
        # Calculate normalized accuracy metrics
        loc_per_sifp_error = row['loc_per_sifp'] - baseline_loc_per_sifp
        loc_per_sifp_error_pct = (loc_per_sifp_error / baseline_loc_per_sifp) * 100 if baseline_loc_per_sifp > 0 else 0
        
        # Get quality metrics from original LLM data
        model_data = llm_estimates_df[llm_estimates_df['model'] == model]
        avg_confidence = model_data['confidence'].mean() if 'confidence' in model_data.columns else 0
        avg_judge_score = model_data['judge_score'].mean() if 'judge_score' in model_data.columns else 0
        std_sifp = model_data['final_sifp'].std() if 'final_sifp' in model_data.columns else 0
        
        # Calculate success rate
        success_rate = row['successful_reqs'] / len(ground_truth_requirements) if len(ground_truth_requirements) > 0 else 0
        
        model_performance.append({
            'Model': model,
            'Success_Rate': success_rate,
            'SiFP_per_KLOC': row['sifp_per_kloc'],
            'LOC_per_SiFP': row['loc_per_sifp'],
            'LOC_per_SiFP_Error': loc_per_sifp_error,
            'Error_Pct': abs(loc_per_sifp_error_pct),
            'Avg_SiFP_per_Req': row['sifp_per_req'],
            'Std_SiFP': std_sifp,
            'Avg_Confidence': avg_confidence,
            'Avg_Judge_Score': avg_judge_score
        })
        
        # Display individual model analysis
        print(f"\n{model}:")
        print(f"  Success rate: {success_rate:.1%}")
        print(f"  SiFP per KLOC: {row['sifp_per_kloc']:.2f}")
        print(f"  LOC per SiFP point: {row['loc_per_sifp']:.1f}")
        print(f"  Error vs baseline: {loc_per_sifp_error:+.1f} LOC/SiFP ({loc_per_sifp_error_pct:+.1f}%)")
        print(f"  Average confidence: {avg_confidence:.2%}")
        print(f"  Average judge score: {avg_judge_score:.2f}/5")
        print(f"  SiFP variability (std): {std_sifp:.2f}")
    
    # Create performance DataFrame
    performance_df = pd.DataFrame(model_performance)
    
    if not performance_df.empty:
        # Rank models by normalized accuracy (lower error is better)
        performance_df['Accuracy_Rank'] = performance_df['Error_Pct'].rank()
        
        print("\n\nNormalized Performance Summary:")
        print("=" * 60)
        summary_cols = ['Model', 'Success_Rate', 'LOC_per_SiFP', 'Error_Pct', 'Accuracy_Rank']
        print(performance_df[summary_cols].round(3).to_string(index=False))
        
        # Additional insights
        best_accuracy = performance_df.loc[performance_df['Error_Pct'].idxmin()]
        best_coverage = performance_df.loc[performance_df['Success_Rate'].idxmax()]
        
        print(f"\nKey Insights:")
        print(f"  Most accurate model: {best_accuracy['Model']} ({best_accuracy['Error_Pct']:.1f}% error)")
        print(f"  Best coverage model: {best_coverage['Model']} ({best_coverage['Success_Rate']:.1%} success rate)")
        print(f"  Average error across all models: {performance_df['Error_Pct'].mean():.1f}%")
        
    return performance_df

# Execute performance analysis
performance_df = analyze_model_performance()

print(f"\n‚úì Performance analysis completed successfully")

In [None]:
# Cell [6] - Load Desharnais Dataset and Establish UFP‚ÜíSiFP‚ÜíEffort Relationships  
# Purpose: Load industry benchmark dataset and establish the complete conversion chain for effort estimation
# Dependencies: sklearn LinearRegression, CONFIG from Cell 0, pandas processing
# Breadcrumbs: Setup -> Performance Analysis -> Industry Benchmarks -> Effort Conversion Chain

def load_and_analyze_desharnais():
    """
    Load Desharnais dataset and establish UFP‚ÜíSiFP‚ÜíEffort conversion relationships
    
    Returns:
        tuple: (desharnais_df, effort_metrics, effort_model)
    """
    try:
        # Load industry benchmark dataset
        desharnais_df = pd.read_csv('../datasets/CostEstimation/Desharnais.csv')
        print(f"‚úì Loaded Desharnais dataset: {desharnais_df.shape[0]} projects")

        # Identify column names (handle variations in dataset)
        ufp_column = 'PointsNonAdjust' if 'PointsNonAdjust' in desharnais_df.columns else 'UFP'
        effort_column = 'Effort' if 'Effort' in desharnais_df.columns else 'effort'

        print(f"Using columns: UFP='{ufp_column}', Effort='{effort_column}'")
        
        # Apply UFP to SiFP conversion using research-validated factor
        print(f"\nApplying UFP‚ÜíSiFP conversion factor: {CONFIG['CONVERSION_FACTOR']}")
        desharnais_df['SiFP_converted'] = desharnais_df[ufp_column] * CONFIG['CONVERSION_FACTOR']

        # Calculate effort per SiFP metrics
        print("\nDesharnais Normalized Metrics:")
        print("=" * 40)

        # Calculate hours per SiFP point for each project
        desharnais_df['hours_per_sifp'] = desharnais_df[effort_column] / desharnais_df['SiFP_converted']

        # Calculate summary statistics
        effort_metrics = {
            'avg_hours_per_sifp': desharnais_df['hours_per_sifp'].mean(),
            'median_hours_per_sifp': desharnais_df['hours_per_sifp'].median(),
            'std_hours_per_sifp': desharnais_df['hours_per_sifp'].std(),
            'min_hours_per_sifp': desharnais_df['hours_per_sifp'].min(),
            'max_hours_per_sifp': desharnais_df['hours_per_sifp'].max()
        }

        print(f"  Average hours per SiFP: {effort_metrics['avg_hours_per_sifp']:.2f}")
        print(f"  Median hours per SiFP: {effort_metrics['median_hours_per_sifp']:.2f}")
        print(f"  Std dev hours per SiFP: {effort_metrics['std_hours_per_sifp']:.2f}")
        print(f"  Range: {effort_metrics['min_hours_per_sifp']:.2f} - {effort_metrics['max_hours_per_sifp']:.2f}")

        # Build linear effort prediction model
        print(f"\nBuilding Linear Effort Model:")
        print("-" * 30)
        
        # Prepare data for sklearn
        X = desharnais_df[['SiFP_converted']].values.astype(np.float64)
        y = desharnais_df[effort_column].values.astype(np.float64)

        # Fit linear regression model
        effort_model = LinearRegression()
        effort_model.fit(X, y)

        # Extract model coefficients
        linear_hours_per_sifp = float(effort_model.coef_[0])
        intercept = float(effort_model.intercept_)
        
        # Calculate model performance
        y_pred = effort_model.predict(X)
        r2 = float(r2_score(y, y_pred))

        print(f"  Hours per SiFP (coefficient): {linear_hours_per_sifp:.2f}")
        print(f"  Base hours (intercept): {intercept:.2f}")
        print(f"  R¬≤ score: {r2:.3f}")
        
        # Add model metrics to effort_metrics
        effort_metrics.update({
            'linear_hours_per_sifp': linear_hours_per_sifp,
            'intercept': intercept,
            'r2_score': r2
        })

        # Analyze SiFP distribution in industry data
        print(f"\nSiFP Distribution in Desharnais Dataset:")
        print("-" * 40)
        print(f"  Mean SiFP per project: {desharnais_df['SiFP_converted'].mean():.1f}")
        print(f"  Median SiFP per project: {desharnais_df['SiFP_converted'].median():.1f}")
        print(f"  Range: {desharnais_df['SiFP_converted'].min():.1f} - {desharnais_df['SiFP_converted'].max():.1f}")
        print(f"  Total projects: {len(desharnais_df)}")

        return desharnais_df, effort_metrics, effort_model
        
    except Exception as e:
        print(f"Error loading Desharnais dataset: {e}")
        raise

# Execute Desharnais analysis
desharnais_df, effort_metrics, effort_model = load_and_analyze_desharnais()

print(f"\n‚úì Desharnais dataset analysis completed successfully")

In [None]:
# Cell [7] - Normalized Effort Impact Analysis with Complete Conversion Chain
# Purpose: Analyze effort impact using UFP‚ÜíSiFP‚ÜíEffort conversion chain and calculate cost implications  
# Dependencies: performance_df from Cell 5, effort_metrics from Cell 6, CONFIG from Cell 0
# Breadcrumbs: Setup -> Performance Analysis -> Industry Benchmarks -> Effort Impact Analysis

def analyze_effort_impact():
    """
    Analyze effort impact using the complete UFP‚ÜíSiFP‚ÜíEffort conversion chain
    
    Returns:
        pd.DataFrame: Effort impact analysis with cost implications
    """
    
    if performance_df.empty or not effort_metrics:
        print("Warning: Missing required data for effort impact analysis")
        return pd.DataFrame()
    
    print("Normalized Effort Impact Analysis (Per SiFP Point)")
    print("=" * 60)

    # Display conversion chain information
    print(f"\nConversion Chain:")
    print(f"  UFP ‚Üí SiFP: Factor = {CONFIG['CONVERSION_FACTOR']} (SiFP = {CONFIG['CONVERSION_FACTOR']} √ó UFP)")
    print(f"  SiFP ‚Üí Effort: {effort_metrics['avg_hours_per_sifp']:.2f} hours/SiFP (from Desharnais)")
    
    if baseline_metrics:
        print(f"  SiFP ‚Üí LOC: {baseline_metrics['project_loc_per_sifp']:.1f} LOC/SiFP (project baseline)")
    
    effort_impact = []
    
    for _, row in performance_df.iterrows():
        model = row['Model']
        
        # Get model's LOC per SiFP
        model_loc_per_sifp = row['LOC_per_SiFP']
        baseline_loc_per_sifp = baseline_metrics.get('project_loc_per_sifp', 
                                                   industry_metrics.get('LOC_PER_SIFP', 100))
        
        # Calculate SiFP estimation accuracy
        # If model estimates fewer LOC per SiFP, it's overestimating SiFP count
        sifp_estimation_factor = baseline_loc_per_sifp / model_loc_per_sifp if model_loc_per_sifp > 0 else 1
        
        # Calculate effort impact using Desharnais baseline
        baseline_hours_per_sifp = effort_metrics['avg_hours_per_sifp']
        
        # The effective hours per estimated SiFP
        effective_hours_per_estimated_sifp = baseline_hours_per_sifp / sifp_estimation_factor
        
        # Calculate percentage error in effort estimation
        effort_error_pct = (sifp_estimation_factor - 1) * 100
        
        # Get total SiFP estimated by this model
        if not llm_analysis_df.empty:
            model_row = llm_analysis_df[llm_analysis_df['model'] == model]
            if not model_row.empty:
                model_total_sifp = model_row['total_sifp'].values[0]
                actual_sifp = model_total_sifp / sifp_estimation_factor
                
                # Calculate total effort impact
                estimated_total_effort = model_total_sifp * baseline_hours_per_sifp
                actual_total_effort = actual_sifp * baseline_hours_per_sifp
                total_effort_error = estimated_total_effort - actual_total_effort
                
                # Calculate cost impact using standard rate
                total_cost_impact = total_effort_error * CONFIG['COST_PER_HOUR']
            else:
                model_total_sifp = actual_sifp = total_effort_error = total_cost_impact = 0
        else:
            model_total_sifp = actual_sifp = total_effort_error = total_cost_impact = 0
        
        effort_impact.append({
            'Model': model,
            'LOC_per_SiFP': model_loc_per_sifp,
            'SiFP_Estimation_Factor': sifp_estimation_factor,
            'Desharnais_Hours_per_SiFP': baseline_hours_per_sifp,
            'Effective_Hours_per_Est_SiFP': effective_hours_per_estimated_sifp,
            'Effort_Error_Pct': effort_error_pct,
            'Model_Total_SiFP': model_total_sifp,
            'Actual_SiFP': actual_sifp,
            'Total_Effort_Error_Hours': total_effort_error,
            'Total_Cost_Impact_USD': total_cost_impact
        })
        
        # Display model-specific analysis
        print(f"\n{model}:")
        print(f"  LOC per SiFP: {model_loc_per_sifp:.1f} (baseline: {baseline_loc_per_sifp:.1f})")
        print(f"  SiFP estimation factor: {sifp_estimation_factor:.2f}x")
        print(f"  Interpretation: Model {'overestimates' if sifp_estimation_factor > 1 else 'underestimates'} SiFP count")
        print(f"  Desharnais baseline: {baseline_hours_per_sifp:.2f} hours per actual SiFP")
        print(f"  Effective hours per estimated SiFP: {effective_hours_per_estimated_sifp:.2f}")
        print(f"  Effort estimation error: {effort_error_pct:+.1f}%")
        if model_total_sifp > 0:
            print(f"  Total SiFP estimated: {model_total_sifp:.0f}")
            print(f"  Actual SiFP (implied): {actual_sifp:.0f}")
            print(f"  Total effort error: {total_effort_error:+.0f} hours (${total_cost_impact:+,.0f})")
    
    effort_impact_df = pd.DataFrame(effort_impact)
    
    if not effort_impact_df.empty:
        # Summary statistics
        print("\n\nEffort Impact Summary:")
        print("=" * 40)
        print(f"  Desharnais hours per SiFP: {effort_metrics['avg_hours_per_sifp']:.2f}")
        print(f"  Average SiFP estimation factor: {effort_impact_df['SiFP_Estimation_Factor'].mean():.2f}x")
        print(f"  Average effort error: {effort_impact_df['Effort_Error_Pct'].mean():+.1f}%")
        print(f"  Total cost impact range: ${effort_impact_df['Total_Cost_Impact_USD'].min():,.0f} to ${effort_impact_df['Total_Cost_Impact_USD'].max():,.0f}")
        
        if effort_impact_df['Effort_Error_Pct'].abs().size > 0:
            best_model = effort_impact_df.loc[effort_impact_df['Effort_Error_Pct'].abs().idxmin(), 'Model']
            print(f"  Most accurate effort model: {best_model}")
    
    return effort_impact_df

# Execute effort impact analysis
effort_impact_df = analyze_effort_impact()

print(f"\n‚úì Effort impact analysis completed successfully")

In [None]:
# Cell [8] - Comprehensive Visualization of Normalized Results and Distributions
# Purpose: Create comprehensive visualizations of model performance, accuracy distributions, and cost impacts
# Dependencies: performance_df from Cell 5, effort_impact_df from Cell 7, matplotlib/seaborn from Cell 0
# Breadcrumbs: Setup -> Performance Analysis -> Effort Impact -> Comprehensive Visualization

if 'performance_df' in globals() and 'effort_impact_df' in globals() and not performance_df.empty and not effort_impact_df.empty:
    
    # Get baseline values from previous calculations
    avg_loc_per_sifp = baseline_metrics.get('project_loc_per_sifp', industry_metrics.get('LOC_PER_SIFP', 100))
    desharnais_hours_per_sifp = effort_metrics.get('avg_hours_per_sifp', 10)
    project_name = CONFIG.get('NEO4J_PROJECT_NAME', 'Unknown Project')
    
    # Create a larger figure with more subplots
    fig = plt.figure(figsize=(20, 16))
    
    # Define grid for subplots
    gs = fig.add_gridspec(4, 3, hspace=0.3, wspace=0.3)
    
    models = performance_df['Model'].values
    x = np.arange(len(models))
    
    # 1. LOC per SiFP Point Comparison
    ax1 = fig.add_subplot(gs[0, :2])
    bars = ax1.bar(x, performance_df['LOC_per_SiFP'], alpha=0.7, color='skyblue')
    ax1.axhline(y=avg_loc_per_sifp, color='red', linestyle='--', 
                label=f'Baseline ({avg_loc_per_sifp:.1f} LOC/SiFP)')
    
    # Color bars based on performance
    for i, bar in enumerate(bars):
        if performance_df.iloc[i]['LOC_per_SiFP'] < avg_loc_per_sifp * 0.8:
            bar.set_color('green')
        elif performance_df.iloc[i]['LOC_per_SiFP'] > avg_loc_per_sifp * 1.2:
            bar.set_color('red')
    
    ax1.set_xlabel('Model')
    ax1.set_ylabel('Lines of Code per SiFP Point')
    ax1.set_title('Code Density per SiFP Point by Model')
    ax1.set_xticks(x)
    ax1.set_xticklabels([m.split('/')[-1][:15] for m in models], rotation=45, ha='right')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Add success rate and error annotations
    for i, (model, success_rate, error) in enumerate(zip(models, performance_df['Success_Rate'], performance_df['Error_Pct'])):
        ax1.text(i, performance_df.iloc[i]['LOC_per_SiFP'] + 1, 
                f'{success_rate:.0%}\n¬±{error:.0f}%', ha='center', va='bottom', fontsize=8)
    
    # 2. Model Performance Comparison Table
    ax2 = fig.add_subplot(gs[0, 2])
    ax2.axis('tight')
    ax2.axis('off')
    
    # Create performance summary table
    table_data = []
    for _, row in performance_df.iterrows():
        model_name = row['Model'].split('/')[-1][:20]
        table_data.append([
            model_name,
            f"{row['Success_Rate']:.0%}",
            f"{row['LOC_per_SiFP']:.1f}",
            f"{row['Error_Pct']:.0f}%"
        ])
    
    table = ax2.table(cellText=table_data,
                     colLabels=['Model', 'Success', 'LOC/SiFP', 'Error'],
                     cellLoc='center',
                     loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(9)
    table.scale(1, 1.5)
    ax2.set_title('Model Performance Summary', pad=20)
    
    # 3. Effort per SiFP Point
    ax3 = fig.add_subplot(gs[1, 0])
    bars = ax3.bar(x, effort_impact_df['Effective_Hours_per_Est_SiFP'], 
            color=['green' if x < desharnais_hours_per_sifp else 'orange' 
                   for x in effort_impact_df['Effective_Hours_per_Est_SiFP']], alpha=0.7)
    ax3.axhline(y=desharnais_hours_per_sifp, color='red', linestyle='--', 
                label=f'Desharnais Baseline ({desharnais_hours_per_sifp:.1f} hrs/SiFP)')
    ax3.set_xlabel('Model')
    ax3.set_ylabel('Effective Hours per Estimated SiFP')
    ax3.set_title('Effort Estimation per SiFP Point')
    ax3.set_xticks(x)
    ax3.set_xticklabels([m.split('/')[-1][:15] for m in models], rotation=45, ha='right')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # 4. SiFP per KLOC
    ax4 = fig.add_subplot(gs[1, 1])
    ax4.bar(x, performance_df['SiFP_per_KLOC'], alpha=0.7, color='coral')
    ax4.set_xlabel('Model')
    ax4.set_ylabel('SiFP per KLOC')
    ax4.set_title('Function Point Density (SiFP per 1000 LOC)')
    ax4.set_xticks(x)
    ax4.set_xticklabels([m.split('/')[-1][:15] for m in models], rotation=45, ha='right')
    ax4.grid(True, alpha=0.3)
    
    # 5. Total Cost Impact
    ax5 = fig.add_subplot(gs[1, 2])
    bars = ax5.bar(x, effort_impact_df['Total_Cost_Impact_USD'], 
                   color=['darkgreen' if x < 0 else 'darkred' 
                          for x in effort_impact_df['Total_Cost_Impact_USD']], alpha=0.7)
    ax5.set_xlabel('Model')
    ax5.set_ylabel('Total Cost Impact ($)')
    ax5.set_title('Total Cost Impact')
    ax5.set_xticks(x)
    ax5.set_xticklabels([m.split('/')[-1][:15] for m in models], rotation=45, ha='right')
    ax5.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
    ax5.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for i, v in enumerate(effort_impact_df['Total_Cost_Impact_USD']):
        ax5.text(i, v + (1000 if v > 0 else -1000), f'${v:,.0f}', 
                ha='center', va='bottom' if v > 0 else 'top', fontsize=8)
    
    # 6-10. Histograms for each model showing requirement-level accuracy
    histogram_count = 0
    max_histograms = 6  # Limit to 6 histograms to fit in remaining subplot space
    
    for idx, model in enumerate(models[:max_histograms]):
        row_idx = 2 + histogram_count // 3
        col_idx = histogram_count % 3
        
        if row_idx >= 4:  # Don't exceed our grid
            break
            
        ax = fig.add_subplot(gs[row_idx, col_idx])
        
        model_data = llm_estimates_df[llm_estimates_df['model'] == model]
        
        if not model_data.empty:
            model_coverage = model_data['requirement_id'].nunique() / len(ground_truth_requirements)
            
            # Calculate LOC per SiFP for each requirement
            req_loc_per_sifp = []
            for _, req in model_data.iterrows():
                if req['final_sifp'] > 0:
                    est_loc_per_req = (code_summary['total_lines'] * model_coverage) / model_data['requirement_id'].nunique()
                    loc_per_sifp = est_loc_per_req / req['final_sifp']
                    req_loc_per_sifp.append(loc_per_sifp)
            
            if req_loc_per_sifp:
                # Create histogram
                n, bins, patches = ax.hist(req_loc_per_sifp, bins=min(15, len(req_loc_per_sifp)), 
                                         alpha=0.7, color='steelblue', edgecolor='black')
                
                # Color code bins
                for i, patch in enumerate(patches):
                    if i < len(bins) - 1:  # bins has one more element than patches
                        if bins[i] < avg_loc_per_sifp * 0.8:
                            patch.set_facecolor('green')
                        elif bins[i] > avg_loc_per_sifp * 1.2:
                            patch.set_facecolor('red')
                
                # Add baseline line
                ax.axvline(x=avg_loc_per_sifp, color='red', linestyle='--', linewidth=2, 
                          label=f'Baseline: {avg_loc_per_sifp:.1f}')
                ax.axvline(x=np.mean(req_loc_per_sifp), color='blue', linestyle='-', linewidth=2,
                          label=f'Model mean: {np.mean(req_loc_per_sifp):.1f}')
                
                ax.set_xlabel('LOC per SiFP')
                ax.set_ylabel('# Requirements')
                ax.set_title(f'{model.split("/")[-1][:20]}\nAccuracy Distribution')
                ax.legend(fontsize=8)
                ax.grid(True, alpha=0.3)
                
                # Add statistics text
                ax.text(0.95, 0.95, f'n={len(req_loc_per_sifp)}\nœÉ={np.std(req_loc_per_sifp):.1f}',
                       transform=ax.transAxes, ha='right', va='top', fontsize=8,
                       bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
            else:
                ax.text(0.5, 0.5, 'No valid data', transform=ax.transAxes, ha='center', va='center')
                ax.set_title(f'{model.split("/")[-1][:20]}\nNo Data')
        else:
            ax.text(0.5, 0.5, 'No model data', transform=ax.transAxes, ha='center', va='center')
            ax.set_title(f'{model.split("/")[-1][:20]}\nNo Data')
        
        histogram_count += 1
    
    plt.suptitle(f'Comprehensive SiFP Analysis - {project_name}\n'
                 f'All Models Performance and Accuracy Distribution', fontsize=16)
    plt.tight_layout()
    plt.show()
    
    print("‚úì Comprehensive visualization completed successfully")
    
else:
    print("Cannot create comprehensive visualization - required data not available")
    print("Available variables:")
    if 'performance_df' in globals():
        print(f"  - performance_df: {len(performance_df) if not performance_df.empty else 'empty'}")
    else:
        print("  - performance_df: not defined")
    
    if 'effort_impact_df' in globals():
        print(f"  - effort_impact_df: {len(effort_impact_df) if not effort_impact_df.empty else 'empty'}")
    else:
        print("  - effort_impact_df: not defined")
    
    if 'baseline_metrics' in globals():
        print("  - baseline_metrics: available")
    else:
        print("  - baseline_metrics: not defined")

In [None]:
# Cell [10] - Executive Summary with Complete UFP‚ÜíSiFP‚ÜíLOC‚ÜíEffort Analysis  
# Purpose: Generate comprehensive executive summary with complete conversion chain analysis and business insights
# Dependencies: All previous analysis results, CONFIG settings, comprehensive metrics from entire workflow
# Breadcrumbs: Setup -> Analysis -> Recommendations -> Executive Summary & Business Impact Report

print("EXECUTIVE SUMMARY - NORMALIZED SIFP ANALYSIS")
print("=" * 60)

# Get project name safely
project_name = CONFIG.get('NEO4J_PROJECT_NAME', 'Unknown Project') if 'CONFIG' in globals() else 'Unknown Project'
print(f"Project: {project_name}")
print(f"Analysis Date: {pd.Timestamp.now().strftime('%Y-%m-%d')}")

if 'performance_df' in globals() and 'effort_impact_df' in globals() and not performance_df.empty and not effort_impact_df.empty:
    print(f"\nData Summary:")
    
    if 'code_metrics_df' in globals():
        print(f"  Total code files in project: {len(code_metrics_df)}")
        print(f"  Total lines of code in project: {code_metrics_df['CountLineCode'].sum():,}")
    else:
        print("  Code metrics: Not available")
    
    if 'ground_truth_requirements' in globals():
        print(f"  Ground truth requirements: {len(ground_truth_requirements)}")
    else:
        print("  Ground truth requirements: Not available")
    
    if 'llm_estimates_df' in globals() and not llm_estimates_df.empty:
        estimated_requirements = llm_estimates_df['requirement_id'].unique()
        if 'ground_truth_requirements' in globals():
            print(f"  Requirements with estimates: {len(estimated_requirements)} ({len(estimated_requirements)/len(ground_truth_requirements):.1%})")
        else:
            print(f"  Requirements with estimates: {len(estimated_requirements)}")
    
    # Get baseline values safely
    conversion_factor = CONFIG.get('CONVERSION_FACTOR', 0.957) if 'CONFIG' in globals() else 0.957
    
    # Get industry and project baselines
    industry_loc_per_sifp = industry_metrics.get('LOC_PER_SIFP', 100) if 'industry_metrics' in globals() else 100
    project_loc_per_sifp = baseline_metrics.get('project_loc_per_sifp', industry_loc_per_sifp) if 'baseline_metrics' in globals() else industry_loc_per_sifp
    desharnais_hours_per_sifp = effort_metrics.get('avg_hours_per_sifp', 10) if 'effort_metrics' in globals() else 10
    
    print(f"\nConversion Factors and Baselines:")
    print(f"  UFP ‚Üí SiFP: {conversion_factor} (from Desharnais research)")
    print(f"  SiFP ‚Üí Effort: {desharnais_hours_per_sifp:.2f} hours/SiFP (Desharnais dataset)")
    print(f"  SiFP ‚Üí LOC: {project_loc_per_sifp:.1f} LOC/SiFP (project weighted average)")
    print(f"  Industry baseline: {industry_loc_per_sifp:.1f} LOC/SiFP")
    print(f"  Project vs Industry: {(project_loc_per_sifp - industry_loc_per_sifp)/industry_loc_per_sifp*100:+.1f}%")
    
    # Detailed performance for each model
    print(f"\n" + "="*80)
    print("DETAILED MODEL PERFORMANCE")
    print("="*80)
    
    for idx, row in performance_df.iterrows():
        model = row['Model']
        effort_row = effort_impact_df[effort_impact_df['Model'] == model]
        
        if not effort_row.empty:
            effort_row = effort_row.iloc[0]
            
            if 'llm_analysis_df' in globals() and not llm_analysis_df.empty:
                llm_row = llm_analysis_df[llm_analysis_df['model'] == model]
                if not llm_row.empty:
                    llm_row = llm_row.iloc[0]
                else:
                    llm_row = None
            else:
                llm_row = None
            
            print(f"\n{idx+1}. {model}")
            print("-" * len(f"{idx+1}. {model}"))
            
            print(f"\n  Estimation Coverage:")
            print(f"    - Success rate: {row['Success_Rate']:.1%}")
            if llm_row is not None:
                print(f"    - Requirements estimated: {llm_row['successful_reqs']}/{len(ground_truth_requirements) if 'ground_truth_requirements' in globals() else 'unknown'}")
            
            print(f"\n  SiFP Estimates:")
            if llm_row is not None:
                print(f"    - Total SiFP: {llm_row['total_sifp']:.0f}")
                print(f"    - Equivalent UFP: {llm_row['equivalent_ufp']:.0f}")
                print(f"    - Average SiFP per requirement: {row['Avg_SiFP_per_Req']:.1f}")
            else:
                print(f"    - Average SiFP per requirement: {row.get('Avg_SiFP_per_Req', 'N/A')}")
            
            print(f"\n  Accuracy Metrics:")
            print(f"    - LOC per SiFP: {row['LOC_per_SiFP']:.1f} (baseline: {project_loc_per_sifp:.1f})")
            print(f"    - Error: {row.get('LOC_per_SiFP_Error', 'N/A'):+.1f} LOC/SiFP ({row['Error_Pct']:+.1f}%)")
            print(f"    - SiFP estimation factor: {effort_row['SiFP_Estimation_Factor']:.2f}x")
            
            print(f"\n  Effort Impact:")
            print(f"    - Desharnais baseline: {desharnais_hours_per_sifp:.2f} hours/SiFP")
            print(f"    - Effort estimation error: {effort_row['Effort_Error_Pct']:+.1f}%")
            print(f"    - Total effort error: {effort_row['Total_Effort_Error_Hours']:+.0f} hours")
            print(f"    - Cost impact: ${effort_row['Total_Cost_Impact_USD']:+,.0f}")
            
            print(f"\n  Quality Indicators:")
            print(f"    - Average confidence: {row.get('Avg_Confidence', 0):.1%}")
            print(f"    - Average judge score: {row.get('Avg_Judge_Score', 0):.2f}/5")
    
    # Analysis of the conversion chain
    print(f"\n" + "="*80)
    print("CONVERSION CHAIN ANALYSIS")
    print("="*80)
    
    print(f"\nFor a typical requirement in this project:")
    if 'llm_analysis_df' in globals() and not llm_analysis_df.empty:
        avg_sifp_per_req = llm_analysis_df['sifp_per_req'].mean()
        print(f"  Average SiFP per requirement: {avg_sifp_per_req:.1f}")
        print(f"  Equivalent UFP: {avg_sifp_per_req / conversion_factor:.1f}")
        print(f"  Expected LOC: {avg_sifp_per_req * project_loc_per_sifp:.0f}")
        print(f"  Expected effort: {avg_sifp_per_req * desharnais_hours_per_sifp:.0f} hours")
    else:
        print("  Analysis not available - LLM analysis data missing")
    
    # Recommendations
    print(f"\n" + "="*80)
    print("RECOMMENDATIONS")
    print("="*80)
    
    if not performance_df.empty:
        best_accuracy = performance_df.loc[performance_df['Error_Pct'].idxmin()]['Model']
        print(f"\n1. For most accurate code size estimation: {best_accuracy}")
    else:
        print("\n1. For most accurate code size estimation: Data not available")
    
    if not effort_impact_df.empty:
        best_effort = effort_impact_df.loc[effort_impact_df['Effort_Error_Pct'].abs().idxmin()]['Model']
        print(f"2. For most accurate effort estimation: {best_effort}")
    else:
        print("2. For most accurate effort estimation: Data not available")
    
    print(f"3. Use Desharnais baseline of {desharnais_hours_per_sifp:.1f} hours per SiFP for effort planning")
    print(f"4. Apply UFP conversion factor of {conversion_factor} when comparing to UFP-based estimates")
    print(f"5. Consider that this project has {(project_loc_per_sifp - industry_loc_per_sifp)/industry_loc_per_sifp*100:+.1f}% different LOC/SiFP than industry average")
    
    # Save results with all conversion factors
    try:
        os.makedirs('results', exist_ok=True)
        
        # Create comprehensive summary
        conversion_summary = pd.DataFrame({
            'Metric': ['UFP‚ÜíSiFP Factor', 'Industry LOC/SiFP', 'Project LOC/SiFP', 'Desharnais Hours/SiFP'],
            'Value': [conversion_factor, industry_loc_per_sifp, project_loc_per_sifp, desharnais_hours_per_sifp]
        })
        conversion_summary.to_csv(f'results/conversion_factors_{project_name}.csv', index=False)
        
        # Save all other results
        performance_df.to_csv(f'results/normalized_performance_{project_name}.csv', index=False)
        effort_impact_df.to_csv(f'results/normalized_effort_impact_{project_name}.csv', index=False)
        
        if 'llm_analysis_df' in globals() and not llm_analysis_df.empty:
            llm_analysis_df.to_csv(f'results/normalized_llm_analysis_{project_name}.csv', index=False)
        
        print(f"\n‚úì Results saved to results/ directory")
        
    except Exception as e:
        print(f"\nWarning: Could not save results - {e}")

else:
    print("Missing required data for executive summary")
    print("Available data:")
    
    if 'performance_df' in globals():
        print(f"  - performance_df: {len(performance_df) if not performance_df.empty else 'empty'}")
    else:
        print("  - performance_df: not available")
    
    if 'effort_impact_df' in globals():
        print(f"  - effort_impact_df: {len(effort_impact_df) if not effort_impact_df.empty else 'empty'}")
    else:
        print("  - effort_impact_df: not available")
    
    if 'llm_analysis_df' in globals():
        print(f"  - llm_analysis_df: {len(llm_analysis_df) if not llm_analysis_df.empty else 'empty'}")
    else:
        print("  - llm_analysis_df: not available")

print(f"\n‚úì Executive summary completed")

In [None]:
# Cell [11] - Statistical Hypothesis Testing for Estimation Accuracy vs Ground Truth (CORRECTED FP/FN CLASSIFICATION)
# Purpose: Test whether hallucination-reducing techniques improve estimation accuracy by reducing FP/FN rates
# Dependencies: performance_df, llm_estimates_df, code_summary from previous cells
# Breadcrumbs: Setup -> Analysis -> Executive Summary -> Statistical Accuracy Testing vs Ground Truth

from scipy.stats import ttest_ind, mannwhitneyu, chi2_contingency, fisher_exact
from scipy import stats
import numpy as np

def perform_accuracy_hypothesis_testing():
    """
    Perform statistical hypothesis testing comparing estimation accuracy against ground truth
    Tests whether hallucination-reducing techniques improve accuracy by reducing FP/FN rates
    
    Returns:
        dict: Statistical test results for accuracy vs ground truth
    """
    
    print("STATISTICAL HYPOTHESIS TESTING - ESTIMATION ACCURACY vs GROUND TRUTH")
    print("=" * 70)
    print("HYPOTHESIS: Hallucination-reducing techniques significantly improve")
    print("estimation accuracy by >30% through reduced False Positives and False Negatives")
    print("\nOPERATIONAL DEFINITIONS:")
    print("- Ground Truth: Actual iTrust codebase metrics scaled to estimated requirements")
    print("- True Positive (TP): Estimate within ¬±20% of ground truth")
    print("- False Positive (FP): Overestimate >20% (too few LOC per SiFP = more SiFP estimated)")
    print("- False Negative (FN): Underestimate >20% (too many LOC per SiFP = fewer SiFP estimated)")
    print("- Hallucination-reducing: Models with judge scores above median")
    
    if llm_estimates_df.empty or not baseline_metrics:
        print("\nWarning: Insufficient data for accuracy testing")
        return {}
    
    # Establish ground truth baseline from actual codebase
    ground_truth_loc_per_sifp = baseline_metrics.get('project_loc_per_sifp', 
                                                   industry_metrics.get('LOC_PER_SIFP', 100))
    
    print(f"\nGROUND TRUTH BASELINE:")
    print(f"  Actual codebase LOC per SiFP: {ground_truth_loc_per_sifp:.1f}")
    print(f"  Acceptable accuracy range: ¬±20% ({ground_truth_loc_per_sifp*0.8:.1f} - {ground_truth_loc_per_sifp*1.2:.1f})")
    
    # Calculate accuracy classifications for each estimate
    accuracy_results = []
    
    for _, row in llm_estimates_df.iterrows():
        requirement_id = row['requirement_id']
        model = row['model']
        final_sifp = row['final_sifp']
        judge_score = row.get('judge_score', 0)
        
        # Calculate estimated LOC per SiFP for this requirement
        # Assume uniform distribution of LOC across estimated requirements
        if 'llm_analysis_df' in globals() and not llm_analysis_df.empty:
            model_data = llm_analysis_df[llm_analysis_df['model'] == model]
            if not model_data.empty:
                model_loc_per_sifp = model_data['loc_per_sifp'].iloc[0]
            else:
                continue
        else:
            continue
        
        # CORRECTED: Classify estimate accuracy with proper FP/FN definitions
        accuracy_ratio = model_loc_per_sifp / ground_truth_loc_per_sifp
        
        if 0.8 <= accuracy_ratio <= 1.2:
            classification = 'TP'  # True Positive - within ¬±20%
        elif accuracy_ratio > 1.2:
            # Model thinks each SiFP requires MORE LOC than reality
            # For fixed LOC, this means FEWER SiFP estimated = UNDERESTIMATE
            classification = 'FN'  # False Negative - underestimate (fewer SiFP than should be)
        else:
            # Model thinks each SiFP requires LESS LOC than reality  
            # For fixed LOC, this means MORE SiFP estimated = OVERESTIMATE
            classification = 'FP'  # False Positive - overestimate (more SiFP than should be)
        
        # Determine if model uses hallucination-reducing techniques
        median_judge_score = llm_estimates_df['judge_score'].median()
        hallucination_reducing = judge_score > median_judge_score
        
        accuracy_results.append({
            'requirement_id': requirement_id,
            'model': model,
            'final_sifp': final_sifp,
            'judge_score': judge_score,
            'model_loc_per_sifp': model_loc_per_sifp,
            'accuracy_ratio': accuracy_ratio,
            'classification': classification,
            'hallucination_reducing': hallucination_reducing,
            'accuracy_error_pct': abs((accuracy_ratio - 1.0) * 100)
        })
    
    if not accuracy_results:
        print("Warning: No accuracy results to analyze")
        return {}
    
    accuracy_df = pd.DataFrame(accuracy_results)
    
    # Group by hallucination-reducing techniques
    hr_group = accuracy_df[accuracy_df['hallucination_reducing'] == True]
    standard_group = accuracy_df[accuracy_df['hallucination_reducing'] == False]
    
    print(f"\nACCURACY CLASSIFICATION RESULTS:")
    print(f"  Total estimates analyzed: {len(accuracy_df)}")
    print(f"  Hallucination-reducing estimates: {len(hr_group)}")
    print(f"  Standard estimates: {len(standard_group)}")
    print(f"  Median judge score threshold: {median_judge_score:.2f}")
    
    if len(hr_group) == 0 or len(standard_group) == 0:
        print("Warning: Insufficient samples in both groups for comparison")
        return {}
    
    # Calculate accuracy metrics for each group
    def calculate_accuracy_metrics(group_df, group_name):
        """Calculate TP, FP, FN rates and overall accuracy for a group"""
        total = len(group_df)
        tp_count = len(group_df[group_df['classification'] == 'TP'])
        fp_count = len(group_df[group_df['classification'] == 'FP'])
        fn_count = len(group_df[group_df['classification'] == 'FN'])
        
        tp_rate = tp_count / total
        fp_rate = fp_count / total
        fn_rate = fn_count / total
        accuracy = tp_rate  # Accuracy = TP / (TP + FP + FN)
        
        avg_error = group_df['accuracy_error_pct'].mean()
        std_error = group_df['accuracy_error_pct'].std()
        
        print(f"\n{group_name} Group (n={total}):")
        print(f"  True Positives (¬±20%): {tp_count} ({tp_rate:.2%})")
        print(f"  False Positives (overestimates): {fp_count} ({fp_rate:.2%})")
        print(f"  False Negatives (underestimates): {fn_count} ({fn_rate:.2%})")
        print(f"  Overall Accuracy: {accuracy:.2%}")
        print(f"  Average Error: {avg_error:.1f}% ¬± {std_error:.1f}%")
        
        return {
            'total': total,
            'tp_count': tp_count, 'fp_count': fp_count, 'fn_count': fn_count,
            'tp_rate': tp_rate, 'fp_rate': fp_rate, 'fn_rate': fn_rate,
            'accuracy': accuracy, 'avg_error': avg_error, 'std_error': std_error,
            'group_data': group_df
        }
    
    # Calculate metrics for both groups
    hr_metrics = calculate_accuracy_metrics(hr_group, "Hallucination-Reducing")
    standard_metrics = calculate_accuracy_metrics(standard_group, "Standard")
    
    # Calculate improvement percentages
    accuracy_improvement = ((hr_metrics['accuracy'] - standard_metrics['accuracy']) / 
                           standard_metrics['accuracy']) * 100
    fp_reduction = ((standard_metrics['fp_rate'] - hr_metrics['fp_rate']) / 
                   standard_metrics['fp_rate']) * 100 if standard_metrics['fp_rate'] > 0 else 0
    fn_reduction = ((standard_metrics['fn_rate'] - hr_metrics['fn_rate']) / 
                   standard_metrics['fn_rate']) * 100 if standard_metrics['fn_rate'] > 0 else 0
    error_reduction = ((standard_metrics['avg_error'] - hr_metrics['avg_error']) / 
                      standard_metrics['avg_error']) * 100
    
    print(f"\nIMPROVEMENT ANALYSIS:")
    print(f"  Accuracy improvement: {accuracy_improvement:+.1f}%")
    print(f"  False Positive reduction (fewer overestimates): {fp_reduction:+.1f}%")
    print(f"  False Negative reduction (fewer underestimates): {fn_reduction:+.1f}%")
    print(f"  Average error reduction: {error_reduction:+.1f}%")
    print(f"  Meets >30% threshold: {'YES' if max(accuracy_improvement, fp_reduction, fn_reduction, error_reduction) > 30 else 'NO'}")
    
    # Business impact interpretation
    print(f"\nBUSINESS IMPACT INTERPRETATION:")
    print(f"  FP reduction (overestimate reduction): Reduces resource waste, over-scoping")
    print(f"  FN reduction (underestimate reduction): Reduces project overruns, missed deadlines")
    print(f"  Combined effect: More accurate project planning and resource allocation")
    
    statistical_results = {}
    
    # 1. Chi-square test for accuracy classification differences
    print(f"\nSTATISTICAL TESTS:")
    print("=" * 40)
    
    try:
        # Create contingency table for chi-square test
        contingency_table = pd.crosstab(accuracy_df['hallucination_reducing'], 
                                      accuracy_df['classification'])
        print(f"\nContingency Table:")
        print(contingency_table)
        
        chi2_stat, chi2_p, dof, expected = chi2_contingency(contingency_table)
        
        print(f"\n1. Chi-Square Test (Classification Independence):")
        print(f"   H‚ÇÄ: Classification is independent of hallucination-reducing techniques")
        print(f"   H‚ÇÅ: Classification depends on hallucination-reducing techniques")
        print(f"   œá¬≤ statistic: {chi2_stat:.3f}")
        print(f"   p-value: {chi2_p:.6f}")
        print(f"   Significant: {'YES' if chi2_p < CONFIG['ALPHA_LEVEL'] else 'NO'}")
        
        statistical_results['chi_square'] = {
            'statistic': chi2_stat,
            'p_value': chi2_p,
            'significant': chi2_p < CONFIG['ALPHA_LEVEL'],
            'contingency_table': contingency_table
        }
        
    except Exception as e:
        print(f"Error in chi-square test: {e}")
        statistical_results['chi_square'] = {}
    
    # 2. Two-proportion z-test for accuracy rates
    try:
        from statsmodels.stats.proportion import proportions_ztest
        
        counts = np.array([hr_metrics['tp_count'], standard_metrics['tp_count']])
        nobs = np.array([hr_metrics['total'], standard_metrics['total']])
        
        z_stat, z_p = proportions_ztest(counts, nobs)
        
        print(f"\n2. Two-Proportion Z-Test (Accuracy Rates):")
        print(f"   H‚ÇÄ: No difference in accuracy rates between groups")
        print(f"   H‚ÇÅ: Hallucination-reducing group has higher accuracy")
        print(f"   z-statistic: {z_stat:.3f}")
        print(f"   p-value: {z_p:.6f}")
        print(f"   Significant: {'YES' if z_p < CONFIG['ALPHA_LEVEL'] else 'NO'}")
        
        statistical_results['proportion_test'] = {
            'statistic': z_stat,
            'p_value': z_p,
            'significant': z_p < CONFIG['ALPHA_LEVEL']
        }
        
    except Exception as e:
        print(f"Error in proportion test: {e}")
        statistical_results['proportion_test'] = {}
    
    # 3. Mann-Whitney U test for error distributions
    try:
        hr_errors = hr_group['accuracy_error_pct'].values
        standard_errors = standard_group['accuracy_error_pct'].values
        
        u_stat, u_p = mannwhitneyu(hr_errors, standard_errors, alternative='less')  # HR should have lower errors
        
        print(f"\n3. Mann-Whitney U Test (Error Distributions):")
        print(f"   H‚ÇÄ: No difference in error distributions")
        print(f"   H‚ÇÅ: Hallucination-reducing group has lower errors")
        print(f"   U-statistic: {u_stat:.3f}")
        print(f"   p-value: {u_p:.6f}")
        print(f"   Significant: {'YES' if u_p < CONFIG['ALPHA_LEVEL'] else 'NO'}")
        
        # Calculate effect size (rank-biserial correlation)
        n1, n2 = len(hr_errors), len(standard_errors)
        effect_size = 1 - (2 * u_stat) / (n1 * n2)
        
        print(f"   Effect size (rank-biserial): {effect_size:.3f}")
        
        statistical_results['mann_whitney'] = {
            'statistic': u_stat,
            'p_value': u_p,
            'significant': u_p < CONFIG['ALPHA_LEVEL'],
            'effect_size': effect_size
        }
        
    except Exception as e:
        print(f"Error in Mann-Whitney test: {e}")
        statistical_results['mann_whitney'] = {}
    
    # 4. Fisher's Exact Test for small samples (if applicable)
    if len(hr_group) < 30 or len(standard_group) < 30:
        try:
            # Create 2x2 table for accurate vs inaccurate estimates
            hr_accurate = hr_metrics['tp_count']
            hr_inaccurate = hr_metrics['fp_count'] + hr_metrics['fn_count']
            standard_accurate = standard_metrics['tp_count']
            standard_inaccurate = standard_metrics['fp_count'] + standard_metrics['fn_count']
            
            table_2x2 = [[hr_accurate, hr_inaccurate], 
                        [standard_accurate, standard_inaccurate]]
            
            odds_ratio, fisher_p = fisher_exact(table_2x2)
            
            print(f"\n4. Fisher's Exact Test (Small Sample Correction):")
            print(f"   2x2 Table: HR [{hr_accurate}, {hr_inaccurate}] vs Standard [{standard_accurate}, {standard_inaccurate}]")
            print(f"   Odds ratio: {odds_ratio:.3f}")
            print(f"   p-value: {fisher_p:.6f}")
            print(f"   Significant: {'YES' if fisher_p < CONFIG['ALPHA_LEVEL'] else 'NO'}")
            
            statistical_results['fisher_exact'] = {
                'odds_ratio': odds_ratio,
                'p_value': fisher_p,
                'significant': fisher_p < CONFIG['ALPHA_LEVEL']
            }
            
        except Exception as e:
            print(f"Error in Fisher's exact test: {e}")
            statistical_results['fisher_exact'] = {}
    
    # Overall hypothesis testing conclusion
    print(f"\nHYPOTHESIS TESTING CONCLUSION:")
    print("=" * 50)
    
    significant_tests = sum(1 for test in statistical_results.values() 
                          if test.get('significant', False))
    total_tests = len([test for test in statistical_results.values() if test])
    
    meets_threshold = max(accuracy_improvement, fp_reduction, fn_reduction, error_reduction) > 30
    
    print(f"\n‚úì Magnitude Test: {'PASS' if meets_threshold else 'FAIL'}")
    print(f"  - Required: >30% improvement in any metric")
    print(f"  - Best observed: {max(accuracy_improvement, fp_reduction, fn_reduction, error_reduction):.1f}%")
    
    print(f"\n‚úì Statistical Significance: {'PASS' if significant_tests > 0 else 'FAIL'}")
    print(f"  - Significant tests: {significant_tests}/{total_tests}")
    print(f"  - Alpha level: {CONFIG['ALPHA_LEVEL']}")
    
    hypothesis_supported = meets_threshold and significant_tests > 0
    
    print(f"\nüéØ FINAL VERDICT: {'HYPOTHESIS SUPPORTED' if hypothesis_supported else 'HYPOTHESIS NOT SUPPORTED'}")
    
    if not hypothesis_supported:
        print(f"\nRECOMMENDATIONS FOR FUTURE RESEARCH:")
        print(f"  1. Collect more estimation samples (current: {len(accuracy_df)})")
        print(f"  2. Refine ground truth establishment methodology")
        print(f"  3. Adjust accuracy thresholds based on industry standards")
        print(f"  4. Implement longitudinal study design")
        print(f"  5. Add direct time-to-market measurements")
    
    print(f"\nCORRECTED CLASSIFICATION VERIFICATION:")
    print(f"  ‚úì FP (False Positive) = Overestimate = More SiFP than reality")
    print(f"  ‚úì FN (False Negative) = Underestimate = Fewer SiFP than reality") 
    print(f"  ‚úì Business Impact: FP ‚Üí resource waste, FN ‚Üí project overruns")
    
    # Store comprehensive results
    accuracy_analysis_results = {
        'accuracy_df': accuracy_df,
        'hr_metrics': hr_metrics,
        'standard_metrics': standard_metrics,
        'improvements': {
            'accuracy': accuracy_improvement,
            'fp_reduction': fp_reduction,
            'fn_reduction': fn_reduction,
            'error_reduction': error_reduction
        },
        'statistical_tests': statistical_results,
        'hypothesis_supported': hypothesis_supported,
        'meets_threshold': meets_threshold,
        'ground_truth_baseline': ground_truth_loc_per_sifp
    }
    
    return accuracy_analysis_results

# Execute accuracy-based hypothesis testing
if 'llm_estimates_df' in globals() and not llm_estimates_df.empty:
    accuracy_test_results = perform_accuracy_hypothesis_testing()
    print(f"\n‚úì Accuracy-based hypothesis testing completed successfully")
else:
    print("Cannot perform accuracy-based hypothesis testing - LLM estimates not available")
    accuracy_test_results = {}

In [None]:
# Cell [12] - Advanced Permutation Testing for Estimation Accuracy vs Ground Truth
# Purpose: Implement robust permutation tests for estimation accuracy without distributional assumptions
# Dependencies: accuracy_test_results from Cell 11, STATISTICAL_CONFIG from Cell 0
# Breadcrumbs: Setup -> Analysis -> Accuracy Testing -> Advanced Permutation Testing for Accuracy

def perform_accuracy_permutation_testing():
    """
    Perform comprehensive permutation testing for estimation accuracy vs ground truth
    Tests whether hallucination-reducing techniques improve accuracy through exact p-values
    
    Returns:
        dict: Permutation test results for accuracy improvements
    """
    
    print("ADVANCED PERMUTATION TESTING - ESTIMATION ACCURACY vs GROUND TRUTH")
    print("=" * 70)
    print("HYPOTHESIS: Hallucination-reducing techniques improve estimation accuracy by >30%")
    print("METHOD: Distribution-free permutation tests with exact p-values on accuracy metrics")
    
    if not accuracy_test_results or 'accuracy_df' not in accuracy_test_results:
        print("\nWarning: Accuracy test results not available from Cell 11")
        return {}
    
    accuracy_df = accuracy_test_results['accuracy_df']
    hr_metrics = accuracy_test_results['hr_metrics']
    standard_metrics = accuracy_test_results['standard_metrics']
    
    print(f"\nACCURACY DATA SUMMARY:")
    print(f"  Total estimates: {len(accuracy_df)}")
    print(f"  Hallucination-reducing estimates: {len(hr_metrics['group_data'])}")
    print(f"  Standard estimates: {len(standard_metrics['group_data'])}")
    print(f"  Ground truth baseline: {accuracy_test_results['ground_truth_baseline']:.1f} LOC/SiFP")
    
    if len(hr_metrics['group_data']) == 0 or len(standard_metrics['group_data']) == 0:
        print("Warning: Insufficient samples for permutation testing")
        return {}
    
    permutation_results = {}
    n_resamples = CONFIG.get('PERMUTATION_SAMPLES', 10000)
    
    print(f"\nPERMUTATION TEST CONFIGURATION:")
    print(f"  Number of resamples: {n_resamples:,}")
    print(f"  Random seed: 42 (for reproducibility)")
    print(f"  Test type: One-sided (greater for improvements)")
    
    # Extract data for permutation tests
    hr_data = hr_metrics['group_data']
    standard_data = standard_metrics['group_data']
    
    # 1. Permutation test for overall accuracy improvement
    print(f"\n" + "="*60)
    print("1. PERMUTATION TEST: OVERALL ACCURACY IMPROVEMENT")
    print("="*60)
    
    print(f"H‚ÇÄ: No difference in accuracy rates between groups")
    print(f"H‚ÇÅ: Hallucination-reducing group has higher accuracy rate")
    
    def accuracy_difference_statistic(hr_group, standard_group):
        """Test statistic: difference in accuracy rates (HR - Standard)"""
        hr_accuracy = np.mean(hr_group['classification'] == 'TP')
        standard_accuracy = np.mean(standard_group['classification'] == 'TP')
        return hr_accuracy - standard_accuracy
    
    try:
        # Observed test statistic
        observed_accuracy_diff = accuracy_difference_statistic(hr_data, standard_data)
        
        # Permutation test
        if PERMUTATION_TEST_AVAILABLE:
            accuracy_perm_result = permutation_test(
                (hr_data, standard_data),
                lambda x, y: accuracy_difference_statistic(x, y),
                n_resamples=n_resamples,
                alternative='greater',
                random_state=42
            )
            accuracy_p_value = accuracy_perm_result.pvalue
            null_distribution = accuracy_perm_result.null_distribution
        else:
            # Custom permutation test
            accuracy_p_value, null_distribution = custom_permutation_test_accuracy(
                hr_data, standard_data, accuracy_difference_statistic, n_resamples
            )
        
        # Calculate improvement percentage
        accuracy_improvement_pct = (observed_accuracy_diff / standard_metrics['accuracy']) * 100
        
        print(f"\nResults:")
        print(f"  Observed accuracy difference: {observed_accuracy_diff:.3f}")
        print(f"  Improvement percentage: {accuracy_improvement_pct:.1f}%")
        print(f"  Permutation p-value: {accuracy_p_value:.6f}")
        print(f"  Significant: {'YES' if accuracy_p_value < CONFIG['ALPHA_LEVEL'] else 'NO'}")
        print(f"  Meets >30% threshold: {'YES' if accuracy_improvement_pct > 30 else 'NO'}")
        
        permutation_results['accuracy_improvement'] = {
            'observed_statistic': observed_accuracy_diff,
            'improvement_pct': accuracy_improvement_pct,
            'p_value': accuracy_p_value,
            'significant': accuracy_p_value < CONFIG['ALPHA_LEVEL'],
            'meets_threshold': accuracy_improvement_pct > 30,
            'null_distribution': null_distribution
        }
        
    except Exception as e:
        print(f"Error in accuracy improvement permutation test: {e}")
        permutation_results['accuracy_improvement'] = {}
    
    # 2. Permutation test for False Positive reduction
    print(f"\n" + "="*60)
    print("2. PERMUTATION TEST: FALSE POSITIVE REDUCTION")
    print("="*60)
    
    print(f"H‚ÇÄ: No difference in False Positive rates between groups")
    print(f"H‚ÇÅ: Hallucination-reducing group has lower False Positive rate")
    
    def fp_reduction_statistic(hr_group, standard_group):
        """Test statistic: reduction in FP rate (Standard - HR)"""
        hr_fp_rate = np.mean(hr_group['classification'] == 'FP')
        standard_fp_rate = np.mean(standard_group['classification'] == 'FP')
        return standard_fp_rate - hr_fp_rate  # Positive means HR has lower FP rate
    
    try:
        observed_fp_reduction = fp_reduction_statistic(hr_data, standard_data)
        
        # Permutation test
        if PERMUTATION_TEST_AVAILABLE:
            fp_perm_result = permutation_test(
                (hr_data, standard_data),
                lambda x, y: fp_reduction_statistic(x, y),
                n_resamples=n_resamples,
                alternative='greater',
                random_state=42
            )
            fp_p_value = fp_perm_result.pvalue
            fp_null_distribution = fp_perm_result.null_distribution
        else:
            fp_p_value, fp_null_distribution = custom_permutation_test_accuracy(
                hr_data, standard_data, fp_reduction_statistic, n_resamples
            )
        
        # Calculate reduction percentage
        fp_reduction_pct = (observed_fp_reduction / standard_metrics['fp_rate']) * 100 if standard_metrics['fp_rate'] > 0 else 0
        
        print(f"\nResults:")
        print(f"  Observed FP rate reduction: {observed_fp_reduction:.3f}")
        print(f"  FP reduction percentage: {fp_reduction_pct:.1f}%")
        print(f"  Permutation p-value: {fp_p_value:.6f}")
        print(f"  Significant: {'YES' if fp_p_value < CONFIG['ALPHA_LEVEL'] else 'NO'}")
        print(f"  Meets >30% threshold: {'YES' if fp_reduction_pct > 30 else 'NO'}")
        
        permutation_results['fp_reduction'] = {
            'observed_statistic': observed_fp_reduction,
            'reduction_pct': fp_reduction_pct,
            'p_value': fp_p_value,
            'significant': fp_p_value < CONFIG['ALPHA_LEVEL'],
            'meets_threshold': fp_reduction_pct > 30,
            'null_distribution': fp_null_distribution
        }
        
    except Exception as e:
        print(f"Error in FP reduction permutation test: {e}")
        permutation_results['fp_reduction'] = {}
    
    # 3. Permutation test for False Negative reduction
    print(f"\n" + "="*60)
    print("3. PERMUTATION TEST: FALSE NEGATIVE REDUCTION")
    print("="*60)
    
    print(f"H‚ÇÄ: No difference in False Negative rates between groups")
    print(f"H‚ÇÅ: Hallucination-reducing group has lower False Negative rate")
    
    def fn_reduction_statistic(hr_group, standard_group):
        """Test statistic: reduction in FN rate (Standard - HR)"""
        hr_fn_rate = np.mean(hr_group['classification'] == 'FN')
        standard_fn_rate = np.mean(standard_group['classification'] == 'FN')
        return standard_fn_rate - hr_fn_rate  # Positive means HR has lower FN rate
    
    try:
        observed_fn_reduction = fn_reduction_statistic(hr_data, standard_data)
        
        # Permutation test
        if PERMUTATION_TEST_AVAILABLE:
            fn_perm_result = permutation_test(
                (hr_data, standard_data),
                lambda x, y: fn_reduction_statistic(x, y),
                n_resamples=n_resamples,
                alternative='greater',
                random_state=42
            )
            fn_p_value = fn_perm_result.pvalue
            fn_null_distribution = fn_perm_result.null_distribution
        else:
            fn_p_value, fn_null_distribution = custom_permutation_test_accuracy(
                hr_data, standard_data, fn_reduction_statistic, n_resamples
            )
        
        # Calculate reduction percentage
        fn_reduction_pct = (observed_fn_reduction / standard_metrics['fn_rate']) * 100 if standard_metrics['fn_rate'] > 0 else 0
        
        print(f"\nResults:")
        print(f"  Observed FN rate reduction: {observed_fn_reduction:.3f}")
        print(f"  FN reduction percentage: {fn_reduction_pct:.1f}%")
        print(f"  Permutation p-value: {fn_p_value:.6f}")
        print(f"  Significant: {'YES' if fn_p_value < CONFIG['ALPHA_LEVEL'] else 'NO'}")
        print(f"  Meets >30% threshold: {'YES' if fn_reduction_pct > 30 else 'NO'}")
        
        permutation_results['fn_reduction'] = {
            'observed_statistic': observed_fn_reduction,
            'reduction_pct': fn_reduction_pct,
            'p_value': fn_p_value,
            'significant': fn_p_value < CONFIG['ALPHA_LEVEL'],
            'meets_threshold': fn_reduction_pct > 30,
            'null_distribution': fn_null_distribution
        }
        
    except Exception as e:
        print(f"Error in FN reduction permutation test: {e}")
        permutation_results['fn_reduction'] = {}
    
    # 4. Permutation test for overall error reduction
    print(f"\n" + "="*60)
    print("4. PERMUTATION TEST: OVERALL ERROR REDUCTION")
    print("="*60)
    
    print(f"H‚ÇÄ: No difference in average estimation errors between groups")
    print(f"H‚ÇÅ: Hallucination-reducing group has lower average errors")
    
    def error_reduction_statistic(hr_group, standard_group):
        """Test statistic: reduction in average error (Standard - HR)"""
        hr_avg_error = hr_group['accuracy_error_pct'].mean()
        standard_avg_error = standard_group['accuracy_error_pct'].mean()
        return standard_avg_error - hr_avg_error  # Positive means HR has lower errors
    
    try:
        observed_error_reduction = error_reduction_statistic(hr_data, standard_data)
        
        # Permutation test
        if PERMUTATION_TEST_AVAILABLE:
            error_perm_result = permutation_test(
                (hr_data, standard_data),
                lambda x, y: error_reduction_statistic(x, y),
                n_resamples=n_resamples,
                alternative='greater',
                random_state=42
            )
            error_p_value = error_perm_result.pvalue
            error_null_distribution = error_perm_result.null_distribution
        else:
            error_p_value, error_null_distribution = custom_permutation_test_accuracy(
                hr_data, standard_data, error_reduction_statistic, n_resamples
            )
        
        # Calculate reduction percentage
        error_reduction_pct = (observed_error_reduction / standard_metrics['avg_error']) * 100
        
        print(f"\nResults:")
        print(f"  Observed error reduction: {observed_error_reduction:.2f} percentage points")
        print(f"  Error reduction percentage: {error_reduction_pct:.1f}%")
        print(f"  Permutation p-value: {error_p_value:.6f}")
        print(f"  Significant: {'YES' if error_p_value < CONFIG['ALPHA_LEVEL'] else 'NO'}")
        print(f"  Meets >30% threshold: {'YES' if error_reduction_pct > 30 else 'NO'}")
        
        permutation_results['error_reduction'] = {
            'observed_statistic': observed_error_reduction,
            'reduction_pct': error_reduction_pct,
            'p_value': error_p_value,
            'significant': error_p_value < CONFIG['ALPHA_LEVEL'],
            'meets_threshold': error_reduction_pct > 30,
            'null_distribution': error_null_distribution
        }
        
    except Exception as e:
        print(f"Error in error reduction permutation test: {e}")
        permutation_results['error_reduction'] = {}
    
    # 5. Multiple comparisons adjustment
    print(f"\n" + "="*60)
    print("5. MULTIPLE COMPARISONS ADJUSTMENT")
    print("="*60)
    
    try:
        from statsmodels.stats.multitest import multipletests
        
        # Collect all p-values for adjustment
        p_values = []
        test_names = []
        
        for test_name, results in permutation_results.items():
            if results and 'p_value' in results:
                p_values.append(results['p_value'])
                test_names.append(test_name.replace('_', ' ').title())
        
        if p_values:
            # Apply Bonferroni correction
            bonferroni_rejected, bonferroni_pvals, _, _ = multipletests(p_values, method='bonferroni')
            
            # Apply Benjamini-Hochberg (FDR) correction
            fdr_rejected, fdr_pvals, _, _ = multipletests(p_values, method='fdr_bh')
            
            print(f"Multiple Comparisons Results:")
            print(f"{'Test':<20} {'Raw p-value':<12} {'Bonferroni':<12} {'FDR (B-H)':<12} {'Significant'}")
            print("-" * 80)
            
            for i, (test_name, raw_p) in enumerate(zip(test_names, p_values)):
                bonf_sig = '‚úì' if bonferroni_rejected[i] else '‚úó'
                fdr_sig = '‚úì' if fdr_rejected[i] else '‚úó'
                print(f"{test_name:<20} {raw_p:<12.6f} {bonferroni_pvals[i]:<12.6f} {fdr_pvals[i]:<12.6f} {bonf_sig}/{fdr_sig}")
            
            permutation_results['multiple_comparisons'] = {
                'raw_p_values': p_values,
                'test_names': test_names,
                'bonferroni_p_values': bonferroni_pvals.tolist(),
                'bonferroni_rejected': bonferroni_rejected.tolist(),
                'fdr_p_values': fdr_pvals.tolist(),
                'fdr_rejected': fdr_rejected.tolist()
            }
    
    except Exception as e:
        print(f"Error in multiple comparisons adjustment: {e}")
        permutation_results['multiple_comparisons'] = {}
    
    # Summary of permutation testing results
    print(f"\n" + "="*70)
    print("PERMUTATION TESTING SUMMARY - ACCURACY vs GROUND TRUTH")
    print("="*70)
    
    significant_tests = 0
    total_tests = 0
    meets_threshold_tests = 0
    
    for test_name, results in permutation_results.items():
        if test_name == 'multiple_comparisons':
            continue
        if results and 'significant' in results:
            total_tests += 1
            if results['significant']:
                significant_tests += 1
            if results.get('meets_threshold', False):
                meets_threshold_tests += 1
    
    print(f"\nOverall Results:")
    print(f"  Total permutation tests: {total_tests}")
    print(f"  Statistically significant: {significant_tests}/{total_tests}")
    print(f"  Meet >30% threshold: {meets_threshold_tests}/{total_tests}")
    
    # Calculate best improvements
    best_improvement = 0
    best_metric = "None"
    for test_name, results in permutation_results.items():
        if test_name == 'multiple_comparisons':
            continue
        if results and 'improvement_pct' in results:
            improvement = results['improvement_pct']
        elif results and 'reduction_pct' in results:
            improvement = results['reduction_pct']
        else:
            continue
        
        if improvement > best_improvement:
            best_improvement = improvement
            best_metric = test_name.replace('_', ' ').title()
    
    print(f"  Best improvement: {best_improvement:.1f}% in {best_metric}")
    
    # Final verdict for permutation testing
    any_significant = significant_tests > 0
    any_threshold = meets_threshold_tests > 0
    strong_evidence = significant_tests >= total_tests * 0.5 and meets_threshold_tests > 0
    
    print(f"\nüéØ PERMUTATION TESTING VERDICT:")
    print(f"  Statistical Evidence: {'Found' if any_significant else 'Not Found'}")
    print(f"  Practical Significance: {'Found' if any_threshold else 'Not Found'}")
    print(f"  Overall Support: {'STRONG' if strong_evidence else 'MODERATE' if any_significant or any_threshold else 'WEAK'}")
    
    if strong_evidence:
        print(f"\n‚úÖ CONCLUSION: Permutation tests provide strong evidence that")
        print(f"   hallucination-reducing techniques improve estimation accuracy")
        print(f"   through reduced false positives and false negatives.")
    elif any_significant or any_threshold:
        print(f"\n‚ö†Ô∏è  CONCLUSION: Permutation tests provide moderate evidence")
        print(f"   for improved estimation accuracy, but results are mixed.")
    else:
        print(f"\n‚ùå CONCLUSION: Permutation tests do not provide sufficient")
        print(f"   evidence for improved estimation accuracy.")
    
    print(f"\nADVANTAGES OF ACCURACY-BASED PERMUTATION TESTING:")
    print(f"  ‚úì Compares against ground truth rather than between models")
    print(f"  ‚úì Directly tests False Positive and False Negative reduction")
    print(f"  ‚úì No distributional assumptions required")
    print(f"  ‚úì Exact p-values for small samples")
    print(f"  ‚úì Multiple comparison corrections applied")
    print(f"  ‚úì Robust to outliers and non-normal distributions")
    
    return permutation_results

def custom_permutation_test_accuracy(hr_data, standard_data, statistic_func, n_resamples):
    """
    Custom permutation test implementation for accuracy analysis
    """
    # Combine the data
    combined_data = pd.concat([hr_data, standard_data], ignore_index=True)
    n_hr = len(hr_data)
    
    # Calculate observed statistic
    observed_stat = statistic_func(hr_data, standard_data)
    
    # Generate permutation distribution
    perm_stats = []
    np.random.seed(42)
    
    for _ in range(n_resamples):
        # Randomly permute the combined data
        perm_data = combined_data.sample(frac=1).reset_index(drop=True)
        perm_hr = perm_data.iloc[:n_hr]
        perm_standard = perm_data.iloc[n_hr:]
        
        perm_stat = statistic_func(perm_hr, perm_standard)
        perm_stats.append(perm_stat)
    
    perm_stats = np.array(perm_stats)
    
    # Calculate p-value (one-sided test - greater)
    p_value = np.mean(perm_stats >= observed_stat)
    
    return p_value, perm_stats

# Execute accuracy-based permutation testing
if ('accuracy_test_results' in globals() and accuracy_test_results and 
    STATISTICAL_CONFIG.get('permutation_tests', False)):
    accuracy_permutation_results = perform_accuracy_permutation_testing()
    print(f"\n‚úì Accuracy-based permutation testing completed successfully")
else:
    print("Cannot perform accuracy-based permutation testing:")
    if 'accuracy_test_results' not in globals() or not accuracy_test_results:
        print("  - Accuracy test results not available (run Cell 11 first)")
    if not STATISTICAL_CONFIG.get('permutation_tests', False):
        print("  - Permutation testing functionality not available")
    
    accuracy_permutation_results = {}

In [None]:
# Cell [13] - Bootstrap Hypothesis Testing for Estimation Accuracy vs Ground Truth
# Purpose: Bootstrap-based hypothesis testing for >30% accuracy improvement with confidence intervals
# Dependencies: accuracy_test_results from Cell 11, bootstrap functionality from Cell 0
# Breadcrumbs: Setup -> Analysis -> Accuracy Testing -> Bootstrap Hypothesis Testing for Accuracy

def perform_bootstrap_accuracy_testing():
    """
    Perform comprehensive bootstrap hypothesis testing for estimation accuracy vs ground truth
    Tests whether hallucination-reducing techniques improve accuracy by >30% through bootstrap resampling
    
    Returns:
        dict: Bootstrap test results with confidence intervals and bias corrections
    """
    
    print("BOOTSTRAP HYPOTHESIS TESTING - ESTIMATION ACCURACY vs GROUND TRUTH")
    print("=" * 70)
    print("HYPOTHESIS: Hallucination-reducing techniques improve estimation accuracy by >30%")
    print("METHOD: Bootstrap resampling with bias-corrected confidence intervals on accuracy metrics")
    
    if not accuracy_test_results or 'accuracy_df' not in accuracy_test_results:
        print("\nWarning: Accuracy test results not available from Cell 11")
        return {}
    
    accuracy_df = accuracy_test_results['accuracy_df']
    hr_metrics = accuracy_test_results['hr_metrics']
    standard_metrics = accuracy_test_results['standard_metrics']
    
    # Extract accuracy data for bootstrap testing
    hr_data = hr_metrics['group_data']
    standard_data = standard_metrics['group_data']
    
    print(f"\nACCURACY DATA SUMMARY:")
    print(f"  Total estimates: {len(accuracy_df)}")
    print(f"  Hallucination-reducing estimates: {len(hr_data)}")
    print(f"  Standard estimates: {len(standard_data)}")
    print(f"  Ground truth baseline: {accuracy_test_results['ground_truth_baseline']:.1f} LOC/SiFP")
    
    if len(hr_data) == 0 or len(standard_data) == 0:
        print("Warning: Cannot perform bootstrap tests - insufficient group sizes")
        return {}
    
    bootstrap_results = {}
    n_bootstrap = CONFIG.get('BOOTSTRAP_SAMPLES', 10000)
    
    print(f"\nBOOTSTRAP TEST CONFIGURATION:")
    print(f"  Number of bootstrap samples: {n_bootstrap:,}")
    print(f"  Random seed: 42 (for reproducibility)")
    print(f"  Confidence level: 95%")
    print(f"  Bias correction: BCa (Bias-Corrected and accelerated)")
    
    # 1. Bootstrap test for overall accuracy improvement
    print(f"\n" + "="*60)
    print("1. BOOTSTRAP TEST: OVERALL ACCURACY IMPROVEMENT")
    print("="*60)
    
    print(f"H‚ÇÄ: No difference in accuracy rates between groups")
    print(f"H‚ÇÅ: Hallucination-reducing group has higher accuracy rate (>30% improvement)")
    
    try:
        def accuracy_improvement_statistic(hr_sample, standard_sample):
            """Calculate accuracy improvement percentage: (HR - Standard) / Standard * 100"""
            hr_accuracy = np.mean(hr_sample['classification'] == 'TP')
            standard_accuracy = np.mean(standard_sample['classification'] == 'TP')
            if standard_accuracy == 0:
                return 0
            return ((hr_accuracy - standard_accuracy) / standard_accuracy) * 100
        
        # Bootstrap for accuracy improvement
        def bootstrap_accuracy_improvement(n_samples=n_bootstrap):
            improvements = []
            np.random.seed(42)
            
            for _ in range(n_samples):
                # Resample with replacement from each group
                hr_sample = hr_data.sample(len(hr_data), replace=True)
                standard_sample = standard_data.sample(len(standard_data), replace=True)
                
                improvement = accuracy_improvement_statistic(hr_sample, standard_sample)
                improvements.append(improvement)
            
            return np.array(improvements)
        
        # Generate bootstrap distribution
        accuracy_bootstrap_dist = bootstrap_accuracy_improvement()
        observed_accuracy_improvement = accuracy_improvement_statistic(hr_data, standard_data)
        
        # Calculate confidence intervals
        accuracy_ci_lower = np.percentile(accuracy_bootstrap_dist, 2.5)
        accuracy_ci_upper = np.percentile(accuracy_bootstrap_dist, 97.5)
        
        # Bootstrap hypothesis test: H‚ÇÄ: improvement ‚â§ 0
        null_violations = np.sum(accuracy_bootstrap_dist <= 0)
        accuracy_bootstrap_p_value = null_violations / len(accuracy_bootstrap_dist)
        
        # Test for >30% improvement specifically
        improvement_30_violations = np.sum(accuracy_bootstrap_dist < 30)
        accuracy_30_p_value = improvement_30_violations / len(accuracy_bootstrap_dist)
        
        print(f"\nResults:")
        print(f"  Observed accuracy improvement: {observed_accuracy_improvement:.1f}%")
        print(f"  Bootstrap mean improvement: {np.mean(accuracy_bootstrap_dist):.1f}%")
        print(f"  Bootstrap std improvement: {np.std(accuracy_bootstrap_dist):.1f}%")
        print(f"  95% Bootstrap CI: [{accuracy_ci_lower:.1f}%, {accuracy_ci_upper:.1f}%]")
        print(f"  P(improvement ‚â§ 0): {accuracy_bootstrap_p_value:.4f}")
        print(f"  P(improvement < 30%): {accuracy_30_p_value:.4f}")
        print(f"  Significant improvement: {'YES' if accuracy_bootstrap_p_value < CONFIG['ALPHA_LEVEL'] else 'NO'}")
        print(f"  Meets >30% threshold: {'YES' if observed_accuracy_improvement > 30 else 'NO'}")
        print(f"  30% threshold confidence: {1 - accuracy_30_p_value:.3f}")
        
        bootstrap_results['accuracy_improvement'] = {
            'observed_improvement': observed_accuracy_improvement,
            'bootstrap_mean': np.mean(accuracy_bootstrap_dist),
            'bootstrap_std': np.std(accuracy_bootstrap_dist),
            'confidence_interval': (accuracy_ci_lower, accuracy_ci_upper),
            'p_value_improvement': accuracy_bootstrap_p_value,
            'p_value_30_percent': accuracy_30_p_value,
            'significant': accuracy_bootstrap_p_value < CONFIG['ALPHA_LEVEL'],
            'meets_threshold': observed_accuracy_improvement > 30,
            'threshold_confidence': 1 - accuracy_30_p_value,
            'bootstrap_distribution': accuracy_bootstrap_dist
        }
        
    except Exception as e:
        print(f"Error in bootstrap accuracy improvement test: {e}")
        bootstrap_results['accuracy_improvement'] = {}
    
    # 2. Bootstrap test for False Positive reduction
    print(f"\n" + "="*60)
    print("2. BOOTSTRAP TEST: FALSE POSITIVE REDUCTION")
    print("="*60)
    
    print(f"H‚ÇÄ: No difference in False Positive rates between groups")
    print(f"H‚ÇÅ: Hallucination-reducing group has lower FP rate (>30% reduction)")
    
    try:
        def fp_reduction_statistic(hr_sample, standard_sample):
            """Calculate FP reduction percentage: (Standard - HR) / Standard * 100"""
            hr_fp_rate = np.mean(hr_sample['classification'] == 'FP')
            standard_fp_rate = np.mean(standard_sample['classification'] == 'FP')
            if standard_fp_rate == 0:
                return 0
            return ((standard_fp_rate - hr_fp_rate) / standard_fp_rate) * 100
        
        # Bootstrap for FP reduction
        def bootstrap_fp_reduction(n_samples=n_bootstrap):
            reductions = []
            np.random.seed(42)
            
            for _ in range(n_samples):
                hr_sample = hr_data.sample(len(hr_data), replace=True)
                standard_sample = standard_data.sample(len(standard_data), replace=True)
                
                reduction = fp_reduction_statistic(hr_sample, standard_sample)
                reductions.append(reduction)
            
            return np.array(reductions)
        
        # Generate bootstrap distribution
        fp_bootstrap_dist = bootstrap_fp_reduction()
        observed_fp_reduction = fp_reduction_statistic(hr_data, standard_data)
        
        # Calculate confidence intervals
        fp_ci_lower = np.percentile(fp_bootstrap_dist, 2.5)
        fp_ci_upper = np.percentile(fp_bootstrap_dist, 97.5)
        
        # Bootstrap hypothesis tests
        null_violations = np.sum(fp_bootstrap_dist <= 0)
        fp_bootstrap_p_value = null_violations / len(fp_bootstrap_dist)
        
        reduction_30_violations = np.sum(fp_bootstrap_dist < 30)
        fp_30_p_value = reduction_30_violations / len(fp_bootstrap_dist)
        
        print(f"\nResults:")
        print(f"  Observed FP reduction: {observed_fp_reduction:.1f}%")
        print(f"  Bootstrap mean reduction: {np.mean(fp_bootstrap_dist):.1f}%")
        print(f"  Bootstrap std reduction: {np.std(fp_bootstrap_dist):.1f}%")
        print(f"  95% Bootstrap CI: [{fp_ci_lower:.1f}%, {fp_ci_upper:.1f}%]")
        print(f"  P(reduction ‚â§ 0): {fp_bootstrap_p_value:.4f}")
        print(f"  P(reduction < 30%): {fp_30_p_value:.4f}")
        print(f"  Significant reduction: {'YES' if fp_bootstrap_p_value < CONFIG['ALPHA_LEVEL'] else 'NO'}")
        print(f"  Meets >30% threshold: {'YES' if observed_fp_reduction > 30 else 'NO'}")
        print(f"  30% threshold confidence: {1 - fp_30_p_value:.3f}")
        
        bootstrap_results['fp_reduction'] = {
            'observed_reduction': observed_fp_reduction,
            'bootstrap_mean': np.mean(fp_bootstrap_dist),
            'bootstrap_std': np.std(fp_bootstrap_dist),
            'confidence_interval': (fp_ci_lower, fp_ci_upper),
            'p_value_reduction': fp_bootstrap_p_value,
            'p_value_30_percent': fp_30_p_value,
            'significant': fp_bootstrap_p_value < CONFIG['ALPHA_LEVEL'],
            'meets_threshold': observed_fp_reduction > 30,
            'threshold_confidence': 1 - fp_30_p_value,
            'bootstrap_distribution': fp_bootstrap_dist
        }
        
    except Exception as e:
        print(f"Error in bootstrap FP reduction test: {e}")
        bootstrap_results['fp_reduction'] = {}
    
    # 3. Bootstrap test for False Negative reduction
    print(f"\n" + "="*60)
    print("3. BOOTSTRAP TEST: FALSE NEGATIVE REDUCTION")
    print("="*60)
    
    print(f"H‚ÇÄ: No difference in False Negative rates between groups")
    print(f"H‚ÇÅ: Hallucination-reducing group has lower FN rate (>30% reduction)")
    
    try:
        def fn_reduction_statistic(hr_sample, standard_sample):
            """Calculate FN reduction percentage: (Standard - HR) / Standard * 100"""
            hr_fn_rate = np.mean(hr_sample['classification'] == 'FN')
            standard_fn_rate = np.mean(standard_sample['classification'] == 'FN')
            if standard_fn_rate == 0:
                return 0
            return ((standard_fn_rate - hr_fn_rate) / standard_fn_rate) * 100
        
        # Bootstrap for FN reduction
        def bootstrap_fn_reduction(n_samples=n_bootstrap):
            reductions = []
            np.random.seed(42)
            
            for _ in range(n_samples):
                hr_sample = hr_data.sample(len(hr_data), replace=True)
                standard_sample = standard_data.sample(len(standard_data), replace=True)
                
                reduction = fn_reduction_statistic(hr_sample, standard_sample)
                reductions.append(reduction)
            
            return np.array(reductions)
        
        # Generate bootstrap distribution
        fn_bootstrap_dist = bootstrap_fn_reduction()
        observed_fn_reduction = fn_reduction_statistic(hr_data, standard_data)
        
        # Calculate confidence intervals
        fn_ci_lower = np.percentile(fn_bootstrap_dist, 2.5)
        fn_ci_upper = np.percentile(fn_bootstrap_dist, 97.5)
        
        # Bootstrap hypothesis tests
        null_violations = np.sum(fn_bootstrap_dist <= 0)
        fn_bootstrap_p_value = null_violations / len(fn_bootstrap_dist)
        
        reduction_30_violations = np.sum(fn_bootstrap_dist < 30)
        fn_30_p_value = reduction_30_violations / len(fn_bootstrap_dist)
        
        print(f"\nResults:")
        print(f"  Observed FN reduction: {observed_fn_reduction:.1f}%")
        print(f"  Bootstrap mean reduction: {np.mean(fn_bootstrap_dist):.1f}%")
        print(f"  Bootstrap std reduction: {np.std(fn_bootstrap_dist):.1f}%")
        print(f"  95% Bootstrap CI: [{fn_ci_lower:.1f}%, {fn_ci_upper:.1f}%]")
        print(f"  P(reduction ‚â§ 0): {fn_bootstrap_p_value:.4f}")
        print(f"  P(reduction < 30%): {fn_30_p_value:.4f}")
        print(f"  Significant reduction: {'YES' if fn_bootstrap_p_value < CONFIG['ALPHA_LEVEL'] else 'NO'}")
        print(f"  Meets >30% threshold: {'YES' if observed_fn_reduction > 30 else 'NO'}")
        print(f"  30% threshold confidence: {1 - fn_30_p_value:.3f}")
        
        bootstrap_results['fn_reduction'] = {
            'observed_reduction': observed_fn_reduction,
            'bootstrap_mean': np.mean(fn_bootstrap_dist),
            'bootstrap_std': np.std(fn_bootstrap_dist),
            'confidence_interval': (fn_ci_lower, fn_ci_upper),
            'p_value_reduction': fn_bootstrap_p_value,
            'p_value_30_percent': fn_30_p_value,
            'significant': fn_bootstrap_p_value < CONFIG['ALPHA_LEVEL'],
            'meets_threshold': observed_fn_reduction > 30,
            'threshold_confidence': 1 - fn_30_p_value,
            'bootstrap_distribution': fn_bootstrap_dist
        }
        
    except Exception as e:
        print(f"Error in bootstrap FN reduction test: {e}")
        bootstrap_results['fn_reduction'] = {}
    
    # 4. Bootstrap test for overall error reduction
    print(f"\n" + "="*60)
    print("4. BOOTSTRAP TEST: OVERALL ERROR REDUCTION")
    print("="*60)
    
    print(f"H‚ÇÄ: No difference in average estimation errors between groups")
    print(f"H‚ÇÅ: Hallucination-reducing group has lower average errors (>30% reduction)")
    
    try:
        def error_reduction_statistic(hr_sample, standard_sample):
            """Calculate error reduction percentage: (Standard - HR) / Standard * 100"""
            hr_avg_error = hr_sample['accuracy_error_pct'].mean()
            standard_avg_error = standard_sample['accuracy_error_pct'].mean()
            if standard_avg_error == 0:
                return 0
            return ((standard_avg_error - hr_avg_error) / standard_avg_error) * 100
        
        # Bootstrap for error reduction
        def bootstrap_error_reduction(n_samples=n_bootstrap):
            reductions = []
            np.random.seed(42)
            
            for _ in range(n_samples):
                hr_sample = hr_data.sample(len(hr_data), replace=True)
                standard_sample = standard_data.sample(len(standard_data), replace=True)
                
                reduction = error_reduction_statistic(hr_sample, standard_sample)
                reductions.append(reduction)
            
            return np.array(reductions)
        
        # Generate bootstrap distribution
        error_bootstrap_dist = bootstrap_error_reduction()
        observed_error_reduction = error_reduction_statistic(hr_data, standard_data)
        
        # Calculate confidence intervals
        error_ci_lower = np.percentile(error_bootstrap_dist, 2.5)
        error_ci_upper = np.percentile(error_bootstrap_dist, 97.5)
        
        # Bootstrap hypothesis tests
        null_violations = np.sum(error_bootstrap_dist <= 0)
        error_bootstrap_p_value = null_violations / len(error_bootstrap_dist)
        
        reduction_30_violations = np.sum(error_bootstrap_dist < 30)
        error_30_p_value = reduction_30_violations / len(error_bootstrap_dist)
        
        print(f"\nResults:")
        print(f"  Observed error reduction: {observed_error_reduction:.1f}%")
        print(f"  Bootstrap mean reduction: {np.mean(error_bootstrap_dist):.1f}%")
        print(f"  Bootstrap std reduction: {np.std(error_bootstrap_dist):.1f}%")
        print(f"  95% Bootstrap CI: [{error_ci_lower:.1f}%, {error_ci_upper:.1f}%]")
        print(f"  P(reduction ‚â§ 0): {error_bootstrap_p_value:.4f}")
        print(f"  P(reduction < 30%): {error_30_p_value:.4f}")
        print(f"  Significant reduction: {'YES' if error_bootstrap_p_value < CONFIG['ALPHA_LEVEL'] else 'NO'}")
        print(f"  Meets >30% threshold: {'YES' if observed_error_reduction > 30 else 'NO'}")
        print(f"  30% threshold confidence: {1 - error_30_p_value:.3f}")
        
        bootstrap_results['error_reduction'] = {
            'observed_reduction': observed_error_reduction,
            'bootstrap_mean': np.mean(error_bootstrap_dist),
            'bootstrap_std': np.std(error_bootstrap_dist),
            'confidence_interval': (error_ci_lower, error_ci_upper),
            'p_value_reduction': error_bootstrap_p_value,
            'p_value_30_percent': error_30_p_value,
            'significant': error_bootstrap_p_value < CONFIG['ALPHA_LEVEL'],
            'meets_threshold': observed_error_reduction > 30,
            'threshold_confidence': 1 - error_30_p_value,
            'bootstrap_distribution': error_bootstrap_dist
        }
        
    except Exception as e:
        print(f"Error in bootstrap error reduction test: {e}")
        bootstrap_results['error_reduction'] = {}
    
    # 5. Bias-Corrected and Accelerated (BCa) Bootstrap Intervals
    print(f"\n" + "="*60)
    print("5. BIAS-CORRECTED AND ACCELERATED (BCa) INTERVALS")
    print("="*60)
    
    try:
        def calculate_bca_interval_accuracy(hr_data, standard_data, statistic_func, alpha=0.05, n_bootstrap=n_bootstrap):
            """
            Calculate BCa (Bias-Corrected and accelerated) bootstrap confidence interval for accuracy metrics
            """
            # Original statistic
            original_stat = statistic_func(hr_data, standard_data)
            
            # Bootstrap statistics
            bootstrap_stats = []
            np.random.seed(42)
            
            for _ in range(n_bootstrap):
                hr_sample = hr_data.sample(len(hr_data), replace=True)
                standard_sample = standard_data.sample(len(standard_data), replace=True)
                bootstrap_stats.append(statistic_func(hr_sample, standard_sample))
            
            bootstrap_stats = np.array(bootstrap_stats)
            
            # Bias correction
            num_less = np.sum(bootstrap_stats < original_stat)
            bias_correction = stats.norm.ppf(num_less / n_bootstrap)
            
            # Acceleration (jackknife)
            n_hr, n_std = len(hr_data), len(standard_data)
            jackknife_stats = []
            
            # Jackknife for hr_data
            for i in range(min(n_hr, 20)):  # Limit jackknife for computational efficiency
                jack_hr = hr_data.drop(hr_data.index[i])
                jackknife_stats.append(statistic_func(jack_hr, standard_data))
            
            # Jackknife for standard_data
            for i in range(min(n_std, 20)):
                jack_std = standard_data.drop(standard_data.index[i])
                jackknife_stats.append(statistic_func(hr_data, jack_std))
            
            jackknife_stats = np.array(jackknife_stats)
            jackknife_mean = np.mean(jackknife_stats)
            
            # Acceleration parameter
            numerator = np.sum((jackknife_mean - jackknife_stats)**3)
            denominator = 6 * (np.sum((jackknife_mean - jackknife_stats)**2))**1.5
            acceleration = numerator / denominator if denominator != 0 else 0
            
            # BCa percentiles
            z_alpha_2 = stats.norm.ppf(alpha/2)
            z_1_alpha_2 = stats.norm.ppf(1 - alpha/2)
            
            alpha_1 = stats.norm.cdf(bias_correction + (bias_correction + z_alpha_2)/(1 - acceleration*(bias_correction + z_alpha_2)))
            alpha_2 = stats.norm.cdf(bias_correction + (bias_correction + z_1_alpha_2)/(1 - acceleration*(bias_correction + z_1_alpha_2)))
            
            # Ensure percentiles are within valid range
            alpha_1 = max(0, min(1, alpha_1))
            alpha_2 = max(0, min(1, alpha_2))
            
            bca_lower = np.percentile(bootstrap_stats, alpha_1 * 100)
            bca_upper = np.percentile(bootstrap_stats, alpha_2 * 100)
            
            return bca_lower, bca_upper, bias_correction, acceleration
        
        # Calculate BCa intervals for key metrics
        if 'accuracy_improvement' in bootstrap_results and bootstrap_results['accuracy_improvement']:
            acc_bca_lower, acc_bca_upper, acc_bias, acc_accel = calculate_bca_interval_accuracy(
                hr_data, standard_data, accuracy_improvement_statistic
            )
            
            print(f"BCa Confidence Intervals (95%):")
            print(f"  Accuracy Improvement:")
            print(f"    Standard CI: [{bootstrap_results['accuracy_improvement']['confidence_interval'][0]:.1f}%, {bootstrap_results['accuracy_improvement']['confidence_interval'][1]:.1f}%]")
            print(f"    BCa CI: [{acc_bca_lower:.1f}%, {acc_bca_upper:.1f}%]")
            print(f"    Bias correction: {acc_bias:.3f}")
            print(f"    Acceleration: {acc_accel:.3f}")
            
            bootstrap_results['bca_intervals'] = {
                'accuracy_improvement_bca': (acc_bca_lower, acc_bca_upper),
                'accuracy_bias_correction': acc_bias,
                'accuracy_acceleration': acc_accel
            }
        
    except Exception as e:
        print(f"Error calculating BCa intervals: {e}")
        bootstrap_results['bca_intervals'] = {}
    
    # 6. Bootstrap Hypothesis Testing Summary
    print(f"\n" + "="*70)
    print("BOOTSTRAP HYPOTHESIS TESTING SUMMARY - ACCURACY vs GROUND TRUTH")
    print("="*70)
    
    significant_tests = 0
    total_tests = 0
    meets_threshold_tests = 0
    high_confidence_tests = 0
    
    for test_name, results in bootstrap_results.items():
        if test_name in ['bca_intervals']:
            continue
        if results and 'significant' in results:
            total_tests += 1
            if results['significant']:
                significant_tests += 1
            if results.get('meets_threshold', False):
                meets_threshold_tests += 1
            if results.get('threshold_confidence', 0) > 0.8:
                high_confidence_tests += 1
    
    print(f"\nOverall Results:")
    print(f"  Total bootstrap tests: {total_tests}")
    print(f"  Statistically significant: {significant_tests}/{total_tests}")
    print(f"  Meet >30% threshold: {meets_threshold_tests}/{total_tests}")
    print(f"  High confidence (>80%) for 30% threshold: {high_confidence_tests}/{total_tests}")
    
    # Final verdict for bootstrap testing
    any_significant = significant_tests > 0
    any_threshold = meets_threshold_tests > 0
    high_confidence = high_confidence_tests > 0
    
    print(f"\nüéØ BOOTSTRAP TESTING VERDICT:")
    print(f"  Statistical Evidence: {'Found' if any_significant else 'Not Found'}")
    print(f"  Practical Significance: {'Found' if any_threshold else 'Not Found'}")
    print(f"  High Confidence in 30% Threshold: {'YES' if high_confidence else 'NO'}")
    print(f"  Overall Support: {'STRONG' if any_significant and any_threshold and high_confidence else 'MODERATE' if any_significant or any_threshold else 'WEAK'}")
    
    print(f"\nADVANTAGES OF ACCURACY-BASED BOOTSTRAP TESTING:")
    print(f"  ‚úì Tests accuracy improvements against ground truth")
    print(f"  ‚úì Provides confidence intervals for complex accuracy statistics")
    print(f"  ‚úì Accounts for bias in small samples (BCa intervals)")
    print(f"  ‚úì Direct probability statements about thresholds")
    print(f"  ‚úì Robust to non-normal distributions")
    print(f"  ‚úì Intuitive interpretation of results")
    
    return bootstrap_results

# Execute accuracy-based bootstrap hypothesis testing
if ('accuracy_test_results' in globals() and accuracy_test_results and 
    STATISTICAL_CONFIG.get('bootstrap_tests', False)):
    accuracy_bootstrap_results = perform_bootstrap_accuracy_testing()
    print(f"\n‚úì Accuracy-based bootstrap testing completed successfully")
else:
    print("Cannot perform accuracy-based bootstrap testing:")
    if 'accuracy_test_results' not in globals() or not accuracy_test_results:
        print("  - Accuracy test results not available (run Cell 11 first)")
    if not STATISTICAL_CONFIG.get('bootstrap_tests', False):
        print("  - Bootstrap testing functionality not available")
    
    accuracy_bootstrap_results = {}

In [None]:
# Cell [14] - Bayesian Analysis for Estimation Accuracy vs Ground Truth
# Purpose: Bayesian hypothesis testing with posterior probability statements for accuracy improvements
# Dependencies: accuracy_test_results from Cell 11, pymc, arviz from Cell 0, BAYESIAN_CONFIG
# Breadcrumbs: Setup -> Analysis -> Accuracy Testing -> Bayesian Analysis for Accuracy

def perform_bayesian_accuracy_analysis():
    """
    Perform comprehensive Bayesian analysis for estimation accuracy vs ground truth
    Tests whether hallucination-reducing techniques improve accuracy by >30% through Bayesian inference
    
    Returns:
        dict: Bayesian analysis results with posterior probabilities and credible intervals
    """
    
    print("BAYESIAN ANALYSIS - ESTIMATION ACCURACY vs GROUND TRUTH")
    print("=" * 60)
    print("HYPOTHESIS: Hallucination-reducing techniques improve estimation accuracy by >30%")
    print("METHOD: Bayesian inference with posterior probability statements on accuracy metrics")
    
    if not accuracy_test_results or not STATISTICAL_CONFIG.get('bayesian_analysis', False):
        print("\nWarning: Insufficient data or Bayesian functionality not available")
        return {}
    
    accuracy_df = accuracy_test_results['accuracy_df']
    hr_metrics = accuracy_test_results['hr_metrics']
    standard_metrics = accuracy_test_results['standard_metrics']
    
    # Extract accuracy data for Bayesian analysis
    hr_data = hr_metrics['group_data']
    standard_data = standard_metrics['group_data']
    
    print(f"\nACCURACY DATA SUMMARY:")
    print(f"  Total estimates: {len(accuracy_df)}")
    print(f"  Hallucination-reducing estimates: {len(hr_data)}")
    print(f"  Standard estimates: {len(standard_data)}")
    print(f"  Ground truth baseline: {accuracy_test_results['ground_truth_baseline']:.1f} LOC/SiFP")
    
    if len(hr_data) == 0 or len(standard_data) == 0:
        print("Warning: Cannot perform Bayesian analysis - insufficient group sizes")
        return {}
    
    bayesian_results = {}
    
    print(f"\nBAYESIAN ANALYSIS CONFIGURATION:")
    print(f"  MCMC samples: {CONFIG.get('BAYESIAN_SAMPLES', 2000):,}")
    print(f"  MCMC chains: {CONFIG.get('BAYESIAN_CHAINS', 4)}")
    print(f"  Improvement threshold: {CONFIG.get('IMPROVEMENT_THRESHOLD', 0.30):.0%}")
    print(f"  Random seed: 42 (for reproducibility)")
    
    # 1. Bayesian analysis for overall accuracy improvement
    print(f"\n" + "="*60)
    print("1. BAYESIAN ANALYSIS: OVERALL ACCURACY IMPROVEMENT")
    print("="*60)
    
    try:
        print("Building Bayesian model for accuracy improvement...")
        
        # Prepare accuracy data
        hr_accuracy_counts = hr_data['classification'].value_counts()
        standard_accuracy_counts = standard_data['classification'].value_counts()
        
        hr_tp_count = hr_accuracy_counts.get('TP', 0)
        hr_total = len(hr_data)
        standard_tp_count = standard_accuracy_counts.get('TP', 0)
        standard_total = len(standard_data)
        
        with pm.Model() as accuracy_model:
            # Priors for accuracy rates (Beta distribution for proportions)
            p_hr = pm.Beta('p_hr', alpha=1, beta=1)  # Uniform prior
            p_standard = pm.Beta('p_standard', alpha=1, beta=1)
            
            # Likelihood for observed accuracy counts
            hr_successes = pm.Binomial('hr_successes', n=hr_total, p=p_hr, observed=hr_tp_count)
            standard_successes = pm.Binomial('standard_successes', n=standard_total, p=p_standard, observed=standard_tp_count)
            
            # Derived quantities of interest
            accuracy_difference = pm.Deterministic('accuracy_difference', p_hr - p_standard)
            accuracy_improvement_pct = pm.Deterministic('accuracy_improvement_pct', 
                                                       100 * accuracy_difference / p_standard)
            
            # Probability of >30% improvement
            improvement_30_indicator = pm.Deterministic('improvement_30_plus', 
                                                      pm.math.switch(accuracy_improvement_pct > 30, 1, 0))
            
            # Sample from posterior
            trace_accuracy = pm.sample(
                draws=CONFIG.get('BAYESIAN_SAMPLES', 2000),
                chains=CONFIG.get('BAYESIAN_CHAINS', 4),
                tune=1000,
                target_accept=0.9,
                random_seed=42,
                return_inferencedata=True
            )
        
        # Extract posterior samples
        accuracy_improvement_samples = trace_accuracy.posterior['accuracy_improvement_pct'].values.flatten()
        accuracy_difference_samples = trace_accuracy.posterior['accuracy_difference'].values.flatten()
        
        # Calculate posterior statistics
        accuracy_posterior_mean = np.mean(accuracy_improvement_samples)
        accuracy_posterior_std = np.std(accuracy_improvement_samples)
        accuracy_credible_interval = np.percentile(accuracy_improvement_samples, [2.5, 97.5])
        
        # Probability of >30% improvement
        prob_30_plus = np.mean(accuracy_improvement_samples > 30)
        prob_any_improvement = np.mean(accuracy_improvement_samples > 0)
        
        # ROPE analysis (Region of Practical Equivalence: -5% to +5%)
        rope_lower, rope_upper = -5, 5
        prob_in_rope = np.mean((accuracy_improvement_samples >= rope_lower) & 
                              (accuracy_improvement_samples <= rope_upper))
        prob_above_rope = np.mean(accuracy_improvement_samples > rope_upper)
        
        print(f"\nBayesian Results for Accuracy Improvement:")
        print(f"  Observed HR accuracy: {hr_tp_count}/{hr_total} ({hr_tp_count/hr_total:.2%})")
        print(f"  Observed Standard accuracy: {standard_tp_count}/{standard_total} ({standard_tp_count/standard_total:.2%})")
        print(f"  Posterior mean improvement: {accuracy_posterior_mean:.1f}%")
        print(f"  Posterior std improvement: {accuracy_posterior_std:.1f}%")
        print(f"  95% Credible interval: [{accuracy_credible_interval[0]:.1f}%, {accuracy_credible_interval[1]:.1f}%]")
        print(f"  P(improvement > 0%): {prob_any_improvement:.3f}")
        print(f"  P(improvement > 30%): {prob_30_plus:.3f}")
        print(f"  P(improvement in ROPE [-5%, +5%]): {prob_in_rope:.3f}")
        print(f"  P(improvement > ROPE): {prob_above_rope:.3f}")
        
        bayesian_results['accuracy_improvement'] = {
            'posterior_mean': accuracy_posterior_mean,
            'posterior_std': accuracy_posterior_std,
            'credible_interval': accuracy_credible_interval,
            'prob_any_improvement': prob_any_improvement,
            'prob_30_plus': prob_30_plus,
            'prob_in_rope': prob_in_rope,
            'prob_above_rope': prob_above_rope,
            'trace': trace_accuracy,
            'posterior_samples': accuracy_improvement_samples
        }
        
    except Exception as e:
        print(f"Error in Bayesian accuracy improvement analysis: {e}")
        bayesian_results['accuracy_improvement'] = {}
    
    # 2. Bayesian analysis for False Positive reduction
    print(f"\n" + "="*60)
    print("2. BAYESIAN ANALYSIS: FALSE POSITIVE REDUCTION")
    print("="*60)
    
    try:
        print("Building Bayesian model for False Positive reduction...")
        
        # Prepare FP data
        hr_fp_count = hr_accuracy_counts.get('FP', 0)
        standard_fp_count = standard_accuracy_counts.get('FP', 0)
        
        with pm.Model() as fp_model:
            # Priors for FP rates
            p_hr_fp = pm.Beta('p_hr_fp', alpha=1, beta=1)
            p_standard_fp = pm.Beta('p_standard_fp', alpha=1, beta=1)
            
            # Likelihood for observed FP counts
            hr_fp_obs = pm.Binomial('hr_fp_obs', n=hr_total, p=p_hr_fp, observed=hr_fp_count)
            standard_fp_obs = pm.Binomial('standard_fp_obs', n=standard_total, p=p_standard_fp, observed=standard_fp_count)
            
            # Derived quantities
            fp_reduction = pm.Deterministic('fp_reduction', p_standard_fp - p_hr_fp)
            fp_reduction_pct = pm.Deterministic('fp_reduction_pct', 
                                              100 * fp_reduction / p_standard_fp)
            
            # Probability of >30% FP reduction
            fp_30_indicator = pm.Deterministic('fp_30_plus', 
                                             pm.math.switch(fp_reduction_pct > 30, 1, 0))
            
            # Sample from posterior
            trace_fp = pm.sample(
                draws=CONFIG.get('BAYESIAN_SAMPLES', 2000),
                chains=CONFIG.get('BAYESIAN_CHAINS', 4),
                tune=1000,
                target_accept=0.9,
                random_seed=42,
                return_inferencedata=True
            )
        
        # Extract posterior samples
        fp_reduction_samples = trace_fp.posterior['fp_reduction_pct'].values.flatten()
        
        # Calculate posterior statistics
        fp_posterior_mean = np.mean(fp_reduction_samples)
        fp_posterior_std = np.std(fp_reduction_samples)
        fp_credible_interval = np.percentile(fp_reduction_samples, [2.5, 97.5])
        
        # Probability calculations
        prob_fp_30_plus = np.mean(fp_reduction_samples > 30)
        prob_fp_any_reduction = np.mean(fp_reduction_samples > 0)
        
        print(f"\nBayesian Results for False Positive Reduction:")
        print(f"  Observed HR FP rate: {hr_fp_count}/{hr_total} ({hr_fp_count/hr_total:.2%})")
        print(f"  Observed Standard FP rate: {standard_fp_count}/{standard_total} ({standard_fp_count/standard_total:.2%})")
        print(f"  Posterior mean reduction: {fp_posterior_mean:.1f}%")
        print(f"  Posterior std reduction: {fp_posterior_std:.1f}%")
        print(f"  95% Credible interval: [{fp_credible_interval[0]:.1f}%, {fp_credible_interval[1]:.1f}%]")
        print(f"  P(reduction > 0%): {prob_fp_any_reduction:.3f}")
        print(f"  P(reduction > 30%): {prob_fp_30_plus:.3f}")
        
        bayesian_results['fp_reduction'] = {
            'posterior_mean': fp_posterior_mean,
            'posterior_std': fp_posterior_std,
            'credible_interval': fp_credible_interval,
            'prob_any_reduction': prob_fp_any_reduction,
            'prob_30_plus': prob_fp_30_plus,
            'trace': trace_fp,
            'posterior_samples': fp_reduction_samples
        }
        
    except Exception as e:
        print(f"Error in Bayesian FP reduction analysis: {e}")
        bayesian_results['fp_reduction'] = {}
    
    # 3. Bayesian analysis for False Negative reduction
    print(f"\n" + "="*60)
    print("3. BAYESIAN ANALYSIS: FALSE NEGATIVE REDUCTION")
    print("="*60)
    
    try:
        print("Building Bayesian model for False Negative reduction...")
        
        # Prepare FN data
        hr_fn_count = hr_accuracy_counts.get('FN', 0)
        standard_fn_count = standard_accuracy_counts.get('FN', 0)
        
        with pm.Model() as fn_model:
            # Priors for FN rates
            p_hr_fn = pm.Beta('p_hr_fn', alpha=1, beta=1)
            p_standard_fn = pm.Beta('p_standard_fn', alpha=1, beta=1)
            
            # Likelihood for observed FN counts
            hr_fn_obs = pm.Binomial('hr_fn_obs', n=hr_total, p=p_hr_fn, observed=hr_fn_count)
            standard_fn_obs = pm.Binomial('standard_fn_obs', n=standard_total, p=p_standard_fn, observed=standard_fn_count)
            
            # Derived quantities
            fn_reduction = pm.Deterministic('fn_reduction', p_standard_fn - p_hr_fn)
            fn_reduction_pct = pm.Deterministic('fn_reduction_pct', 
                                              100 * fn_reduction / p_standard_fn)
            
            # Probability of >30% FN reduction
            fn_30_indicator = pm.Deterministic('fn_30_plus', 
                                             pm.math.switch(fn_reduction_pct > 30, 1, 0))
            
            # Sample from posterior
            trace_fn = pm.sample(
                draws=CONFIG.get('BAYESIAN_SAMPLES', 2000),
                chains=CONFIG.get('BAYESIAN_CHAINS', 4),
                tune=1000,
                target_accept=0.9,
                random_seed=42,
                return_inferencedata=True
            )
        
        # Extract posterior samples
        fn_reduction_samples = trace_fn.posterior['fn_reduction_pct'].values.flatten()
        
        # Calculate posterior statistics
        fn_posterior_mean = np.mean(fn_reduction_samples)
        fn_posterior_std = np.std(fn_reduction_samples)
        fn_credible_interval = np.percentile(fn_reduction_samples, [2.5, 97.5])
        
        # Probability calculations
        prob_fn_30_plus = np.mean(fn_reduction_samples > 30)
        prob_fn_any_reduction = np.mean(fn_reduction_samples > 0)
        
        print(f"\nBayesian Results for False Negative Reduction:")
        print(f"  Observed HR FN rate: {hr_fn_count}/{hr_total} ({hr_fn_count/hr_total:.2%})")
        print(f"  Observed Standard FN rate: {standard_fn_count}/{standard_total} ({standard_fn_count/standard_total:.2%})")
        print(f"  Posterior mean reduction: {fn_posterior_mean:.1f}%")
        print(f"  Posterior std reduction: {fn_posterior_std:.1f}%")
        print(f"  95% Credible interval: [{fn_credible_interval[0]:.1f}%, {fn_credible_interval[1]:.1f}%]")
        print(f"  P(reduction > 0%): {prob_fn_any_reduction:.3f}")
        print(f"  P(reduction > 30%): {prob_fn_30_plus:.3f}")
        
        bayesian_results['fn_reduction'] = {
            'posterior_mean': fn_posterior_mean,
            'posterior_std': fn_posterior_std,
            'credible_interval': fn_credible_interval,
            'prob_any_reduction': prob_fn_any_reduction,
            'prob_30_plus': prob_fn_30_plus,
            'trace': trace_fn,
            'posterior_samples': fn_reduction_samples
        }
        
    except Exception as e:
        print(f"Error in Bayesian FN reduction analysis: {e}")
        bayesian_results['fn_reduction'] = {}
    
    # 4. Bayesian Model Comparison using WAIC
    print(f"\n" + "="*60)
    print("4. BAYESIAN MODEL COMPARISON")
    print("="*60)
    
    try:
        print("Comparing models with and without hallucination-reducing effect...")
        
        # Model with treatment effect (already computed above)
        if 'accuracy_improvement' in bayesian_results and bayesian_results['accuracy_improvement']:
            trace_with_effect = bayesian_results['accuracy_improvement']['trace']
            
            # Null model (no treatment effect - pooled accuracy)
            with pm.Model() as null_model:
                # Single accuracy rate for all estimates
                p_pooled = pm.Beta('p_pooled', alpha=1, beta=1)
                
                # All observations from same distribution
                total_tp = hr_tp_count + standard_tp_count
                total_estimates = hr_total + standard_total
                pooled_obs = pm.Binomial('pooled_obs', n=total_estimates, p=p_pooled, observed=total_tp)
                
                trace_null = pm.sample(
                    draws=CONFIG.get('BAYESIAN_SAMPLES', 2000),
                    chains=CONFIG.get('BAYESIAN_CHAINS', 4),
                    tune=1000,
                    random_seed=42,
                    return_inferencedata=True
                )
            
            # Calculate WAIC for model comparison
            waic_with_effect = az.waic(trace_with_effect)
            waic_null = az.waic(trace_null)
            
            # Bayes Factor approximation using WAIC
            delta_waic = waic_with_effect.waic - waic_null.waic
            
            print(f"Model Comparison Results:")
            print(f"  Treatment effect model WAIC: {waic_with_effect.waic:.2f} ¬± {waic_with_effect.waic_se:.2f}")
            print(f"  Null model WAIC: {waic_null.waic:.2f} ¬± {waic_null.waic_se:.2f}")
            print(f"  ŒîWAIC (effect - null): {delta_waic:.2f}")
            
            if delta_waic < -2:
                model_preference = "Strong evidence for treatment effect"
            elif delta_waic < 0:
                model_preference = "Moderate evidence for treatment effect"
            elif delta_waic < 2:
                model_preference = "Weak evidence either way"
            else:
                model_preference = "Evidence against treatment effect"
            
            print(f"  Model preference: {model_preference}")
            
            bayesian_results['model_comparison'] = {
                'waic_with_effect': waic_with_effect.waic,
                'waic_null': waic_null.waic,
                'delta_waic': delta_waic,
                'preference': model_preference
            }
        
    except Exception as e:
        print(f"Error in Bayesian model comparison: {e}")
        bayesian_results['model_comparison'] = {}
    
    # 5. Posterior Predictive Checks
    print(f"\n" + "="*60)
    print("5. POSTERIOR PREDICTIVE CHECKS")
    print("="*60)
    
    try:
        if 'accuracy_improvement' in bayesian_results and bayesian_results['accuracy_improvement']:
            trace_accuracy = bayesian_results['accuracy_improvement']['trace']
            
            # Generate posterior predictive samples
            with accuracy_model:
                ppc = pm.sample_posterior_predictive(trace_accuracy, random_seed=42)
            
            # Compare observed vs predicted accuracy rates
            obs_hr_rate = hr_tp_count / hr_total
            obs_standard_rate = standard_tp_count / standard_total
            obs_difference = obs_hr_rate - obs_standard_rate
            
            pred_hr_rates = ppc.posterior_predictive['hr_successes'].values.flatten() / hr_total
            pred_standard_rates = ppc.posterior_predictive['standard_successes'].values.flatten() / standard_total
            pred_differences = pred_hr_rates - pred_standard_rates
            
            # Calculate Bayesian p-values
            p_value_difference = np.mean(pred_differences >= obs_difference)
            
            print(f"Posterior Predictive Check Results:")
            print(f"  Observed HR accuracy: {obs_hr_rate:.2%}")
            print(f"  Predicted HR accuracy: {np.mean(pred_hr_rates):.2%} ¬± {np.std(pred_hr_rates):.2%}")
            print(f"  Observed Standard accuracy: {obs_standard_rate:.2%}")
            print(f"  Predicted Standard accuracy: {np.mean(pred_standard_rates):.2%} ¬± {np.std(pred_standard_rates):.2%}")
            print(f"  Bayesian p-value (difference): {p_value_difference:.3f}")
            
            bayesian_results['posterior_predictive'] = {
                'obs_hr_rate': obs_hr_rate,
                'obs_standard_rate': obs_standard_rate,
                'pred_hr_rates': pred_hr_rates,
                'pred_standard_rates': pred_standard_rates,
                'p_value_difference': p_value_difference
            }
        
    except Exception as e:
        print(f"Error in posterior predictive checks: {e}")
        bayesian_results['posterior_predictive'] = {}
    
    # 6. Bayesian Decision Analysis
    print(f"\n" + "="*60)
    print("6. BAYESIAN DECISION ANALYSIS")
    print("="*60)
    
    try:
        if ('accuracy_improvement' in bayesian_results and bayesian_results['accuracy_improvement'] and
            'fp_reduction' in bayesian_results and bayesian_results['fp_reduction'] and
            'fn_reduction' in bayesian_results and bayesian_results['fn_reduction']):
            
            accuracy_samples = bayesian_results['accuracy_improvement']['posterior_samples']
            fp_samples = bayesian_results['fp_reduction']['posterior_samples']
            fn_samples = bayesian_results['fn_reduction']['posterior_samples']
            
            # Combined probability of achieving >30% in any metric
            prob_any_30 = np.mean((accuracy_samples > 30) | (fp_samples > 30) | (fn_samples > 30))
            
            # Combined probability of achieving >30% in all metrics
            prob_all_30 = np.mean((accuracy_samples > 30) & (fp_samples > 30) & (fn_samples > 30))
            
            # Expected values
            expected_accuracy_improvement = np.mean(accuracy_samples)
            expected_fp_reduction = np.mean(fp_samples)
            expected_fn_reduction = np.mean(fn_samples)
            
            # Risk assessment (probability of negative outcomes)
            prob_accuracy_worse = np.mean(accuracy_samples < -10)  # >10% worse accuracy
            prob_fp_worse = np.mean(fp_samples < -10)  # >10% worse FP rate
            prob_fn_worse = np.mean(fn_samples < -10)  # >10% worse FN rate
            
            print(f"Bayesian Decision Analysis:")
            print(f"  P(>30% improvement in any metric): {prob_any_30:.3f}")
            print(f"  P(>30% improvement in all metrics): {prob_all_30:.3f}")
            print(f"  Expected accuracy improvement: {expected_accuracy_improvement:.1f}%")
            print(f"  Expected FP reduction: {expected_fp_reduction:.1f}%")
            print(f"  Expected FN reduction: {expected_fn_reduction:.1f}%")
            print(f"  P(>10% worse accuracy): {prob_accuracy_worse:.3f}")
            print(f"  P(>10% worse FP rate): {prob_fp_worse:.3f}")
            print(f"  P(>10% worse FN rate): {prob_fn_worse:.3f}")
            
            # Decision recommendation
            if prob_any_30 > 0.8:
                decision = "STRONG RECOMMENDATION: Adopt hallucination-reducing techniques"
            elif prob_any_30 > 0.6:
                decision = "MODERATE RECOMMENDATION: Consider adoption with monitoring"
            elif prob_any_30 > 0.4:
                decision = "WEAK RECOMMENDATION: Proceed with caution"
            else:
                decision = "NOT RECOMMENDED: Insufficient evidence of benefit"
            
            print(f"\nDecision Recommendation: {decision}")
            
            bayesian_results['decision_analysis'] = {
                'prob_any_30': prob_any_30,
                'prob_all_30': prob_all_30,
                'expected_accuracy_improvement': expected_accuracy_improvement,
                'expected_fp_reduction': expected_fp_reduction,
                'expected_fn_reduction': expected_fn_reduction,
                'prob_accuracy_worse': prob_accuracy_worse,
                'prob_fp_worse': prob_fp_worse,
                'prob_fn_worse': prob_fn_worse,
                'recommendation': decision
            }
        
    except Exception as e:
        print(f"Error in Bayesian decision analysis: {e}")
        bayesian_results['decision_analysis'] = {}
    
    # 7. Bayesian Analysis Summary
    print(f"\n" + "="*70)
    print("BAYESIAN ANALYSIS SUMMARY - ACCURACY vs GROUND TRUTH")
    print("="*70)
    
    print(f"\nKey Posterior Probabilities:")
    if 'accuracy_improvement' in bayesian_results and bayesian_results['accuracy_improvement']:
        print(f"  P(Accuracy improvement > 30%): {bayesian_results['accuracy_improvement']['prob_30_plus']:.3f}")
    if 'fp_reduction' in bayesian_results and bayesian_results['fp_reduction']:
        print(f"  P(FP reduction > 30%): {bayesian_results['fp_reduction']['prob_30_plus']:.3f}")
    if 'fn_reduction' in bayesian_results and bayesian_results['fn_reduction']:
        print(f"  P(FN reduction > 30%): {bayesian_results['fn_reduction']['prob_30_plus']:.3f}")
    if 'decision_analysis' in bayesian_results and bayesian_results['decision_analysis']:
        print(f"  P(Any metric > 30%): {bayesian_results['decision_analysis']['prob_any_30']:.3f}")
    
    print(f"\nModel Evidence:")
    if 'model_comparison' in bayesian_results and bayesian_results['model_comparison']:
        print(f"  {bayesian_results['model_comparison']['preference']}")
        print(f"  ŒîWAIC: {bayesian_results['model_comparison']['delta_waic']:.2f}")
    
    print(f"\nADVANTAGES OF BAYESIAN ACCURACY ANALYSIS:")
    print(f"  ‚úì Direct probability statements about accuracy improvements")
    print(f"  ‚úì Credible intervals for TP/FP/FN rates")
    print(f"  ‚úì Incorporates uncertainty in ground truth comparisons")
    print(f"  ‚úì Model comparison for treatment vs null hypotheses")
    print(f"  ‚úì Decision-theoretic framework for accuracy-based recommendations")
    print(f"  ‚úì Handles small sample sizes with proper uncertainty quantification")
    
    return bayesian_results

# Execute Bayesian accuracy analysis
if ('accuracy_test_results' in globals() and accuracy_test_results and 
    STATISTICAL_CONFIG.get('bayesian_analysis', False)):
    bayesian_accuracy_results = perform_bayesian_accuracy_analysis()
    print(f"\n‚úì Bayesian accuracy analysis completed successfully")
else:
    print("Cannot perform Bayesian accuracy analysis:")
    if 'accuracy_test_results' not in globals() or not accuracy_test_results:
        print("  - Accuracy test results not available (run Cell 11 first)")
    if not STATISTICAL_CONFIG.get('bayesian_analysis', False):
        print("  - Bayesian analysis functionality not available")
        print("  - Install with: pip install 'praxis-requirements-analyzer[dev]'")
    
    bayesian_accuracy_results = {}

In [None]:
# Cell [15] - Enhanced Statistical Validation Visualizations with Advanced Methods (UPDATED FOR ACCURACY)
# Purpose: Create comprehensive visualizations for all statistical tests including permutation, bootstrap, and Bayesian results
# Dependencies: accuracy_test_results, accuracy_permutation_results, accuracy_bootstrap_results, bayesian_accuracy_results from previous cells
# Breadcrumbs: Setup -> Analysis -> Advanced Statistical Testing -> Enhanced Validation Visualizations for Accuracy

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import bootstrap
import numpy as np

def safe_format_value(value, format_spec, fallback='N/A'):
    """
    Safely format a value with the given format specification
    
    Args:
        value: The value to format
        format_spec: Format specification (e.g., '.4f', '.1f')
        fallback: Value to return if formatting fails
    
    Returns:
        str: Formatted value or fallback
    """
    try:
        if value is None or value == 'N/A' or (isinstance(value, str) and value.lower() == 'n/a'):
            return fallback
        # Try to convert to float and format
        float_value = float(value)
        return f"{float_value:{format_spec}}"
    except (ValueError, TypeError):
        return fallback

def create_enhanced_accuracy_visualizations():
    """
    Create comprehensive visualizations to support all statistical hypothesis testing methods for accuracy analysis
    """
    
    if not accuracy_test_results or 'accuracy_df' not in accuracy_test_results:
        print("No accuracy data available for statistical visualizations")
        return
    
    accuracy_df = accuracy_test_results['accuracy_df']
    hr_metrics = accuracy_test_results['hr_metrics']
    standard_metrics = accuracy_test_results['standard_metrics']
    
    # Set up the enhanced figure with more subplots
    fig = plt.figure(figsize=(24, 20), constrained_layout=True)
    fig.suptitle('Enhanced Statistical Validation of Hallucination-Reducing Techniques\n' +
                 'Hypothesis: >30% Improvement in Estimation Accuracy vs Ground Truth\n' +
                 'Methods: Classical, Permutation, Bootstrap, and Bayesian Analysis', fontsize=18)
    
    # Create a more complex grid layout
    gs = fig.add_gridspec(4, 4, hspace=0.3, wspace=0.3)
    
    # Extract data for visualizations
    hr_data = hr_metrics['group_data']
    standard_data = standard_metrics['group_data']
    
    # 1. Enhanced accuracy classification comparison
    ax1 = fig.add_subplot(gs[0, 0])
    
    # Create accuracy classification data for plotting
    hr_classifications = hr_data['classification'].value_counts()
    standard_classifications = standard_data['classification'].value_counts()
    
    # Ensure all categories are present
    categories = ['TP', 'FP', 'FN']
    hr_counts = [hr_classifications.get(cat, 0) for cat in categories]
    standard_counts = [standard_classifications.get(cat, 0) for cat in categories]
    
    x = np.arange(len(categories))
    width = 0.35
    
    bars1 = ax1.bar(x - width/2, standard_counts, width, label='Standard', alpha=0.7, color='lightcoral')
    bars2 = ax1.bar(x + width/2, hr_counts, width, label='Hallucination-Reducing', alpha=0.7, color='lightgreen')
    
    ax1.set_xlabel('Classification')
    ax1.set_ylabel('Count')
    ax1.set_title('Accuracy Classifications: HR vs Standard\nTP=Correct, FP=Overestimate, FN=Underestimate')
    ax1.set_xticks(x)
    ax1.set_xticklabels(categories)
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Add percentage labels on bars
    for i, (std_count, hr_count) in enumerate(zip(standard_counts, hr_counts)):
        std_total = sum(standard_counts)
        hr_total = sum(hr_counts)
        if std_total > 0:
            ax1.text(i - width/2, std_count + 0.1, f'{std_count/std_total:.1%}', 
                    ha='center', va='bottom', fontsize=8)
        if hr_total > 0:
            ax1.text(i + width/2, hr_count + 0.1, f'{hr_count/hr_total:.1%}', 
                    ha='center', va='bottom', fontsize=8)
    
    # Add comprehensive significance indicators
    test_results_text = ""
    if 'accuracy_test_results' in globals() and 'statistical_tests' in accuracy_test_results:
        if 'chi_square' in accuracy_test_results['statistical_tests']:
            p_val = accuracy_test_results['statistical_tests']['chi_square'].get('p_value', 'N/A')
            test_results_text += f"œá¬≤-test: p={safe_format_value(p_val, '.4f')}\n"
    if 'accuracy_permutation_results' in globals() and 'accuracy_improvement' in accuracy_permutation_results:
        perm_p = accuracy_permutation_results['accuracy_improvement'].get('p_value', 'N/A')
        test_results_text += f"Permutation: p={safe_format_value(perm_p, '.4f')}\n"
    if 'accuracy_bootstrap_results' in globals() and 'accuracy_improvement' in accuracy_bootstrap_results:
        boot_p = accuracy_bootstrap_results['accuracy_improvement'].get('p_value_30_percent', 'N/A')
        test_results_text += f"Bootstrap: p={safe_format_value(boot_p, '.4f')}\n"
    if 'bayesian_accuracy_results' in globals() and 'accuracy_improvement' in bayesian_accuracy_results:
        bayes_prob = bayesian_accuracy_results['accuracy_improvement'].get('prob_30_plus', 'N/A')
        test_results_text += f"Bayesian P(>30%): {safe_format_value(bayes_prob, '.3f')}"
    
    ax1.text(0.02, 0.98, test_results_text, transform=ax1.transAxes, 
             verticalalignment='top', fontsize=9,
             bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    # 2. Permutation Test Results for Accuracy
    ax2 = fig.add_subplot(gs[0, 1])
    if 'accuracy_permutation_results' in globals() and 'accuracy_improvement' in accuracy_permutation_results:
        perm_data = accuracy_permutation_results['accuracy_improvement']
        if 'null_distribution' in perm_data:
            null_dist = perm_data['null_distribution']
            observed_stat = perm_data['observed_statistic']
            
            ax2.hist(null_dist, bins=50, alpha=0.7, color='lightblue', density=True, 
                    label='Null Distribution')
            ax2.axvline(observed_stat, color='red', linestyle='--', linewidth=2, 
                       label=f'Observed: {safe_format_value(observed_stat, ".3f")}')
            ax2.set_xlabel('Accuracy Difference (HR - Standard)')
            ax2.set_ylabel('Density')
            ax2.set_title('Permutation Test: Accuracy Improvement\nNull Distribution vs Observed')
            ax2.legend()
            ax2.grid(True, alpha=0.3)
            
            # Add p-value annotation
            p_val = perm_data.get('p_value', 'N/A')
            formatted_p = safe_format_value(p_val, '.4f')
            ax2.text(0.95, 0.95, f'p = {formatted_p}', transform=ax2.transAxes, 
                    ha='right', va='top', fontsize=12,
                    bbox=dict(boxstyle='round', facecolor='yellow' if (p_val != 'N/A' and float(p_val) < 0.05) else 'white'))
    else:
        ax2.text(0.5, 0.5, 'Permutation Test\nNot Available', ha='center', va='center', 
                transform=ax2.transAxes, fontsize=14)
        ax2.set_title('Permutation Test Results')
    
    # 3. Bootstrap Confidence Intervals for Accuracy Metrics
    ax3 = fig.add_subplot(gs[0, 2])
    if 'accuracy_bootstrap_results' in globals():
        metrics = []
        improvements = []
        ci_lowers = []
        ci_uppers = []
        
        test_names = ['accuracy_improvement', 'fp_reduction', 'fn_reduction', 'error_reduction']
        display_names = ['Accuracy\nImprovement', 'FP\nReduction', 'FN\nReduction', 'Error\nReduction']
        
        for test_name, display_name in zip(test_names, display_names):
            if test_name in accuracy_bootstrap_results and accuracy_bootstrap_results[test_name]:
                metrics.append(display_name)
                if 'observed_improvement' in accuracy_bootstrap_results[test_name]:
                    improvements.append(accuracy_bootstrap_results[test_name]['observed_improvement'])
                elif 'observed_reduction' in accuracy_bootstrap_results[test_name]:
                    improvements.append(accuracy_bootstrap_results[test_name]['observed_reduction'])
                else:
                    improvements.append(0)
                
                ci = accuracy_bootstrap_results[test_name].get('confidence_interval', (0, 0))
                ci_lowers.append(ci[0])
                ci_uppers.append(ci[1])
        
        if metrics:
            x_pos = np.arange(len(metrics))
            bars = ax3.bar(x_pos, improvements, alpha=0.7, 
                          color=['green' if imp > 30 else 'orange' if imp > 0 else 'red' 
                                for imp in improvements])
            
            # Add error bars for confidence intervals
            errors = [[imp - ci_low for imp, ci_low in zip(improvements, ci_lowers)],
                     [ci_up - imp for imp, ci_up in zip(improvements, ci_uppers)]]
            ax3.errorbar(x_pos, improvements, yerr=errors, fmt='none', 
                        color='black', capsize=5, capthick=2)
            
            ax3.axhline(y=30, color='red', linestyle='--', linewidth=2, label='30% Threshold')
            ax3.axhline(y=0, color='black', linestyle='-', linewidth=1)
            ax3.set_xticks(x_pos)
            ax3.set_xticklabels(metrics)
            ax3.set_ylabel('Improvement (%)')
            ax3.set_title('Bootstrap Confidence Intervals\nfor Accuracy Improvement Metrics')
            ax3.legend()
            ax3.grid(True, alpha=0.3)
        else:
            ax3.text(0.5, 0.5, 'Bootstrap Results\nNot Available', ha='center', va='center',
                    transform=ax3.transAxes, fontsize=14)
    else:
        ax3.text(0.5, 0.5, 'Bootstrap Results\nNot Available', ha='center', va='center',
                transform=ax3.transAxes, fontsize=14)
        ax3.set_title('Bootstrap Confidence Intervals')
    
    # 4. Bayesian Posterior Distributions
    ax4 = fig.add_subplot(gs[0, 3])
    if 'bayesian_accuracy_results' in globals():
        colors = ['skyblue', 'lightgreen', 'lightcoral', 'lightyellow']
        test_names = ['accuracy_improvement', 'fp_reduction', 'fn_reduction']
        display_names = ['Accuracy', 'FP Reduction', 'FN Reduction']
        
        for i, (test_name, display_name, color) in enumerate(zip(test_names, display_names, colors)):
            if test_name in bayesian_accuracy_results and 'posterior_samples' in bayesian_accuracy_results[test_name]:
                samples = bayesian_accuracy_results[test_name]['posterior_samples']
                ax4.hist(samples, bins=30, alpha=0.6, color=color, density=True,
                        label=display_name)
        
        ax4.axvline(30, color='red', linestyle='--', linewidth=2, label='30% Threshold')
        ax4.axvline(0, color='black', linestyle='-', linewidth=1)
        ax4.set_xlabel('Improvement (%)')
        ax4.set_ylabel('Posterior Density')
        ax4.set_title('Bayesian Posterior Distributions\nAccuracy Improvements')
        ax4.legend()
        ax4.grid(True, alpha=0.3)
        
        # Add probability annotations
        if 'accuracy_improvement' in bayesian_accuracy_results:
            prob_30 = bayesian_accuracy_results['accuracy_improvement'].get('prob_30_plus', 0)
            ax4.text(0.02, 0.95, f'P(Accuracy >30%): {safe_format_value(prob_30, ".3f")}', 
                    transform=ax4.transAxes, va='top', fontsize=10,
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    else:
        ax4.text(0.5, 0.5, 'Bayesian Results\nNot Available', ha='center', va='center',
                transform=ax4.transAxes, fontsize=14)
        ax4.set_title('Bayesian Posterior Distributions')
    
    # 5. Accuracy Error Distribution Comparison
    ax5 = fig.add_subplot(gs[1, :2])
    
    # Plot error distributions for both groups
    hr_errors = hr_data['accuracy_error_pct'].values
    standard_errors = standard_data['accuracy_error_pct'].values
    
    ax5.hist(standard_errors, bins=20, alpha=0.6, color='lightcoral', label='Standard', density=True)
    ax5.hist(hr_errors, bins=20, alpha=0.6, color='lightgreen', label='Hallucination-Reducing', density=True)
    
    # Add vertical lines for means
    ax5.axvline(np.mean(standard_errors), color='red', linestyle='--', linewidth=2, 
               label=f'Standard Mean: {np.mean(standard_errors):.1f}%')
    ax5.axvline(np.mean(hr_errors), color='green', linestyle='--', linewidth=2,
               label=f'HR Mean: {np.mean(hr_errors):.1f}%')
    
    # Enhanced correlation and statistical info
    improvement = ((np.mean(standard_errors) - np.mean(hr_errors)) / np.mean(standard_errors)) * 100
    stats_text = f'Error Reduction: {improvement:.1f}%\n'
    
    if 'accuracy_test_results' in globals() and 'statistical_tests' in accuracy_test_results:
        if 'mann_whitney' in accuracy_test_results['statistical_tests']:
            u_stat = accuracy_test_results['statistical_tests']['mann_whitney'].get('statistic', 'N/A')
            u_p = accuracy_test_results['statistical_tests']['mann_whitney'].get('p_value', 'N/A')
            stats_text += f"Mann-Whitney U: {safe_format_value(u_stat, '.1f')}\n"
            stats_text += f"p-value: {safe_format_value(u_p, '.4f')}"
    
    ax5.text(0.02, 0.98, stats_text, transform=ax5.transAxes, 
             verticalalignment='top', fontsize=11,
             bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    ax5.set_xlabel('Accuracy Error (%)')
    ax5.set_ylabel('Density')
    ax5.set_title('Error Distribution Comparison\nHallucination-Reducing vs Standard Techniques')
    ax5.legend()
    ax5.grid(True, alpha=0.3)
    
    # 6. Statistical Methods Comparison Table
    ax6 = fig.add_subplot(gs[1, 2:])
    ax6.axis('tight')
    ax6.axis('off')
    
    # Create comprehensive comparison table for accuracy analysis
    methods = ['Classical Tests', 'Permutation Test', 'Bootstrap Test', 'Bayesian Analysis']
    metrics = ['p-value/Probability', 'Accuracy Improvement', 'Confidence Interval', 'Interpretation']
    
    table_data = []
    
    # Classical tests
    if 'accuracy_test_results' in globals() and 'statistical_tests' in accuracy_test_results:
        chi_sq = accuracy_test_results['statistical_tests'].get('chi_square', {})
        classical_row = [
            safe_format_value(chi_sq.get('p_value'), '.4f'),
            f"{accuracy_test_results['improvements']['accuracy']:+.1f}%" if 'improvements' in accuracy_test_results else 'N/A',
            'Classical CI' if chi_sq else 'N/A',
            'Significant' if chi_sq.get('significant', False) else 'Not Significant'
        ]
    else:
        classical_row = ['N/A', 'N/A', 'N/A', 'N/A']
    table_data.append(classical_row)
    
    # Permutation test
    if 'accuracy_permutation_results' in globals() and 'accuracy_improvement' in accuracy_permutation_results:
        perm_result = accuracy_permutation_results['accuracy_improvement']
        perm_row = [
            safe_format_value(perm_result.get('p_value'), '.4f'),
            f"{safe_format_value(perm_result.get('improvement_pct'), '.1f')}%" if perm_result.get('improvement_pct') != 'N/A' else 'N/A',
            'Exact CI Available',
            'Significant' if perm_result.get('significant', False) else 'Not Significant'
        ]
    else:
        perm_row = ['N/A', 'N/A', 'N/A', 'N/A']
    table_data.append(perm_row)
    
    # Bootstrap test
    if 'accuracy_bootstrap_results' in globals() and 'accuracy_improvement' in accuracy_bootstrap_results:
        boot_result = accuracy_bootstrap_results['accuracy_improvement']
        boot_row = [
            safe_format_value(boot_result.get('p_value_30_percent'), '.4f'),
            f"{safe_format_value(boot_result.get('observed_improvement'), '.1f')}%" if boot_result.get('observed_improvement') != 'N/A' else 'N/A',
            f"[{safe_format_value(boot_result.get('confidence_interval', [0,0])[0], '.1f')}, {safe_format_value(boot_result.get('confidence_interval', [0,0])[1], '.1f')}]" if 'confidence_interval' in boot_result else 'N/A',
            'Meets Threshold' if boot_result.get('meets_threshold', False) else 'Below Threshold'
        ]
    else:
        boot_row = ['N/A', 'N/A', 'N/A', 'N/A']
    table_data.append(boot_row)
    
    # Bayesian analysis
    if 'bayesian_accuracy_results' in globals() and 'accuracy_improvement' in bayesian_accuracy_results:
        bayes_result = bayesian_accuracy_results['accuracy_improvement']
        bayes_row = [
            safe_format_value(bayes_result.get('prob_30_plus'), '.3f'),
            f"{safe_format_value(bayes_result.get('posterior_mean'), '.1f')}%" if bayes_result.get('posterior_mean') != 'N/A' else 'N/A',
            f"[{safe_format_value(bayes_result.get('credible_interval', [0,0])[0], '.1f')}, {safe_format_value(bayes_result.get('credible_interval', [0,0])[1], '.1f')}]" if 'credible_interval' in bayes_result else 'N/A',
            'Strong Evidence' if bayes_result.get('prob_30_plus', 0) > 0.8 else 'Moderate Evidence' if bayes_result.get('prob_30_plus', 0) > 0.6 else 'Weak Evidence'
        ]
    else:
        bayes_row = ['N/A', 'N/A', 'N/A', 'N/A']
    table_data.append(bayes_row)
    
    table = ax6.table(cellText=table_data,
                     rowLabels=methods,
                     colLabels=metrics,
                     cellLoc='center',
                     loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(9)
    table.scale(1.2, 2)
    
    # Color code the table based on results
    for i in range(len(methods)):
        for j in range(len(metrics)):
            cell = table[(i+1, j)]
            if 'Significant' in table_data[i][j] or 'Strong Evidence' in table_data[i][j]:
                cell.set_facecolor('lightgreen')
            elif 'Not Significant' in table_data[i][j] or 'Weak Evidence' in table_data[i][j]:
                cell.set_facecolor('lightcoral')
            elif 'Moderate' in table_data[i][j]:
                cell.set_facecolor('lightyellow')
    
    ax6.set_title('Statistical Methods Comparison Summary\nAccuracy vs Ground Truth Analysis', pad=20, fontsize=14)
    
    # 7-10. Detailed Bootstrap and Bayesian Distributions
    methods_available = 0
    
    # Bootstrap distributions for accuracy metrics
    if 'accuracy_bootstrap_results' in globals():
        test_names = ['accuracy_improvement', 'fp_reduction', 'fn_reduction', 'error_reduction']
        labels = ['Accuracy Improvement', 'FP Reduction', 'FN Reduction', 'Error Reduction']
        
        for idx, (test_name, label) in enumerate(zip(test_names, labels)):
            if test_name in accuracy_bootstrap_results and 'bootstrap_distribution' in accuracy_bootstrap_results[test_name] and methods_available < 4:
                ax = fig.add_subplot(gs[2, methods_available])
                
                boot_dist = accuracy_bootstrap_results[test_name]['bootstrap_distribution']
                if 'observed_improvement' in accuracy_bootstrap_results[test_name]:
                    observed = accuracy_bootstrap_results[test_name]['observed_improvement']
                else:
                    observed = accuracy_bootstrap_results[test_name].get('observed_reduction', 0)
                ci = accuracy_bootstrap_results[test_name]['confidence_interval']
                
                ax.hist(boot_dist, bins=40, alpha=0.7, color='steelblue', density=True)
                ax.axvline(observed, color='red', linestyle='-', linewidth=2, 
                          label=f'Observed: {safe_format_value(observed, ".1f")}%')
                ax.axvline(ci[0], color='orange', linestyle='--', alpha=0.7)
                ax.axvline(ci[1], color='orange', linestyle='--', alpha=0.7, 
                          label=f'95% CI: [{safe_format_value(ci[0], ".1f")}, {safe_format_value(ci[1], ".1f")}]')
                ax.axvline(30, color='green', linestyle=':', linewidth=2, label='30% Threshold')
                
                ax.set_xlabel('Improvement (%)')
                ax.set_ylabel('Bootstrap Density')
                ax.set_title(f'Bootstrap Distribution\n{label}')
                ax.legend(fontsize=8)
                ax.grid(True, alpha=0.3)
                
                methods_available += 1
    
    # Bayesian posterior distributions (detailed)
    if 'bayesian_accuracy_results' in globals() and methods_available < 4:
        test_names = ['accuracy_improvement', 'fp_reduction', 'fn_reduction']
        labels = ['Accuracy Improvement', 'FP Reduction', 'FN Reduction']
        
        for idx, (test_name, label) in enumerate(zip(test_names, labels)):
            if (test_name in bayesian_accuracy_results and 'posterior_samples' in bayesian_accuracy_results[test_name] 
                and methods_available < 4):
                ax = fig.add_subplot(gs[2, methods_available])
                
                samples = bayesian_accuracy_results[test_name]['posterior_samples']
                ci = bayesian_accuracy_results[test_name]['credible_interval']
                prob_30 = bayesian_accuracy_results[test_name]['prob_30_plus']
                
                ax.hist(samples, bins=40, alpha=0.7, color='lightcoral', density=True)
                ax.axvline(ci[0], color='blue', linestyle='--', alpha=0.7)
                ax.axvline(ci[1], color='blue', linestyle='--', alpha=0.7, 
                          label=f'95% Credible: [{safe_format_value(ci[0], ".1f")}, {safe_format_value(ci[1], ".1f")}]')
                ax.axvline(30, color='green', linestyle=':', linewidth=2, label='30% Threshold')
                
                # Shade area above 30%
                x_range = np.linspace(samples.min(), samples.max(), 1000)
                density = stats.gaussian_kde(samples)(x_range)
                ax.fill_between(x_range, 0, density, where=(x_range >= 30), 
                               alpha=0.3, color='green', label=f'P(>30%) = {safe_format_value(prob_30, ".3f")}')
                
                ax.set_xlabel('Improvement (%)')
                ax.set_ylabel('Posterior Density')
                ax.set_title(f'Bayesian Posterior\n{label}')
                ax.legend(fontsize=8)
                ax.grid(True, alpha=0.3)
                
                methods_available += 1
    
    # 11. Overall Statistical Evidence Summary
    ax11 = fig.add_subplot(gs[3, :])
    ax11.axis('off')
    
    # Create evidence summary for accuracy analysis
    evidence_summary = "COMPREHENSIVE STATISTICAL EVIDENCE SUMMARY - ACCURACY vs GROUND TRUTH\n" + "="*90 + "\n\n"
    
    # Classical statistics
    if 'accuracy_test_results' in globals() and 'statistical_tests' in accuracy_test_results:
        classical_tests = accuracy_test_results['statistical_tests']
        evidence_summary += f"üìä CLASSICAL STATISTICS: "
        significant_classical = sum(1 for test in classical_tests.values() if test.get('significant', False))
        total_classical = len([test for test in classical_tests.values() if test])
        evidence_summary += f"{significant_classical}/{total_classical} SIGNIFICANT\n"
        
        if 'chi_square' in classical_tests:
            evidence_summary += f"   œá¬≤ test p-value: {safe_format_value(classical_tests['chi_square'].get('p_value'), '.4f')}\n"
        if 'mann_whitney' in classical_tests:
            evidence_summary += f"   Mann-Whitney U p-value: {safe_format_value(classical_tests['mann_whitney'].get('p_value'), '.4f')}\n"
    
    evidence_summary += "\n"
    
    # Permutation tests
    if 'accuracy_permutation_results' in globals():
        perm_sig_count = sum(1 for test in ['accuracy_improvement', 'fp_reduction', 'fn_reduction', 'error_reduction'] 
                           if test in accuracy_permutation_results and accuracy_permutation_results[test].get('significant', False))
        perm_total = len([test for test in ['accuracy_improvement', 'fp_reduction', 'fn_reduction', 'error_reduction'] 
                         if test in accuracy_permutation_results])
        evidence_summary += f"üîÑ PERMUTATION TESTS: {perm_sig_count}/{perm_total} SIGNIFICANT\n"
        if 'accuracy_improvement' in accuracy_permutation_results:
            evidence_summary += f"   Accuracy improvement p-value: {safe_format_value(accuracy_permutation_results['accuracy_improvement'].get('p_value'), '.4f')}\n"
        if 'fp_reduction' in accuracy_permutation_results:
            evidence_summary += f"   FP reduction p-value: {safe_format_value(accuracy_permutation_results['fp_reduction'].get('p_value'), '.4f')}\n"
    
    evidence_summary += "\n"
    
    # Bootstrap results
    if 'accuracy_bootstrap_results' in globals():
        boot_threshold_count = sum(1 for test in ['accuracy_improvement', 'fp_reduction', 'fn_reduction', 'error_reduction'] 
                                 if test in accuracy_bootstrap_results and accuracy_bootstrap_results[test].get('meets_threshold', False))
        boot_total = len([test for test in ['accuracy_improvement', 'fp_reduction', 'fn_reduction', 'error_reduction'] 
                         if test in accuracy_bootstrap_results])
        evidence_summary += f"üîÄ BOOTSTRAP ANALYSIS: {boot_threshold_count}/{boot_total} MEET >30% THRESHOLD\n"
        if 'accuracy_improvement' in accuracy_bootstrap_results:
            conf = accuracy_bootstrap_results['accuracy_improvement'].get('threshold_confidence', 0)
            evidence_summary += f"   Accuracy improvement confidence for 30%: {safe_format_value(conf, '.3f')}\n"
        if 'fp_reduction' in accuracy_bootstrap_results:
            conf = accuracy_bootstrap_results['fp_reduction'].get('threshold_confidence', 0)
            evidence_summary += f"   FP reduction confidence for 30%: {safe_format_value(conf, '.3f')}\n"
    
    evidence_summary += "\n"
    
    # Bayesian results
    if 'bayesian_accuracy_results' in globals():
        evidence_summary += f"üéØ BAYESIAN ANALYSIS: POSTERIOR PROBABILITIES\n"
        if 'accuracy_improvement' in bayesian_accuracy_results:
            prob = bayesian_accuracy_results['accuracy_improvement'].get('prob_30_plus', 0)
            evidence_summary += f"   P(Accuracy improvement > 30%): {safe_format_value(prob, '.3f')}\n"
        if 'fp_reduction' in bayesian_accuracy_results:
            prob = bayesian_accuracy_results['fp_reduction'].get('prob_30_plus', 0)
            evidence_summary += f"   P(FP reduction > 30%): {safe_format_value(prob, '.3f')}\n"
        if 'fn_reduction' in bayesian_accuracy_results:
            prob = bayesian_accuracy_results['fn_reduction'].get('prob_30_plus', 0)
            evidence_summary += f"   P(FN reduction > 30%): {safe_format_value(prob, '.3f')}\n"
        if 'decision_analysis' in bayesian_accuracy_results:
            prob_any = bayesian_accuracy_results['decision_analysis'].get('prob_any_30', 0)
            evidence_summary += f"   P(Any metric > 30%): {safe_format_value(prob_any, '.3f')}\n"
            recommendation = bayesian_accuracy_results['decision_analysis'].get('recommendation', 'N/A')
            evidence_summary += f"   Decision: {recommendation}\n"
    
    evidence_summary += "\n" + "="*90 + "\n"
    evidence_summary += "üéØ OVERALL CONCLUSION: "
    
    # Determine overall conclusion based on accuracy analysis
    significant_methods = 0
    total_methods = 0
    
    if 'accuracy_test_results' in globals() and 'statistical_tests' in accuracy_test_results:
        total_methods += 1
        if any(test.get('significant', False) for test in accuracy_test_results['statistical_tests'].values()):
            significant_methods += 1
    
    if 'accuracy_permutation_results' in globals():
        total_methods += 1
        if any(test.get('significant', False) for test in accuracy_permutation_results.values() if isinstance(test, dict)):
            significant_methods += 1
    
    if 'accuracy_bootstrap_results' in globals():
        total_methods += 1
        if any(test.get('meets_threshold', False) for test in accuracy_bootstrap_results.values() if isinstance(test, dict)):
            significant_methods += 1
    
    if 'bayesian_accuracy_results' in globals():
        total_methods += 1
        if any(test.get('prob_30_plus', 0) > 0.8 for test in bayesian_accuracy_results.values() if isinstance(test, dict) and 'prob_30_plus' in test):
            significant_methods += 1
    
    if significant_methods >= total_methods * 0.75:
        conclusion = "STRONG EVIDENCE for >30% accuracy improvement hypothesis"
    elif significant_methods >= total_methods * 0.5:
        conclusion = "MODERATE EVIDENCE for >30% accuracy improvement hypothesis"
    else:
        conclusion = "WEAK EVIDENCE for >30% accuracy improvement hypothesis"
    
    evidence_summary += conclusion
    
    ax11.text(0.05, 0.95, evidence_summary, transform=ax11.transAxes, 
              fontsize=11, verticalalignment='top', fontfamily='monospace',
              bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.8))
    
    plt.show()
    
    print("‚úì Enhanced accuracy-based statistical visualizations completed successfully")
    print(f"‚úì Integrated results from {total_methods} statistical methods")
    print(f"‚úì {significant_methods}/{total_methods} methods show positive evidence for accuracy improvement")

# Execute enhanced statistical visualizations for accuracy analysis
if 'accuracy_test_results' in globals() and accuracy_test_results:
    create_enhanced_accuracy_visualizations()
else:
    print("Cannot create enhanced statistical visualizations - accuracy test results not available")
    print("Please run Cell 11 first to generate accuracy test results")

In [None]:
# Cell [16] - Enhanced Final Executive Summary with Advanced Statistical Analysis
# Purpose: Generate comprehensive executive summary integrating all statistical methods and business insights
# Dependencies: accuracy_test_results, accuracy_permutation_results, accuracy_bootstrap_results, bayesian_accuracy_results
# Breadcrumbs: Setup -> Analysis -> Advanced Statistical Testing -> Enhanced Final Executive Summary & Business Impact Report

print("ENHANCED EXECUTIVE SUMMARY - COMPREHENSIVE ACCURACY-BASED STATISTICAL ANALYSIS")
print("=" * 80)

# Get project name safely
project_name = CONFIG.get('NEO4J_PROJECT_NAME', 'Unknown Project') if 'CONFIG' in globals() else 'Unknown Project'
print(f"Project: {project_name}")
print(f"Analysis Date: {pd.Timestamp.now().strftime('%Y-%m-%d')}")
print(f"Analysis Framework: Ground Truth Accuracy Analysis with Classical, Permutation, Bootstrap, and Bayesian Methods")

if 'accuracy_test_results' in globals() and accuracy_test_results:
    print(f"\n" + "="*80)
    print("DATA SUMMARY")
    print("="*80)
    
    accuracy_df = accuracy_test_results['accuracy_df']
    hr_metrics = accuracy_test_results['hr_metrics']
    standard_metrics = accuracy_test_results['standard_metrics']
    ground_truth_baseline = accuracy_test_results['ground_truth_baseline']
    
    print(f"  Ground truth baseline: {ground_truth_baseline:.1f} LOC/SiFP (iTrust codebase)")
    print(f"  Total estimates analyzed: {len(accuracy_df)}")
    print(f"  Hallucination-reducing estimates: {len(hr_metrics['group_data'])}")
    print(f"  Standard estimates: {len(standard_metrics['group_data'])}")
    print(f"  Accuracy tolerance: ¬±20% of ground truth")
    
    if 'code_metrics_df' in globals():
        print(f"  Total code files in project: {len(code_metrics_df)}")
        print(f"  Total lines of code in project: {code_metrics_df['CountLineCode'].sum():,}")
    
    if 'ground_truth_requirements' in globals():
        print(f"  Ground truth requirements: {len(ground_truth_requirements)}")
    
    # Get baseline values safely
    conversion_factor = CONFIG.get('CONVERSION_FACTOR', 0.957) if 'CONFIG' in globals() else 0.957
    industry_loc_per_sifp = industry_metrics.get('LOC_PER_SIFP', 100) if 'industry_metrics' in globals() else 100
    project_loc_per_sifp = baseline_metrics.get('project_loc_per_sifp', industry_loc_per_sifp) if 'baseline_metrics' in globals() else industry_loc_per_sifp
    desharnais_hours_per_sifp = effort_metrics.get('avg_hours_per_sifp', 10) if 'effort_metrics' in globals() else 10
    
    print(f"\nConversion Factors and Baselines:")
    print(f"  UFP ‚Üí SiFP: {conversion_factor} (from Desharnais research)")
    print(f"  SiFP ‚Üí Effort: {desharnais_hours_per_sifp:.2f} hours/SiFP (Desharnais dataset)")
    print(f"  SiFP ‚Üí LOC: {project_loc_per_sifp:.1f} LOC/SiFP (project weighted average)")
    print(f"  Industry baseline: {industry_loc_per_sifp:.1f} LOC/SiFP")
    print(f"  Project vs Industry: {(project_loc_per_sifp - industry_loc_per_sifp)/industry_loc_per_sifp*100:+.1f}%")
    
    # COMPREHENSIVE STATISTICAL HYPOTHESIS TESTING SUMMARY
    print(f"\n" + "="*80)
    print("COMPREHENSIVE STATISTICAL HYPOTHESIS TESTING - ACCURACY vs GROUND TRUTH")
    print("="*80)
    
    print(f"\nHypothesis: Hallucination-reducing techniques improve estimation accuracy by >30%")
    print(f"Operational Definition: Multi-stage refinement (actor‚Üíjudge‚Üímeta-judge)")
    print(f"Ground Truth: iTrust codebase metrics scaled to estimated requirements")
    print(f"Statistical Significance Threshold: Œ± = 0.05")
    print(f"Practical Significance Threshold: >30% improvement")
    
    # Collect results from all statistical methods
    statistical_evidence = {
        'classical': {},
        'permutation': {},
        'bootstrap': {},
        'bayesian': {}
    }
    
    # 1. Classical Statistics
    print(f"\n" + "-"*60)
    print("1. CLASSICAL STATISTICAL ANALYSIS - ACCURACY vs GROUND TRUTH")
    print("-"*60)
    
    if 'accuracy_test_results' in globals() and 'statistical_tests' in accuracy_test_results:
        statistical_tests = accuracy_test_results['statistical_tests']
        improvements = accuracy_test_results['improvements']
        
        print(f"  Method: Chi-square, Fisher's exact, Mann-Whitney U tests")
        
        if 'chi_square' in statistical_tests:
            chi_result = statistical_tests['chi_square']
            print(f"  Chi-square test (independence): œá¬≤ = {chi_result.get('statistic', 'N/A'):.3f}, p = {chi_result.get('p_value', 'N/A'):.6f}")
            print(f"  Statistically significant: {'YES' if chi_result.get('significant', False) else 'NO'}")
            
            statistical_evidence['classical'] = {
                'significant': chi_result.get('significant', False),
                'p_value': chi_result.get('p_value', 'N/A'),
                'method': 'Chi-square test'
            }
        
        if 'fisher_exact' in statistical_tests:
            fisher_result = statistical_tests['fisher_exact']
            print(f"  Fisher's exact test: OR = {fisher_result.get('odds_ratio', 'N/A'):.3f}, p = {fisher_result.get('p_value', 'N/A'):.6f}")
        
        if 'mann_whitney' in statistical_tests:
            mw_result = statistical_tests['mann_whitney']
            print(f"  Mann-Whitney U test: U = {mw_result.get('statistic', 'N/A'):.3f}, p = {mw_result.get('p_value', 'N/A'):.6f}")
            print(f"  Effect size: {mw_result.get('effect_size', 'N/A'):.3f}")
        
        print(f"\n  Accuracy Improvements:")
        print(f"    Overall accuracy: {improvements['accuracy']:+.1f}%")
        print(f"    False Positive reduction: {improvements['fp_reduction']:+.1f}%")
        print(f"    False Negative reduction: {improvements['fn_reduction']:+.1f}%")
        print(f"    Average error reduction: {improvements['error_reduction']:+.1f}%")
        
        best_improvement = max(improvements['accuracy'], improvements['fp_reduction'], 
                             improvements['fn_reduction'], improvements['error_reduction'])
        print(f"    Best improvement: {best_improvement:.1f}%")
        print(f"    Meets >30% threshold: {'YES' if best_improvement > 30 else 'NO'}")
        
        statistical_evidence['classical']['best_improvement'] = best_improvement
        
    else:
        print(f"  Classical statistical testing: NOT COMPLETED")
        print(f"  Reason: Accuracy test results not available")
    
    # 2. Permutation Tests
    print(f"\n" + "-"*60)
    print("2. PERMUTATION TEST ANALYSIS - ACCURACY vs GROUND TRUTH")
    print("-"*60)
    
    if 'accuracy_permutation_results' in globals() and accuracy_permutation_results:
        perm_tests = ['accuracy_improvement', 'fp_reduction', 'fn_reduction', 'error_reduction']
        significant_perm_tests = 0
        total_perm_tests = 0
        threshold_perm_tests = 0
        
        for test_name in perm_tests:
            if test_name in accuracy_permutation_results and accuracy_permutation_results[test_name]:
                total_perm_tests += 1
                result = accuracy_permutation_results[test_name]
                test_label = test_name.replace('_', ' ').title()
                
                print(f"  {test_label}:")
                print(f"    p-value: {result.get('p_value', 'N/A'):.6f}")
                if 'improvement_pct' in result:
                    print(f"    Improvement: {result.get('improvement_pct', 'N/A'):.1f}%")
                elif 'reduction_pct' in result:
                    print(f"    Reduction: {result.get('reduction_pct', 'N/A'):.1f}%")
                print(f"    Significant: {'YES' if result.get('significant', False) else 'NO'}")
                print(f"    Meets >30% threshold: {'YES' if result.get('meets_threshold', False) else 'NO'}")
                
                if result.get('significant', False):
                    significant_perm_tests += 1
                if result.get('meets_threshold', False):
                    threshold_perm_tests += 1
        
        print(f"\n  Overall Permutation Results: {significant_perm_tests}/{total_perm_tests} tests significant")
        print(f"  Tests meeting >30% threshold: {threshold_perm_tests}/{total_perm_tests}")
        print(f"  Method advantages: Distribution-free, exact p-values, robust to outliers")
        
        statistical_evidence['permutation'] = {
            'significant_tests': significant_perm_tests,
            'threshold_tests': threshold_perm_tests,
            'total_tests': total_perm_tests,
            'any_significant': significant_perm_tests > 0,
            'any_threshold': threshold_perm_tests > 0
        }
    else:
        print(f"  Permutation testing: NOT AVAILABLE")
        print(f"  Reason: Advanced statistical libraries not installed or analysis not run")
    
    # 3. Bootstrap Analysis
    print(f"\n" + "-"*60)
    print("3. BOOTSTRAP HYPOTHESIS TESTING - ACCURACY vs GROUND TRUTH")
    print("-"*60)
    
    if 'accuracy_bootstrap_results' in globals() and accuracy_bootstrap_results:
        boot_tests = ['accuracy_improvement', 'fp_reduction', 'fn_reduction', 'error_reduction']
        threshold_boot_tests = 0
        high_confidence_boot_tests = 0
        total_boot_tests = 0
        
        for test_name in boot_tests:
            if test_name in accuracy_bootstrap_results and accuracy_bootstrap_results[test_name]:
                total_boot_tests += 1
                result = accuracy_bootstrap_results[test_name]
                test_label = test_name.replace('_', ' ').title()
                
                print(f"  {test_label}:")
                if 'observed_improvement' in result:
                    print(f"    Observed improvement: {result.get('observed_improvement', 'N/A'):.1f}%")
                elif 'observed_reduction' in result:
                    print(f"    Observed reduction: {result.get('observed_reduction', 'N/A'):.1f}%")
                
                ci = result.get('confidence_interval', [0, 0])
                print(f"    95% Bootstrap CI: [{ci[0]:.1f}, {ci[1]:.1f}]%")
                print(f"    P(>30% threshold): {result.get('threshold_confidence', 'N/A'):.3f}")
                print(f"    Meets >30% threshold: {'YES' if result.get('meets_threshold', False) else 'NO'}")
                
                if result.get('meets_threshold', False):
                    threshold_boot_tests += 1
                if result.get('threshold_confidence', 0) > 0.8:
                    high_confidence_boot_tests += 1
        
        print(f"\n  Overall Bootstrap Results: {threshold_boot_tests}/{total_boot_tests} tests meet >30% threshold")
        print(f"  High confidence (>80%) tests: {high_confidence_boot_tests}/{total_boot_tests}")
        print(f"  Method advantages: Direct threshold testing, bias-corrected intervals")
        
        statistical_evidence['bootstrap'] = {
            'threshold_tests': threshold_boot_tests,
            'high_confidence_tests': high_confidence_boot_tests,
            'total_tests': total_boot_tests,
            'any_threshold': threshold_boot_tests > 0,
            'strong_evidence': high_confidence_boot_tests > 0
        }
    else:
        print(f"  Bootstrap testing: NOT AVAILABLE")
        print(f"  Reason: Advanced statistical libraries not installed or analysis not run")
    
    # 4. Bayesian Analysis
    print(f"\n" + "-"*60)
    print("4. BAYESIAN HYPOTHESIS TESTING - ACCURACY vs GROUND TRUTH")
    print("-"*60)
    
    if 'bayesian_accuracy_results' in globals() and bayesian_accuracy_results:
        print(f"  Method: Bayesian inference with posterior probability statements")
        
        if 'accuracy_improvement' in bayesian_accuracy_results and bayesian_accuracy_results['accuracy_improvement']:
            accuracy_result = bayesian_accuracy_results['accuracy_improvement']
            print(f"  Accuracy Improvement Analysis:")
            print(f"    Posterior mean improvement: {accuracy_result.get('posterior_mean', 'N/A'):.1f}%")
            ci = accuracy_result.get('credible_interval', [0, 0])
            print(f"    95% Credible interval: [{ci[0]:.1f}, {ci[1]:.1f}]%")
            print(f"    P(improvement > 30%): {accuracy_result.get('prob_30_plus', 'N/A'):.3f}")
            print(f"    P(any improvement): {accuracy_result.get('prob_any_improvement', 'N/A'):.3f}")
        
        if 'fp_reduction' in bayesian_accuracy_results and bayesian_accuracy_results['fp_reduction']:
            fp_result = bayesian_accuracy_results['fp_reduction']
            print(f"  False Positive Reduction Analysis:")
            print(f"    P(FP reduction > 30%): {fp_result.get('prob_30_plus', 'N/A'):.3f}")
            print(f"    P(any FP reduction): {fp_result.get('prob_any_reduction', 'N/A'):.3f}")
        
        if 'fn_reduction' in bayesian_accuracy_results and bayesian_accuracy_results['fn_reduction']:
            fn_result = bayesian_accuracy_results['fn_reduction']
            print(f"  False Negative Reduction Analysis:")
            print(f"    P(FN reduction > 30%): {fn_result.get('prob_30_plus', 'N/A'):.3f}")
            print(f"    P(any FN reduction): {fn_result.get('prob_any_reduction', 'N/A'):.3f}")
        
        if 'decision_analysis' in bayesian_accuracy_results and bayesian_accuracy_results['decision_analysis']:
            decision_result = bayesian_accuracy_results['decision_analysis']
            print(f"  Combined Decision Analysis:")
            print(f"    P(any metric > 30%): {decision_result.get('prob_any_30', 'N/A'):.3f}")
            print(f"    P(all metrics > 30%): {decision_result.get('prob_all_30', 'N/A'):.3f}")
            print(f"    Recommendation: {decision_result.get('recommendation', 'N/A')}")
        
        if 'model_comparison' in bayesian_accuracy_results and bayesian_accuracy_results['model_comparison']:
            model_result = bayesian_accuracy_results['model_comparison']
            print(f"  Model Comparison:")
            print(f"    Evidence: {model_result.get('preference', 'N/A')}")
            print(f"    ŒîWAIC: {model_result.get('delta_waic', 'N/A'):.2f}")
        
        print(f"  Method advantages: Direct probability statements, credible intervals, decision framework")
        
        # Determine Bayesian evidence strength
        bayes_strong = False
        if ('accuracy_improvement' in bayesian_accuracy_results and 
            bayesian_accuracy_results['accuracy_improvement'].get('prob_30_plus', 0) > 0.8):
            bayes_strong = True
        if ('decision_analysis' in bayesian_accuracy_results and 
            bayesian_accuracy_results['decision_analysis'].get('prob_any_30', 0) > 0.8):
            bayes_strong = True
            
        statistical_evidence['bayesian'] = {
            'strong_evidence': bayes_strong,
            'available': True,
            'decision_recommendation': bayesian_accuracy_results.get('decision_analysis', {}).get('recommendation', 'N/A')
        }
    else:
        print(f"  Bayesian analysis: NOT AVAILABLE")
        print(f"  Reason: PyMC/ArviZ not installed or analysis not run")
        statistical_evidence['bayesian'] = {'available': False}
    
    # INTEGRATED STATISTICAL CONCLUSION
    print(f"\n" + "="*80)
    print("INTEGRATED STATISTICAL CONCLUSION - ACCURACY vs GROUND TRUTH")
    print("="*80)
    
    # Count evidence across methods
    evidence_count = 0
    total_methods = 0
    evidence_details = []
    
    # Classical evidence
    if statistical_evidence['classical']:
        total_methods += 1
        if statistical_evidence['classical'].get('significant', False):
            evidence_count += 1
            evidence_details.append("Classical tests: SIGNIFICANT")
        else:
            evidence_details.append("Classical tests: Not significant")
    
    # Permutation evidence
    if statistical_evidence['permutation']:
        total_methods += 1
        if statistical_evidence['permutation'].get('any_significant', False) or statistical_evidence['permutation'].get('any_threshold', False):
            evidence_count += 1
            evidence_details.append("Permutation tests: SIGNIFICANT/THRESHOLD")
        else:
            evidence_details.append("Permutation tests: Not significant")
    
    # Bootstrap evidence
    if statistical_evidence['bootstrap']:
        total_methods += 1
        if statistical_evidence['bootstrap'].get('any_threshold', False):
            evidence_count += 1
            evidence_details.append("Bootstrap tests: MEETS THRESHOLD")
        else:
            evidence_details.append("Bootstrap tests: Below threshold")
    
    # Bayesian evidence
    if statistical_evidence['bayesian']['available']:
        total_methods += 1
        if statistical_evidence['bayesian'].get('strong_evidence', False):
            evidence_count += 1
            evidence_details.append("Bayesian analysis: STRONG EVIDENCE")
        else:
            evidence_details.append("Bayesian analysis: Weak evidence")
    
    print(f"\nEvidence Summary:")
    for detail in evidence_details:
        print(f"  ‚Ä¢ {detail}")
    
    print(f"\nStatistical Methods Agreement: {evidence_count}/{total_methods} methods show positive evidence")
    
    # Overall statistical conclusion
    if evidence_count >= total_methods * 0.75:
        statistical_conclusion = "STRONG STATISTICAL EVIDENCE"
        conclusion_color = "üü¢"
    elif evidence_count >= total_methods * 0.5:
        statistical_conclusion = "MODERATE STATISTICAL EVIDENCE"
        conclusion_color = "üü°"
    else:
        statistical_conclusion = "WEAK STATISTICAL EVIDENCE"
        conclusion_color = "üî¥"
    
    print(f"\n{conclusion_color} OVERALL STATISTICAL CONCLUSION: {statistical_conclusion}")
    print(f"   for >30% improvement in estimation accuracy through hallucination-reducing techniques")
    
    # ACCURACY ANALYSIS AND BUSINESS IMPACT
    print(f"\n" + "="*80)
    print("ACCURACY ANALYSIS AND BUSINESS RECOMMENDATIONS")
    print("="*80)
    
    # Accuracy classification analysis
    print(f"\n1. ACCURACY CLASSIFICATION ANALYSIS:")
    print(f"   Hallucination-Reducing Group:")
    print(f"     ‚Ä¢ True Positives (accurate): {hr_metrics['tp_count']}/{hr_metrics['total']} ({hr_metrics['tp_rate']:.2%})")
    print(f"     ‚Ä¢ False Positives (overestimates): {hr_metrics['fp_count']}/{hr_metrics['total']} ({hr_metrics['fp_rate']:.2%})")
    print(f"     ‚Ä¢ False Negatives (underestimates): {hr_metrics['fn_count']}/{hr_metrics['total']} ({hr_metrics['fn_rate']:.2%})")
    print(f"     ‚Ä¢ Overall Accuracy: {hr_metrics['accuracy']:.2%}")
    
    print(f"\n   Standard Group:")
    print(f"     ‚Ä¢ True Positives (accurate): {standard_metrics['tp_count']}/{standard_metrics['total']} ({standard_metrics['tp_rate']:.2%})")
    print(f"     ‚Ä¢ False Positives (overestimates): {standard_metrics['fp_count']}/{standard_metrics['total']} ({standard_metrics['fp_rate']:.2%})")
    print(f"     ‚Ä¢ False Negatives (underestimates): {standard_metrics['fn_count']}/{standard_metrics['total']} ({standard_metrics['fn_rate']:.2%})")
    print(f"     ‚Ä¢ Overall Accuracy: {standard_metrics['accuracy']:.2%}")
    
    # Business impact from accuracy improvements
    if 'accuracy_test_results' in globals() and 'improvements' in accuracy_test_results:
        improvements = accuracy_test_results['improvements']
        print(f"\n2. BUSINESS IMPACT OF ACCURACY IMPROVEMENTS:")
        
        # Calculate potential cost savings from improved accuracy
        if 'effort_impact_df' in globals() and not effort_impact_df.empty:
            avg_cost_impact = effort_impact_df['Total_Cost_Impact_USD'].mean()
            print(f"   Average cost impact per model: ${avg_cost_impact:+,.0f}")
            
            # Estimate savings from accuracy improvement
            accuracy_improvement_factor = improvements['accuracy'] / 100
            estimated_savings = abs(avg_cost_impact) * accuracy_improvement_factor
            print(f"   Estimated savings from {improvements['accuracy']:.1f}% accuracy improvement: ${estimated_savings:,.0f}")
        
        print(f"   False Positive reduction: {improvements['fp_reduction']:.1f}%")
        print(f"     ‚Ä¢ Business benefit: Reduced over-scoping and resource waste")
        print(f"   False Negative reduction: {improvements['fn_reduction']:.1f}%")
        print(f"     ‚Ä¢ Business benefit: Reduced under-scoping and deadline pressure")
        print(f"   Overall error reduction: {improvements['error_reduction']:.1f}%")
        print(f"     ‚Ä¢ Business benefit: More predictable project planning")
    
    # Risk assessment based on accuracy
    print(f"\n3. RISK ASSESSMENT:")
    overall_accuracy = (hr_metrics['accuracy'] + standard_metrics['accuracy']) / 2
    if overall_accuracy > 0.8:
        print(f"   ‚úÖ LOW RISK: High overall accuracy ({overall_accuracy:.1%})")
    elif overall_accuracy > 0.6:
        print(f"   ‚ö†Ô∏è  MODERATE RISK: Moderate accuracy ({overall_accuracy:.1%})")
    else:
        print(f"   üö® HIGH RISK: Low accuracy ({overall_accuracy:.1%}) requires attention")
    
    fp_rate = (hr_metrics['fp_rate'] + standard_metrics['fp_rate']) / 2
    if fp_rate < 0.2:
        print(f"   ‚úÖ LOW OVERESTIMATION RISK: FP rate {fp_rate:.1%}")
    else:
        print(f"   ‚ö†Ô∏è  OVERESTIMATION RISK: High FP rate {fp_rate:.1%}")
    
    fn_rate = (hr_metrics['fn_rate'] + standard_metrics['fn_rate']) / 2
    if fn_rate < 0.2:
        print(f"   ‚úÖ LOW UNDERESTIMATION RISK: FN rate {fn_rate:.1%}")
    else:
        print(f"   ‚ö†Ô∏è  UNDERESTIMATION RISK: High FN rate {fn_rate:.1%}")
    
    # IMPLEMENTATION RECOMMENDATIONS
    print(f"\n" + "="*80)
    print("IMPLEMENTATION RECOMMENDATIONS")
    print("="*80)
    
    print(f"\n1. STATISTICAL EVIDENCE BASED:")
    if evidence_count >= total_methods * 0.75:
        print(f"   üöÄ STRONG RECOMMENDATION: Implement hallucination-reducing techniques")
        print(f"      ‚Ä¢ Multiple statistical methods converge on positive evidence")
        print(f"      ‚Ä¢ Significant accuracy improvements demonstrated against ground truth")
        print(f"      ‚Ä¢ Risk of Type I error minimized through diverse analytical approaches")
        print(f"      ‚Ä¢ Expected accuracy improvement supported by robust statistical framework")
    elif evidence_count >= total_methods * 0.5:
        print(f"   üìã MODERATE RECOMMENDATION: Pilot implementation with careful monitoring")
        print(f"      ‚Ä¢ Mixed statistical evidence suggests cautious adoption")
        print(f"      ‚Ä¢ Implement in controlled environment with accuracy measurement systems")
        print(f"      ‚Ä¢ Establish clear success metrics based on TP/FP/FN rates")
        print(f"      ‚Ä¢ Monitor against ground truth continuously")
    else:
        print(f"   ‚ö†Ô∏è  WEAK RECOMMENDATION: Additional research needed before implementation")
        print(f"      ‚Ä¢ Insufficient statistical evidence for confident recommendation")
        print(f"      ‚Ä¢ Consider larger sample sizes or refined accuracy measurement")
        print(f"      ‚Ä¢ Focus on improving ground truth establishment and validation")
    
    print(f"\n2. ACCURACY-BASED IMPLEMENTATION:")
    print(f"   ‚Ä¢ Multi-stage refinement: actor ‚Üí judge ‚Üí meta-judge architecture")
    print(f"   ‚Ä¢ Ground truth validation systems for continuous accuracy monitoring")
    print(f"   ‚Ä¢ TP/FP/FN classification frameworks for estimation quality assessment")
    print(f"   ‚Ä¢ Automated accuracy feedback loops for model improvement")
    print(f"   ‚Ä¢ Integration with existing project management and estimation processes")
    
    print(f"\n3. ORGANIZATIONAL READINESS:")
    print(f"   ‚Ä¢ Training on accuracy-based estimation methodologies")
    print(f"   ‚Ä¢ Establishment of ground truth baselines for different project types")
    print(f"   ‚Ä¢ Change management for adoption of accuracy-validated AI estimation")
    print(f"   ‚Ä¢ Quality assurance protocols based on TP/FP/FN metrics")
    print(f"   ‚Ä¢ Continuous monitoring and improvement processes")
    
    # LIMITATIONS AND FUTURE RESEARCH
    print(f"\n" + "="*80)
    print("LIMITATIONS AND FUTURE RESEARCH DIRECTIONS")
    print("="*80)
    
    print(f"\nStudy Limitations:")
    print(f"  ‚Ä¢ Sample size constraints limit statistical power")
    print(f"  ‚Ä¢ Single project ground truth (iTrust) may not generalize")
    print(f"  ‚Ä¢ Accuracy tolerance of ¬±20% is somewhat arbitrary")
    print(f"  ‚Ä¢ Cross-sectional design rather than longitudinal study")
    print(f"  ‚Ä¢ Limited direct measurement of time-to-market impact")
    print(f"  ‚Ä¢ Proxy measures for 'hallucination-reducing techniques'")
    
    print(f"\nFuture Research Priorities:")
    print(f"  1. Multi-project validation with diverse ground truth baselines")
    print(f"  2. Longitudinal studies tracking accuracy improvements over time")
    print(f"  3. Direct measurement of business impact metrics (time-to-market, cost)")
    print(f"  4. Refinement of accuracy classification thresholds")
    print(f"  5. Development of automated ground truth establishment methods")
    print(f"  6. Comparative analysis with other estimation accuracy frameworks")
    print(f"  7. Integration of accuracy metrics with project success indicators")
    
    # FINAL EXECUTIVE DECISION FRAMEWORK
    print(f"\n" + "="*80)
    print("EXECUTIVE DECISION FRAMEWORK")
    print("="*80)
    
    print(f"\nüéØ KEY FINDINGS:")
    print(f"   ‚Ä¢ Statistical Analysis: {statistical_conclusion}")
    print(f"   ‚Ä¢ Methods Agreement: {evidence_count}/{total_methods} approaches show positive results")
    print(f"   ‚Ä¢ Accuracy Improvement: {improvements['accuracy']:+.1f}% overall accuracy")
    print(f"   ‚Ä¢ False Positive Reduction: {improvements['fp_reduction']:+.1f}%")
    print(f"   ‚Ä¢ False Negative Reduction: {improvements['fn_reduction']:+.1f}%")
    print(f"   ‚Ä¢ Ground Truth Validation: Against iTrust codebase metrics")
    
    print(f"\nüìä DECISION MATRIX:")
    print(f"   Statistical Evidence: {'Strong' if evidence_count >= total_methods * 0.75 else 'Moderate' if evidence_count >= total_methods * 0.5 else 'Weak'}")
    print(f"   Accuracy Improvement: {'High' if overall_accuracy > 0.8 else 'Moderate' if overall_accuracy > 0.6 else 'Low'}")
    print(f"   Implementation Risk: {'Low' if fp_rate < 0.2 and fn_rate < 0.2 else 'Moderate'}")
    print(f"   Ground Truth Validation: {'Established' if ground_truth_baseline else 'Limited'}")
    print(f"   Technical Readiness: {'High' if len(accuracy_df) > 50 else 'Moderate'}")
    
    print(f"\nüéØ EXECUTIVE RECOMMENDATION:")
    if evidence_count >= total_methods * 0.75:
        print(f"   PROCEED WITH IMPLEMENTATION")
        print(f"   ‚Ä¢ Strong statistical evidence supports adoption")
        print(f"   ‚Ä¢ Accuracy improvements demonstrated against ground truth")
        print(f"   ‚Ä¢ Establish continuous accuracy monitoring systems")
        print(f"   ‚Ä¢ Implement with robust TP/FP/FN tracking")
    elif evidence_count >= total_methods * 0.5:
        print(f"   PROCEED WITH PILOT PROGRAM")
        print(f"   ‚Ä¢ Moderate evidence supports cautious adoption")
        print(f"   ‚Ä¢ Implement with comprehensive accuracy measurement")
        print(f"   ‚Ä¢ Establish clear go/no-go criteria based on accuracy metrics")
        print(f"   ‚Ä¢ Focus on ground truth validation and continuous improvement")
    else:
        print(f"   DEFER IMPLEMENTATION - CONDUCT ADDITIONAL RESEARCH")
        print(f"   ‚Ä¢ Insufficient evidence for confident business decision")
        print(f"   ‚Ä¢ Invest in larger-scale accuracy validation studies")
        print(f"   ‚Ä¢ Improve ground truth establishment methodologies")
        print(f"   ‚Ä¢ Focus on accuracy measurement system development")
    
    # Save comprehensive results
    try:
        os.makedirs('results', exist_ok=True)
        timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
        
        # Create comprehensive executive summary
        final_summary = {
            'project': project_name,
            'analysis_date': pd.Timestamp.now().strftime('%Y-%m-%d'),
            'hypothesis': 'Hallucination-reducing techniques improve estimation accuracy by >30%',
            'ground_truth_baseline': ground_truth_baseline,
            'statistical_methods_used': total_methods,
            'methods_with_positive_evidence': evidence_count,
            'overall_statistical_conclusion': statistical_conclusion,
            'evidence_strength_ratio': f"{evidence_count}/{total_methods}",
            'accuracy_improvements': {
                'overall_accuracy': improvements['accuracy'],
                'fp_reduction': improvements['fp_reduction'],
                'fn_reduction': improvements['fn_reduction'],
                'error_reduction': improvements['error_reduction']
            },
            'accuracy_metrics': {
                'hr_accuracy': hr_metrics['accuracy'],
                'standard_accuracy': standard_metrics['accuracy'],
                'hr_fp_rate': hr_metrics['fp_rate'],
                'hr_fn_rate': hr_metrics['fn_rate'],
                'standard_fp_rate': standard_metrics['fp_rate'],
                'standard_fn_rate': standard_metrics['fn_rate']
            },
            'executive_recommendation': 'PROCEED WITH IMPLEMENTATION' if evidence_count >= total_methods * 0.75 else 'PROCEED WITH PILOT PROGRAM' if evidence_count >= total_methods * 0.5 else 'DEFER IMPLEMENTATION',
            'statistical_evidence_details': evidence_details,
            'analysis_timestamp': timestamp
        }
        
        # Save detailed results
        import json
        with open(f'results/enhanced_accuracy_executive_summary_{project_name}_{timestamp}.json', 'w') as f:
            json.dump(final_summary, f, indent=2, default=str)
        
        # Save statistical evidence summary
        with open(f'results/accuracy_statistical_evidence_{project_name}_{timestamp}.json', 'w') as f:
            json.dump(statistical_evidence, f, indent=2, default=str)
        
        print(f"\n‚úÖ COMPREHENSIVE ACCURACY ANALYSIS RESULTS SAVED")
        print(f"   Location: results/enhanced_accuracy_executive_summary_{project_name}_{timestamp}.json")
        print(f"   Statistical Evidence: results/accuracy_statistical_evidence_{project_name}_{timestamp}.json")
        
    except Exception as e:
        print(f"\n‚ö†Ô∏è  Warning: Could not save results - {e}")

else:
    print("‚ùå ANALYSIS INCOMPLETE")
    print("Missing required data components:")
    print("  - accuracy_test_results: Ground truth accuracy analysis from Cell 11")
    print("  - accuracy_permutation_results: Permutation tests from Cell 12 (optional)")
    print("  - accuracy_bootstrap_results: Bootstrap tests from Cell 13 (optional)")
    print("  - bayesian_accuracy_results: Bayesian analysis from Cell 14 (optional)")
    print("\nPlease ensure Cell 11 has completed successfully to generate accuracy test results.")

print(f"\n" + "="*80)
print("üéâ ENHANCED ACCURACY-BASED STATISTICAL ANALYSIS COMPLETE")
print("="*80)
print("Framework: Ground Truth Accuracy ‚Üí Classical ‚Üí Permutation ‚Üí Bootstrap ‚Üí Bayesian ‚Üí Integration")
print("Thank you for using the Enhanced SiFP COSMIC Estimation Accuracy Analysis Framework!")
print("For questions or support, refer to the comprehensive documentation and statistical methodologies.")