# Hypothesis 2: Hallucination Impact on Estimation Quality
**Hallucinations in LLM outputs significantly correlate with lower quality estimates in new product feature development.**

In [None]:
# Cell [0] - Setup and Dependencies
# Purpose: Import necessary libraries and configure the environment for traceability and SiFP analysis
# Dependencies: All packages assumed to be installed
# Breadcrumbs: Setup -> Imports -> Environment Configuration

# Standard library imports
import os
import re
import logging
import warnings
from datetime import datetime
from typing import Dict, List, Any, Tuple, Optional
from pathlib import Path

# Data processing and analysis
import pandas as pd
import numpy as np
from scipy import stats

# Database connectivity for Neo4j
from neo4j import GraphDatabase
from dotenv import load_dotenv

# Visualization libraries
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.gridspec import GridSpec

# Machine learning libraries
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, balanced_accuracy_score, 
    cohen_kappa_score, matthews_corrcoef, fbeta_score,
    roc_auc_score
)
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

# Enhanced statistical analysis
import statsmodels.api as sm
from statsmodels.regression.mixed_linear_model import MixedLM
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.power import ttest_power, tt_solve_power

# Advanced statistical testing imports
# 1. Permutation Testing
from scipy.stats import permutation_test, bootstrap as scipy_bootstrap

# 2. Extended Bootstrap Analysis
from arch.bootstrap import IIDBootstrap, CircularBlockBootstrap, StationaryBootstrap

# 3. Bayesian Analysis
import pymc as pm
import arviz as az
import bayesian_testing as bt

# Suppress warnings for cleaner notebook output
warnings.filterwarnings('ignore')

# Configure logging with shorter format for better PDF wrapping
import textwrap

class PDFLoggingFormatter(logging.Formatter):
    """Custom formatter that wraps long log messages for better PDF display"""
    def format(self, record):
        # Format the basic log record
        formatted = super().format(record)
        
        # Wrap long lines at 120 characters with proper indentation
        if len(formatted) > 120:
            lines = textwrap.wrap(formatted, width=120, 
                                subsequent_indent='    ')  # Indent continuation lines
            formatted = '\n'.join(lines)
        
        return formatted

# Create custom formatter
pdf_formatter = PDFLoggingFormatter('%(asctime)s - %(levelname)s - %(message)s')

# Configure logging with custom formatter
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True  # Override any existing handlers
)

# Apply custom formatter to all handlers
logger = logging.getLogger(__name__)
for handler in logging.getLogger().handlers:
    handler.setFormatter(pdf_formatter)

# Configure pandas display settings for legal landscape format
pd.set_option('display.width', 130)           # Set width threshold for legal landscape
pd.set_option('display.max_columns', 25)     # Reasonable number of columns
pd.set_option('display.max_colwidth', 25)    # Compact column width
pd.set_option('display.precision', 2)        # Only 2 decimal places to save space
pd.set_option('display.float_format', '{:.2f}'.format)  # Consistent float formatting
pd.set_option('display.max_rows', None)      # Show all rows
# Note: Removed expand_frame_repr=False to allow natural wrapping at 130 chars

# Configure matplotlib settings
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100
plt.style.use('seaborn-v0_8' if 'seaborn-v0_8' in plt.style.available else 'seaborn')

# Custom color palettes for TP/FP/FN visualization
# TP (True Positive) = Light Blue, FP (False Positive) = Red, FN (False Negative) = Orange
tp_fp_fn_colors = ["#4d98da", "#e74c3c", "#f39c12"]  # Light blue for TP, Red for FP, Orange for FN
hallucination_colors = ["#4d98da", "#e74c3c", "#f39c12"]  # Same as above but conceptually for TP vs Hallucinations (FP+FN)

# Statistical testing configuration
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Advanced statistical methods availability flags (all True since packages are installed)
advanced_methods_available = {
    'permutation_tests': True,
    'extended_bootstrap': True,
    'bayesian_analysis': True,
    'bayesian_testing': True,
    'monte_carlo': True,
    'statsmodels': True,
    'scipy_bootstrap': True,
    'arch_bootstrap': True
}

# Set statsmodels_available flag for backward compatibility
statsmodels_available = True

# Optimization metric for threshold evaluation
OPTIMIZATION_METRIC = 'F2'

print("Notebook Environment Setup Complete ✓")
print("HYPOTHESIS 2 FRAMEWORK:")
print("========================")
print("Hallucinations in LLM outputs = BOTH False Positive (FP) AND False Negative (FN) traceability links:")
print("- FP Hallucinations: Predicted as traceable but NOT verified by ground truth (over-identification)")
print("- FN Hallucinations: Predicted as non-traceable but ARE verified by ground truth (missed features/under-identification)")
print("- TP Links: Correctly identified traceable links (verified predictions)")
print("")
print("RESEARCH QUESTION: Do hallucinated traceability links (both FP over-identification and FN missed features)")
print("exhibit significantly higher SiFP estimation errors compared to TP links in new product development?")
print("")
print("ADVANCED STATISTICAL METHODS AVAILABLE:")
print("=" * 40)
print("✓ Permutation Tests (exact p-values, distribution-free)")
print("✓ Extended Bootstrap Methods (IID, Circular Block, Stationary Bootstrap)")
print("✓ Bayesian Analysis (credible intervals, posterior distributions)")
print("✓ Bayesian Hypothesis Testing (Bayes factors, model comparison)")
print("✓ Monte Carlo Simulation Methods (manual implementation)")
print("✓ Enhanced Statistical Modeling (statsmodels)")
print("✓ Multiple Testing Corrections and Power Analysis")
print("")
print("This notebook analyzes the relationship between BOTH types of LLM hallucinations and SiFP estimation quality")
print("using comprehensive statistical validation including:")
print("- Traditional parametric and non-parametric tests")
print("- Permutation-based exact tests (distribution-free)")
print("- Extended bootstrap methods (IID, Circular Block, Stationary Bootstrap)")
print("- Bayesian hypothesis testing (credible intervals, Bayes factors)")
print("- Monte Carlo simulation methods (manual implementation)")
print("- Multiple testing corrections and power analysis")
print("")
print("🎯 READY FOR COMPREHENSIVE HYPOTHESIS 2 TESTING WITH ADVANCED STATISTICAL METHODS!")

In [None]:
# Cell [1] - Environment Configuration
# Purpose: Load environment variables and configure analysis settings from .env file
# Dependencies: os, dotenv
# Breadcrumbs: Setup -> Environment Configuration -> Model Selection

# Load environment variables from .env file
load_dotenv()

# Neo4j credentials from environment variables
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USER = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_PROJECT_NAME = os.getenv('NEO4J_PROJECT_NAME')

# Get current model from environment
CURRENT_MODEL_KEY = os.getenv('CURRENT_MODEL')
CURRENT_MODEL = os.getenv(os.getenv('CURRENT_MODEL', ''))

# Configuration for analysis
SHOW_VISUALIZATION = os.getenv('SHOW_VISUALIZATION', 'False').lower() == 'true'
MIN_TRACEABILITY_THRESHOLD = int(os.getenv('MIN_TRACEABILITY_THRESHOLD', '3'))
SIFP_ESTIMATION_REQUIREMENT = os.getenv('SIFP_ESTIMATION_REQUIREMENT', 'SOURCE')

# Get models to include in analysis from environment
ANALYSIS_MODEL_IDS = os.getenv('RESULTS_ANALYSIS_MODEL_IDS', '')
if ANALYSIS_MODEL_IDS:
    # Split by comma and strip whitespace to get variable names
    model_var_names = [var_name.strip() for var_name in ANALYSIS_MODEL_IDS.split(',')]
    
    # Look up each variable in the environment to get actual model IDs
    selected_models = []
    for var_name in model_var_names:
        model_id = os.getenv(var_name, '')
        if model_id:
            selected_models.append(model_id)
else:
    selected_models = [CURRENT_MODEL] if CURRENT_MODEL else []

# Print configuration summary
print(f"Environment Configuration:")
print(f"==========================")
print(f"Project: {NEO4J_PROJECT_NAME}")
print(f"Current model: {CURRENT_MODEL}")
print(f"Selected models for analysis: {selected_models}")
print(f"Visualization: {'Enabled' if SHOW_VISUALIZATION else 'Disabled'}")
print(f"Traceability threshold: {MIN_TRACEABILITY_THRESHOLD}")
print(f"SIFP requirement type: {SIFP_ESTIMATION_REQUIREMENT}")

In [None]:
# Cell [2] - Neo4j Connection Setup
# Purpose: Create and test connection to Neo4j database for data retrieval
# Dependencies: neo4j, logging
# Breadcrumbs: Setup -> Database Connection -> Neo4j Client

def create_neo4j_driver(uri=None, user=None, password=None):
    """
    Create and return a Neo4j driver instance
    
    Parameters:
        uri (str, optional): Neo4j connection URI. If None, uses NEO4J_URI from environment.
        user (str, optional): Neo4j username. If None, uses NEO4J_USER from environment.
        password (str, optional): Neo4j password. If None, uses NEO4J_PASSWORD from environment.
    
    Returns:
        GraphDatabase.driver: Connected Neo4j driver
    """
    try:
        # Use parameters if provided, otherwise use environment variables
        uri = uri or NEO4J_URI
        user = user or NEO4J_USER  
        password = password or NEO4J_PASSWORD
        
        # Verify that all required connection parameters are available
        if not all([uri, user, password]):
            missing = [param for param, value in 
                      zip(['uri', 'user', 'password'], [uri, user, password]) 
                      if not value]
            
            error_msg = f"Missing Neo4j connection parameters: {', '.join(missing)}"
            logger.error(error_msg)
            raise ValueError(error_msg)
        
        # Create the driver
        driver = GraphDatabase.driver(uri, auth=(user, password))
        
        # Verify connection with a simple query
        with driver.session() as session:
            result = session.run("RETURN 1 as test").single()
            if result and result["test"] == 1:
                logger.info("Successfully connected to Neo4j database")
                logger.info(f"Connected to Neo4j at {uri}")
                return driver
            else:
                raise ConnectionError("Could not verify Neo4j connection")
                
    except Exception as e:
        logger.error(f"Failed to connect to Neo4j: {str(e)}")
        logger.error("Exception details:", exc_info=True)
        raise

# Create Neo4j driver
try:
    driver = create_neo4j_driver()
    print(f"Neo4j Connection Status:")
    print(f"======================")
    print(f"Connected to: {NEO4J_URI}")
    print(f"Project name: {NEO4J_PROJECT_NAME}")
    print(f"Connection successful ✓")
except Exception as e:
    print(f"Error connecting to Neo4j database:")
    print(f"Error details: {str(e)}")
    print("Please check your environment variables and database connection settings.")

In [None]:
# Cell [3] - Query Meta Judge Links (Traceability Data)
# Purpose: Retrieve traceability data including TP/FP classifications from Neo4j
# Dependencies: neo4j, pandas, logging
# Breadcrumbs: Database Connection -> Data Retrieval -> Traceability Links

def query_meta_judge_links(driver, project_name=None, models=None, target_ids=None):
    """
    Query LLM_RESULT_META_JUDGE links from Neo4j database
    
    Parameters:
        driver: Neo4j driver connection
        project_name (str, optional): Project name to query. If None, uses NEO4J_PROJECT_NAME from environment.
        models (list, optional): List of models to query. If None, uses selected_models from environment.
        target_ids (list, optional): List of target IDs to filter by. If None, retrieves all target IDs.
    
    Returns:
        pd.DataFrame: DataFrame containing meta judge links with traceability data
    """
    try:
        # If parameters aren't provided, use globals
        project_name = project_name or NEO4J_PROJECT_NAME
        models = models or selected_models
        
        if not project_name:
            logger.error("No project name provided for meta judge query")
            return pd.DataFrame()
            
        if not models:
            logger.warning("No models specified for meta judge query, using all available models")
        
        # Build model filter clause if models are specified
        model_filter = ""
        if models:
            model_list = "', '".join(models)
            model_filter = f"AND r.model IN ['{model_list}']"
        
        # Build target ID filter clause if target_ids are specified
        target_id_filter = ""
        if target_ids and len(target_ids) > 0:
            # Convert all IDs to strings and wrap in quotes
            quoted_ids = ["'" + str(id).replace("'", "\\'") + "'" for id in target_ids]
            id_list = ", ".join(quoted_ids)
            target_id_filter = f"AND target.id IN [{id_list}]"
            logger.info(f"Filtering meta judge query to {len(target_ids)} specific target IDs")
        
        # Query for meta-judge links
        meta_judge_query = f"""
        MATCH (p:Project {{name: $project_name}})-[:CONTAINS]->(d:Document)-[:CONTAINS]->(source:Requirement)-[r:LLM_RESULT_META_JUDGE]->(target:Requirement)
        WHERE source.type = 'SOURCE' {model_filter} {target_id_filter}
        RETURN 
            p.name as project_name,
            source.id as source_id,
            source.description as source_description,
            target.id as target_id,
            target.description as target_description,
            r.is_traceable as is_traceable,
            r.judge_score as judge_score,
            r.semantic_alignment as semantic_alignment,
            r.non_functional_coverage as non_functional_coverage,
            r.final_score as final_score,
            r.actor_score as actor_score,
            r.functional_completeness as functional_completeness,
            r.model as model
        ORDER BY source.id, target.id
        """
        
        with driver.session() as session:
            logger.info(f"Executing meta judge query for project: {project_name}")
            results = session.run(meta_judge_query, project_name=project_name).data()
            
            if not results:
                logger.warning(f"No meta judge links found for project: {project_name}")
                return pd.DataFrame()
            
            # Convert to DataFrame
            meta_judge_df = pd.DataFrame(results)
            
            # Convert boolean columns to proper boolean type
            if 'is_traceable' in meta_judge_df.columns:
                meta_judge_df['is_traceable'] = meta_judge_df['is_traceable'].map(
                    lambda x: str(x).lower() == 'true' if pd.notna(x) else False
                )
            
            # Convert numeric columns to float
            numeric_cols = [
                'judge_score', 'semantic_alignment', 'non_functional_coverage',
                'final_score', 'actor_score', 'functional_completeness'
            ]
            
            for col in numeric_cols:
                if col in meta_judge_df.columns:
                    meta_judge_df[col] = pd.to_numeric(meta_judge_df[col], errors='coerce')
            
            # Calculate total score as judge_score + actor_score
            if 'judge_score' in meta_judge_df.columns and 'actor_score' in meta_judge_df.columns:
                meta_judge_df['total_score'] = meta_judge_df['judge_score'] + meta_judge_df['actor_score']
            
            # Add meta_judge_threshold column based on MIN_TRACEABILITY_THRESHOLD
            if 'total_score' in meta_judge_df.columns:
                meta_judge_df['meta_judge_threshold'] = meta_judge_df['total_score'] >= MIN_TRACEABILITY_THRESHOLD
            
            # Count metrics for logging
            logger.info(f"Retrieved {len(meta_judge_df)} meta judge links")
            if 'is_traceable' in meta_judge_df.columns:
                traceable_count = meta_judge_df['is_traceable'].sum()
                logger.info(f"Traceable links: {traceable_count} ({traceable_count/len(meta_judge_df)*100:.2f}%)")
            
            return meta_judge_df
            
    except Exception as e:
        logger.error(f"Error querying Neo4j for meta judge links: {str(e)}") 
        logger.error("Exception details:", exc_info=True)
        return pd.DataFrame()

# Execute query and get results
try:
    # Wait until we have both ground truth data and SIFP data
    sifp_target_ids = []
    ground_truth_target_ids = []
    
    # Get target IDs from SIFP data if already loaded
    if 'sifp_results_df' in globals() and not globals()['sifp_results_df'].empty:
        if 'sifp_requirement_id' in globals()['sifp_results_df'].columns:
            sifp_target_ids = globals()['sifp_results_df']['sifp_requirement_id'].unique().tolist()
        print(f"Found {len(sifp_target_ids)} unique target IDs with SIFP data")
    
    # Get target IDs from ground truth data if already loaded
    if 'ground_truth_df' in globals() and not globals()['ground_truth_df'].empty:
        ground_truth_target_ids = globals()['ground_truth_df']['target_id'].unique().tolist()
        print(f"Found {len(ground_truth_target_ids)} unique target IDs in ground truth data")
    
    # Only filter if we have data from both sources
    filter_target_ids = None
    if sifp_target_ids and ground_truth_target_ids:
        # Find the intersection of target IDs (those with both SIFP data and ground truth)
        filter_target_ids = list(set(sifp_target_ids).intersection(set(ground_truth_target_ids)))
        print(f"Filtering meta judge query to {len(filter_target_ids)} target IDs that have both SIFP data and ground truth")
    
    # Query meta judge data with filtered target IDs
    meta_judge_df = query_meta_judge_links(driver, target_ids=filter_target_ids)
    
    print(f"Meta Judge Traceability Results:")
    print(f"================================")
    
    if not meta_judge_df.empty:
        # Display general info about the dataset
        print(f"Total links analyzed: {len(meta_judge_df)}")
        
        if 'model' in meta_judge_df.columns:
            model_counts = meta_judge_df['model'].value_counts()
            print(f"Links by model:")
            for model, count in model_counts.items():
                print(f"  {model}: {count}")
        
        if 'is_traceable' in meta_judge_df.columns:
            traceable_count = meta_judge_df['is_traceable'].sum()
            print(f"Traceability Distribution:")
            print(f"  Traceable links: {traceable_count} ({traceable_count/len(meta_judge_df)*100:.2f}%)")
            print(f"  Non-traceable links: {len(meta_judge_df) - traceable_count} ({(len(meta_judge_df) - traceable_count)/len(meta_judge_df)*100:.2f}%)")
        
        # Display sample of data
        print("Sample of meta judge data:")
        display_cols = ['source_id', 'target_id', 'model', 'is_traceable', 'judge_score', 'actor_score', 'total_score']
        display_cols = [col for col in display_cols if col in meta_judge_df.columns]
        print(meta_judge_df[display_cols].head())
    else:
        print("No meta judge data found. Please check your database connection and project name.")
except Exception as e:
    print(f"Error retrieving meta judge data: {str(e)}")

In [None]:
# Cell [4] - Query Ground Truth Links
# Purpose: Retrieve ground truth traceability links for validation and baseline comparison
# Dependencies: neo4j, pandas, logging
# Breadcrumbs: Data Retrieval -> Ground Truth -> Validation Data

def query_ground_truth_links(driver, project_name=None):
    """
    Query ground truth traceability links from Neo4j database
    
    Parameters:
        driver: Neo4j driver connection
        project_name (str, optional): Project name to query. If None, uses NEO4J_PROJECT_NAME from environment.
    
    Returns:
        pd.DataFrame: DataFrame containing ground truth links
    """
    try:
        # If project_name isn't provided, use globals
        project_name = project_name or NEO4J_PROJECT_NAME
        
        if not project_name:
            logger.error("No project name provided for ground truth query")
            return pd.DataFrame()
        
        # Query for ground truth links
        ground_truth_query = """
        MATCH (p:Project {name: $project_name})-[:CONTAINS]->(d:Document)-[:CONTAINS]->(source:Requirement)-[r:GROUND_TRUTH]->(target:Requirement)
        WHERE source.type = 'SOURCE' AND target.type = 'TARGET'
        RETURN 
            p.name as project_name,
            source.id as source_id,
            source.description as source_description,
            target.id as target_id,
            target.description as target_description,
            1 as ground_truth,
            d.id as document_id
        ORDER BY source.id, target.id
        """
        
        with driver.session() as session:
            logger.info(f"Executing ground truth query for project: {project_name}")
            results = session.run(ground_truth_query, project_name=project_name).data()
            
            if not results:
                logger.warning(f"No ground truth links found for project: {project_name}")
                return pd.DataFrame()
            
            # Convert to DataFrame
            ground_truth_df = pd.DataFrame(results)
            
            # Add pair_id column for merging (source_id + "_" + target_id)
            if 'source_id' in ground_truth_df.columns and 'target_id' in ground_truth_df.columns:
                ground_truth_df['pair_id'] = ground_truth_df['source_id'] + "_" + ground_truth_df['target_id']
            
            # Count metrics for logging
            logger.info(f"Retrieved {len(ground_truth_df)} ground truth links")
            source_count = ground_truth_df['source_id'].nunique() if 'source_id' in ground_truth_df.columns else 0
            target_count = ground_truth_df['target_id'].nunique() if 'target_id' in ground_truth_df.columns else 0
            logger.info(f"Unique source requirements: {source_count}")
            logger.info(f"Unique target requirements: {target_count}")
            
            return ground_truth_df
            
    except Exception as e:
        logger.error(f"Error querying Neo4j for ground truth links: {str(e)}")
        logger.error("Exception details:", exc_info=True)
        return pd.DataFrame()

# Execute query and get results
try:
    ground_truth_df = query_ground_truth_links(driver)
    print(f"Ground Truth Links:")
    print(f"==================")
    
    if not ground_truth_df.empty:
        # Display general info about the dataset
        print(f"Total ground truth links: {len(ground_truth_df)}")
        
        # Count unique source and target requirements
        source_count = ground_truth_df['source_id'].nunique()
        target_count = ground_truth_df['target_id'].nunique()
        
        print(f"Unique source requirements: {source_count}")
        print(f"Unique target requirements: {target_count}")
        
        # Calculate link density
        if source_count > 0 and target_count > 0:
            link_density = len(ground_truth_df) / (source_count * target_count)
            print(f"Link density: {link_density:.4f}")
        
        # Display sample of data
        print("Sample of ground truth data:")
        display_cols = ['source_id', 'target_id', 'ground_truth']
        display_cols = [col for col in display_cols if col in ground_truth_df.columns]
        print(ground_truth_df[display_cols].head())
        
        # Create a set of ground truth link pair IDs for faster lookups
        if 'pair_id' in ground_truth_df.columns:
            ground_truth_links = set(ground_truth_df['pair_id'])
            print(f"Created fast lookup set with {len(ground_truth_links)} ground truth link pairs")
    else:
        print("No ground truth data found. Please check your database connection and project name.")
except Exception as e:
    print(f"Error retrieving ground truth data: {str(e)}")

In [None]:
# Cell [5] - Query SIFP Estimation Data
# Purpose: Retrieve SiFP estimations and actual metrics from requirements analysis
# Dependencies: neo4j, pandas, logging
# Breadcrumbs: Data Retrieval -> SIFP Estimations -> Requirements Metrics

def query_sifp_estimations(driver, project_name=None, models=None, requirement_type=None, target_ids=None):
    """
    Query SIFP estimation data from Neo4j database
    
    Parameters:
        driver: Neo4j driver connection
        project_name (str, optional): Project name to query. If None, uses NEO4J_PROJECT_NAME from environment.
        models (list, optional): List of models to query. If None, uses selected_models from environment.
        requirement_type (str, optional): Requirement type to query. If None, uses SIFP_ESTIMATION_REQUIREMENT from environment.
        target_ids (list, optional): List of target IDs to filter by. If None, retrieves all target IDs.
    
    Returns:
        pd.DataFrame: DataFrame containing SIFP estimation results
    """
    try:
        # If parameters aren't provided, use globals
        project_name = project_name or NEO4J_PROJECT_NAME
        models = models or selected_models
        requirement_type = requirement_type or SIFP_ESTIMATION_REQUIREMENT
        
        if not project_name:
            logger.error("No project name provided for SIFP estimation query")
            return pd.DataFrame()
        
        # Build model filter clause if models are specified
        model_filter = ""
        if models:
            model_list = "', '".join(models)
            model_filter = f"AND s.model IN ['{model_list}']"
        
        # Build target ID filter clause if target_ids are specified
        target_id_filter = ""
        if target_ids and len(target_ids) > 0:
            # Convert all IDs to strings and wrap in quotes
            quoted_ids = ["'" + str(id).replace("'", "\\'") + "'" for id in target_ids]
            id_list = ", ".join(quoted_ids)
            target_id_filter = f"AND r.id IN [{id_list}]"
            logger.info(f"Filtering SIFP query to {len(target_ids)} specific target IDs")
        
        # Query for SIFP estimations
        sifp_query = f"""
        MATCH (p:Project {{name: $project_name}})
        MATCH (p)-[:CONTAINS*]->(n)
        MATCH (r:Requirement)-[s:SIFP_ESTIMATION]->(e)
        WHERE r = n
        AND r.type = $requirement_type
        {model_filter}
        {target_id_filter}
        WITH r, s
        WHERE s.actor_analysis IS NOT NULL
        AND s.judge_evaluation IS NOT NULL
        AND s.final_estimation IS NOT NULL
        WITH r, s,
            CASE
                WHEN s.actor_analysis STARTS WITH '{{'
                THEN apoc.convert.fromJsonMap(s.actor_analysis)
                ELSE NULL
            END as actor_analysis,
            CASE
                WHEN s.judge_evaluation STARTS WITH '{{'
                THEN apoc.convert.fromJsonMap(s.judge_evaluation)
                ELSE NULL
            END as judge_eval,
            CASE
                WHEN s.final_estimation STARTS WITH '{{'
                THEN apoc.convert.fromJsonMap(s.final_estimation)
                ELSE NULL
            END as final_est
        WHERE actor_analysis IS NOT NULL
        AND final_est IS NOT NULL
        WITH r.id as sifp_requirement_id,
            s.is_valid as sifp_is_valid,
            s.model as sifp_model,
            s.judge_score as sifp_judge_score,
            s.judge_confidence as sifp_judge_confidence,
            // Actor Analysis values
            actor_analysis.confidence as sifp_actor_confidence,
            actor_analysis.sifp_points.total as sifp_actor_total,
            // Judge Evaluation values
            judge_eval.ugep_accuracy as sifp_judge_ugep_accuracy,
            judge_eval.ugdg_accuracy as sifp_judge_ugdg_accuracy,
            judge_eval.calculation_accuracy as sifp_judge_calculation_accuracy,
            judge_eval.component_classification_accuracy as sifp_judge_classification_accuracy,
            // Final Estimation values
            final_est.sifp_points.total as sifp_final_total
        WITH sifp_requirement_id, sifp_model,
            COLLECT([sifp_is_valid, sifp_judge_score, sifp_judge_confidence,
            sifp_actor_confidence, sifp_actor_total, sifp_judge_ugep_accuracy,
            sifp_judge_ugdg_accuracy, sifp_judge_calculation_accuracy,
            sifp_judge_classification_accuracy, sifp_final_total])[0] as fields
        RETURN 
            sifp_requirement_id,
            sifp_model,
            fields[0] as sifp_is_valid,
            fields[1] as sifp_judge_score,
            fields[2] as sifp_judge_confidence,
            // Actor Analysis values
            fields[3] as sifp_actor_confidence,
            fields[4] as sifp_actor_total,
            // Judge Evaluation values
            fields[5] as sifp_judge_ugep_accuracy,
            fields[6] as sifp_judge_ugdg_accuracy,
            fields[7] as sifp_judge_calculation_accuracy,
            fields[8] as sifp_judge_classification_accuracy,
            // Final Estimation values
            fields[9] as sifp_final_total
        """
        
        with driver.session() as session:
            logger.info(f"Executing SIFP estimation query for project: {project_name}")
            results = session.run(sifp_query, 
                                 project_name=project_name,
                                 requirement_type=requirement_type).data()
            
            if not results:
                logger.warning(f"No SIFP estimation results found for project: {project_name}")
                return pd.DataFrame()
            
            # Convert to DataFrame
            sifp_results_df = pd.DataFrame(results)
            
            # Convert boolean columns to proper boolean type
            if 'sifp_is_valid' in sifp_results_df.columns:
                sifp_results_df['sifp_is_valid'] = sifp_results_df['sifp_is_valid'].map(
                    lambda x: str(x).lower() == 'true' if pd.notna(x) else False
                )
            
            # Convert numeric columns to float
            numeric_cols = [
                'sifp_judge_score', 'sifp_judge_confidence', 'sifp_actor_confidence', 'sifp_actor_total',
                'sifp_judge_ugep_accuracy', 'sifp_judge_ugdg_accuracy', 'sifp_judge_calculation_accuracy',
                'sifp_judge_classification_accuracy', 'sifp_final_total'
            ]
            
            for col in numeric_cols:
                if col in sifp_results_df.columns:
                    sifp_results_df[col] = pd.to_numeric(sifp_results_df[col], errors='coerce')
            
            # Add columns to help with analysis
            if 'sifp_actor_total' in sifp_results_df.columns and 'sifp_final_total' in sifp_results_df.columns:
                # Calculate difference between actor and final estimations
                sifp_results_df['sifp_difference'] = sifp_results_df['sifp_final_total'] - sifp_results_df['sifp_actor_total']
                
                # Calculate percentage difference
                nonzero_mask = sifp_results_df['sifp_actor_total'] != 0
                sifp_results_df['sifp_pct_difference'] = np.nan
                sifp_results_df.loc[nonzero_mask, 'sifp_pct_difference'] = (
                    (sifp_results_df.loc[nonzero_mask, 'sifp_final_total'] - 
                     sifp_results_df.loc[nonzero_mask, 'sifp_actor_total']) / 
                    sifp_results_df.loc[nonzero_mask, 'sifp_actor_total'] * 100
                )
                
                # Calculate absolute percentage difference
                sifp_results_df['sifp_abs_pct_difference'] = sifp_results_df['sifp_pct_difference'].abs()
            
            # Log summary statistics
            logger.info(f"Retrieved {len(sifp_results_df)} SIFP estimation results")
            
            if 'sifp_model' in sifp_results_df.columns:
                logger.info(f"Models in SIFP data: {sifp_results_df['sifp_model'].unique()}")
            
            if 'sifp_actor_total' in sifp_results_df.columns and 'sifp_final_total' in sifp_results_df.columns:
                logger.info(f"Average actor total: {sifp_results_df['sifp_actor_total'].mean():.2f}")
                logger.info(f"Average final total: {sifp_results_df['sifp_final_total'].mean():.2f}")
            
            return sifp_results_df
            
    except Exception as e:
        logger.error(f"Error querying Neo4j for SIFP estimations: {str(e)}")
        logger.error("Exception details:", exc_info=True)
        return pd.DataFrame()

# Execute query and get results
try:
    # First, extract target IDs from ground truth data
    ground_truth_target_ids = []
    if 'ground_truth_df' in globals() and not globals()['ground_truth_df'].empty:
        ground_truth_target_ids = globals()['ground_truth_df']['target_id'].unique().tolist()
        print(f"Found {len(ground_truth_target_ids)} unique target IDs in ground truth data")
    
    # Query SIFP data, filtered by target IDs from ground truth
    sifp_results_df = query_sifp_estimations(driver, target_ids=ground_truth_target_ids)
    
    print(f"SIFP Estimation Results:")
    print(f"========================")
    
    if not sifp_results_df.empty:
        # Display general info about the dataset
        print(f"Total SIFP estimations: {len(sifp_results_df)}")
        
        if 'sifp_model' in sifp_results_df.columns:
            model_counts = sifp_results_df['sifp_model'].value_counts()
            print(f"Estimations by model:")
            for model, count in model_counts.items():
                print(f"  {model}: {count}")
        
        # Display statistics on SIFP estimations
        if 'sifp_actor_total' in sifp_results_df.columns and 'sifp_final_total' in sifp_results_df.columns:
            print(f"SIFP Estimation Statistics:")
            print(f"  Average actor estimation: {sifp_results_df['sifp_actor_total'].mean():.2f} points")
            print(f"  Average judge estimation: {sifp_results_df['sifp_final_total'].mean():.2f} points")
            print(f"  Average improvement: {sifp_results_df['sifp_difference'].mean():.2f} points")
            print(f"  Average percentage change: {sifp_results_df['sifp_pct_difference'].mean():.2f}%")
        
        # Display sample of data
        print("Sample of SIFP estimation data:")
        display_cols = ['sifp_requirement_id', 'sifp_model', 'sifp_actor_total', 'sifp_final_total', 'sifp_difference', 'sifp_pct_difference']
        display_cols = [col for col in display_cols if col in sifp_results_df.columns]
        print(sifp_results_df[display_cols].head())
        
        # Display summary statistics for key metrics
        print("Summary Statistics for Key Metrics:")
        key_metrics = ['sifp_actor_total', 'sifp_final_total', 'sifp_difference', 'sifp_abs_pct_difference']
        key_metrics = [col for col in key_metrics if col in sifp_results_df.columns]
        
        if key_metrics:
            print(sifp_results_df[key_metrics].describe())
    else:
        print("No SIFP estimation data found. Please check your database connection and project name.")
except Exception as e:
    print(f"Error retrieving SIFP estimation data: {str(e)}")

In [None]:
# Cell [6] - Model Evaluation and Threshold Optimization
# Purpose: Evaluate traceability links and classify as TP/FP/FN/TN based on F2 optimization
# Dependencies: pandas, numpy, sklearn.metrics
# Breadcrumbs: Data Analysis -> Model Evaluation -> Classification Optimization

def evaluate_model_thresholds(df, model_name, score_column='total_score', 
                             ground_truth_column='ground_truth_traceable', 
                             optimize_for='F2'):
    """
    Evaluate a model's performance across different thresholds
    
    Parameters:
        df: DataFrame containing model predictions and ground truth
        model_name: Name of the model to evaluate
        score_column: Column containing score values
        ground_truth_column: Column containing ground truth values
        optimize_for: Metric to optimize for ('F1' or 'F2')
    
    Returns:
        dict: Dictionary containing evaluation results
    """
    try:
        # Filter data for this model
        model_df = df[df['model'] == model_name].copy()
        
        if model_df.empty:
            print(f"No data available for model: {model_name}")
            return {}
            
        if ground_truth_column not in model_df.columns:
            print(f"Ground truth column '{ground_truth_column}' not found for model: {model_name}")
            return {}
        
        # Get ground truth and scores
        y_true = model_df[ground_truth_column].astype(int).values
        
        # Check for and handle None/NaN values in score column
        if model_df[score_column].isna().any():
            print(f"Found NaN values in {score_column} for model {model_name}. Filling with 0.")
            model_df[score_column] = model_df[score_column].fillna(0)
        
        # Ensure scores are numeric
        if model_df[score_column].dtype == object:
            try:
                model_df[score_column] = pd.to_numeric(model_df[score_column])
                print(f"Converted {score_column} to numeric for model {model_name}")
            except Exception as e:
                print(f"Error converting {score_column} to numeric: {str(e)}")
                # Default to zeros if conversion fails
                model_df[score_column] = 0
        
        scores = model_df[score_column].values
        
        # Print debug information
        print(f"  - Total data points: {len(model_df)}")
        print(f"  - Positive examples: {y_true.sum()} ({y_true.sum()/len(y_true)*100:.2f}%)")
        print(f"  - Negative examples: {len(y_true) - y_true.sum()} ({(len(y_true) - y_true.sum())/len(y_true)*100:.2f}%)")
        print(f"  - Score range: {scores.min():.4f} to {scores.max():.4f}")
        
        # If all ground truth values are the same, we can't calculate meaningful metrics
        if len(np.unique(y_true)) < 2:
            print(f"Insufficient ground truth variety for model {model_name} - all values are {np.unique(y_true)[0]}")
            return {
                'model_name': model_name,
                'data_points': len(model_df),
                'ground_truth_positive': int(y_true.sum()),
                'ground_truth_negative': int(len(y_true) - y_true.sum())
            }
        
        # Generate possible thresholds from the data
        unique_scores = np.unique(scores)
        # Add some intermediate thresholds for a more fine-grained evaluation
        thresholds = np.sort(np.concatenate([
            unique_scores,
            np.linspace(scores.min(), scores.max(), 20)
        ]))
        
        # Calculate metrics for each threshold
        results = []
        
        for threshold in thresholds:
            # Convert scores to binary predictions using this threshold
            y_pred = (scores >= threshold).astype(int)
            
            # Only calculate if we have at least one prediction of each class
            if np.unique(y_pred).size < 2:
                continue
                
            # Confusion matrix components
            tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
            
            # Basic metrics
            accuracy = accuracy_score(y_true, y_pred)
            balanced_acc = balanced_accuracy_score(y_true, y_pred)
            
            # Handle division by zero
            if tp + fp == 0:  # No positive predictions
                prec = 0
            else:
                prec = tp / (tp + fp)
                
            if tp + fn == 0:  # No positive ground truth
                rec = 0
            else:
                rec = tp / (tp + fn)
            
            f1 = f1_score(y_true, y_pred, zero_division=0)
            f2 = fbeta_score(y_true, y_pred, beta=2, zero_division=0)
            
            # Additional metrics
            tnr = tn / (tn + fp) if (tn + fp) > 0 else 0  # Specificity/True Negative Rate
            fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # Miss Rate/False Negative Rate
            mcc = matthews_corrcoef(y_true, y_pred)  # Matthews Correlation Coefficient
            
            results.append({
                'threshold': threshold,
                'tp': tp,
                'fp': fp,
                'fn': fn,
                'tn': tn,
                'accuracy': accuracy,
                'balanced_accuracy': balanced_acc,
                'precision': prec,
                'recall': rec,
                'tnr': tnr,  # specificity
                'fnr': fnr,  # miss rate
                'f1_score': f1,
                'f2_score': f2,
                'mcc': mcc  # Matthews Correlation Coefficient
            })
        
        # Convert to DataFrame for easier analysis
        results_df = pd.DataFrame(results)
        
        if results_df.empty:
            print(f"No valid thresholds found for model {model_name}")
            return {
                'model_name': model_name,
                'data_points': len(model_df),
                'ground_truth_positive': int(y_true.sum()),
                'ground_truth_negative': int(len(y_true) - y_true.sum()),
                'error': "No valid thresholds found with current data"
            }
        
        # Find best threshold based on optimization metric
        if optimize_for == 'F1':
            best_idx = results_df['f1_score'].idxmax()
            best_metric = 'f1_score'
        else:  # F2
            best_idx = results_df['f2_score'].idxmax()
            best_metric = 'f2_score'
            
        best_result = results_df.loc[best_idx]
        
        # Return comprehensive results with the original dataframe for further use
        result_dict = {
            'model_name': model_name,
            'score_column': score_column,
            'data_points': len(model_df),
            'ground_truth_positive': int(y_true.sum()),
            'ground_truth_negative': int(len(y_true) - y_true.sum()),
            'best_threshold': best_result['threshold'],
            'best_precision': best_result['precision'],
            'best_recall': best_result['recall'],
            'best_accuracy': best_result['accuracy'],
            'best_balanced_accuracy': best_result['balanced_accuracy'],
            'best_f1': best_result['f1_score'],
            'best_f2': best_result['f2_score'],
            'best_tnr': best_result['tnr'],
            'best_fnr': best_result['fnr'],
            'best_mcc': best_result['mcc'],
            'best_tp': best_result['tp'],
            'best_fp': best_result['fp'],
            'best_fn': best_result['fn'],
            'best_tn': best_result['tn'],
            'optimization_metric': optimize_for,
            'threshold_results': results_df,
            'model_df': model_df
        }
        
        return result_dict
    except Exception as e:
        print(f"Error evaluating model {model_name}: {str(e)}")
        import traceback
        traceback.print_exc()
        return {
            'model_name': model_name,
            'data_points': len(model_df) if 'model_df' in locals() else 0,
            'error': str(e)
        }

def apply_classification_to_dataframe(df, evaluation_results):
    """
    Apply classification (TP/FP/FN/TN) to a dataframe based on evaluation results
    
    Parameters:
        df: DataFrame to apply classification to
        evaluation_results: Dictionary with evaluation results including best threshold
    
    Returns:
        pd.DataFrame: DataFrame with added classification columns
    """
    if not evaluation_results or 'error' in evaluation_results:
        return df
    
    result_df = df.copy()
    
    # Extract key information from evaluation results
    model_name = evaluation_results['model_name']
    score_column = evaluation_results['score_column']
    threshold = evaluation_results['best_threshold']
    
    # Add columns for this specific model and score type
    col_prefix = f"{model_name}_{score_column}"
    prediction_col = f"{col_prefix}_prediction"
    classification_col = f"{col_prefix}_classification"
    
    # Filter for rows with this model
    if 'model' in result_df.columns:
        model_mask = (result_df['model'] == model_name)
    else:
        model_mask = pd.Series(True, index=result_df.index)
    
    # Create prediction column (True/False) based on threshold
    result_df.loc[model_mask, prediction_col] = False
    score_mask = model_mask & result_df[score_column].notna()
    result_df.loc[score_mask, prediction_col] = result_df.loc[score_mask, score_column] >= threshold
    
    # Create classification column (TP/FP/FN/TN) based on prediction and ground truth
    result_df.loc[model_mask, classification_col] = 'NA'
    
    # Apply classification logic
    if 'ground_truth_traceable' in result_df.columns:
        # TP: ground truth positive AND prediction positive
        tp_mask = model_mask & result_df['ground_truth_traceable'] & result_df[prediction_col]
        result_df.loc[tp_mask, classification_col] = 'TP'
        
        # FP: ground truth negative BUT prediction positive
        # Make sure to handle potential float values correctly by ensuring boolean conversion
        fp_mask = model_mask & ~result_df['ground_truth_traceable'] & result_df[prediction_col]
        result_df.loc[fp_mask, classification_col] = 'FP'
        
        # FN: ground truth positive BUT prediction negative
        # This is where the error occurred - we need to ensure we're treating prediction_col as boolean
        prediction_bool = result_df[prediction_col].astype(bool)
        fn_mask = model_mask & result_df['ground_truth_traceable'] & ~prediction_bool
        result_df.loc[fn_mask, classification_col] = 'FN'
        
        # TN: ground truth negative AND prediction negative
        tn_mask = model_mask & ~result_df['ground_truth_traceable'] & ~prediction_bool
        result_df.loc[tn_mask, classification_col] = 'TN'
    
    return result_df

def create_combined_dataset(meta_judge_df, ground_truth_df, sifp_results_df):
    """
    Combine meta judge data with traceability classifications and SIFP estimates
    
    Parameters:
        meta_judge_df: DataFrame with meta judge assessments
        ground_truth_df: DataFrame with ground truth traceability data
        sifp_results_df: DataFrame with SIFP estimation data
    
    Returns:
        pd.DataFrame: Combined dataset linking classifications with SIFP data
    """
    # Validate input data
    if meta_judge_df is None or meta_judge_df.empty:
        logger.error("Meta judge data is empty or None")
        return pd.DataFrame()
    
    if ground_truth_df is None or ground_truth_df.empty:
        logger.error("Ground truth data is empty or None")
        return pd.DataFrame()
    
    if sifp_results_df is None or sifp_results_df.empty:
        logger.error("SIFP results data is empty or None")
        return pd.DataFrame()
    
    logger.info(f"Creating combined dataset from {len(meta_judge_df)} meta judge links, " 
                f"{len(ground_truth_df)} ground truth links, and {len(sifp_results_df)} SIFP estimates")
    
    # Create a set of ground truth link pairs for fast lookup
    if 'source_id' in ground_truth_df.columns and 'target_id' in ground_truth_df.columns:
        # Convert all IDs to strings for consistent comparison
        ground_truth_df['source_id'] = ground_truth_df['source_id'].astype(str)
        ground_truth_df['target_id'] = ground_truth_df['target_id'].astype(str)
        
        ground_truth_pairs = set(zip(ground_truth_df['source_id'], ground_truth_df['target_id']))
        logger.info(f"Created ground truth lookup set with {len(ground_truth_pairs)} link pairs")
        
        # Create a copy to work with
        result_df = meta_judge_df.copy()
        
        # Ensure IDs are strings
        result_df['source_id'] = result_df['source_id'].astype(str)
        result_df['target_id'] = result_df['target_id'].astype(str)
        
        # Add ground_truth_traceable column
        result_df['ground_truth_traceable'] = result_df.apply(
            lambda row: (row['source_id'], row['target_id']) in ground_truth_pairs,
            axis=1
        )
        
        # Add a pair_id for easier merging with SIFP data
        result_df['pair_id'] = result_df['source_id'] + "_" + result_df['target_id']
        
        # Now merge with SIFP data using target_id and model
        # Ensure SIFP data has consistent string IDs
        sifp_df = sifp_results_df.copy()
        sifp_df['sifp_requirement_id'] = sifp_df['sifp_requirement_id'].astype(str)
        
        # Create merge keys that include both target_id and model for SIFP data
        if 'model' in result_df.columns and 'sifp_model' in sifp_df.columns:
            logger.info("Creating merge keys using both target_id and model")
            result_df['merge_key'] = result_df['target_id'] + '_' + result_df['model']
            sifp_df['merge_key'] = sifp_df['sifp_requirement_id'] + '_' + sifp_df['sifp_model']
            
            # Check for duplicates in merge keys
            if sifp_df['merge_key'].duplicated().any():
                logger.warning(f"Found {sifp_df['merge_key'].duplicated().sum()} duplicate merge keys in SIFP estimates. Using first occurrence.")
                sifp_df = sifp_df.drop_duplicates(subset=['merge_key'], keep='first')
            
            # Perform the merge
            combined_df = result_df.merge(
                sifp_df,
                on='merge_key',
                how='left',
                suffixes=('', '_sifp')
            )
            
            # Check merge result
            merge_success = combined_df[['sifp_actor_total', 'sifp_final_total']].notna().any(axis=1).sum()
            merge_pct = merge_success / len(combined_df) * 100 if len(combined_df) > 0 else 0
            logger.info(f"Merged dataframe has {len(combined_df)} rows with {merge_success} valid SIFP links ({merge_pct:.1f}%)")
            
            # If the merge didn't work well, try with just target_id as fallback
            if merge_pct < 50:
                logger.warning("Poor merge results with target_id+model. Trying with just target_id as fallback.")
                
                # Fallback to just target_id
                combined_df = result_df.merge(
                    sifp_df.drop(columns=['merge_key']),
                    left_on='target_id',
                    right_on='sifp_requirement_id',
                    how='left',
                    suffixes=('', '_sifp')
                )
                
                # Check fallback merge result
                merge_success = combined_df[['sifp_actor_total', 'sifp_final_total']].notna().any(axis=1).sum()
                merge_pct = merge_success / len(combined_df) * 100 if len(combined_df) > 0 else 0
                logger.info(f"Fallback merge with target_id: {merge_success} valid SIFP links ({merge_pct:.1f}%)")
        else:
            # If we don't have model columns in both dataframes, merge on target_id directly
            logger.info("Missing model column in one or both DataFrames, merging on target_id only")
            combined_df = result_df.merge(
                sifp_df,
                left_on='target_id',
                right_on='sifp_requirement_id',
                how='left',
                suffixes=('', '_sifp')
            )
            
            # Check merge result
            merge_success = combined_df[['sifp_actor_total', 'sifp_final_total']].notna().any(axis=1).sum()
            merge_pct = merge_success / len(combined_df) * 100 if len(combined_df) > 0 else 0
            logger.info(f"Merged on target_id: {merge_success} valid SIFP links ({merge_pct:.1f}%)")
        
        # Add error metrics if they don't exist
        if 'sifp_abs_error' not in combined_df.columns:
            combined_df['sifp_abs_error'] = (combined_df['sifp_final_total'] - combined_df['sifp_actor_total']).abs()
        
        if 'sifp_pct_error' not in combined_df.columns:
            # Calculate percentage error, handling division by zero
            combined_df['sifp_pct_error'] = np.nan
            nonzero_mask = (combined_df['sifp_actor_total'] != 0) & combined_df['sifp_actor_total'].notna()
            
            combined_df.loc[nonzero_mask, 'sifp_pct_error'] = (
                (combined_df.loc[nonzero_mask, 'sifp_final_total'] - 
                 combined_df.loc[nonzero_mask, 'sifp_actor_total']).abs() / 
                combined_df.loc[nonzero_mask, 'sifp_actor_total'] * 100
            )
        
        # Log statistics by classification (after classification is applied)
        if 'classification' in combined_df.columns:
            logger.info("Calculating error metrics by classification:")
            for classification in combined_df['classification'].unique():
                if pd.isna(classification):
                    continue
                    
                class_df = combined_df[combined_df['classification'] == classification]
                if len(class_df) > 0:
                    logger.info(f"  {classification}: {len(class_df)} links")
                    
                    # Compute stats on valid data
                    valid_data = class_df[class_df['sifp_abs_error'].notna()]
                    if len(valid_data) > 0:
                        logger.info(f"    Mean Absolute Error: {valid_data['sifp_abs_error'].mean():.2f}")
                        
                        pct_data = valid_data[valid_data['sifp_pct_error'].notna()]
                        if len(pct_data) > 0:
                            logger.info(f"    Mean Percentage Error: {pct_data['sifp_pct_error'].mean():.2f}%")
                    else:
                        logger.info(f"    No valid SIFP data for {classification} links")
        
        return combined_df
    else:
        logger.error("Required source_id and target_id columns missing from ground truth data")
        return pd.DataFrame()

# Load environment variables
load_dotenv()

# Get the actual model name
current_model_var = os.environ.get('CURRENT_MODEL', '')
actual_model_name = os.environ.get(current_model_var, current_model_var)

# Initialize best_thresholds_df as an empty DataFrame (will be filled later)
best_thresholds_df = pd.DataFrame()

# First, make sure meta_judge_df exists and has required data
if 'meta_judge_df' not in globals() or globals()['meta_judge_df'] is None or globals()['meta_judge_df'].empty:
    print("No meta judge data available. Please run previous cells first.")
else:
    meta_judge_df = globals()['meta_judge_df']
    
    # Make sure ground_truth_df exists and has required data
    if 'ground_truth_df' not in globals() or globals()['ground_truth_df'] is None or globals()['ground_truth_df'].empty:
        print("No ground truth data available. Please run previous cells first.")
    else:
        # Create a dataset for evaluation
        ground_truth_df = globals()['ground_truth_df']
        
        print(f"Preparing data for threshold evaluation:")
        print(f"======================================")
        
        # Create a set of ground truth link pairs for fast lookup
        if 'source_id' in ground_truth_df.columns and 'target_id' in ground_truth_df.columns:
            # Convert IDs to strings for consistent comparison
            ground_truth_df['source_id'] = ground_truth_df['source_id'].astype(str)
            ground_truth_df['target_id'] = ground_truth_df['target_id'].astype(str)
            
            ground_truth_pairs = set(zip(ground_truth_df['source_id'], ground_truth_df['target_id']))
            print(f"Created ground truth lookup set with {len(ground_truth_pairs)} link pairs")
            
            # Convert source_id and target_id to strings in meta_judge_df too
            meta_judge_df['source_id'] = meta_judge_df['source_id'].astype(str)
            meta_judge_df['target_id'] = meta_judge_df['target_id'].astype(str)
            
            # Add ground_truth_traceable column to meta_judge_df
            meta_judge_df['ground_truth_traceable'] = meta_judge_df.apply(
                lambda row: (row['source_id'], row['target_id']) in ground_truth_pairs,
                axis=1
            )
            
            # Add derived score columns if not already present
            if 'judge_score' in meta_judge_df.columns and 'actor_score' in meta_judge_df.columns:
                if 'total_score' not in meta_judge_df.columns:
                    meta_judge_df['total_score'] = meta_judge_df['judge_score'] + meta_judge_df['actor_score']
                
                # Add alternative total scores for comparison
                if 'final_score' in meta_judge_df.columns:
                    if 'total_score_with_final' not in meta_judge_df.columns:
                        meta_judge_df['total_score_with_final'] = meta_judge_df['actor_score'] + meta_judge_df['final_score']
                    
                    if 'total_score_all' not in meta_judge_df.columns:
                        meta_judge_df['total_score_all'] = meta_judge_df['actor_score'] + meta_judge_df['judge_score'] + meta_judge_df['final_score']
            
            # Get list of all models
            all_models = meta_judge_df['model'].unique()
            
            # Set OPTIMIZATION_METRIC if not already in globals
            if 'OPTIMIZATION_METRIC' not in globals():
                OPTIMIZATION_METRIC = 'F2'
            else:
                OPTIMIZATION_METRIC = globals()['OPTIMIZATION_METRIC']
            
            print(f"Evaluating {len(all_models)} models using meta judge data")
            print(f"Optimizing for {OPTIMIZATION_METRIC} score")
            print("=" * 80)
            
            # Define score columns to evaluate
            score_columns_to_evaluate = [
                'is_traceable',         # Boolean indicator
                'actor_score',          # Individual score
                'judge_score',          # Individual score
                'final_score',          # Individual score
                'total_score',          # judge_score + actor_score
                'total_score_with_final',  # actor_score + final_score
                'total_score_all'       # actor_score + judge_score + final_score
            ]
            
            # Filter to only columns that exist
            score_columns_to_evaluate = [
                col for col in score_columns_to_evaluate 
                if col in meta_judge_df.columns and not meta_judge_df[col].isna().all()
            ]
            
            # Evaluate models and score columns
            evaluation_results = []
            classified_df = meta_judge_df.copy()
            
            for model in all_models:
                print(f"Evaluating model: {model}")
                
                # For each model, evaluate using different score columns
                model_results = []
                for score_column in score_columns_to_evaluate:
                    print(f"  Evaluating using {score_column}:")
                    result = evaluate_model_thresholds(
                        meta_judge_df, 
                        model, 
                        score_column=score_column,
                        optimize_for=OPTIMIZATION_METRIC
                    )
                    
                    if result and 'best_threshold' in result:
                        # Add score column to result
                        result['score_column'] = score_column
                        model_results.append(result)
                        
                        # Print key metrics
                        print(f"    - Best threshold: {result['best_threshold']:.3f}")
                        print(f"    - Precision: {result['best_precision']:.3f}")
                        print(f"    - Recall: {result['best_recall']:.3f}")
                        print(f"    - F1: {result['best_f1']:.3f}")
                        print(f"    - F2: {result['best_f2']:.3f}")
                        print(f"    - MCC: {result['best_mcc']:.3f}")
                        
                        # Apply classification to the dataframe
                        classified_df = apply_classification_to_dataframe(classified_df, result)
                
                # Find best score column for this model based on optimization metric
                if model_results:
                    # Sort by the chosen optimization metric
                    if OPTIMIZATION_METRIC == 'F1':
                        model_results.sort(key=lambda x: x['best_f1'], reverse=True)
                    else:  # F2
                        model_results.sort(key=lambda x: x['best_f2'], reverse=True)
                        
                    best_result = model_results[0]
                    print(f"  Best performing score for {model}: {best_result['score_column']}")
                    print(f"    - {OPTIMIZATION_METRIC} Score: {best_result['best_f2' if OPTIMIZATION_METRIC == 'F2' else 'best_f1']:.3f}")
                    
                    # Add a 'best_model_score' classification column
                    best_col_prefix = f"{model}_{best_result['score_column']}"
                    best_classification_col = f"{best_col_prefix}_classification"
                    classified_df[f"{model}_best_classification"] = classified_df[best_classification_col]
                    
                    evaluation_results.extend(model_results)
            
            # Create DataFrame of best thresholds with all metrics
            if evaluation_results:
                best_thresholds_df = pd.DataFrame([
                    {
                        'model_name': r['model_name'],
                        'score_column': r['score_column'],
                        'best_threshold': r['best_threshold'],
                        'accuracy': r['best_accuracy'],
                        'balanced_accuracy': r['best_balanced_accuracy'],
                        'precision': r['best_precision'],
                        'recall': r['best_recall'],
                        'specificity': r['best_tnr'],
                        'miss_rate': r['best_fnr'],
                        'f1_score': r['best_f1'],
                        'f2_score': r['best_f2'],
                        'matthews_corr': r['best_mcc'],
                        'true_positives': r['best_tp'],
                        'false_positives': r['best_fp'],
                        'false_negatives': r['best_fn'],
                        'true_negatives': r['best_tn'],
                        'data_points': r['data_points'],
                        'ground_truth_positive': r['ground_truth_positive'],
                        'ground_truth_negative': r['ground_truth_negative'],
                    }
                    for r in evaluation_results if 'best_threshold' in r
                ])
                
                # Sort by the appropriate metric
                sort_col = 'f1_score' if OPTIMIZATION_METRIC == 'F1' else 'f2_score'
                best_thresholds_df = best_thresholds_df.sort_values(sort_col, ascending=False).reset_index(drop=True)
                
                print("Best Thresholds by Model and Score Column:")
                print("-" * 80)
                print(best_thresholds_df)
                
                # Add a simple 'classification' column that uses the best model's best score column
                best_model_row = best_thresholds_df.iloc[0]
                best_model = best_model_row['model_name']
                best_score_col = best_model_row['score_column']
                best_classification_col = f"{best_model}_{best_score_col}_classification"
                
                # Create the unified classification column
                if best_classification_col in classified_df.columns:
                    classified_df['classification'] = classified_df[best_classification_col]
                    print(f"Added 'classification' column using best model ({best_model}) and score column ({best_score_col})")
                    
                    # Count by classification
                    classification_counts = classified_df['classification'].value_counts()
                    print("Classification Distribution:")
                    for cls, count in classification_counts.items():
                        print(f"  {cls}: {count} ({count/len(classified_df)*100:.2f}%)")
                
                # Store the classified DataFrame for use in the next cell
                globals()['classified_df'] = classified_df
                globals()['best_thresholds_df'] = best_thresholds_df
                
                # If there's SIFP data available, create combined dataset
                if 'sifp_results_df' in globals() and not globals()['sifp_results_df'].empty:
                    print("Creating combined dataset with SIFP estimation data...")
                    combined_df = create_combined_dataset(
                        classified_df, 
                        ground_truth_df, 
                        globals()['sifp_results_df']
                    )
                    
                    if not combined_df.empty:
                        globals()['combined_df'] = combined_df
                        print(f"Successfully created combined dataset with {len(combined_df)} rows")
                    else:
                        print("Failed to create combined dataset")
            else:
                print("No valid threshold results found.")
        else:
            print("Required columns missing from ground truth data.")

In [None]:
# Cell [7] - Combine Classified Data with SIFP Estimates
# Purpose: Merge TP/FP/FN classified links with SIFP estimation data for hallucination analysis
# Dependencies: pandas, numpy, logging
# Breadcrumbs: Model Evaluation -> Data Integration -> SIFP Mapping

def combine_classification_with_sifp(classified_df=None, sifp_results_df=None, include_fn_as_hallucination=True):
    """
    Combine classified traceability links (TP/FP/FN/TN) with SIFP estimates
    
    Parameters:
        classified_df (pd.DataFrame, optional): DataFrame with classification results
        sifp_results_df (pd.DataFrame, optional): DataFrame with SIFP estimates
        include_fn_as_hallucination (bool): Whether to include FN as hallucination type (default: True)
    
    Returns:
        pd.DataFrame: Combined dataset focusing on TP and hallucination links (FP+FN) with SIFP data
    """
    try:
        # Get required dataframes from globals if not provided
        if classified_df is None or classified_df.empty:
            if 'classified_df' in globals() and not globals()['classified_df'].empty:
                classified_df = globals()['classified_df']
            else:
                logger.error("No classified data available")
                return pd.DataFrame()
        
        if sifp_results_df is None or sifp_results_df.empty:
            if 'sifp_results_df' in globals() and not globals()['sifp_results_df'].empty:
                sifp_results_df = globals()['sifp_results_df']
            else:
                logger.error("No SIFP estimates data available")
                return pd.DataFrame()
        
        # Create a copy of the classified dataframe
        combined_df = classified_df.copy()
        
        # Ensure we have a classification column
        if 'classification' not in combined_df.columns:
            logger.error("Missing classification column in data")
            return pd.DataFrame()
        
        # Filter to TP, FP, and optionally FN links for hallucination analysis
        if include_fn_as_hallucination:
            # Include FN as a type of hallucination (missed features)
            analysis_classifications = ['TP', 'FP', 'FN']
            logger.info("Including FN (False Negatives) as hallucination type alongside FP")
        else:
            # Original approach - only FP as hallucination
            analysis_classifications = ['TP', 'FP']
            logger.info("Using only FP (False Positives) as hallucination type")
        
        tp_fp_fn_df = combined_df[combined_df['classification'].isin(analysis_classifications)].copy()
        logger.info(f"Filtered to {analysis_classifications} classifications: {len(tp_fp_fn_df)} rows from {len(combined_df)} total")
        
        # Add hallucination indicator column
        # TP = Non-hallucination (correct identification)
        # FP = Hallucination (over-identification)  
        # FN = Hallucination (missed features/under-identification)
        tp_fp_fn_df['is_hallucination'] = tp_fp_fn_df['classification'].isin(['FP', 'FN'])
        tp_fp_fn_df['hallucination_type'] = tp_fp_fn_df['classification'].apply(
            lambda x: 'Correct_Identification' if x == 'TP' 
                     else 'Over_Identification' if x == 'FP'
                     else 'Under_Identification' if x == 'FN'
                     else 'Unknown'
        )
        
        # Ensure target_id and model are strings for consistent comparison
        tp_fp_fn_df['target_id'] = tp_fp_fn_df['target_id'].astype(str)
        sifp_results_df['sifp_requirement_id'] = sifp_results_df['sifp_requirement_id'].astype(str)
        
        if 'model' in tp_fp_fn_df.columns and 'sifp_model' in sifp_results_df.columns:
            tp_fp_fn_df['model'] = tp_fp_fn_df['model'].astype(str)
            sifp_results_df['sifp_model'] = sifp_results_df['sifp_model'].astype(str)
            
            # Create merge keys that include both target_id and model
            logger.info("Creating merge keys using both target_id and model")
            tp_fp_fn_df['merge_key'] = tp_fp_fn_df['target_id'] + '_' + tp_fp_fn_df['model']
            sifp_results_df['merge_key'] = sifp_results_df['sifp_requirement_id'] + '_' + sifp_results_df['sifp_model']
            
            # Check for duplicates in merge keys
            if sifp_results_df['merge_key'].duplicated().any():
                logger.warning(f"Found {sifp_results_df['merge_key'].duplicated().sum()} duplicate merge keys in SIFP estimates. Using first occurrence.")
                sifp_results_df = sifp_results_df.drop_duplicates(subset=['merge_key'], keep='first')
            
            # Perform the merge
            merged_df = tp_fp_fn_df.merge(
                sifp_results_df,
                on='merge_key',
                how='left',
                suffixes=('', '_sifp')
            )
            
            # Check merge result
            logger.info(f"Merged dataframe has {len(merged_df)} rows")
            merge_success = merged_df[['sifp_actor_total', 'sifp_final_total']].notna().any(axis=1).sum()
            logger.info(f"Found {merge_success} rows with valid SIFP data after merge ({merge_success/len(merged_df)*100:.1f}%)")
            
            # If the merge didn't work well, try with just target_id
            if merge_success < len(merged_df) * 0.5:
                logger.warning("Poor merge results with target_id+model. Trying with just target_id.")
                
                # Fallback to just target_id
                merged_df = tp_fp_fn_df.merge(
                    sifp_results_df.drop(columns=['merge_key']),
                    left_on='target_id',
                    right_on='sifp_requirement_id',
                    how='left',
                    suffixes=('', '_sifp')
                )
                
                merge_success = merged_df[['sifp_actor_total', 'sifp_final_total']].notna().any(axis=1).sum()
                logger.info(f"Fallback merge with target_id: {merge_success} rows with valid SIFP data ({merge_success/len(merged_df)*100:.1f}%)")
        else:
            # If we don't have model columns in both dataframes, merge on target_id directly
            logger.info("Missing model column in one or both DataFrames, merging on target_id only")
            merged_df = tp_fp_fn_df.merge(
                sifp_results_df,
                left_on='target_id',
                right_on='sifp_requirement_id',
                how='left',
                suffixes=('', '_sifp')
            )
            
            merge_success = merged_df[['sifp_actor_total', 'sifp_final_total']].notna().any(axis=1).sum()
            logger.info(f"Merged on target_id: {merge_success} rows with valid SIFP data ({merge_success/len(merged_df)*100:.1f}%)")
        
        # Add error metrics if they don't exist
        if 'sifp_abs_error' not in merged_df.columns:
            merged_df['sifp_abs_error'] = (merged_df['sifp_final_total'] - merged_df['sifp_actor_total']).abs()
        
        if 'sifp_pct_error' not in merged_df.columns:
            # Calculate percentage error, handling division by zero
            merged_df['sifp_pct_error'] = np.nan
            nonzero_mask = (merged_df['sifp_actor_total'] != 0) & merged_df['sifp_actor_total'].notna()
            
            merged_df.loc[nonzero_mask, 'sifp_pct_error'] = (
                (merged_df.loc[nonzero_mask, 'sifp_final_total'] - 
                 merged_df.loc[nonzero_mask, 'sifp_actor_total']).abs() / 
                merged_df.loc[nonzero_mask, 'sifp_actor_total'] * 100
            )
        
        # Calculate aggregated error metrics by classification and hallucination type
        if merge_success > 0:
            logger.info("Calculating error metrics by classification and hallucination type:")
            
            # By individual classification
            for classification in analysis_classifications:
                class_df = merged_df[merged_df['classification'] == classification]
                if len(class_df) > 0:
                    abs_error_mean = class_df['sifp_abs_error'].mean()
                    pct_error_mean = class_df.loc[class_df['sifp_pct_error'].notna(), 'sifp_pct_error'].mean()
                    
                    logger.info(f"  {classification}: {len(class_df)} links")
                    logger.info(f"    Mean Absolute Error: {abs_error_mean:.2f}")
                    logger.info(f"    Mean Percentage Error: {pct_error_mean:.2f}%")
            
            # By hallucination status (TP vs All Hallucinations)
            hallucination_groups = merged_df.groupby('is_hallucination')
            for is_halluc, group_df in hallucination_groups:
                group_name = "Hallucinations (FP+FN)" if is_halluc else "Correct Identification (TP)"
                abs_error_mean = group_df['sifp_abs_error'].mean()
                pct_error_mean = group_df.loc[group_df['sifp_pct_error'].notna(), 'sifp_pct_error'].mean()
                
                logger.info(f"  {group_name}: {len(group_df)} links")
                logger.info(f"    Mean Absolute Error: {abs_error_mean:.2f}")
                logger.info(f"    Mean Percentage Error: {pct_error_mean:.2f}%")
        
        return merged_df
        
    except Exception as e:
        logger.error(f"Error combining classified data with SIFP estimates: {str(e)}")
        logger.error("Exception details:", exc_info=True)
        return pd.DataFrame()

# Run the combination process
print(f"Combining Classified Links with SIFP Estimates (Including FN as Hallucination):") 
print(f"===============================================================================")

combined_df = combine_classification_with_sifp(include_fn_as_hallucination=True)

if not combined_df.empty:
    # Display summary statistics
    print(f"Successfully created combined dataset with {len(combined_df)} rows!")
    
    # Count rows by classification
    if 'classification' in combined_df.columns:
        classification_counts = combined_df['classification'].value_counts()
        print("Distribution by classification:")
        for cls, count in classification_counts.items():
            print(f"  {cls}: {count} ({count/len(combined_df)*100:.1f}%)")
    
    # Count rows by hallucination status
    if 'is_hallucination' in combined_df.columns:
        hallucination_counts = combined_df['is_hallucination'].value_counts()
        print("Distribution by hallucination status:")
        print(f"  Correct Identification (TP): {hallucination_counts.get(False, 0)} ({hallucination_counts.get(False, 0)/len(combined_df)*100:.1f}%)")
        print(f"  Hallucinations (FP+FN): {hallucination_counts.get(True, 0)} ({hallucination_counts.get(True, 0)/len(combined_df)*100:.1f}%)")
    
    # Count rows by hallucination type
    if 'hallucination_type' in combined_df.columns:
        type_counts = combined_df['hallucination_type'].value_counts()
        print("Distribution by hallucination type:")
        for htype, count in type_counts.items():
            print(f"  {htype}: {count} ({count/len(combined_df)*100:.1f}%)")
    
    # Count rows with SIFP data
    sifp_count = combined_df[['sifp_actor_total', 'sifp_final_total']].notna().all(axis=1).sum()
    print(f"Rows with complete SIFP data: {sifp_count} ({sifp_count/len(combined_df)*100:.1f}%)")
    
    # Show error metrics by classification
    if sifp_count > 0 and 'classification' in combined_df.columns:
        print("SIFP Error Metrics by Classification:")
        for classification in ['TP', 'FP', 'FN']:
            if classification in combined_df['classification'].values:
                class_df = combined_df[
                    (combined_df['classification'] == classification) & 
                    combined_df[['sifp_abs_error', 'sifp_actor_total']].notna().all(axis=1)
                ]
                
                if len(class_df) > 0:
                    print(f"  {classification} Links ({len(class_df)} rows):")
                    print(f"    Mean SIFP Actor Total: {class_df['sifp_actor_total'].mean():.2f}")
                    print(f"    Mean SIFP Final Total: {class_df['sifp_final_total'].mean():.2f}")
                    print(f"    Mean Absolute Error: {class_df['sifp_abs_error'].mean():.2f}")
                    
                    pct_error_df = class_df[class_df['sifp_pct_error'].notna()]
                    if len(pct_error_df) > 0:
                        print(f"    Mean Percentage Error: {pct_error_df['sifp_pct_error'].mean():.2f}%")
        
        # Show error metrics by hallucination status
        print("SIFP Error Metrics by Hallucination Status:")
        for is_halluc, group_name in [(False, "Correct Identification (TP)"), (True, "Hallucinations (FP+FN)")]:
            group_df = combined_df[
                (combined_df['is_hallucination'] == is_halluc) & 
                combined_df[['sifp_abs_error', 'sifp_actor_total']].notna().all(axis=1)
            ]
            
            if len(group_df) > 0:
                print(f"  {group_name} ({len(group_df)} rows):")
                print(f"    Mean SIFP Actor Total: {group_df['sifp_actor_total'].mean():.2f}")
                print(f"    Mean SIFP Final Total: {group_df['sifp_final_total'].mean():.2f}")
                print(f"    Mean Absolute Error: {group_df['sifp_abs_error'].mean():.2f}")
                
                pct_error_df = group_df[group_df['sifp_pct_error'].notna()]
                if len(pct_error_df) > 0:
                    print(f"    Mean Percentage Error: {pct_error_df['sifp_pct_error'].mean():.2f}%")
    
    # Display sample of the combined dataset
    print(f"Sample of combined dataset ({min(5, len(combined_df))} rows):")
    display_cols = ['classification', 'is_hallucination', 'hallucination_type', 'source_id', 'target_id', 'model', 
                    'sifp_actor_total', 'sifp_final_total', 'sifp_abs_error', 'sifp_pct_error']
    display_cols = [col for col in display_cols if col in combined_df.columns]
    print(combined_df[display_cols].head())
    
    # Store in global namespace for next cells
    globals()['combined_df'] = combined_df
else:
    print("No combined dataset created. Check error logs.") 

In [None]:
# Cell [8] - Calculate Estimation Accuracy Metrics
# Purpose: Compute accuracy metrics for SiFP estimations by classification type including FN hallucinations
# Dependencies: pandas, numpy, sklearn.metrics
# Breadcrumbs: Data Integration -> Accuracy Analysis -> SiFP Metrics

def calculate_sifp_accuracy_metrics(df=None, group_by_classification=True, include_hallucination_analysis=True):
    """
    Calculate accuracy metrics for SiFP estimations including hallucination analysis
    
    Parameters:
        df (pd.DataFrame, optional): Combined dataset with traceability and SIFP data
        group_by_classification (bool): Whether to calculate metrics by TP/FP/FN classification
        include_hallucination_analysis (bool): Whether to include hallucination vs non-hallucination analysis
    
    Returns:
        dict: Dictionary containing accuracy metrics DataFrames and analysis results
    """
    try:
        # Use global combined_df if not provided
        df = df if df is not None else globals().get('combined_df', pd.DataFrame())
        
        if df.empty:
            logger.error("No data available for calculating accuracy metrics")
            return {}
        
        # Filter to rows that have SIFP data
        sifp_df = df.dropna(subset=['sifp_actor_total', 'sifp_final_total'])
        
        if sifp_df.empty:
            logger.error("No SIFP data available for calculating accuracy metrics")
            return {}
        
        logger.info(f"Calculating accuracy metrics on {len(sifp_df)} links with SIFP data")
        
        # Define metric calculation function
        def calculate_metrics(data):
            if len(data) == 0:
                return pd.Series({
                    'count': 0,
                    'mae': np.nan,
                    'mse': np.nan,
                    'rmse': np.nan,
                    'mape': np.nan,
                    'r2': np.nan,
                    'pearson_r': np.nan,
                    'pearson_p': np.nan,
                    'mean_abs_error': np.nan,
                    'mean_pct_error': np.nan,
                    'std_abs_error': np.nan,
                    'std_pct_error': np.nan,
                    'mean_actor_total': np.nan,
                    'mean_final_total': np.nan,
                    'median_actor_total': np.nan,
                    'median_final_total': np.nan
                })
            
            # Basic metrics
            y_true = data['sifp_actor_total']
            y_pred = data['sifp_final_total']
            
            # Calculate standard ML metrics
            mae = mean_absolute_error(y_true, y_pred)
            mse = mean_squared_error(y_true, y_pred)
            rmse = np.sqrt(mse)
            
            # Calculate MAPE (Mean Absolute Percentage Error)
            # Filter out zeros to avoid division by zero
            nonzero_mask = y_true != 0
            if nonzero_mask.sum() > 0:
                mape = np.mean(np.abs((y_true[nonzero_mask] - y_pred[nonzero_mask]) / y_true[nonzero_mask])) * 100
            else:
                mape = np.nan
            
            # R-squared
            r2 = r2_score(y_true, y_pred) if len(data) > 1 else np.nan
            
            # Pearson correlation
            if len(data) > 1:
                pearson_r, pearson_p = stats.pearsonr(y_true, y_pred)
            else:
                pearson_r, pearson_p = np.nan, np.nan
            
            # Custom metrics calculated earlier
            abs_error = data['sifp_abs_error'].mean() if 'sifp_abs_error' in data.columns else np.nan
            pct_error = data['sifp_pct_error'].mean() if 'sifp_pct_error' in data.columns else np.nan
            
            std_abs_error = data['sifp_abs_error'].std() if 'sifp_abs_error' in data.columns else np.nan
            std_pct_error = data['sifp_pct_error'].std() if 'sifp_pct_error' in data.columns else np.nan
            
            # Basic statistics
            mean_actor = y_true.mean()
            mean_final = y_pred.mean()
            median_actor = y_true.median()
            median_final = y_pred.median()
            
            return pd.Series({
                'count': len(data),
                'mae': mae,
                'mse': mse,
                'rmse': rmse,
                'mape': mape,
                'r2': r2,
                'pearson_r': pearson_r,
                'pearson_p': pearson_p,
                'mean_abs_error': abs_error,
                'mean_pct_error': pct_error,
                'std_abs_error': std_abs_error,
                'std_pct_error': std_pct_error,
                'mean_actor_total': mean_actor,
                'mean_final_total': mean_final,
                'median_actor_total': median_actor,
                'median_final_total': median_final
            })
        
        results = {}
        
        # Calculate global metrics
        global_metrics = calculate_metrics(sifp_df)
        results['overall'] = pd.DataFrame([global_metrics], index=['Overall'])
        
        # Calculate metrics by individual classification if requested
        if group_by_classification and 'classification' in sifp_df.columns:
            # Group data by classification and calculate metrics for each group
            classification_groups = ['TP', 'FP', 'FN']
            available_classifications = [cls for cls in classification_groups if cls in sifp_df['classification'].values]
            
            if available_classifications:
                grouped_metrics = sifp_df[sifp_df['classification'].isin(available_classifications)].groupby('classification').apply(calculate_metrics)
                
                # Create a DataFrame with all metrics
                classification_metrics_df = pd.DataFrame([global_metrics]).T.rename(columns={0: 'Overall'})
                
                # For each classification group, add metrics to the DataFrame
                for cls in grouped_metrics.index:
                    classification_metrics_df[cls] = grouped_metrics.loc[cls]
                
                # Convert to a more readable format
                classification_metrics_df = classification_metrics_df.T
                
                # Reorder index to put Overall first if it exists
                if 'Overall' in classification_metrics_df.index:
                    new_index = ['Overall'] + [idx for idx in classification_metrics_df.index if idx != 'Overall']
                    classification_metrics_df = classification_metrics_df.reindex(new_index)
                
                results['by_classification'] = classification_metrics_df
                logger.info(f"Calculated accuracy metrics for {len(classification_metrics_df)} classification groups")
        
        # Calculate metrics by hallucination status if requested
        if include_hallucination_analysis and 'is_hallucination' in sifp_df.columns:
            hallucination_metrics = sifp_df.groupby('is_hallucination').apply(calculate_metrics)
            
            # Create DataFrame with hallucination analysis
            hallucination_metrics_df = pd.DataFrame()
            
            # Add metrics for each group
            for is_halluc in hallucination_metrics.index:
                group_name = "Hallucinations_FP_FN" if is_halluc else "Correct_Identification_TP"
                hallucination_metrics_df[group_name] = hallucination_metrics.loc[is_halluc]
            
            # Add overall metrics
            hallucination_metrics_df['Overall'] = global_metrics
            
            # Convert to readable format
            hallucination_metrics_df = hallucination_metrics_df.T
            
            # Reorder to put Overall first
            if 'Overall' in hallucination_metrics_df.index:
                new_index = ['Overall'] + [idx for idx in hallucination_metrics_df.index if idx != 'Overall']
                hallucination_metrics_df = hallucination_metrics_df.reindex(new_index)
            
            results['by_hallucination_status'] = hallucination_metrics_df
            logger.info(f"Calculated hallucination analysis metrics")
        
        # Calculate metrics by hallucination type if available
        if include_hallucination_analysis and 'hallucination_type' in sifp_df.columns:
            hallucination_type_metrics = sifp_df.groupby('hallucination_type').apply(calculate_metrics)
            
            # Create DataFrame with hallucination type analysis
            hallucination_type_metrics_df = pd.DataFrame()
            
            # Add metrics for each group
            for htype in hallucination_type_metrics.index:
                hallucination_type_metrics_df[htype] = hallucination_type_metrics.loc[htype]
            
            # Add overall metrics
            hallucination_type_metrics_df['Overall'] = global_metrics
            
            # Convert to readable format
            hallucination_type_metrics_df = hallucination_type_metrics_df.T
            
            # Reorder to put Overall first
            if 'Overall' in hallucination_type_metrics_df.index:
                new_index = ['Overall'] + [idx for idx in hallucination_type_metrics_df.index if idx != 'Overall']
                hallucination_type_metrics_df = hallucination_type_metrics_df.reindex(new_index)
            
            results['by_hallucination_type'] = hallucination_type_metrics_df
            logger.info(f"Calculated hallucination type analysis metrics")
        
        return results
            
    except Exception as e:
        logger.error(f"Error calculating accuracy metrics: {str(e)}")
        logger.error("Exception details:", exc_info=True)
        return {}

# Calculate accuracy metrics
try:
    accuracy_results = calculate_sifp_accuracy_metrics(include_hallucination_analysis=True)
    print(f"SIFP Estimation Accuracy Metrics with Hallucination Analysis:")
    print(f"============================================================")
    
    if accuracy_results:
        # Display overall metrics
        if 'overall' in accuracy_results:
            print("Overall Metrics:")
            print(accuracy_results['overall'].round(2))
            print()
        
        # Display metrics by individual classification (TP, FP, FN)
        if 'by_classification' in accuracy_results:
            print("Metrics by Individual Classification:")
            display_metrics = accuracy_results['by_classification'].copy()
            
            # Round numeric columns to 2 decimal places for display
            numeric_cols = display_metrics.select_dtypes(include=[np.number]).columns
            display_metrics[numeric_cols] = display_metrics[numeric_cols].round(2)
            
            print(display_metrics)
            print()
            
            # Compare TP vs FP vs FN metrics
            available_classes = [cls for cls in ['TP', 'FP', 'FN'] if cls in display_metrics.index]
            if len(available_classes) >= 2:
                print("Classification Comparison:")
                comparison_metrics = ['count', 'mean_abs_error', 'mean_pct_error', 'rmse', 'mape', 'r2']
                comparison_metrics = [m for m in comparison_metrics if m in display_metrics.columns]
                
                class_comparison = display_metrics.loc[available_classes, comparison_metrics]
                print(class_comparison)
                print()
        
        # Display metrics by hallucination status (TP vs FP+FN)
        if 'by_hallucination_status' in accuracy_results:
            print("Metrics by Hallucination Status (KEY ANALYSIS):")
            halluc_metrics = accuracy_results['by_hallucination_status'].copy()
            
            # Round numeric columns to 2 decimal places for display
            numeric_cols = halluc_metrics.select_dtypes(include=[np.number]).columns
            halluc_metrics[numeric_cols] = halluc_metrics[numeric_cols].round(2)
            
            print(halluc_metrics)
            print()
            
            # Calculate differences between TP and Hallucinations
            if 'Correct_Identification_TP' in halluc_metrics.index and 'Hallucinations_FP_FN' in halluc_metrics.index:
                print("TP vs Hallucinations (FP+FN) Comparison:")
                comparison_metrics = ['count', 'mean_abs_error', 'mean_pct_error', 'rmse', 'mape', 'r2']
                comparison_metrics = [m for m in comparison_metrics if m in halluc_metrics.columns]
                
                tp_halluc_comparison = halluc_metrics.loc[['Correct_Identification_TP', 'Hallucinations_FP_FN'], comparison_metrics]
                
                # Calculate the difference and percentage difference
                if not tp_halluc_comparison.empty:
                    diff = tp_halluc_comparison.loc['Correct_Identification_TP'] - tp_halluc_comparison.loc['Hallucinations_FP_FN']
                    pct_diff = ((tp_halluc_comparison.loc['Correct_Identification_TP'] - tp_halluc_comparison.loc['Hallucinations_FP_FN']) / 
                               tp_halluc_comparison.loc['Hallucinations_FP_FN'] * 100).round(1)
                    
                    # Skip count column for percentage difference
                    if 'count' in pct_diff.index:
                        pct_diff['count'] = np.nan
                    
                    # Add comparison rows to the DataFrame
                    tp_halluc_comparison.loc['Difference (TP - Hallucinations)'] = diff
                    tp_halluc_comparison.loc['% Difference'] = pct_diff
                    
                    # Format and display
                    print(tp_halluc_comparison.round(2))
                    print()
                    
                    # Highlight key insights for hypothesis testing
                    print("Key Insights for Hypothesis 2:")
                    for metric in ['mean_abs_error', 'mean_pct_error', 'rmse', 'mape']:
                        if metric in diff.index and not np.isnan(diff[metric]):
                            if diff[metric] < 0:
                                print(f"- TP links have {abs(diff[metric]):.2f} LOWER {metric} than Hallucination links (FP+FN) - {abs(pct_diff[metric]):.1f}% better")
                                print(f"  → SUPPORTS hypothesis: Hallucinations correlate with HIGHER estimation errors")
                            elif diff[metric] > 0:
                                print(f"- TP links have {diff[metric]:.2f} HIGHER {metric} than Hallucination links (FP+FN) - {pct_diff[metric]:.1f}% worse")
                                print(f"  → CONTRADICTS hypothesis: Hallucinations correlate with LOWER estimation errors")
                            else:
                                print(f"- TP and Hallucination links show NO difference in {metric}")
        
        # Display metrics by hallucination type (Over vs Under identification)
        if 'by_hallucination_type' in accuracy_results:
            print("Metrics by Hallucination Type:")
            type_metrics = accuracy_results['by_hallucination_type'].copy()
            
            # Round numeric columns to 2 decimal places for display
            numeric_cols = type_metrics.select_dtypes(include=[np.number]).columns
            type_metrics[numeric_cols] = type_metrics[numeric_cols].round(2)
            
            print(type_metrics)
            print()
            
            # Compare Over vs Under identification if both exist
            if 'Over_Identification' in type_metrics.index and 'Under_Identification' in type_metrics.index:
                print("Over-Identification (FP) vs Under-Identification (FN) Comparison:")
                comparison_metrics = ['count', 'mean_abs_error', 'mean_pct_error', 'rmse', 'mape']
                comparison_metrics = [m for m in comparison_metrics if m in type_metrics.columns]
                
                over_under_comparison = type_metrics.loc[['Over_Identification', 'Under_Identification'], comparison_metrics]
                print(over_under_comparison)
                print()
        
        # Store results in global namespace for next cells
        globals()['accuracy_results'] = accuracy_results
        
        # Also store the main metrics DataFrame for backward compatibility
        if 'by_classification' in accuracy_results:
            globals()['accuracy_metrics'] = accuracy_results['by_classification']
        elif 'overall' in accuracy_results:
            globals()['accuracy_metrics'] = accuracy_results['overall']
    else:
        print("No accuracy metrics could be calculated. Check logs for details.")
except Exception as e:
    print(f"Error during accuracy metrics calculation: {str(e)}") 

In [None]:
# Cell [9] - Compare TP vs Hallucinations (FP+FN) Estimation Accuracy
# Purpose: Analyze and compare SiFP accuracy between TP links and Hallucination links (FP+FN) with statistical tests
# Dependencies: pandas, numpy, scipy.stats
# Breadcrumbs: Accuracy Analysis -> Statistical Comparison -> Hypothesis 2 Testing

def compare_tp_vs_hallucinations_accuracy(df=None, accuracy_results=None):
    """
    Perform detailed comparison of SiFP accuracy between TP links and Hallucination links (FP+FN)
    Core analysis for Hypothesis 2: "Hallucinations significantly correlate with lower quality estimates"
    
    Parameters:
        df (pd.DataFrame, optional): Combined dataset with traceability and SIFP data
        accuracy_results (dict, optional): Accuracy results from previous cell
    
    Returns:
        dict: Dictionary containing comprehensive comparison results and statistical tests
    """
    try:
        # Use global variables if not provided
        df = df if df is not None else globals().get('combined_df', pd.DataFrame())
        
        if df.empty:
            logger.error("No combined data available for TP vs Hallucinations comparison")
            return {}
            
        # Filter for rows with SIFP data and classification information
        valid_df = df.dropna(subset=['sifp_actor_total', 'sifp_final_total', 'classification', 'is_hallucination'])
        valid_df = valid_df[valid_df['classification'].isin(['TP', 'FP', 'FN'])]
        
        if len(valid_df) == 0:
            logger.error("No valid TP, FP, or FN data with SIFP estimations found")
            return {}
            
        # Split into TP (non-hallucination) and Hallucination (FP+FN) datasets
        tp_df = valid_df[valid_df['is_hallucination'] == False]  # TP links
        hallucination_df = valid_df[valid_df['is_hallucination'] == True]  # FP + FN links
        
        logger.info(f"Comparing {len(tp_df)} TP links with {len(hallucination_df)} Hallucination links (FP+FN)")
        
        # Create results dictionary
        results = {
            'hypothesis': "Hallucinations in LLM outputs significantly correlate with lower quality estimates",
            'comparison_type': 'TP_vs_Hallucinations_FP_FN',
            'counts': {
                'TP': len(tp_df),
                'Hallucinations_FP_FN': len(hallucination_df),
                'FP_only': len(valid_df[valid_df['classification'] == 'FP']),
                'FN_only': len(valid_df[valid_df['classification'] == 'FN']),
                'total': len(valid_df)
            },
            'metrics': {},
            'statistical_tests': {},
            'effect_sizes': {},
            'hypothesis_conclusion': {}
        }
        
        # Check if we have enough data for meaningful comparison
        if len(tp_df) < 5 or len(hallucination_df) < 5:
            logger.warning("Sample size too small for reliable statistical comparison")
            results['warning'] = "Sample size too small for reliable statistical comparison"
            return results
            
        # Extract metrics for comparison
        metrics_to_compare = ['sifp_abs_error', 'sifp_pct_error']
        
        for metric in metrics_to_compare:
            if metric in tp_df.columns and metric in hallucination_df.columns:
                # Get the data
                tp_values = tp_df[metric].dropna()
                halluc_values = hallucination_df[metric].dropna()
                
                if len(tp_values) == 0 or len(halluc_values) == 0:
                    continue
                    
                # Basic descriptive statistics
                results['metrics'][metric] = {
                    'TP': {
                        'mean': tp_values.mean(),
                        'median': tp_values.median(),
                        'std': tp_values.std(),
                        'count': len(tp_values)
                    },
                    'Hallucinations_FP_FN': {
                        'mean': halluc_values.mean(),
                        'median': halluc_values.median(),
                        'std': halluc_values.std(),
                        'count': len(halluc_values)
                    }
                }
                
                # Calculate difference and hypothesis direction
                mean_diff = tp_values.mean() - halluc_values.mean()
                pct_diff = (mean_diff / halluc_values.mean()) * 100 if halluc_values.mean() != 0 else np.nan
                    
                results['metrics'][metric]['difference'] = {
                    'absolute': mean_diff,
                    'percent': pct_diff
                }
                
                # Hypothesis support check: mean_diff < 0 means TP has lower error → SUPPORTS hypothesis
                hypothesis_support = mean_diff < 0
                results['metrics'][metric]['hypothesis_support'] = {
                    'supports_hypothesis': hypothesis_support,
                    'interpretation': f"TP links have {'LOWER' if hypothesis_support else 'HIGHER'} {metric} than Hallucinations"
                }
                
                # Statistical tests
                results['statistical_tests'][metric] = {}
                
                # Check for normality with Shapiro-Wilk test
                if len(tp_values) <= 5000 and len(halluc_values) <= 5000:
                    try:
                        _, tp_norm_p = stats.shapiro(tp_values)
                        _, halluc_norm_p = stats.shapiro(halluc_values)
                        tp_is_normal = tp_norm_p > 0.05
                        halluc_is_normal = halluc_norm_p > 0.05
                        both_normal = tp_is_normal and halluc_is_normal
                    except Exception:
                        both_normal = False
                else:
                    both_normal = False
                
                results['statistical_tests'][metric]['normality'] = {
                    'both_normal': both_normal
                }
                
                # Main comparison tests
                if both_normal:
                    # Parametric: t-test
                    try:
                        _, levene_p = stats.levene(tp_values, halluc_values)
                        equal_var = levene_p > 0.05
                        t_stat, t_p = stats.ttest_ind(tp_values, halluc_values, equal_var=equal_var)
                        
                        results['statistical_tests'][metric]['parametric'] = {
                            'test': 't-test',
                            'statistic': t_stat,
                            'p_value': t_p,
                            'significant': t_p < 0.05,
                            'hypothesis_result': 'SUPPORTS' if (t_p < 0.05 and hypothesis_support) else 'CONTRADICTS' if (t_p < 0.05 and not hypothesis_support) else 'INCONCLUSIVE'
                        }
                    except Exception as e:
                        logger.warning(f"t-test failed for {metric}: {str(e)}")
                
                # Non-parametric: Mann-Whitney U test
                try:
                    u_stat, u_p = stats.mannwhitneyu(tp_values, halluc_values, alternative='two-sided')
                    
                    results['statistical_tests'][metric]['non_parametric'] = {
                        'test': 'Mann-Whitney U',
                        'statistic': u_stat,
                        'p_value': u_p,
                        'significant': u_p < 0.05,
                        'hypothesis_result': 'SUPPORTS' if (u_p < 0.05 and hypothesis_support) else 'CONTRADICTS' if (u_p < 0.05 and not hypothesis_support) else 'INCONCLUSIVE'
                    }
                except Exception as e:
                    logger.warning(f"Mann-Whitney U test failed for {metric}: {str(e)}")
                
                # Effect sizes
                effect_sizes = {}
                
                # Cohen's d
                try:
                    pooled_std = np.sqrt(((len(tp_values) - 1) * tp_values.std()**2 + 
                                         (len(halluc_values) - 1) * halluc_values.std()**2) / 
                                         (len(tp_values) + len(halluc_values) - 2))
                    if pooled_std > 0:
                        cohen_d = mean_diff / pooled_std
                        
                        if abs(cohen_d) < 0.2:
                            effect_interpretation = "Negligible"
                        elif abs(cohen_d) < 0.5:
                            effect_interpretation = "Small"
                        elif abs(cohen_d) < 0.8:
                            effect_interpretation = "Medium"
                        else:
                            effect_interpretation = "Large"
                        
                        effect_sizes['cohen_d'] = {
                            'value': cohen_d,
                            'interpretation': effect_interpretation
                        }
                except Exception as e:
                    logger.warning(f"Cohen's d calculation failed for {metric}: {str(e)}")
                
                results['effect_sizes'][metric] = effect_sizes
        
        # Overall hypothesis conclusion
        significant_results = []
        
        for metric, tests in results['statistical_tests'].items():
            for test_type in ['parametric', 'non_parametric']:
                if test_type in tests and tests[test_type]['significant']:
                    significant_results.append({
                        'metric': metric,
                        'test': tests[test_type]['test'],
                        'p_value': tests[test_type]['p_value'],
                        'hypothesis_result': tests[test_type]['hypothesis_result']
                    })
        
        # Determine overall conclusion
        support_count = len([r for r in significant_results if r['hypothesis_result'] == 'SUPPORTS'])
        contradict_count = len([r for r in significant_results if r['hypothesis_result'] == 'CONTRADICTS'])
        
        if support_count > contradict_count and support_count > 0:
            overall_conclusion = "HYPOTHESIS SUPPORTED"
        elif contradict_count > support_count and contradict_count > 0:
            overall_conclusion = "HYPOTHESIS CONTRADICTED" 
        elif support_count == contradict_count and support_count > 0:
            overall_conclusion = "MIXED EVIDENCE"
        else:
            overall_conclusion = "INSUFFICIENT EVIDENCE"
        
        results['hypothesis_conclusion'] = {
            'overall_result': overall_conclusion,
            'significant_supporting_tests': support_count,
            'significant_contradicting_tests': contradict_count,
            'total_significant_tests': len(significant_results),
            'significant_results': significant_results
        }
        
        return results
    
    except Exception as e:
        logger.error(f"Error comparing TP vs Hallucinations accuracy: {str(e)}")
        return {}

# Run the comprehensive comparison for Hypothesis 2
try:
    tp_vs_hallucinations_results = compare_tp_vs_hallucinations_accuracy()
    print(f"HYPOTHESIS 2 TESTING: TP vs Hallucinations (FP+FN) Accuracy Comparison")
    print(f"======================================================================")
    
    if tp_vs_hallucinations_results:
        # Display basic counts
        if 'counts' in tp_vs_hallucinations_results:
            counts = tp_vs_hallucinations_results['counts']
            print(f"Sample Sizes:")
            print(f"- TP (Correct Identification): {counts['TP']} links")
            print(f"- Hallucinations (FP+FN): {counts['Hallucinations_FP_FN']} links")
            print(f"  └─ FP (Over-identification): {counts['FP_only']} links")
            print(f"  └─ FN (Under-identification): {counts['FN_only']} links")
            print()
        
        # Display descriptive statistics comparison
        if 'metrics' in tp_vs_hallucinations_results:
            print("Descriptive Statistics Comparison:")
            for metric, data in tp_vs_hallucinations_results['metrics'].items():
                print(f"  {metric.upper().replace('_', ' ')}:")
                print(f"    TP Links:         {data['TP']['mean']:.3f} ± {data['TP']['std']:.3f}")
                print(f"    Hallucinations:   {data['Hallucinations_FP_FN']['mean']:.3f} ± {data['Hallucinations_FP_FN']['std']:.3f}")
                
                diff = data['difference']
                hypothesis_support = data['hypothesis_support']['supports_hypothesis']
                direction = "LOWER" if diff['absolute'] < 0 else "HIGHER"
                
                print(f"    Difference:       TP has {abs(diff['absolute']):.3f} {direction} error")
                print(f"    Hypothesis:       {'✓ SUPPORTS' if hypothesis_support else '✗ CONTRADICTS'}")
                print()
        
        # Display statistical tests
        if 'statistical_tests' in tp_vs_hallucinations_results:
            print("Statistical Significance Tests:")
            for metric, tests in tp_vs_hallucinations_results['statistical_tests'].items():
                print(f"  {metric.upper().replace('_', ' ')}:")
                
                for test_type in ['parametric', 'non_parametric']:
                    if test_type in tests:
                        test_info = tests[test_type]
                        significance = "significant" if test_info['significant'] else "not significant"
                        
                        print(f"    {test_info['test']}: p={test_info['p_value']:.6f} ({significance})")
                        print(f"      → {test_info['hypothesis_result']}")
                print()
        
        # Display effect sizes
        if 'effect_sizes' in tp_vs_hallucinations_results:
            print("Effect Size Analysis:")
            for metric, effects in tp_vs_hallucinations_results['effect_sizes'].items():
                print(f"  {metric.upper().replace('_', ' ')}:")
                if 'cohen_d' in effects:
                    cohen = effects['cohen_d']
                    print(f"    Cohen's d: {cohen['value']:.3f} ({cohen['interpretation']} effect)")
                print()
        
        # Display hypothesis conclusion
        if 'hypothesis_conclusion' in tp_vs_hallucinations_results:
            conclusion = tp_vs_hallucinations_results['hypothesis_conclusion']
            print("="*80)
            print("HYPOTHESIS 2 CONCLUSION:")
            print(f"Overall Result: {conclusion['overall_result']}")
            print(f"Supporting Evidence: {conclusion['significant_supporting_tests']} significant test(s)")
            print(f"Contradicting Evidence: {conclusion['significant_contradicting_tests']} significant test(s)")
            print()
            
            # Provide interpretation
            if conclusion['overall_result'] == "HYPOTHESIS SUPPORTED":
                print("INTERPRETATION:")
                print("✓ Evidence supports that hallucinations (both FP and FN) significantly")
                print("  correlate with lower quality SiFP estimates in new product development.")
            elif conclusion['overall_result'] == "HYPOTHESIS CONTRADICTED":
                print("INTERPRETATION:")
                print("✗ Evidence contradicts the hypothesis - hallucinations appear to correlate")
                print("  with BETTER or equivalent SiFP estimation quality.")
            else:
                print("INTERPRETATION:")
                print("? Mixed or insufficient evidence to draw strong conclusions.")
        
        # Store results for further analysis
        globals()['tp_vs_hallucinations_results'] = tp_vs_hallucinations_results
    else:
        print("No comparison data available. Check logs for details.")
        
except Exception as e:
    print(f"Error during TP vs Hallucinations comparison: {str(e)}") 

In [None]:
# Cell [10] - Enhanced Statistical Validation for Hypothesis 2
# Purpose: Add comprehensive statistical measures including confidence intervals, power analysis, and robust testing
# Dependencies: pandas, numpy, scipy.stats, statsmodels
# Breadcrumbs: Statistical Comparison -> Enhanced Validation -> Robust Hypothesis Testing

def enhanced_statistical_validation(df=None, alpha=0.05, bootstrap_samples=1000):
    """
    Perform enhanced statistical validation for Hypothesis 2 with comprehensive measures
    
    Parameters:
        df (pd.DataFrame, optional): Combined dataset with traceability and SIFP data
        alpha (float): Significance level (default: 0.05)
        bootstrap_samples (int): Number of bootstrap samples for confidence intervals
    
    Returns:
        dict: Comprehensive statistical validation results
    """
    # Use global variables if not provided
    df = df if df is not None else globals().get('combined_df', pd.DataFrame())
    
    if df.empty:
        logger.error("No data available for enhanced statistical validation")
        return {}
    
    # Filter for valid data
    valid_df = df.dropna(subset=['sifp_actor_total', 'sifp_final_total', 'classification', 'is_hallucination'])
    valid_df = valid_df[valid_df['classification'].isin(['TP', 'FP', 'FN'])]
    
    if len(valid_df) == 0:
        logger.error("No valid data for enhanced statistical validation")
        return {}
    
    # Split groups
    tp_df = valid_df[valid_df['is_hallucination'] == False]
    hallucination_df = valid_df[valid_df['is_hallucination'] == True]
    
    results = {
        'confidence_intervals': {},
        'power_analysis': {},
        'bootstrap_results': {},
        'correlation_analysis': {},
        'additional_effect_sizes': {},
        'assumption_testing': {},
        'multiple_testing_correction': {},
        'sample_adequacy': {}
    }
    
    metrics_to_test = ['sifp_abs_error', 'sifp_pct_error']
    
    for metric in metrics_to_test:
        if metric not in tp_df.columns or metric not in hallucination_df.columns:
            continue
            
        tp_values = tp_df[metric].dropna()
        halluc_values = hallucination_df[metric].dropna()
        
        if len(tp_values) < 3 or len(halluc_values) < 3:
            continue
        
        results[metric] = {}
        
        # 1. CONFIDENCE INTERVALS
        # Bootstrap confidence intervals for mean difference
        def bootstrap_mean_diff(tp_vals, halluc_vals, n_samples=bootstrap_samples):
            np.random.seed(42)  # For reproducibility
            diffs = []
            
            for _ in range(n_samples):
                tp_boot = np.random.choice(tp_vals, size=len(tp_vals), replace=True)
                halluc_boot = np.random.choice(halluc_vals, size=len(halluc_vals), replace=True)
                diffs.append(np.mean(tp_boot) - np.mean(halluc_boot))
            
            diffs = np.array(diffs)
            ci_lower = np.percentile(diffs, (alpha/2) * 100)
            ci_upper = np.percentile(diffs, (1 - alpha/2) * 100)
            
            return {
                'mean_difference': np.mean(diffs),
                'ci_lower': ci_lower,
                'ci_upper': ci_upper,
                'contains_zero': ci_lower <= 0 <= ci_upper,
                'confidence_level': (1 - alpha) * 100
            }
        
        bootstrap_ci = bootstrap_mean_diff(tp_values, halluc_values)
        results['confidence_intervals'][metric] = bootstrap_ci
        
        # Parametric CI for mean difference
        mean_diff = tp_values.mean() - halluc_values.mean()
        se_diff = np.sqrt(tp_values.var()/len(tp_values) + halluc_values.var()/len(halluc_values))
        
        if len(tp_values) >= 30 and len(halluc_values) >= 30:
            # Large sample: use z-distribution
            z_critical = stats.norm.ppf(1 - alpha/2)
            margin_error = z_critical * se_diff
            method = 'Large_sample_z'
        else:
            # Small sample: use t-distribution (Welch's)
            df_welch = (tp_values.var()/len(tp_values) + halluc_values.var()/len(halluc_values))**2 / \
                      ((tp_values.var()/len(tp_values))**2/(len(tp_values)-1) + 
                       (halluc_values.var()/len(halluc_values))**2/(len(halluc_values)-1))
            t_critical = stats.t.ppf(1 - alpha/2, df_welch)
            margin_error = t_critical * se_diff
            method = 'Welch_t'
        
        parametric_ci = {
            'mean_difference': mean_diff,
            'ci_lower': mean_diff - margin_error,
            'ci_upper': mean_diff + margin_error,
            'contains_zero': (mean_diff - margin_error) <= 0 <= (mean_diff + margin_error),
            'method': method
        }
        
        results['confidence_intervals'][metric]['parametric'] = parametric_ci
        
        # 2. POWER ANALYSIS (Fixed implementation)
        # Calculate observed effect size (Cohen's d)
        pooled_std = np.sqrt(((len(tp_values) - 1) * tp_values.var() + 
                             (len(halluc_values) - 1) * halluc_values.var()) / 
                             (len(tp_values) + len(halluc_values) - 2))
        
        if pooled_std > 0:
            observed_effect_size = abs(tp_values.mean() - halluc_values.mean()) / pooled_std
            total_n = len(tp_values) + len(halluc_values)
            
            # Calculate achieved power using ttest_power
            achieved_power = ttest_power(effect_size=observed_effect_size,
                                       nobs=total_n,
                                       alpha=alpha,
                                       alternative='two-sided')
            
            # Calculate required sample size for 80% power using simplified approach
            # For two-sample t-test with equal sample sizes
            n_groups = 2
            power_target = 0.8
            
            # Use iterative approach to find required sample size
            required_total_n = total_n
            for test_n in range(10, 1000, 5):
                test_power = ttest_power(effect_size=observed_effect_size,
                                       nobs=test_n,
                                       alpha=alpha,
                                       alternative='two-sided')
                if test_power >= power_target:
                    required_total_n = test_n
                    break
            
            # Calculate required sample size per group (assuming equal allocation)
            required_n_per_group = required_total_n / 2
            
            results['power_analysis'][metric] = {
                'observed_effect_size': observed_effect_size,
                'achieved_power': achieved_power,
                'required_total_n_for_80_power': required_total_n,
                'required_n_per_group_for_80_power': required_n_per_group,
                'current_n_tp': len(tp_values),
                'current_n_halluc': len(halluc_values),
                'current_total_n': total_n,
                'adequate_power': achieved_power >= 0.8
            }
        
        # 3. ADDITIONAL EFFECT SIZES
        additional_effects = {}
        
        # Cliff's Delta (robust, non-parametric effect size)
        cliff_delta_sum = 0
        for tp_val in tp_values:
            for halluc_val in halluc_values:
                if tp_val > halluc_val:
                    cliff_delta_sum += 1
                elif tp_val < halluc_val:
                    cliff_delta_sum -= 1
        
        cliff_delta = cliff_delta_sum / (len(tp_values) * len(halluc_values))
        
        if abs(cliff_delta) < 0.147:
            cliff_interpretation = "Negligible"
        elif abs(cliff_delta) < 0.33:
            cliff_interpretation = "Small"
        elif abs(cliff_delta) < 0.474:
            cliff_interpretation = "Medium"
        else:
            cliff_interpretation = "Large"
        
        additional_effects['cliff_delta'] = {
            'value': cliff_delta,
            'interpretation': cliff_interpretation
        }
        
        # Glass's Delta
        if halluc_values.std() > 0:
            glass_delta = (tp_values.mean() - halluc_values.mean()) / halluc_values.std()
            additional_effects['glass_delta'] = {
                'value': glass_delta,
                'interpretation': "Control group (Hallucinations) as reference"
            }
        
        results['additional_effect_sizes'][metric] = additional_effects
        
        # 4. CORRELATION ANALYSIS
        # Point-biserial correlation
        combined_values = np.concatenate([tp_values, halluc_values])
        binary_halluc = np.concatenate([np.zeros(len(tp_values)), np.ones(len(halluc_values))])
        
        correlation_coef, correlation_p = stats.pearsonr(binary_halluc, combined_values)
        
        results['correlation_analysis'][metric] = {
            'point_biserial_r': correlation_coef,
            'p_value': correlation_p,
            'significant': correlation_p < alpha,
            'interpretation': f"{'Positive' if correlation_coef > 0 else 'Negative'} correlation between hallucination status and {metric}"
        }
    
    # 5. MULTIPLE TESTING CORRECTION
    all_p_values = []
    test_descriptions = []
    
    # Get p-values from previous analysis if available
    if 'tp_vs_hallucinations_results' in globals():
        prev_results = globals()['tp_vs_hallucinations_results']
        if 'statistical_tests' in prev_results:
            for metric, tests in prev_results['statistical_tests'].items():
                for test_type in ['parametric', 'non_parametric']:
                    if test_type in tests and 'p_value' in tests[test_type]:
                        all_p_values.append(tests[test_type]['p_value'])
                        test_descriptions.append(f"{metric}_{test_type}")
    
    # Add correlation p-values
    for metric in results['correlation_analysis']:
        if 'p_value' in results['correlation_analysis'][metric]:
            all_p_values.append(results['correlation_analysis'][metric]['p_value'])
            test_descriptions.append(f"{metric}_correlation")
    
    if all_p_values:
        # Bonferroni correction
        bonferroni_alpha = alpha / len(all_p_values)
        bonferroni_significant = [p < bonferroni_alpha for p in all_p_values]
        
        # False Discovery Rate (Benjamini-Hochberg)
        fdr_rejected, fdr_p_corrected, _, _ = multipletests(all_p_values, alpha=alpha, method='fdr_bh')
        
        results['multiple_testing_correction'] = {
            'original_alpha': alpha,
            'bonferroni_alpha': bonferroni_alpha,
            'bonferroni_significant': bonferroni_significant,
            'fdr_rejected': fdr_rejected.tolist(),
            'fdr_corrected_p': fdr_p_corrected.tolist(),
            'test_descriptions': test_descriptions,
            'original_p_values': all_p_values
        }
    
    # 6. SAMPLE ADEQUACY ASSESSMENT
    total_n = len(tp_df) + len(hallucination_df)
    tp_n = len(tp_df)
    halluc_n = len(hallucination_df)
    
    sample_adequacy = {
        'total_sample_size': total_n,
        'tp_sample_size': tp_n,
        'hallucination_sample_size': halluc_n,
        'minimum_recommended': 30,
        'adequate_total_size': total_n >= 30,
        'adequate_group_sizes': tp_n >= 15 and halluc_n >= 15,
        'balanced_groups': abs(tp_n - halluc_n) / max(tp_n, halluc_n) < 0.5,
        'recommendations': []
    }
    
    if not sample_adequacy['adequate_total_size']:
        sample_adequacy['recommendations'].append("Increase total sample size to at least 30")
    if not sample_adequacy['adequate_group_sizes']:
        sample_adequacy['recommendations'].append("Ensure each group has at least 15 observations")
    if not sample_adequacy['balanced_groups']:
        sample_adequacy['recommendations'].append("Consider balancing group sizes for better statistical power")
    
    results['sample_adequacy'] = sample_adequacy
    
    return results

# Run enhanced statistical validation with improved interpretation
print(f"ENHANCED STATISTICAL VALIDATION FOR HYPOTHESIS 2:")
print(f"================================================")
print(f"HYPOTHESIS: 'Hallucinations in LLM outputs significantly correlate with lower quality SiFP estimates'")
print(f"OPERATIONALIZATION:")
print(f"- Hallucinations = Both FP (over-identification) and FN (under-identification) traceability links")
print(f"- Lower quality = Higher absolute and percentage errors in SiFP estimations")
print(f"- Significant = p < 0.05 with meaningful effect sizes")
print()

enhanced_results = enhanced_statistical_validation()

if enhanced_results:
    # Display confidence intervals with hypothesis interpretation
    if 'confidence_intervals' in enhanced_results:
        print("1. CONFIDENCE INTERVALS FOR MEAN DIFFERENCES:")
        print("   (Testing if TP links have different error rates than Hallucinations)")
        print("-" * 70)
        for metric, ci_data in enhanced_results['confidence_intervals'].items():
            metric_name = "Absolute Error" if "abs" in metric else "Percentage Error"
            print(f"  {metric_name}:")
            
            # Bootstrap CI
            bootstrap_ci = ci_data
            ci_level = bootstrap_ci.get('confidence_level', 95)
            print(f"    Bootstrap {ci_level}% CI: [{bootstrap_ci['ci_lower']:.3f}, {bootstrap_ci['ci_upper']:.3f}]")
            print(f"    Mean Difference (TP - Hallucinations): {bootstrap_ci['mean_difference']:.3f}")
            
            # Hypothesis interpretation
            if not bootstrap_ci['contains_zero']:
                if bootstrap_ci['ci_upper'] < 0:
                    print(f"    ✓ SUPPORTS HYPOTHESIS: TP links have significantly LOWER {metric_name.lower()}")
                    print(f"      → Hallucinations correlate with HIGHER estimation errors")
                else:
                    print(f"    ✗ CONTRADICTS HYPOTHESIS: TP links have significantly HIGHER {metric_name.lower()}")
                    print(f"      → Hallucinations correlate with LOWER estimation errors")
            else:
                print(f"    ? INCONCLUSIVE: No significant difference detected")
                print(f"      → Cannot conclude hallucinations affect estimation quality")
            
            # Parametric CI if available
            if 'parametric' in ci_data:
                param_ci = ci_data['parametric']
                print(f"    Parametric CI ({param_ci['method']}): [{param_ci['ci_lower']:.3f}, {param_ci['ci_upper']:.3f}]")
            print()
    
    # Display power analysis with interpretation
    if 'power_analysis' in enhanced_results:
        print("2. STATISTICAL POWER ANALYSIS:")
        print("   (Assessing reliability of our conclusions)")
        print("-" * 50)
        for metric, power_data in enhanced_results['power_analysis'].items():
            metric_name = "Absolute Error" if "abs" in metric else "Percentage Error"
            print(f"  {metric_name}:")
            print(f"    Observed Effect Size (Cohen's d): {power_data['observed_effect_size']:.3f}")
            print(f"    Achieved Statistical Power: {power_data['achieved_power']:.3f} ({power_data['achieved_power']*100:.1f}%)")
            print(f"    Required Total N for 80% Power: {power_data['required_total_n_for_80_power']:.0f}")
            print(f"    Required N per Group for 80% Power: {power_data['required_n_per_group_for_80_power']:.0f}")
            print(f"    Current Sample: TP={power_data['current_n_tp']}, Hallucinations={power_data['current_n_halluc']}")
            
            # Power interpretation
            if power_data['adequate_power']:
                print(f"    ✓ ADEQUATE POWER: High confidence in detecting true effects")
            else:
                print(f"    ⚠ LOW POWER: May miss true effects (Type II error risk)")
                print(f"      → Consider increasing sample size for more reliable conclusions")
            print()
    
    # Display additional effect sizes with interpretation
    if 'additional_effect_sizes' in enhanced_results:
        print("3. ADDITIONAL EFFECT SIZE MEASURES:")
        print("   (Assessing practical significance beyond statistical significance)")
        print("-" * 65)
        for metric, effects in enhanced_results['additional_effect_sizes'].items():
            metric_name = "Absolute Error" if "abs" in metric else "Percentage Error"
            print(f"  {metric_name}:")
            if 'cliff_delta' in effects:
                cliff = effects['cliff_delta']
                print(f"    Cliff's Delta: {cliff['value']:.3f} ({cliff['interpretation']} effect)")
                
                # Interpretation for hypothesis
                if cliff['value'] < 0:
                    print(f"      → TP links tend to have lower {metric_name.lower()} than Hallucinations")
                    print(f"      → SUPPORTS hypothesis about hallucination impact")
                elif cliff['value'] > 0:
                    print(f"      → TP links tend to have higher {metric_name.lower()} than Hallucinations")
                    print(f"      → CONTRADICTS hypothesis about hallucination impact")
                else:
                    print(f"      → No systematic difference between groups")
                    
            if 'glass_delta' in effects:
                glass = effects['glass_delta']
                print(f"    Glass's Delta: {glass['value']:.3f}")
                print(f"      → Standardized difference using Hallucination group variability")
            print()
    
    # Display correlation analysis with hypothesis context
    if 'correlation_analysis' in enhanced_results:
        print("4. CORRELATION ANALYSIS:")
        print("   (Direct test of correlation between hallucinations and estimation errors)")
        print("-" * 75)
        for metric, corr_data in enhanced_results['correlation_analysis'].items():
            metric_name = "Absolute Error" if "abs" in metric else "Percentage Error"
            print(f"  {metric_name}:")
            print(f"    Point-biserial correlation (r): {corr_data['point_biserial_r']:.3f}")
            print(f"    p-value: {corr_data['p_value']:.6f}")
            print(f"    Statistically significant: {'Yes' if corr_data['significant'] else 'No'}")
            
            # Hypothesis interpretation
            if corr_data['significant']:
                if corr_data['point_biserial_r'] > 0:
                    print(f"    ✓ SUPPORTS HYPOTHESIS: Positive correlation means hallucinations")
                    print(f"      are associated with HIGHER {metric_name.lower()}")
                else:
                    print(f"    ✗ CONTRADICTS HYPOTHESIS: Negative correlation means hallucinations")
                    print(f"      are associated with LOWER {metric_name.lower()}")
            else:
                print(f"    ? INCONCLUSIVE: No significant correlation detected")
            
            # Strength interpretation
            r_abs = abs(corr_data['point_biserial_r'])
            if r_abs < 0.1:
                strength = "negligible"
            elif r_abs < 0.3:
                strength = "small"
            elif r_abs < 0.5:
                strength = "medium"
            else:
                strength = "large"
            print(f"    Correlation strength: {strength} ({r_abs:.3f})")
            print()
    
    # Display multiple testing correction
    if 'multiple_testing_correction' in enhanced_results:
        print("5. MULTIPLE TESTING CORRECTION:")
        print("   (Controlling for false discoveries when testing multiple metrics)")
        print("-" * 65)
        correction_data = enhanced_results['multiple_testing_correction']
        print(f"    Original significance level (α): {correction_data['original_alpha']}")
        print(f"    Bonferroni-corrected α: {correction_data['bonferroni_alpha']:.6f}")
        print(f"    Number of statistical tests: {len(correction_data['original_p_values'])}")
        
        significant_after_correction = sum(correction_data.get('bonferroni_significant', []))
        print(f"    Significant after Bonferroni correction: {significant_after_correction}")
        
        if 'fdr_rejected' in correction_data:
            fdr_significant = sum(correction_data['fdr_rejected'])
            print(f"    Significant after FDR correction: {fdr_significant}")
        
        # Interpretation
        if significant_after_correction > 0:
            print(f"    ✓ ROBUST FINDINGS: Results survive correction for multiple testing")
            print(f"      → High confidence that observed effects are not due to chance")
        else:
            print(f"    ⚠ REDUCED CONFIDENCE: No tests significant after correction")
            print(f"      → Observed effects may be due to multiple testing artifacts")
        print()
    
    # Display sample adequacy
    if 'sample_adequacy' in enhanced_results:
        print("6. SAMPLE SIZE ADEQUACY:")
        print("   (Assessing whether we have sufficient data for reliable conclusions)")
        print("-" * 70)
        adequacy = enhanced_results['sample_adequacy']
        print(f"    Total Sample Size: {adequacy['total_sample_size']}")
        print(f"    TP Group Size: {adequacy['tp_sample_size']}")
        print(f"    Hallucination Group Size: {adequacy['hallucination_sample_size']}")
        print(f"    Adequate for statistical tests: {'✓ Yes' if adequacy['adequate_total_size'] else '✗ No'}")
        print(f"    Adequate group sizes: {'✓ Yes' if adequacy['adequate_group_sizes'] else '✗ No'}")
        print(f"    Reasonably balanced groups: {'✓ Yes' if adequacy['balanced_groups'] else '✗ No'}")
        
        if adequacy['recommendations']:
            print("    Recommendations for improvement:")
            for rec in adequacy['recommendations']:
                print(f"      - {rec}")
        print()
    
    # Store enhanced results
    globals()['enhanced_validation_results'] = enhanced_results
    
    print("OVERALL INTERPRETATION FOR HYPOTHESIS 2:")
    print("=" * 50)
    print("Your statistical analysis now provides multiple lines of evidence:")
    print()
    print("✓ CONFIDENCE INTERVALS: Show the range of plausible differences")
    print("✓ STATISTICAL POWER: Assesses reliability of conclusions")
    print("✓ EFFECT SIZES: Measure practical significance beyond p-values")
    print("✓ CORRELATIONS: Direct test of the hypothesized relationship")
    print("✓ MULTIPLE TESTING: Controls for false discovery rates")
    print("✓ SAMPLE ADEQUACY: Confirms sufficient data for conclusions")
    print()
    print("This comprehensive approach provides much stronger evidence")
    print("for or against your hypothesis than traditional p-value testing alone!")
    
else:
    print("No enhanced validation results available.")

In [None]:
# Cell [11] - Advanced Statistical Methods: Function Definitions and Execution
# Purpose: Define and execute cutting-edge statistical methods for robust Hypothesis 2 testing
# Dependencies: scipy, arch, pymc, arviz, bayesian_testing
# Breadcrumbs: Enhanced Validation -> Advanced Methods -> Function Definition

def advanced_statistical_methods_analysis(df=None, alpha=0.05, n_permutations=10000, n_bootstrap=5000):
    """
    Comprehensive advanced statistical analysis for Hypothesis 2 using:
    1. Permutation Tests (exact p-values, distribution-free)
    2. Extended Bootstrap Methods (BCa intervals, multiple bootstrap types)
    3. Bayesian Analysis (credible intervals, Bayes factors)
    
    Parameters:
        df (pd.DataFrame, optional): Combined dataset with traceability and SIFP data
        alpha (float): Significance level (default: 0.05)
        n_permutations (int): Number of permutation samples (default: 10000)
        n_bootstrap (int): Number of bootstrap samples (default: 5000)
    
    Returns:
        dict: Comprehensive results from all advanced statistical methods
    """
    # Use global variables if not provided
    df = df if df is not None else globals().get('combined_df', pd.DataFrame())
    
    if df.empty:
        logger.error("No data available for advanced statistical analysis")
        return {}
    
    # Filter for valid data
    valid_df = df.dropna(subset=['sifp_actor_total', 'sifp_final_total', 'classification', 'is_hallucination'])
    valid_df = valid_df[valid_df['classification'].isin(['TP', 'FP', 'FN'])]
    
    if len(valid_df) == 0:
        logger.error("No valid data for advanced statistical analysis")
        return {}
    
    # Split groups
    tp_df = valid_df[valid_df['is_hallucination'] == False]
    hallucination_df = valid_df[valid_df['is_hallucination'] == True]
    
    logger.info(f"Advanced analysis: {len(tp_df)} TP links vs {len(hallucination_df)} Hallucination links")
    
    results = {
        'permutation_tests': {},
        'extended_bootstrap': {},
        'bayesian_analysis': {},
        'method_comparison': {},
        'hypothesis_conclusion': {}
    }
    
    metrics_to_test = ['sifp_abs_error', 'sifp_pct_error']
    
    for metric in metrics_to_test:
        if metric not in tp_df.columns or metric not in hallucination_df.columns:
            continue
            
        tp_values = tp_df[metric].dropna()
        halluc_values = hallucination_df[metric].dropna()
        
        if len(tp_values) < 5 or len(halluc_values) < 5:
            continue
        
        logger.info(f"Analyzing {metric} with advanced methods...")
        
        # =================================================================
        # 1. PERMUTATION TESTS
        # =================================================================
        logger.info(f"  Running permutation tests for {metric}...")
        
        def mean_difference_statistic(x, y, axis=0):
            """Statistic function for permutation test"""
            return np.mean(x, axis=axis) - np.mean(y, axis=axis)
        
        # Two-sample permutation test for mean difference
        perm_result = permutation_test(
            (tp_values, halluc_values),
            mean_difference_statistic,
            n_resamples=n_permutations,
            alternative='two-sided',
            random_state=RANDOM_SEED
        )
        
        # One-sided tests for hypothesis direction
        perm_result_less = permutation_test(
            (tp_values, halluc_values),
            mean_difference_statistic,
            n_resamples=n_permutations,
            alternative='less',  # TP < Hallucinations (supports hypothesis)
            random_state=RANDOM_SEED
        )
        
        perm_result_greater = permutation_test(
            (tp_values, halluc_values),
            mean_difference_statistic,
            n_resamples=n_permutations,
            alternative='greater',  # TP > Hallucinations (contradicts hypothesis)
            random_state=RANDOM_SEED
        )
        
        # Calculate observed effect
        observed_diff = tp_values.mean() - halluc_values.mean()
        
        results['permutation_tests'][metric] = {
            'observed_difference': observed_diff,
            'two_sided': {
                'statistic': perm_result.statistic,
                'p_value': perm_result.pvalue,
                'significant': perm_result.pvalue < alpha
            },
            'one_sided_less': {
                'statistic': perm_result_less.statistic,
                'p_value': perm_result_less.pvalue,
                'significant': perm_result_less.pvalue < alpha,
                'supports_hypothesis': perm_result_less.pvalue < alpha and observed_diff < 0
            },
            'one_sided_greater': {
                'statistic': perm_result_greater.statistic,
                'p_value': perm_result_greater.pvalue,
                'significant': perm_result_greater.pvalue < alpha,
                'contradicts_hypothesis': perm_result_greater.pvalue < alpha and observed_diff > 0
            },
            'n_permutations': n_permutations,
            'hypothesis_interpretation': 'SUPPORTS' if (perm_result_less.pvalue < alpha and observed_diff < 0) else 'CONTRADICTS' if (perm_result_greater.pvalue < alpha and observed_diff > 0) else 'INCONCLUSIVE'
        }
        
        # =================================================================
        # 2. EXTENDED BOOTSTRAP METHODS
        # =================================================================
        logger.info(f"  Running extended bootstrap methods for {metric}...")
        
        bootstrap_results = {}
        
        # IID Bootstrap (standard) - Fixed implementation
        np.random.seed(RANDOM_SEED)
        
        # Simple bootstrap implementation without arch library issues
        bootstrap_diffs = []
        for i in range(n_bootstrap):
            # Bootstrap samples
            tp_bootstrap = np.random.choice(tp_values, size=len(tp_values), replace=True)
            halluc_bootstrap = np.random.choice(halluc_values, size=len(halluc_values), replace=True)
            
            # Calculate difference
            diff = np.mean(tp_bootstrap) - np.mean(halluc_bootstrap)
            bootstrap_diffs.append(diff)
        
        bootstrap_diffs = np.array(bootstrap_diffs)
        
        # Calculate confidence intervals
        ci_lower = np.percentile(bootstrap_diffs, (alpha/2) * 100)
        ci_upper = np.percentile(bootstrap_diffs, (1 - alpha/2) * 100)
        
        # BCa confidence intervals (bias-corrected and accelerated)
        # Estimate bias correction
        n_tp = len(tp_values)
        n_halluc = len(halluc_values)
        
        # Jackknife for acceleration
        jackknife_diffs = []
        for i in range(n_tp):
            tp_jack = np.delete(tp_values, i)
            jackknife_diffs.append(tp_jack.mean() - halluc_values.mean())
        for i in range(n_halluc):
            halluc_jack = np.delete(halluc_values, i)
            jackknife_diffs.append(tp_values.mean() - halluc_jack.mean())
        
        jackknife_diffs = np.array(jackknife_diffs)
        jackknife_mean = np.mean(jackknife_diffs)
        
        # Acceleration parameter
        acceleration = np.sum((jackknife_mean - jackknife_diffs)**3) / (6 * (np.sum((jackknife_mean - jackknife_diffs)**2))**(3/2))
        if np.isnan(acceleration):
            acceleration = 0
        
        # Bias correction
        observed_diff = tp_values.mean() - halluc_values.mean()
        bias_correction = stats.norm.ppf((bootstrap_diffs < observed_diff).mean())
        if np.isnan(bias_correction):
            bias_correction = 0
        
        # BCa confidence intervals
        z_alpha_2 = stats.norm.ppf(alpha/2)
        z_1_alpha_2 = stats.norm.ppf(1 - alpha/2)
        
        alpha_1 = stats.norm.cdf(bias_correction + (bias_correction + z_alpha_2)/(1 - acceleration * (bias_correction + z_alpha_2)))
        alpha_2 = stats.norm.cdf(bias_correction + (bias_correction + z_1_alpha_2)/(1 - acceleration * (bias_correction + z_1_alpha_2)))
        
        # Ensure valid percentiles
        alpha_1 = max(0.001, min(0.999, alpha_1))
        alpha_2 = max(0.001, min(0.999, alpha_2))
        
        bca_ci_lower = np.percentile(bootstrap_diffs, alpha_1 * 100)
        bca_ci_upper = np.percentile(bootstrap_diffs, alpha_2 * 100)
        
        bootstrap_results['bca_intervals'] = {
            'ci_lower': bca_ci_lower,
            'ci_upper': bca_ci_upper,
            'bias_correction': bias_correction,
            'acceleration': acceleration,
            'contains_zero': bca_ci_lower <= 0 <= bca_ci_upper
        }
        
        bootstrap_results['iid_bootstrap'] = {
            'mean_difference': np.mean(bootstrap_diffs),
            'std_difference': np.std(bootstrap_diffs),
            'ci_lower': ci_lower,
            'ci_upper': ci_upper,
            'contains_zero': ci_lower <= 0 <= ci_upper,
            'n_bootstrap': n_bootstrap,
            'bootstrap_p_value': 2 * min((bootstrap_diffs <= 0).mean(), (bootstrap_diffs >= 0).mean()),
            'hypothesis_interpretation': 'SUPPORTS' if ci_upper < 0 else 'CONTRADICTS' if ci_lower > 0 else 'INCONCLUSIVE'
        }
        
        # Circular Block Bootstrap (simplified implementation)
        # Simple block bootstrap without arch library complications
        block_size = min(5, len(tp_values) // 5, len(halluc_values) // 5)
        if block_size >= 2:
            circular_diffs = []
            combined_data = np.concatenate([tp_values, halluc_values])
            combined_labels = np.concatenate([np.zeros(len(tp_values)), np.ones(len(halluc_values))])
            
            for i in range(min(1000, n_bootstrap)):  # Limit for computational efficiency
                # Simple circular block sampling
                n_blocks = len(combined_data) // block_size
                block_starts = np.random.choice(len(combined_data), size=n_blocks, replace=True)
                
                boot_data = []
                boot_labels = []
                for start in block_starts:
                    for j in range(block_size):
                        idx = (start + j) % len(combined_data)
                        boot_data.append(combined_data[idx])
                        boot_labels.append(combined_labels[idx])
                
                boot_data = np.array(boot_data)
                boot_labels = np.array(boot_labels)
                
                tp_boot = boot_data[boot_labels == 0]
                halluc_boot = boot_data[boot_labels == 1]
                
                if len(tp_boot) > 0 and len(halluc_boot) > 0:
                    circular_diffs.append(tp_boot.mean() - halluc_boot.mean())
            
            if circular_diffs:
                circular_diffs = np.array(circular_diffs)
                circular_ci_lower = np.percentile(circular_diffs, (alpha/2) * 100)
                circular_ci_upper = np.percentile(circular_diffs, (1 - alpha/2) * 100)
                
                bootstrap_results['circular_block_bootstrap'] = {
                    'mean_difference': np.mean(circular_diffs),
                    'std_difference': np.std(circular_diffs),
                    'ci_lower': circular_ci_lower,
                    'ci_upper': circular_ci_upper,
                    'contains_zero': circular_ci_lower <= 0 <= circular_ci_upper,
                    'n_bootstrap': len(circular_diffs),
                    'block_size': block_size,
                    'hypothesis_interpretation': 'SUPPORTS' if circular_ci_upper < 0 else 'CONTRADICTS' if circular_ci_lower > 0 else 'INCONCLUSIVE'
                }
        
        results['extended_bootstrap'][metric] = bootstrap_results
        
        # =================================================================
        # 3. BAYESIAN ANALYSIS
        # =================================================================
        logger.info(f"  Running Bayesian analysis for {metric}...")
        
        # Bayesian two-sample t-test using PyMC
        with pm.Model() as bayesian_model:
            # Priors for group means
            mu_tp = pm.Normal('mu_tp', mu=tp_values.mean(), sigma=tp_values.std()*2)
            mu_halluc = pm.Normal('mu_halluc', mu=halluc_values.mean(), sigma=halluc_values.std()*2)
            
            # Priors for group standard deviations
            sigma_tp = pm.HalfNormal('sigma_tp', sigma=tp_values.std()*2)
            sigma_halluc = pm.HalfNormal('sigma_halluc', sigma=halluc_values.std()*2)
            
            # Likelihood for observations
            tp_obs = pm.Normal('tp_obs', mu=mu_tp, sigma=sigma_tp, observed=tp_values)
            halluc_obs = pm.Normal('halluc_obs', mu=mu_halluc, sigma=sigma_halluc, observed=halluc_values)
            
            # Derived quantities
            diff_means = pm.Deterministic('diff_means', mu_tp - mu_halluc)
            effect_size = pm.Deterministic('effect_size', diff_means / pm.math.sqrt((sigma_tp**2 + sigma_halluc**2) / 2))
            
            # Sample from posterior
            trace = pm.sample(2000, tune=1000, cores=1, random_seed=RANDOM_SEED, 
                            progressbar=False, return_inferencedata=True)
        
        # Extract posterior samples
        posterior_diff = trace.posterior['diff_means'].values.flatten()
        posterior_effect_size = trace.posterior['effect_size'].values.flatten()
        
        # Calculate credible intervals
        diff_ci = np.percentile(posterior_diff, [(alpha/2)*100, (1-alpha/2)*100])
        effect_ci = np.percentile(posterior_effect_size, [(alpha/2)*100, (1-alpha/2)*100])
        
        # Bayesian p-value (probability that difference is in wrong direction)
        bayesian_p_value = (posterior_diff > 0).mean() if posterior_diff.mean() < 0 else (posterior_diff < 0).mean()
        
        # Probability that TP has lower error (supports hypothesis)
        prob_supports_hypothesis = (posterior_diff < 0).mean()
        
        # Bayes factor approximation using Savage-Dickey density ratio
        # Compare H1: diff != 0 vs H0: diff = 0
        from scipy.stats import gaussian_kde
        posterior_kde = gaussian_kde(posterior_diff)
        
        # Prior density at 0 (assuming normal prior centered at 0)
        prior_at_zero = stats.norm.pdf(0, 0, tp_values.std() + halluc_values.std())
        posterior_at_zero = posterior_kde.evaluate([0])[0]
        
        # Bayes factor (BF10 = evidence for H1 vs H0)
        bayes_factor = prior_at_zero / posterior_at_zero if posterior_at_zero > 0 else np.inf
        
        # Interpret Bayes factor
        if bayes_factor < 1/3:
            bf_interpretation = "Strong evidence for H0 (no difference)"
        elif bayes_factor < 1:
            bf_interpretation = "Moderate evidence for H0"
        elif bayes_factor < 3:
            bf_interpretation = "Weak evidence for H1"
        elif bayes_factor < 10:
            bf_interpretation = "Moderate evidence for H1 (difference exists)"
        else:
            bf_interpretation = "Strong evidence for H1"
        
        results['bayesian_analysis'][metric] = {
            'posterior_mean_difference': np.mean(posterior_diff),
            'posterior_std_difference': np.std(posterior_diff),
            'credible_interval_95': {
                'lower': diff_ci[0],
                'upper': diff_ci[1],
                'contains_zero': diff_ci[0] <= 0 <= diff_ci[1]
            },
            'effect_size': {
                'mean': np.mean(posterior_effect_size),
                'credible_interval': {'lower': effect_ci[0], 'upper': effect_ci[1]}
            },
            'bayesian_p_value': bayesian_p_value,
            'prob_supports_hypothesis': prob_supports_hypothesis,
            'bayes_factor': bayes_factor,
            'bayes_factor_interpretation': bf_interpretation,
            'hypothesis_interpretation': 'SUPPORTS' if prob_supports_hypothesis > 0.95 else 'CONTRADICTS' if prob_supports_hypothesis < 0.05 else 'INCONCLUSIVE',
            'model_summary': {
                'n_samples': len(posterior_diff),
                'n_tp': len(tp_values),
                'n_hallucinations': len(halluc_values)
            }
        }
    
    # =================================================================
    # 4. METHOD COMPARISON AND SYNTHESIS
    # =================================================================
    logger.info("Synthesizing results across all advanced methods...")
    
    for metric in metrics_to_test:
        if metric in results['permutation_tests'] or metric in results['extended_bootstrap'] or metric in results['bayesian_analysis']:
            comparison = {
                'metric': metric,
                'hypothesis_support_summary': {},
                'p_values': {},
                'confidence_intervals': {},
                'effect_evidence': {}
            }
            
            # Collect hypothesis support from each method
            methods_support = []
            
            if metric in results['permutation_tests'] and 'hypothesis_interpretation' in results['permutation_tests'][metric]:
                perm_support = results['permutation_tests'][metric]['hypothesis_interpretation']
                methods_support.append(('Permutation', perm_support))
                comparison['p_values']['permutation_two_sided'] = results['permutation_tests'][metric]['two_sided']['p_value']
                comparison['p_values']['permutation_one_sided'] = results['permutation_tests'][metric]['one_sided_less']['p_value']
            
            if metric in results['extended_bootstrap'] and 'iid_bootstrap' in results['extended_bootstrap'][metric]:
                boot_support = results['extended_bootstrap'][metric]['iid_bootstrap']['hypothesis_interpretation']
                methods_support.append(('Bootstrap', boot_support))
                comparison['confidence_intervals']['bootstrap'] = {
                    'lower': results['extended_bootstrap'][metric]['iid_bootstrap']['ci_lower'],
                    'upper': results['extended_bootstrap'][metric]['iid_bootstrap']['ci_upper']
                }
            
            if metric in results['bayesian_analysis'] and 'hypothesis_interpretation' in results['bayesian_analysis'][metric]:
                bayes_support = results['bayesian_analysis'][metric]['hypothesis_interpretation']
                methods_support.append(('Bayesian', bayes_support))
                comparison['p_values']['bayesian'] = results['bayesian_analysis'][metric]['bayesian_p_value']
                comparison['confidence_intervals']['bayesian_credible'] = results['bayesian_analysis'][metric]['credible_interval_95']
            
            # Consensus analysis
            support_votes = sum(1 for _, support in methods_support if support == 'SUPPORTS')
            contradict_votes = sum(1 for _, support in methods_support if support == 'CONTRADICTS')
            inconclusive_votes = sum(1 for _, support in methods_support if support == 'INCONCLUSIVE')
            
            if support_votes > contradict_votes and support_votes > inconclusive_votes:
                consensus = 'STRONG_SUPPORT'
            elif contradict_votes > support_votes and contradict_votes > inconclusive_votes:
                consensus = 'STRONG_CONTRADICTION'
            elif support_votes == contradict_votes and support_votes > 0:
                consensus = 'MIXED_EVIDENCE'
            else:
                consensus = 'INSUFFICIENT_EVIDENCE'
            
            comparison['hypothesis_support_summary'] = {
                'methods_tested': len(methods_support),
                'support_votes': support_votes,
                'contradict_votes': contradict_votes,
                'inconclusive_votes': inconclusive_votes,
                'consensus': consensus,
                'method_results': methods_support
            }
            
            results['method_comparison'][metric] = comparison
    
    # =================================================================
    # 5. OVERALL HYPOTHESIS CONCLUSION
    # =================================================================
    all_consensuses = [comp['hypothesis_support_summary']['consensus'] 
                      for comp in results['method_comparison'].values()]
    
    strong_support_count = all_consensuses.count('STRONG_SUPPORT')
    strong_contradiction_count = all_consensuses.count('STRONG_CONTRADICTION')
    mixed_count = all_consensuses.count('MIXED_EVIDENCE')
    insufficient_count = all_consensuses.count('INSUFFICIENT_EVIDENCE')
    
    if strong_support_count > strong_contradiction_count and strong_support_count > 0:
        overall_conclusion = 'HYPOTHESIS_STRONGLY_SUPPORTED'
    elif strong_contradiction_count > strong_support_count and strong_contradiction_count > 0:
        overall_conclusion = 'HYPOTHESIS_STRONGLY_CONTRADICTED'
    elif mixed_count > 0:
        overall_conclusion = 'MIXED_EVIDENCE_ACROSS_METRICS'
    else:
        overall_conclusion = 'INSUFFICIENT_EVIDENCE'
    
    results['hypothesis_conclusion'] = {
        'overall_result': overall_conclusion,
        'metrics_tested': len(all_consensuses),
        'strong_support_count': strong_support_count,
        'strong_contradiction_count': strong_contradiction_count,
        'mixed_evidence_count': mixed_count,
        'insufficient_evidence_count': insufficient_count,
        'confidence_level': 'HIGH' if (strong_support_count + strong_contradiction_count) >= len(all_consensuses) * 0.75 else 'MODERATE' if (strong_support_count + strong_contradiction_count) > 0 else 'LOW'
    }
    
    return results

# Execute the advanced statistical methods analysis
print("ADVANCED STATISTICAL METHODS FOR HYPOTHESIS 2:")
print("=" * 60)
print("Applying cutting-edge statistical methods:")
print("1. PERMUTATION TESTS - Exact p-values without distributional assumptions")
print("2. EXTENDED BOOTSTRAP - BCa intervals and multiple bootstrap types")
print("3. BAYESIAN ANALYSIS - Credible intervals and Bayes factors")
print("=" * 60)
print()

# Execute the analysis
advanced_results = advanced_statistical_methods_analysis(
    n_permutations=10000,
    n_bootstrap=5000,
    alpha=0.05
)

# Store results for further analysis
globals()['advanced_statistical_results'] = advanced_results

if advanced_results:
    print("✅ ADVANCED STATISTICAL ANALYSIS COMPLETE!")
    print(f"Results generated for {len(advanced_results.get('method_comparison', {}))} metrics")
    print("Results stored in 'advanced_statistical_results' for display in next cell.")
else:
    print("❌ No advanced statistical results generated. Check data and error logs.")

In [None]:
# Cell [12] - Advanced Statistical Methods: Results Display and Interpretation
# Purpose: Display and interpret results from advanced statistical analysis of Hypothesis 2
# Dependencies: Results from cell 11 (advanced_statistical_results)
# Breadcrumbs: Advanced Methods -> Results Display -> Comprehensive Interpretation

# Display comprehensive results from advanced statistical analysis
try:
    # Check if results are available from previous cell
    if 'advanced_statistical_results' not in globals() or not advanced_statistical_results:
        print("⚠️ No advanced statistical results found.")
        print("Please run Cell [11] first to generate the results.")
    else:
        advanced_results = advanced_statistical_results
        
        print("COMPREHENSIVE ADVANCED STATISTICAL RESULTS FOR HYPOTHESIS 2:")
        print("=" * 70)
        print("HYPOTHESIS: 'Hallucinations in LLM outputs significantly correlate with")
        print("            lower quality SiFP estimates in new product development'")
        print("=" * 70)
        print()
        
        # Display Permutation Test Results
        if 'permutation_tests' in advanced_results and advanced_results['permutation_tests']:
            print("1. PERMUTATION TEST RESULTS:")
            print("   (Exact p-values without distributional assumptions)")
            print("-" * 50)
            for metric, perm_data in advanced_results['permutation_tests'].items():
                if 'error' in perm_data:
                    print(f"  {metric}: Error - {perm_data['error']}")
                    continue
                    
                metric_name = "Absolute Error" if "abs" in metric else "Percentage Error"
                print(f"  📊 {metric_name}:")
                print(f"    Observed Difference (TP - Hallucinations): {perm_data['observed_difference']:.4f}")
                print(f"    Two-sided p-value: {perm_data['two_sided']['p_value']:.6f}")
                print(f"    One-sided p-value (TP < Hallucinations): {perm_data['one_sided_less']['p_value']:.6f}")
                print(f"    Permutations: {perm_data['n_permutations']:,}")
                print(f"    Hypothesis Support: {perm_data['hypothesis_interpretation']}")
                
                # Interpretation with visual indicators
                if perm_data['hypothesis_interpretation'] == 'SUPPORTS':
                    print(f"    ✅ SUPPORTS: TP links have significantly lower {metric_name.lower()}")
                    print(f"       → Hallucinations correlate with HIGHER estimation errors")
                elif perm_data['hypothesis_interpretation'] == 'CONTRADICTS':
                    print(f"    ❌ CONTRADICTS: TP links have significantly higher {metric_name.lower()}")
                    print(f"       → Hallucinations correlate with LOWER estimation errors")
                else:
                    print(f"    ❓ INCONCLUSIVE: No significant difference detected")
                    print(f"       → Cannot conclude hallucinations affect estimation quality")
                print()
        else:
            print("1. PERMUTATION TESTS: No results available")
            print()
        
        # Display Extended Bootstrap Results
        if 'extended_bootstrap' in advanced_results and advanced_results['extended_bootstrap']:
            print("2. EXTENDED BOOTSTRAP RESULTS:")
            print("   (Robust confidence intervals and multiple bootstrap methods)")
            print("-" * 60)
            for metric, boot_data in advanced_results['extended_bootstrap'].items():
                metric_name = "Absolute Error" if "abs" in metric else "Percentage Error"
                print(f"  📈 {metric_name}:")
                
                # IID Bootstrap Results
                if 'iid_bootstrap' in boot_data and 'error' not in boot_data['iid_bootstrap']:
                    iid = boot_data['iid_bootstrap']
                    print(f"    IID Bootstrap (n={iid['n_bootstrap']:,}):")
                    print(f"      95% CI: [{iid['ci_lower']:.4f}, {iid['ci_upper']:.4f}]")
                    print(f"      Bootstrap p-value: {iid['bootstrap_p_value']:.6f}")
                    print(f"      Contains zero: {'Yes' if iid['contains_zero'] else 'No'}")
                    print(f"      Hypothesis Support: {iid['hypothesis_interpretation']}")
                    
                    # Visual interpretation
                    if iid['hypothesis_interpretation'] == 'SUPPORTS':
                        print(f"      ✅ Bootstrap CI supports hypothesis")
                    elif iid['hypothesis_interpretation'] == 'CONTRADICTS':
                        print(f"      ❌ Bootstrap CI contradicts hypothesis")
                    else:
                        print(f"      ❓ Bootstrap CI is inconclusive")
                
                # BCa Bootstrap Results
                if 'bca_intervals' in boot_data and 'error' not in boot_data['bca_intervals']:
                    bca = boot_data['bca_intervals']
                    print(f"    BCa Bootstrap (Bias-Corrected & Accelerated):")
                    print(f"      95% CI: [{bca['ci_lower']:.4f}, {bca['ci_upper']:.4f}]")
                    print(f"      Bias correction: {bca['bias_correction']:.4f}")
                    print(f"      Acceleration: {bca['acceleration']:.4f}")
                    print(f"      Contains zero: {'Yes' if bca['contains_zero'] else 'No'}")
                
                # Circular Block Bootstrap Results
                if 'circular_block_bootstrap' in boot_data and 'error' not in boot_data['circular_block_bootstrap']:
                    circular = boot_data['circular_block_bootstrap']
                    print(f"    Circular Block Bootstrap (n={circular['n_bootstrap']}):")
                    print(f"      95% CI: [{circular['ci_lower']:.4f}, {circular['ci_upper']:.4f}]")
                    print(f"      Hypothesis Support: {circular['hypothesis_interpretation']}")
                print()
        else:
            print("2. EXTENDED BOOTSTRAP: No results available")
            print()
        
        # Display Bayesian Analysis Results
        if 'bayesian_analysis' in advanced_results and advanced_results['bayesian_analysis']:
            print("3. BAYESIAN ANALYSIS RESULTS:")
            print("   (Probabilistic inference and model comparison)")
            print("-" * 55)
            for metric, bayes_data in advanced_results['bayesian_analysis'].items():
                if 'error' in bayes_data:
                    print(f"  {metric}: Error - {bayes_data['error']}")
                    continue
                    
                metric_name = "Absolute Error" if "abs" in metric else "Percentage Error"
                print(f"  🔮 {metric_name}:")
                
                # Posterior results
                print(f"    Posterior Mean Difference: {bayes_data['posterior_mean_difference']:.4f}")
                print(f"    Posterior Std Difference: {bayes_data['posterior_std_difference']:.4f}")
                
                # Credible intervals
                ci = bayes_data['credible_interval_95']
                print(f"    95% Credible Interval: [{ci['lower']:.4f}, {ci['upper']:.4f}]")
                print(f"    Contains zero: {'Yes' if ci['contains_zero'] else 'No'}")
                
                # Effect size
                effect = bayes_data['effect_size']
                print(f"    Effect Size (mean): {effect['mean']:.4f}")
                print(f"    Effect Size 95% CI: [{effect['credible_interval']['lower']:.4f}, {effect['credible_interval']['upper']:.4f}]")
                
                # Probability assessments
                print(f"    Probability Supporting Hypothesis: {bayes_data['prob_supports_hypothesis']:.3f}")
                print(f"    Bayesian p-value: {bayes_data['bayesian_p_value']:.6f}")
                
                # Bayes Factor
                print(f"    Bayes Factor (BF₁₀): {bayes_data['bayes_factor']:.2f}")
                print(f"    BF Interpretation: {bayes_data['bayes_factor_interpretation']}")
                print(f"    Hypothesis Support: {bayes_data['hypothesis_interpretation']}")
                
                # Visual interpretation
                prob = bayes_data['prob_supports_hypothesis']
                if prob > 0.95:
                    print(f"    ✅ STRONG BAYESIAN SUPPORT: {prob:.1%} probability TP has lower errors")
                elif prob < 0.05:
                    print(f"    ❌ STRONG BAYESIAN CONTRADICTION: {1-prob:.1%} probability TP has higher errors")
                else:
                    print(f"    ❓ BAYESIAN UNCERTAINTY: {prob:.1%} probability supporting hypothesis")
                
                # Model info
                model_info = bayes_data['model_summary']
                print(f"    Model: {model_info['n_samples']:,} samples, TP={model_info['n_tp']}, Halluc={model_info['n_hallucinations']}")
                print()
        else:
            print("3. BAYESIAN ANALYSIS: No results available")
            print()
        
        # Display Cross-Method Comparison
        if 'method_comparison' in advanced_results and advanced_results['method_comparison']:
            print("4. CROSS-METHOD COMPARISON:")
            print("   (Consensus across statistical approaches)")
            print("-" * 50)
            for metric, comp_data in advanced_results['method_comparison'].items():
                metric_name = "Absolute Error" if "abs" in metric else "Percentage Error"
                summary = comp_data['hypothesis_support_summary']
                
                print(f"  🔍 {metric_name}:")
                print(f"    Methods Tested: {summary['methods_tested']}")
                print(f"    Support Votes: {summary['support_votes']} ✅")
                print(f"    Contradict Votes: {summary['contradict_votes']} ❌")
                print(f"    Inconclusive Votes: {summary['inconclusive_votes']} ❓")
                print(f"    Consensus: {summary['consensus']}")
                
                # Visual consensus interpretation
                consensus = summary['consensus']
                if consensus == 'STRONG_SUPPORT':
                    print(f"    🎯 STRONG CONSENSUS: Methods agree hypothesis is SUPPORTED")
                elif consensus == 'STRONG_CONTRADICTION':
                    print(f"    🎯 STRONG CONSENSUS: Methods agree hypothesis is CONTRADICTED")
                elif consensus == 'MIXED_EVIDENCE':
                    print(f"    ⚖️ MIXED EVIDENCE: Methods disagree on hypothesis")
                else:
                    print(f"    ❓ INSUFFICIENT EVIDENCE: No clear consensus")
                
                print(f"    Method Details:")
                for method, result in summary['method_results']:
                    emoji = "✅" if result == "SUPPORTS" else "❌" if result == "CONTRADICTS" else "❓"
                    print(f"      {emoji} {method}: {result}")
                
                # P-values summary
                if 'p_values' in comp_data:
                    print(f"    P-values:")
                    for test_name, p_val in comp_data['p_values'].items():
                        significance = "significant" if p_val < 0.05 else "not significant"
                        print(f"      {test_name}: {p_val:.6f} ({significance})")
                print()
        else:
            print("4. CROSS-METHOD COMPARISON: No results available")
            print()
        
        # Display Overall Hypothesis Conclusion
        if 'hypothesis_conclusion' in advanced_results:
            conclusion = advanced_results['hypothesis_conclusion']
            print("5. 🎯 COMPREHENSIVE HYPOTHESIS 2 CONCLUSION:")
            print("=" * 60)
            print(f"Overall Result: {conclusion['overall_result']}")
            print(f"Confidence Level: {conclusion['confidence_level']}")
            print(f"Metrics Tested: {conclusion['metrics_tested']}")
            print(f"Strong Support: {conclusion['strong_support_count']} metric(s)")
            print(f"Strong Contradiction: {conclusion['strong_contradiction_count']} metric(s)")
            print(f"Mixed Evidence: {conclusion['mixed_evidence_count']} metric(s)")
            print(f"Insufficient Evidence: {conclusion['insufficient_evidence_count']} metric(s)")
            print()
            
            # Final comprehensive interpretation with recommendations
            result = conclusion['overall_result']
            confidence = conclusion['confidence_level']
            
            print("📋 FINAL INTERPRETATION & RECOMMENDATIONS:")
            print("-" * 50)
            
            if result == 'HYPOTHESIS_STRONGLY_SUPPORTED':
                print("🏆 CONCLUSION: HYPOTHESIS 2 IS STRONGLY SUPPORTED")
                print()
                print("📊 EVIDENCE SUMMARY:")
                print("   ✅ Multiple advanced statistical methods converge on the conclusion that")
                print("      hallucinations in LLM outputs (both FP over-identification and FN")
                print("      under-identification) significantly correlate with lower quality")
                print("      SiFP estimates in new product development.")
                print()
                print("🔬 STATISTICAL RIGOR:")
                print("   • Permutation tests: Exact p-values without distributional assumptions")
                print("   • Extended bootstrap: Robust confidence intervals (IID, BCa, Circular Block)")
                print("   • Bayesian analysis: Probabilistic evidence with credible intervals & Bayes factors")
                print("   • Cross-method consensus: Multiple approaches agree on the conclusion")
                print()
                print("💡 PRACTICAL IMPLICATIONS:")
                print("   1. Hallucinated traceability links are associated with WORSE SiFP estimates")
                print("   2. Focus on improving LLM accuracy to enhance estimation quality")
                print("   3. Implement hallucination detection mechanisms in traceability systems")
                print("   4. Use estimation quality as a feedback signal for LLM performance")
                print()
                print("🎯 RECOMMENDATIONS:")
                print("   • Prioritize reducing both FP (over-identification) and FN (missed features)")
                print("   • Implement confidence scoring for traceability predictions")
                print("   • Use estimation accuracy as a validation metric for traceability quality")
                print("   • Consider ensemble methods to reduce hallucination rates")
                
            elif result == 'HYPOTHESIS_STRONGLY_CONTRADICTED':
                print("🚫 CONCLUSION: HYPOTHESIS 2 IS STRONGLY CONTRADICTED")
                print()
                print("📊 EVIDENCE SUMMARY:")
                print("   ❌ Multiple advanced statistical methods converge on the conclusion that")
                print("      hallucinations do NOT significantly correlate with lower quality")
                print("      SiFP estimates. In fact, the evidence suggests hallucinations may")
                print("      be associated with equal or even better estimation quality.")
                print()
                print("🤔 UNEXPECTED FINDINGS:")
                print("   • Hallucinated links may not negatively impact estimation accuracy")
                print("   • Current LLM hallucinations might be 'beneficial' or neutral for SiFP")
                print("   • The relationship between traceability and estimation may be more complex")
                print()
                print("💡 PRACTICAL IMPLICATIONS:")
                print("   1. Hallucination reduction may not improve SiFP estimation quality")
                print("   2. Focus effort on other factors affecting estimation accuracy")
                print("   3. Re-examine the assumed relationship between traceability and estimation")
                print("   4. Consider whether current 'hallucinations' capture useful information")
                print()
                print("🎯 RECOMMENDATIONS:")
                print("   • Investigate why hallucinations don't correlate with worse estimates")
                print("   • Examine other factors that might influence SiFP estimation quality")
                print("   • Consider revising the definition of 'hallucination' in this context")
                print("   • Focus optimization efforts on different aspects of the system")
                
            elif result == 'MIXED_EVIDENCE_ACROSS_METRICS':
                print("⚖️ CONCLUSION: MIXED EVIDENCE FOR HYPOTHESIS 2")
                print()
                print("📊 EVIDENCE SUMMARY:")
                print("   🔀 Different error metrics show different patterns. The relationship")
                print("      between hallucinations and estimation quality appears to be more")
                print("      complex than hypothesized, potentially varying by error type,")
                print("      context, or other moderating factors.")
                print()
                print("🧩 COMPLEXITY INDICATORS:")
                print("   • Some metrics support the hypothesis while others contradict it")
                print("   • Effect may depend on the type of estimation error measured")
                print("   • Relationship may be moderated by other variables")
                print()
                print("💡 PRACTICAL IMPLICATIONS:")
                print("   1. Simple binary relationship between hallucinations and quality may not exist")
                print("   2. Different types of estimation errors may be affected differently")
                print("   3. Context-dependent effects require more nuanced analysis")
                print()
                print("🎯 RECOMMENDATIONS:")
                print("   • Conduct subgroup analyses to identify moderating factors")
                print("   • Examine different types of hallucinations separately")
                print("   • Consider interaction effects with other variables")
                print("   • Develop more sophisticated models of the relationship")
                
            else:
                print("❓ CONCLUSION: INSUFFICIENT EVIDENCE FOR HYPOTHESIS 2")
                print()
                print("📊 EVIDENCE SUMMARY:")
                print("   ⚠️ The available data and methods do not provide sufficient evidence")
                print("      to draw strong conclusions about the relationship between")
                print("      hallucinations and SiFP estimation quality.")
                print()
                print("🔍 LIMITATIONS:")
                print("   • Sample size may be too small for reliable detection")
                print("   • Effect size may be smaller than detectable with current data")
                print("   • Measurement noise may obscure true relationships")
                print()
                print("🎯 RECOMMENDATIONS:")
                print("   • Collect additional data to increase statistical power")
                print("   • Improve measurement precision of key variables")
                print("   • Consider alternative analytical approaches")
                print("   • Replicate analysis with different datasets")
            
            print()
            print("🏁 ANALYSIS COMPLETE!")
            print(f"Confidence in conclusion: {confidence}")
            print("Results available in 'advanced_statistical_results' for further analysis.")
        
        print("\n" + "=" * 70)
        print("ADVANCED STATISTICAL HYPOTHESIS 2 TESTING COMPLETE!")
        print("All results stored in 'advanced_statistical_results' global variable.")
        print("=" * 70)

except Exception as e:
    print(f"❌ Error displaying advanced statistical results: {str(e)}")
    import traceback
    traceback.print_exc() 