# Hypothesis 1: Hallucination Reduction for NPD Accuracy Improvement
**Techniques reducing the hallucinations generated by LLMs by 30% can improve the accuracy of outputs used in NPD.**

In [None]:
# Cell [0] - Setup and Imports
# Purpose: Import all required libraries and configure environment settings for sentence transformer and meta judge comparison
# Dependencies: os, logging, numpy, pandas, dotenv, neo4j, matplotlib, seaborn, sklearn, scipy, statsmodels, json, datetime, pathlib
# Breadcrumbs: Setup -> Imports -> Environment Configuration

import os
import logging
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from neo4j import GraphDatabase
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, fbeta_score,
    matthews_corrcoef, confusion_matrix, balanced_accuracy_score,
    cohen_kappa_score, roc_auc_score, precision_recall_curve, auc,
    classification_report
)
from sklearn.utils import resample
import scipy.stats as stats
from scipy import stats as scipy_stats
from scipy.stats import wilcoxon, friedmanchisquare, ttest_rel
from statsmodels.stats.contingency_tables import mcnemar
from statsmodels.stats.multitest import multipletests
import json
from datetime import datetime
from pathlib import Path

def setup_environment():
    """
    Configure logging and load environment variables
    
    Returns:
        dict: Configuration parameters
    """
    # Configure logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    
    # Load environment variables
    load_dotenv()
    
    # Neo4j credentials from environment variables
    config = {
        'NEO4J_URI': os.getenv('NEO4J_URI'),
        'NEO4J_USER': os.getenv('NEO4J_USER'),
        'NEO4J_PASSWORD': os.getenv('NEO4J_PASSWORD'),
        'NEO4J_PROJECT_NAME': os.getenv('NEO4J_PROJECT_NAME'),
        'OPTIMIZATION_METRIC': os.getenv('OPTIMIZATION_METRIC', 'F2').upper(),
        'SHOW_VISUALIZATION': os.getenv('SHOW_VISUALIZATION', 'False').lower() == 'true',
        'MATCH_DIRECTION': os.getenv('MATCH_DIRECTION', 'source_to_target')
    }
    
    # Create results directory if it doesn't exist
    results_dir = Path('results')
    results_dir.mkdir(exist_ok=True)
    config['RESULTS_DIR'] = results_dir
    
    logger.info(f"Using {config['OPTIMIZATION_METRIC']} score for threshold optimization")
    logger.info(f"Visualization display is set to: {config['SHOW_VISUALIZATION']}")
    logger.info(f"Results will be saved to: {config['RESULTS_DIR']}")
    
    return config, logger

# Execute setup
CONFIG, logger = setup_environment()

In [None]:
# Cell [1] - Neo4j Connection Setup
# Purpose: Create connection to Neo4j database
# Dependencies: Neo4j GraphDatabase, config from Cell [0]
# Breadcrumbs: Setup -> Database Connection -> Driver Creation

def create_neo4j_driver(config):
    """
    Create and return a Neo4j driver instance
    
    Parameters:
        config (dict): Configuration dictionary with Neo4j credentials
    
    Returns:
        GraphDatabase.driver: Connected Neo4j driver
    """
    try:
        driver = GraphDatabase.driver(
            config['NEO4J_URI'], 
            auth=(config['NEO4J_USER'], config['NEO4J_PASSWORD'])
        )
        logger.info("Successfully connected to Neo4j database")
        return driver
    except Exception as e:
        logger.error(f"Failed to connect to Neo4j: {str(e)}")
        raise

# Create Neo4j driver
driver = create_neo4j_driver(CONFIG)

In [None]:
# Cell [2] - Query SIMILAR_TO Links
# Purpose: Retrieve sentence transformer similarity links from Neo4j
# Dependencies: Neo4j driver from Cell [1], project configuration
# Breadcrumbs: Database Connection -> Similarity Data Retrieval -> Sentence Transformer Links

def query_similar_to_links(driver, project_name, match_direction='source_to_target'):
    """
    Query SIMILAR_TO links from Neo4j
    
    Parameters:
        driver: Neo4j driver connection
        project_name: Name of the project
        match_direction: Direction of matching ('source_to_target', 'target_to_source', 'both')
    
    Returns:
        pd.DataFrame: DataFrame containing similarity links
    """
    try:
        logger.info(f"Querying SIMILAR_TO links for project: {project_name}")
        logger.info(f"Match direction: {match_direction}")
        
        all_results = []
        
        # Query for source-to-target links
        if match_direction in ['source_to_target', 'both']:
            source_to_target_query = """
            MATCH (p:Project {name: $project_name})-[:CONTAINS]->(d:Document)-[:CONTAINS]->(source_req:Requirement)-[r:SIMILAR_TO]->(target_req:Requirement)
            WHERE source_req.type = 'SOURCE' AND target_req.type = 'TARGET'
            AND EXISTS { 
                MATCH (source_req)-[:GROUND_TRUTH]->() 
            } 
            AND EXISTS { 
                MATCH ()-[:GROUND_TRUTH]->(target_req) 
            }
            RETURN 
                p.name as project_name,
                source_req.id as source_id,
                target_req.id as target_id,
                r.model as sentence_transformer_model,
                r.similarity as similarity_score,
                r.timestamp as timestamp,
                'source_to_target' as direction
            """
            
            with driver.session() as session:
                results = session.run(source_to_target_query, project_name=project_name).data()
                all_results.extend(results)
                logger.info(f"Retrieved {len(results)} source-to-target SIMILAR_TO links")
        
        # Query for target-to-source links
        if match_direction in ['target_to_source', 'both']:
            target_to_source_query = """
            MATCH (p:Project {name: $project_name})-[:CONTAINS]->(d:Document)-[:CONTAINS]->(target_req:Requirement)-[r:SIMILAR_TO]->(source_req:Requirement)
            WHERE target_req.type = 'TARGET' AND source_req.type = 'SOURCE'
            AND EXISTS { 
                MATCH (source_req)-[:GROUND_TRUTH]->() 
            } 
            AND EXISTS { 
                MATCH ()-[:GROUND_TRUTH]->(target_req) 
            }
            RETURN 
                p.name as project_name,
                source_req.id as source_id,
                target_req.id as target_id,
                r.model as sentence_transformer_model,
                r.similarity as similarity_score,
                r.timestamp as timestamp,
                'target_to_source' as direction
            """
            
            with driver.session() as session:
                results = session.run(target_to_source_query, project_name=project_name).data()
                all_results.extend(results)
                logger.info(f"Retrieved {len(results)} target-to-source SIMILAR_TO links")
        
        if all_results:
            df = pd.DataFrame(all_results)
            df['model'] = df['sentence_transformer_model']  # Add alias for compatibility
            logger.info(f"Total SIMILAR_TO links retrieved: {len(df)}")
            
            # Count unique models
            model_counts = df['sentence_transformer_model'].value_counts()
            logger.info("Models found:")
            for model, count in model_counts.items():
                logger.info(f"  - {model}: {count} links")
            
            return df
        else:
            logger.warning("No SIMILAR_TO links found")
            return pd.DataFrame()
            
    except Exception as e:
        logger.error(f"Error querying SIMILAR_TO links: {str(e)}")
        return pd.DataFrame()

# Query SIMILAR_TO links
similar_to_df = query_similar_to_links(driver, CONFIG['NEO4J_PROJECT_NAME'], CONFIG['MATCH_DIRECTION'])

# Display info
if not similar_to_df.empty:
    print(f"\nSIMILAR_TO Links Summary:")
    print(f"Total links: {len(similar_to_df)}")
    print(f"Unique models: {similar_to_df['model'].nunique()}")
    print("\nSample data:")
    print(similar_to_df.head())

In [None]:
# Cell [3] - Query Ground Truth Links
# Purpose: Retrieve ground truth traceability links from Neo4j
# Dependencies: Neo4j driver from Cell [1], project configuration
# Breadcrumbs: Database Connection -> Ground Truth Data Retrieval -> Traceability Links

def query_ground_truth_links(driver, project_name):
    """
    Query ground truth traceability links from Neo4j
    
    Parameters:
        driver: Neo4j driver connection
        project_name: Name of the project
    
    Returns:
        pd.DataFrame: DataFrame containing ground truth links
    """
    try:
        ground_truth_query = """
        MATCH (p:Project {name: $project_name})-[:CONTAINS]->(d:Document)-[:CONTAINS]->(source:Requirement)-[r:GROUND_TRUTH]->(target:Requirement)
        RETURN 
            p.name as project_name,
            source.id as source_id,
            source.type as source_type,
            target.id as target_id,
            target.type as target_type,
            1 as ground_truth
        ORDER BY source.id, target.id DESC
        """
        
        with driver.session() as session:
            results = session.run(ground_truth_query, project_name=project_name).data()
            
            if results:
                logger.info(f"Retrieved {len(results)} ground truth links")
                return pd.DataFrame(results)
            else:
                logger.warning("No ground truth links found")
                return pd.DataFrame()
                
    except Exception as e:
        logger.error(f"Error querying ground truth links: {str(e)}")
        return pd.DataFrame()

# Query ground truth links
df_ground_truth = query_ground_truth_links(driver, CONFIG['NEO4J_PROJECT_NAME'])

# Display info
if not df_ground_truth.empty:
    print(f"\nGround Truth Links Summary:")
    print(f"Total links: {len(df_ground_truth)}")
    print(f"Unique source requirements: {df_ground_truth['source_id'].nunique()}")
    print(f"Unique target requirements: {df_ground_truth['target_id'].nunique()}")
    print(f"Link density: {len(df_ground_truth) / (df_ground_truth['source_id'].nunique() * df_ground_truth['target_id'].nunique()):.4f}")

In [None]:
# Cell [4] - Create Combined Dataset
# Purpose: Merge similarity and ground truth data for analysis
# Dependencies: similar_to_df from Cell [2], df_ground_truth from Cell [3]
# Breadcrumbs: Data Retrieval -> Data Merge -> Combined Dataset Creation

def create_combined_dataset(similar_to_df, df_ground_truth):
    """
    Create a combined dataset with similarity scores and ground truth labels
    
    Parameters:
        similar_to_df: DataFrame with similarity scores
        df_ground_truth: DataFrame with ground truth links
    
    Returns:
        pd.DataFrame: Combined dataset
    """
    if similar_to_df.empty:
        logger.error("No SIMILAR_TO links available")
        return pd.DataFrame()
    
    # Start with similarity data
    combined_df = similar_to_df.copy()
    
    # Add ground truth information
    if not df_ground_truth.empty:
        # Create set of ground truth pairs
        ground_truth_pairs = set(zip(df_ground_truth['source_id'], df_ground_truth['target_id']))
        
        # Add ground truth column
        combined_df['ground_truth_traceable'] = combined_df.apply(
            lambda row: (row['source_id'], row['target_id']) in ground_truth_pairs,
            axis=1
        )
        
        logger.info(f"Ground truth distribution: {combined_df['ground_truth_traceable'].value_counts().to_dict()}")
    else:
        logger.warning("No ground truth data available")
        combined_df['ground_truth_traceable'] = False
    
    # Clean similarity scores
    combined_df['similarity_score'] = pd.to_numeric(combined_df['similarity_score'], errors='coerce').fillna(0)
    
    return combined_df

# Create combined dataset
combined_df = create_combined_dataset(similar_to_df, df_ground_truth)

# Display info
if not combined_df.empty:
    print(f"\nCombined Dataset Summary:")
    print(f"Total records: {len(combined_df)}")
    print(f"Ground truth positive: {combined_df['ground_truth_traceable'].sum()}")
    print(f"Ground truth negative: {(~combined_df['ground_truth_traceable']).sum()}")
    print(f"Similarity score range: [{combined_df['similarity_score'].min():.4f}, {combined_df['similarity_score'].max():.4f}]")

In [None]:
# Cell [5] - Model Evaluation and Threshold Optimization
# Purpose: Evaluate each model and find optimal thresholds
# Dependencies: combined_df from Cell [4], sklearn metrics from Cell [0]
# Breadcrumbs: Combined Dataset -> Model Evaluation -> Threshold Optimization

def evaluate_model_thresholds(df, model_name, score_column='similarity_score', 
                             ground_truth_column='ground_truth_traceable', 
                             optimize_for='F2'):
    """
    Evaluate a model's performance across different thresholds
    
    Parameters:
        df: DataFrame containing model predictions and ground truth
        model_name: Name of the model to evaluate
        score_column: Column containing similarity scores
        ground_truth_column: Column containing ground truth values
        optimize_for: Metric to optimize for ('F1' or 'F2')
    
    Returns:
        dict: Dictionary containing evaluation results
    """
    try:
        # Filter data for this model
        model_df = df[df['model'] == model_name].copy()
        
        if model_df.empty:
            logger.warning(f"No data available for model: {model_name}")
            return {}
            
        if ground_truth_column not in model_df.columns:
            logger.warning(f"Ground truth column '{ground_truth_column}' not found for model: {model_name}")
            return {}
        
        # Get ground truth and scores
        y_true = model_df[ground_truth_column].astype(int).values
        
        # Check for and handle None/NaN values in similarity scores
        if model_df[score_column].isna().any():
            logger.warning(f"Found NaN values in {score_column} for model {model_name}. Filling with 0.")
            model_df[score_column] = model_df[score_column].fillna(0)
        
        # Ensure similarity scores are numeric
        if model_df[score_column].dtype == object:
            try:
                model_df[score_column] = pd.to_numeric(model_df[score_column])
                logger.info(f"Converted {score_column} to numeric for model {model_name}")
            except Exception as e:
                logger.error(f"Error converting {score_column} to numeric: {str(e)}")
                model_df[score_column] = 0
        
        scores = model_df[score_column].values
        
        # Debug information
        print(f"  - Data points: {len(model_df)}")
        print(f"  - Positive examples: {y_true.sum()} ({y_true.sum()/len(y_true)*100:.2f}%)")
        print(f"  - Negative examples: {len(y_true) - y_true.sum()} ({(len(y_true) - y_true.sum())/len(y_true)*100:.2f}%)")
        print(f"  - Score range: {scores.min():.4f} to {scores.max():.4f}")
        
        # If all ground truth values are the same, we can't calculate meaningful metrics
        if len(np.unique(y_true)) < 2:
            logger.warning(f"Insufficient ground truth variety for model {model_name} - all values are {np.unique(y_true)[0]}")
            return {
                'model_name': model_name,
                'data_points': len(model_df),
                'ground_truth_positive': int(y_true.sum()),
                'ground_truth_negative': int(len(y_true) - y_true.sum()),
                'error': 'Only one class present'
            }
        
        # Calculate precision-recall curve
        precision, recall, thresholds = precision_recall_curve(y_true, scores)
        
        # Add a threshold of 1.0 to the end for completeness
        thresholds = np.append(thresholds, 1.0)
        
        # Calculate metrics for each threshold
        results = []
        
        for i, threshold in enumerate(thresholds):
            # Convert scores to binary predictions using this threshold
            y_pred = (scores >= threshold).astype(int)
            
            # Skip if all predictions are the same (all 0 or all 1)
            if len(np.unique(y_pred)) < 2:
                continue
            
            # Confusion matrix components
            tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
            
            # Basic metrics
            accuracy = accuracy_score(y_true, y_pred)
            balanced_acc = balanced_accuracy_score(y_true, y_pred)
            prec = precision[min(i, len(precision)-1)]
            rec = recall[min(i, len(recall)-1)]
            f1 = f1_score(y_true, y_pred, zero_division=0)
            f2 = fbeta_score(y_true, y_pred, beta=2, zero_division=0)
            
            # Additional metrics
            tnr = tn / (tn + fp) if (tn + fp) > 0 else 0  # Specificity/True Negative Rate
            fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # Miss Rate/False Negative Rate
            mcc = matthews_corrcoef(y_true, y_pred)  # Matthews Correlation Coefficient
            
            results.append({
                'threshold': threshold,
                'tp': tp,
                'fp': fp,
                'fn': fn,
                'tn': tn,
                'accuracy': accuracy,
                'balanced_accuracy': balanced_acc,
                'precision': prec,
                'recall': rec,
                'tnr': tnr,  # specificity
                'fnr': fnr,  # miss rate
                'f1_score': f1,
                'f2_score': f2,
                'mcc': mcc  # Matthews Correlation Coefficient
            })
        
        # Convert to DataFrame for easier analysis
        results_df = pd.DataFrame(results)
        
        if results_df.empty:
            logger.warning(f"No valid threshold results for model {model_name}")
            return {
                'model_name': model_name,
                'data_points': len(model_df),
                'ground_truth_positive': int(y_true.sum()),
                'ground_truth_negative': int(len(y_true) - y_true.sum()),
                'error': 'No valid thresholds found'
            }
        
        # Find best threshold based on optimization metric
        if optimize_for == 'F1':
            best_idx = results_df['f1_score'].idxmax()
            best_metric = 'f1_score'
        else:  # F2
            best_idx = results_df['f2_score'].idxmax()
            best_metric = 'f2_score'
            
        best_result = results_df.loc[best_idx]
        
        # Return comprehensive results
        return {
            'model_name': model_name,
            'data_points': len(model_df),
            'ground_truth_positive': int(y_true.sum()),
            'ground_truth_negative': int(len(y_true) - y_true.sum()),
            'best_threshold': best_result['threshold'],
            'best_precision': best_result['precision'],
            'best_recall': best_result['recall'],
            'best_accuracy': best_result['accuracy'],
            'best_balanced_accuracy': best_result['balanced_accuracy'],
            'best_f1': best_result['f1_score'],
            'best_f2': best_result['f2_score'],
            'best_tnr': best_result['tnr'],
            'best_fnr': best_result['fnr'],
            'best_mcc': best_result['mcc'],
            'best_tp': int(best_result['tp']),
            'best_fp': int(best_result['fp']),
            'best_fn': int(best_result['fn']),
            'best_tn': int(best_result['tn']),
            'optimization_metric': optimize_for,
            'threshold_results': results_df
        }
    except Exception as e:
        logger.error(f"Error evaluating model {model_name}: {str(e)}")
        logger.error("Exception details:", exc_info=True)
        return {
            'model_name': model_name,
            'data_points': len(model_df) if 'model_df' in locals() else 0,
            'error': str(e)
        }

def evaluate_all_models(combined_df, optimization_metric='F2'):
    """
    Evaluate all models in the combined dataset
    
    Parameters:
        combined_df: DataFrame containing model predictions and ground truth
        optimization_metric: Metric to optimize thresholds for ('F1' or 'F2')
    
    Returns:
        tuple: (evaluation_results, best_thresholds_df)
            - evaluation_results: List of dictionaries with evaluation results
            - best_thresholds_df: DataFrame with best thresholds for each model
    """
    # Check if we have the necessary data
    if 'ground_truth_traceable' not in combined_df.columns or 'model' not in combined_df.columns:
        logger.error("Cannot evaluate models: missing ground truth or model data")
        return [], pd.DataFrame()
    
    # Get list of all models
    all_models = combined_df['model'].unique()
    
    # Evaluate each model
    evaluation_results = []
    
    print(f"\nEvaluating {len(all_models)} sentence transformer models")
    print(f"Optimizing for {optimization_metric} score")
    print("=" * 80)
    
    for model in all_models:
        print(f"\nEvaluating model: {model}")
        result = evaluate_model_thresholds(combined_df, model, optimize_for=optimization_metric)
        
        if result:
            evaluation_results.append(result)
            
            if 'error' in result:
                print(f"  - Error: {result['error']}")
                
            if 'best_threshold' in result:
                print(f"  - Best threshold: {result['best_threshold']:.3f}")
                print(f"  - Confusion Matrix (TP, FP, FN, TN): {result['best_tp']}, {result['best_fp']}, {result['best_fn']}, {result['best_tn']}")
                print(f"  - Accuracy: {result['best_accuracy']:.3f}")
                print(f"  - Balanced Accuracy: {result['best_balanced_accuracy']:.3f}")
                print(f"  - Precision: {result['best_precision']:.3f}")
                print(f"  - Recall/TPR: {result['best_recall']:.3f}")
                print(f"  - Specificity/TNR: {result['best_tnr']:.3f}")
                print(f"  - Miss Rate/FNR: {result['best_fnr']:.3f}")
                print(f"  - F1: {result['best_f1']:.3f}")
                print(f"  - F2: {result['best_f2']:.3f}")
                print(f"  - Matthews Correlation Coefficient: {result['best_mcc']:.3f}")
    
    # Create DataFrame of best thresholds with all metrics
    if evaluation_results:
        best_thresholds_df = pd.DataFrame([
            {
                'model_name': r['model_name'],
                'best_threshold': r['best_threshold'] if 'best_threshold' in r else np.nan,
                'accuracy': r['best_accuracy'] if 'best_accuracy' in r else np.nan,
                'balanced_accuracy': r['best_balanced_accuracy'] if 'best_balanced_accuracy' in r else np.nan,
                'precision': r['best_precision'] if 'best_precision' in r else np.nan,
                'recall': r['best_recall'] if 'best_recall' in r else np.nan,
                'specificity': r['best_tnr'] if 'best_tnr' in r else np.nan,
                'miss_rate': r['best_fnr'] if 'best_fnr' in r else np.nan,
                'f1_score': r['best_f1'] if 'best_f1' in r else np.nan,
                'f2_score': r['best_f2'] if 'best_f2' in r else np.nan,
                'matthews_corr': r['best_mcc'] if 'best_mcc' in r else np.nan,
                'true_positives': r['best_tp'] if 'best_tp' in r else np.nan,
                'false_positives': r['best_fp'] if 'best_fp' in r else np.nan,
                'false_negatives': r['best_fn'] if 'best_fn' in r else np.nan,
                'true_negatives': r['best_tn'] if 'best_tn' in r else np.nan,
                'data_points': r['data_points'],
                'ground_truth_positive': r['ground_truth_positive'] if 'ground_truth_positive' in r else 0,
                'ground_truth_negative': r['ground_truth_negative'] if 'ground_truth_negative' in r else 0,
            }
            for r in evaluation_results if 'best_threshold' in r
        ])
        
        # Sort by the appropriate metric
        sort_col = 'f1_score' if optimization_metric == 'F1' else 'f2_score'
        best_thresholds_df = best_thresholds_df.sort_values(sort_col, ascending=False).reset_index(drop=True)
        
        print("\nBest Thresholds by Model:")
        print("-" * 80)
        print(best_thresholds_df.to_string())
        
        return evaluation_results, best_thresholds_df
    else:
        return [], pd.DataFrame()

# Run model evaluation
if 'combined_df' in globals() and not combined_df.empty:
    evaluation_results, best_thresholds_df = evaluate_all_models(combined_df, CONFIG['OPTIMIZATION_METRIC'])
else:
    print("\nCombined dataset not available. Please run previous cells first.")

In [None]:
# Cell [6] - Query LLM Meta Judge Links
# Purpose: Retrieve LLM_RESULT_META_JUDGE links from Neo4j for comparison
# Dependencies: Neo4j driver from Cell [1], environment variables
# Breadcrumbs: Database Connection -> Meta Judge Data Retrieval -> LLM Results

def get_analysis_model_ids():
    """
    Get the list of model IDs to analyze from environment variables
    
    Returns:
        list: List of model IDs to filter by
    """
    # Get the comma-separated list of model ID variable names
    model_id_vars = os.getenv('RESULTS_ANALYSIS_MODEL_IDS', '')
    
    if not model_id_vars:
        logger.warning("RESULTS_ANALYSIS_MODEL_IDS not set, will retrieve all models")
        return []
    
    # Parse the comma-separated list
    model_id_var_names = [var.strip() for var in model_id_vars.split(',') if var.strip()]
    
    # Get the actual model IDs from the environment variables
    model_ids = []
    for var_name in model_id_var_names:
        model_id = os.getenv(var_name)
        if model_id:
            model_ids.append(model_id)
            logger.info(f"Added model ID from {var_name}: {model_id}")
        else:
            logger.warning(f"Environment variable {var_name} not found or empty")
    
    return model_ids

def query_meta_judge_links(driver, project_name, model_ids=None):
    """
    Query LLM_RESULT_META_JUDGE links from Neo4j
    
    Parameters:
        driver: Neo4j driver connection
        project_name: Name of the project
        model_ids: List of model IDs to filter by (optional)
    
    Returns:
        pd.DataFrame: DataFrame containing meta judge links
    """
    try:
        logger.info(f"Querying LLM_RESULT_META_JUDGE links for project: {project_name}")
        
        if model_ids:
            logger.info(f"Filtering for models: {model_ids}")
        
        # Base query for meta-judge links
        base_query = """
        MATCH (p:Project {name: $project_name})-[:CONTAINS]->(d:Document)-[:CONTAINS]->(source:Requirement)-[r:LLM_RESULT_META_JUDGE]->(target:Requirement)
        WHERE source.type = 'SOURCE' and target.type = 'TARGET'
        AND source.project = $project_name AND target.project = $project_name
        """
        
        # Add model filter if model_ids provided
        if model_ids:
            model_filter = " AND r.model IN $model_ids"
        else:
            model_filter = ""
        
        # Add ground truth filter
        ground_truth_filter = """
        AND EXISTS { (source)-[:GROUND_TRUTH]->() }
        AND EXISTS { ()-[:GROUND_TRUTH]->(target) }
        """
        
        # Return clause
        return_clause = """
        RETURN 
            p.name as project_name,
            source.id as source_id,
            target.id as target_id,
            r.is_traceable as is_traceable,
            r.judge_score as judge_score,
            r.semantic_alignment as semantic_alignment,
            r.non_functional_coverage as non_functional_coverage,
            r.final_score as final_score,
            r.actor_score as actor_score,
            r.functional_completeness as functional_completeness,
            r.model as model
        ORDER BY source.id, target.id
        """
        
        # Combine query parts
        meta_judge_query = base_query + model_filter + ground_truth_filter + return_clause
        
        with driver.session() as session:
            # Execute query with appropriate parameters
            if model_ids:
                results = session.run(meta_judge_query, 
                                    project_name=project_name, 
                                    model_ids=model_ids).data()
            else:
                results = session.run(meta_judge_query, 
                                    project_name=project_name).data()
            
            if results:
                logger.info(f"Retrieved {len(results)} meta judge links")
                meta_judge_df = pd.DataFrame(results)
                
                # Convert boolean columns to boolean type if they exist as strings
                if 'is_traceable' in meta_judge_df.columns:
                    if meta_judge_df['is_traceable'].dtype == 'object':
                        meta_judge_df['is_traceable'] = meta_judge_df['is_traceable'].map(
                            lambda x: str(x).lower() == 'true' if pd.notna(x) else False
                        )
                
                # Convert numeric columns to float
                numeric_columns = [
                    'judge_score', 'semantic_alignment', 'non_functional_coverage',
                    'final_score', 'actor_score', 'functional_completeness'
                ]
                
                for col in numeric_columns:
                    if col in meta_judge_df.columns:
                        meta_judge_df[col] = pd.to_numeric(meta_judge_df[col], errors='coerce')
                
                return meta_judge_df
            else:
                logger.warning(f"No meta judge links found for project: {project_name}")
                return pd.DataFrame()
                
    except Exception as e:
        logger.error(f"Error querying meta judge links: {str(e)}")
        return pd.DataFrame()

# Get model IDs from environment variables
analysis_model_ids = get_analysis_model_ids()

# Query meta judge links with optional model filtering
meta_judge_df = query_meta_judge_links(driver, CONFIG['NEO4J_PROJECT_NAME'], analysis_model_ids)

# Display information about the retrieved data
if not meta_judge_df.empty:
    print("\nMeta Judge Links Summary:")
    print(f"Total meta judge links: {len(meta_judge_df)}")
    print(f"Unique models: {meta_judge_df['model'].nunique()}")
    print(f"Models found: {', '.join(meta_judge_df['model'].unique())}")
    
    if analysis_model_ids:
        print(f"\nFiltered to models from RESULTS_ANALYSIS_MODEL_IDS:")
        for model_id in analysis_model_ids:
            count = len(meta_judge_df[meta_judge_df['model'] == model_id])
            print(f"  - {model_id}: {count} links")
    
    # Display score statistics
    score_columns = ['judge_score', 'semantic_alignment', 'non_functional_coverage', 
                     'final_score', 'actor_score', 'functional_completeness']
    
    print("\nScore Statistics:")
    for col in score_columns:
        if col in meta_judge_df.columns:
            stats = meta_judge_df[col].describe()
            print(f"\n{col}:")
            print(f"  Mean: {stats['mean']:.3f}, Std: {stats['std']:.3f}")
            print(f"  Range: [{stats['min']:.3f}, {stats['max']:.3f}]")
else:
    print("\nNo meta judge links found.")
    if analysis_model_ids:
        print(f"Attempted to filter for models: {analysis_model_ids}")

In [None]:
# Cell [7] - Create Combined Meta Judge Dataset
# Purpose: Combine meta judge data with ground truth for analysis
# Dependencies: meta_judge_df from Cell [6], df_ground_truth from Cell [3]
# Breadcrumbs: Meta Judge Data -> Data Merge -> Combined Meta Judge Dataset

def create_meta_judge_combined_dataset(meta_judge_df, df_ground_truth):
    """
    Create a combined dataset with meta judge scores and ground truth labels
    
    Parameters:
        meta_judge_df: DataFrame with meta judge data
        df_ground_truth: DataFrame with ground truth links
    
    Returns:
        pd.DataFrame: Combined dataset
    """
    if meta_judge_df.empty:
        logger.error("No meta judge data available")
        return pd.DataFrame()
    
    # Start with meta judge data
    combined_meta_df = meta_judge_df.copy()
    
    # Filter to only include valid source and target requirements from ground truth
    if not df_ground_truth.empty:
        # Extract unique source and target IDs with ground truth links
        valid_source_ids = df_ground_truth['source_id'].unique()
        valid_target_ids = df_ground_truth['target_id'].unique()
        
        # Filter meta_judge_df to only include valid source and target requirements
        combined_meta_df = combined_meta_df[
            (combined_meta_df['source_id'].isin(valid_source_ids)) & 
            (combined_meta_df['target_id'].isin(valid_target_ids))
        ].copy()
        
        logger.info(f"Filtered meta judge data from {len(meta_judge_df)} to {len(combined_meta_df)} rows")
        
        # Create set of ground truth pairs
        ground_truth_pairs = set(zip(df_ground_truth['source_id'], df_ground_truth['target_id']))
        
        # Add ground truth column
        combined_meta_df['ground_truth_traceable'] = combined_meta_df.apply(
            lambda row: (row['source_id'], row['target_id']) in ground_truth_pairs,
            axis=1
        )
        
        logger.info(f"Ground truth distribution: {combined_meta_df['ground_truth_traceable'].value_counts().to_dict()}")
    else:
        logger.warning("No ground truth data available")
        combined_meta_df['ground_truth_traceable'] = False
    
    # Add derived total score columns for meta judge
    if 'judge_score' in combined_meta_df.columns and 'actor_score' in combined_meta_df.columns:
        # Original total_score: judge_score + actor_score
        combined_meta_df['total_score'] = combined_meta_df['judge_score'] + combined_meta_df['actor_score']
        
        # Alternative total scores
        if 'final_score' in combined_meta_df.columns:
            # Alternative 1: actor_score + final_score
            combined_meta_df['total_score_with_final'] = combined_meta_df['actor_score'] + combined_meta_df['final_score']
            
            # Alternative 2: actor_score + judge_score + final_score
            combined_meta_df['total_score_all'] = (combined_meta_df['actor_score'] + 
                                                   combined_meta_df['judge_score'] + 
                                                   combined_meta_df['final_score'])
    
    # Add total combined score with all available metrics
    available_metrics = []
    for metric in ['judge_score', 'semantic_alignment', 'non_functional_coverage', 
                   'final_score', 'actor_score', 'functional_completeness']:
        if metric in combined_meta_df.columns:
            available_metrics.append(metric)
    
    if available_metrics:
        combined_meta_df['total_combined_score'] = combined_meta_df[available_metrics].sum(axis=1)
        logger.info(f"Created total_combined_score from {len(available_metrics)} metrics")
    
    # Convert is_traceable to numeric for threshold evaluation
    if 'is_traceable' in combined_meta_df.columns:
        combined_meta_df['is_traceable_numeric'] = combined_meta_df['is_traceable'].astype(int)
    
    return combined_meta_df

# Create combined meta judge dataset
combined_meta_df = create_meta_judge_combined_dataset(meta_judge_df, df_ground_truth)

# Display info
if not combined_meta_df.empty:
    print(f"\nCombined Meta Judge Dataset Summary:")
    print(f"Total records: {len(combined_meta_df)}")
    print(f"Ground truth positive: {combined_meta_df['ground_truth_traceable'].sum()}")
    print(f"Ground truth negative: {(~combined_meta_df['ground_truth_traceable']).sum()}")
    
    # Show available score columns
    score_cols = ['is_traceable', 'judge_score', 'actor_score', 'final_score', 
                  'total_score', 'total_score_with_final', 'total_score_all', 'total_combined_score']
    available_scores = [col for col in score_cols if col in combined_meta_df.columns]
    print(f"Available score columns: {', '.join(available_scores)}")

In [None]:
# Cell [8] - Evaluate Meta Judge Models with Multiple Score Columns
# Purpose: Evaluate meta judge performance using different scoring methods
# Dependencies: combined_meta_df from Cell [7], evaluation functions from Cell [5]
# Breadcrumbs: Combined Meta Judge Dataset -> Model Evaluation -> Multiple Scoring Methods

def evaluate_meta_judge_models(combined_meta_df, optimization_metric='F2'):
    """
    Evaluate meta judge models using different scoring columns
    
    Parameters:
        combined_meta_df: DataFrame with meta judge predictions and ground truth
        optimization_metric: Metric to optimize ('F1' or 'F2')
    
    Returns:
        tuple: (evaluation_results, best_thresholds_df)
    """
    if combined_meta_df.empty or 'ground_truth_traceable' not in combined_meta_df.columns:
        logger.error("Cannot evaluate: missing data or ground truth")
        return [], pd.DataFrame()
    
    # Get unique models
    models = combined_meta_df['model'].unique()
    
    # Define score columns to evaluate
    score_columns_to_evaluate = [
        'is_traceable_numeric',  # Boolean indicator converted to numeric
        'actor_score',          # Individual score
        'judge_score',          # Individual score
        'final_score',          # Individual score
        'total_score',          # judge_score + actor_score
        'total_score_with_final',  # actor_score + final_score
        'total_score_all',      # actor_score + judge_score + final_score
        'total_combined_score'  # sum of all available metrics
    ]
    
    # Filter to only columns that exist
    score_columns_to_evaluate = [
        col for col in score_columns_to_evaluate 
        if col in combined_meta_df.columns and not combined_meta_df[col].isna().all()
    ]
    
    print(f"\nEvaluating {len(models)} Meta Judge models")
    print(f"Score columns to evaluate: {len(score_columns_to_evaluate)}")
    print(f"Optimizing for {optimization_metric} score")
    print("=" * 80)
    
    all_evaluation_results = []
    
    for model in models:
        print(f"\nEvaluating model: {model}")
        model_df = combined_meta_df[combined_meta_df['model'] == model]
        
        for score_column in score_columns_to_evaluate:
            # Use the same evaluation function from sentence transformers
            result = evaluate_model_thresholds(
                model_df, 
                model, 
                score_column=score_column,
                ground_truth_column='ground_truth_traceable',
                optimize_for=optimization_metric
            )
            
            if result and 'best_threshold' in result:
                # Add score column info
                result['score_column'] = score_column
                result['method'] = 'meta_judge'
                all_evaluation_results.append(result)
                
                print(f"\n  Score column: {score_column}")
                print(f"    Best threshold: {result['best_threshold']:.3f}")
                print(f"    {optimization_metric}: {result[f'best_{optimization_metric.lower()}']:.3f}")
                print(f"    Precision: {result['best_precision']:.3f}, Recall: {result['best_recall']:.3f}")
                print(f"    TP={result['best_tp']}, FP={result['best_fp']}, FN={result['best_fn']}, TN={result['best_tn']}")
    
    # Create summary DataFrame with ALL columns to match sentence transformer output
    if all_evaluation_results:
        meta_judge_results_df = pd.DataFrame([{
            'model_name': r['model_name'],
            'score_column': r['score_column'],
            'method': 'meta_judge',
            'best_threshold': r['best_threshold'],
            'accuracy': r['best_accuracy'],
            'balanced_accuracy': r['best_balanced_accuracy'],
            'precision': r['best_precision'],
            'recall': r['best_recall'],
            'specificity': r['best_tnr'],
            'miss_rate': r['best_fnr'],
            'f1_score': r['best_f1'],
            'f2_score': r['best_f2'],
            'matthews_corr': r['best_mcc'],
            'true_positives': r['best_tp'],
            'false_positives': r['best_fp'],
            'false_negatives': r['best_fn'],
            'true_negatives': r['best_tn'],
            'data_points': r['data_points'],
            'ground_truth_positive': r['ground_truth_positive'],
            'ground_truth_negative': r['ground_truth_negative']
        } for r in all_evaluation_results])
        
        # Sort by optimization metric
        sort_col = f'{optimization_metric.lower()}_score'
        meta_judge_results_df = meta_judge_results_df.sort_values(sort_col, ascending=False).reset_index(drop=True)
        
        # Display results with all columns to match sentence transformer format
        print(f"\n\nMeta Judge Best Results (sorted by {CONFIG['OPTIMIZATION_METRIC']}):")
        print("-" * 80)
        
        # Display columns in same order as sentence transformer
        display_cols = ['model_name', 'score_column', 'best_threshold', 'accuracy', 
                       'balanced_accuracy', 'precision', 'recall', 'specificity', 
                       'miss_rate', 'f1_score', 'f2_score', 'matthews_corr', 
                       'true_positives', 'false_positives', 'false_negatives', 
                       'true_negatives', 'data_points', 'ground_truth_positive', 
                       'ground_truth_negative']
        
        # Only display columns that exist
        display_cols = [col for col in display_cols if col in meta_judge_results_df.columns]
        
        print(meta_judge_results_df[display_cols].to_string())
        
        return all_evaluation_results, meta_judge_results_df
    
    return [], pd.DataFrame()

# Evaluate meta judge models
meta_judge_evaluation_results, meta_judge_best_df = evaluate_meta_judge_models(
    combined_meta_df, 
    CONFIG['OPTIMIZATION_METRIC']
)

In [None]:
# Cell [9] - Compare Sentence Transformers vs Meta Judge with 30% Improvement Hypothesis Testing
# Purpose: Test the core hypothesis that hallucination-reducing techniques achieve ≥30% improvement in NPD accuracy
# Dependencies: best_thresholds_df from Cell [5], meta_judge_best_df from Cell [8]
# Breadcrumbs: Model Evaluation -> Hypothesis Testing -> 30% Improvement Validation

def calculate_improvement_metrics(baseline_metrics, enhanced_metrics):
    """
    Calculate improvement metrics and test 30% improvement hypothesis
    
    Parameters:
        baseline_metrics: Dictionary with baseline performance metrics
        enhanced_metrics: Dictionary with enhanced method performance metrics
    
    Returns:
        dict: Improvement analysis results
    """
    improvements = {}
    
    # Calculate percentage improvements for each metric
    metrics_to_test = ['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1_score', 'f2_score', 'matthews_corr']
    
    for metric in metrics_to_test:
        if metric in baseline_metrics and metric in enhanced_metrics:
            baseline_val = baseline_metrics[metric]
            enhanced_val = enhanced_metrics[metric]
            
            if baseline_val > 0:
                pct_improvement = ((enhanced_val - baseline_val) / baseline_val) * 100
                improvements[f'{metric}_improvement_pct'] = pct_improvement
                improvements[f'{metric}_baseline'] = baseline_val
                improvements[f'{metric}_enhanced'] = enhanced_val
                improvements[f'{metric}_absolute_diff'] = enhanced_val - baseline_val
    
    # Hallucination reduction metrics (FP and FN reduction)
    if all(key in baseline_metrics for key in ['false_positives', 'false_negatives']) and \
       all(key in enhanced_metrics for key in ['false_positives', 'false_negatives']):
        
        # Calculate FP reduction (over-identification hallucination reduction)
        fp_baseline = baseline_metrics['false_positives']
        fp_enhanced = enhanced_metrics['false_positives']
        fp_reduction = fp_baseline - fp_enhanced
        fp_reduction_pct = (fp_reduction / fp_baseline * 100) if fp_baseline > 0 else 0
        
        # Calculate FN reduction (under-identification hallucination reduction)
        fn_baseline = baseline_metrics['false_negatives']
        fn_enhanced = enhanced_metrics['false_negatives']
        fn_reduction = fn_baseline - fn_enhanced
        fn_reduction_pct = (fn_reduction / fn_baseline * 100) if fn_baseline > 0 else 0
        
        improvements.update({
            'fp_reduction': fp_reduction,
            'fp_reduction_pct': fp_reduction_pct,
            'fn_reduction': fn_reduction,
            'fn_reduction_pct': fn_reduction_pct,
            'total_error_reduction': fp_reduction + fn_reduction,
            'hallucination_reduction_score': (fp_reduction_pct + fn_reduction_pct) / 2
        })
    
    return improvements

def test_30_percent_hypothesis(improvements, primary_metric='f2_score'):
    """
    Test the hypothesis that improvement is ≥30% for the primary metric
    
    Parameters:
        improvements: Dictionary with improvement calculations
        primary_metric: Primary metric to test (default: f2_score)
    
    Returns:
        dict: Hypothesis test results
    """
    improvement_key = f'{primary_metric}_improvement_pct'
    
    if improvement_key not in improvements:
        return {'error': f'Primary metric {primary_metric} not available for testing'}
    
    observed_improvement = improvements[improvement_key]
    
    # H0: improvement ≤ 30%
    # H1: improvement > 30% 
    null_hypothesis_threshold = 30.0
    
    # Simple threshold test
    meets_threshold = observed_improvement >= null_hypothesis_threshold
    
    # Calculate effect size (Cohen's d equivalent for percentage improvement)
    effect_size = (observed_improvement - null_hypothesis_threshold) / 10  # Normalize by 10% as standard unit
    
    return {
        'observed_improvement': observed_improvement,
        'null_threshold': null_hypothesis_threshold,
        'meets_30_percent_threshold': meets_threshold,
        'improvement_above_threshold': observed_improvement - null_hypothesis_threshold,
        'effect_size': effect_size,
        'practical_significance': 'Large' if abs(effect_size) > 0.8 else 'Medium' if abs(effect_size) > 0.5 else 'Small'
    }

def bootstrap_improvement_confidence_interval(baseline_df, enhanced_df, combined_df, combined_meta_df, 
                                            metric_func, n_bootstraps=1000, alpha=0.05):
    """
    Calculate bootstrap confidence intervals for improvement percentages
    
    Parameters:
        baseline_df: Best baseline configuration
        enhanced_df: Best enhanced configuration  
        combined_df: Combined sentence transformer data
        combined_meta_df: Combined meta judge data
        metric_func: Function to calculate metric
        n_bootstraps: Number of bootstrap samples
        alpha: Significance level
    
    Returns:
        dict: Bootstrap confidence interval results
    """
    # Get prediction data for both methods
    baseline_model = baseline_df['model_name']
    baseline_threshold = baseline_df['best_threshold']
    
    enhanced_model = enhanced_df['model_name']
    enhanced_score_col = enhanced_df['score_column']
    enhanced_threshold = enhanced_df['best_threshold']
    
    # Get predictions
    baseline_data = combined_df[combined_df['model'] == baseline_model].copy()
    baseline_data['predicted'] = (baseline_data['similarity_score'] >= baseline_threshold).astype(int)
    y_true_baseline = baseline_data['ground_truth_traceable'].astype(int).values
    y_pred_baseline = baseline_data['predicted'].values
    
    enhanced_data = combined_meta_df[combined_meta_df['model'] == enhanced_model].copy()
    enhanced_data['predicted'] = (enhanced_data[enhanced_score_col] >= enhanced_threshold).astype(int)
    y_true_enhanced = enhanced_data['ground_truth_traceable'].astype(int).values
    y_pred_enhanced = enhanced_data['predicted'].values
    
    # Ensure we have the same test instances by merging on source_id and target_id
    merged = pd.merge(
        baseline_data[['source_id', 'target_id', 'predicted', 'ground_truth_traceable']],
        enhanced_data[['source_id', 'target_id', 'predicted']],
        on=['source_id', 'target_id'],
        suffixes=('_baseline', '_enhanced')
    )
    
    if len(merged) == 0:
        return {'error': 'No common test instances found between methods'}
    
    y_true = merged['ground_truth_traceable'].astype(int).values
    y_pred_baseline = merged['predicted_baseline'].values
    y_pred_enhanced = merged['predicted_enhanced'].values
    
    # Bootstrap sampling
    baseline_scores = []
    enhanced_scores = []
    improvements = []
    
    for _ in range(n_bootstraps):
        # Resample indices
        indices = resample(np.arange(len(y_true)), n_samples=len(y_true))
        
        # Calculate metrics for resampled data
        baseline_score = metric_func(y_true[indices], y_pred_baseline[indices], beta=2, zero_division=0)
        enhanced_score = metric_func(y_true[indices], y_pred_enhanced[indices], beta=2, zero_division=0)
        
        baseline_scores.append(baseline_score)
        enhanced_scores.append(enhanced_score)
        
        # Calculate percentage improvement
        if baseline_score > 0:
            improvement_pct = ((enhanced_score - baseline_score) / baseline_score) * 100
            improvements.append(improvement_pct)
    
    # Calculate confidence intervals
    lower_percentile = (alpha / 2) * 100
    upper_percentile = (1 - alpha / 2) * 100
    
    improvement_ci = np.percentile(improvements, [lower_percentile, upper_percentile])
    
    # Test if 30% threshold is in confidence interval
    threshold_in_ci = improvement_ci[0] <= 30.0 <= improvement_ci[1]
    exceeds_30_percent = improvement_ci[0] > 30.0
    
    return {
        'mean_improvement': np.mean(improvements),
        'improvement_ci_lower': improvement_ci[0],
        'improvement_ci_upper': improvement_ci[1],
        'baseline_mean': np.mean(baseline_scores),
        'enhanced_mean': np.mean(enhanced_scores),
        'common_instances': len(merged),
        'exceeds_30_percent_threshold': exceeds_30_percent,
        'threshold_in_ci': threshold_in_ci,
        'bootstrap_samples': len(improvements)
    }

def compare_approaches_with_hypothesis_testing(sentence_transformer_df, meta_judge_df, 
                                             combined_df, combined_meta_df, optimization_metric='F2'):
    """
    Compare approaches with specific focus on 30% improvement hypothesis testing
    
    Parameters:
        sentence_transformer_df: Results from sentence transformer evaluation
        meta_judge_df: Results from meta judge evaluation
        combined_df: Combined sentence transformer data
        combined_meta_df: Combined meta judge data
        optimization_metric: Metric used for optimization
    
    Returns:
        dict: Complete comparison and hypothesis testing results
    """
    # Add method column if not present
    if 'method' not in sentence_transformer_df.columns:
        sentence_transformer_df = sentence_transformer_df.copy()
        sentence_transformer_df['method'] = 'sentence_transformer'
        sentence_transformer_df['score_column'] = 'similarity_score'
    
    # Combine results
    all_results = pd.concat([sentence_transformer_df, meta_judge_df], ignore_index=True)
    
    # Sort by optimization metric
    metric_col = f'{optimization_metric.lower()}_score'
    all_results = all_results.sort_values(metric_col, ascending=False).reset_index(drop=True)
    
    # Get best from each method
    best_baseline = all_results[all_results['method'] == 'sentence_transformer'].iloc[0]
    best_enhanced = all_results[all_results['method'] == 'meta_judge'].iloc[0]
    
    # Calculate improvement metrics
    improvements = calculate_improvement_metrics(best_baseline.to_dict(), best_enhanced.to_dict())
    
    # Test 30% hypothesis
    primary_metric_key = f'{optimization_metric.lower()}_score'
    hypothesis_results = test_30_percent_hypothesis(improvements, primary_metric_key)
    
    # Bootstrap confidence intervals
    print("Calculating bootstrap confidence intervals...")
    bootstrap_results = bootstrap_improvement_confidence_interval(
        best_baseline, best_enhanced, combined_df, combined_meta_df, fbeta_score
    )
    
    return {
        'comparison_df': all_results,
        'best_baseline': best_baseline,
        'best_enhanced': best_enhanced,
        'improvements': improvements,
        'hypothesis_test': hypothesis_results,
        'bootstrap_ci': bootstrap_results
    }

# Create comparison with hypothesis testing if both results are available
if 'best_thresholds_df' in globals() and not best_thresholds_df.empty and not meta_judge_best_df.empty:
    
    print("HALLUCINATION REDUCTION VALIDATION: 30% IMPROVEMENT HYPOTHESIS TESTING")
    print("=" * 100)
    print("CORE HYPOTHESIS: Techniques reducing LLM hallucinations by 30% can improve NPD accuracy")
    print("OPERATIONAL HYPOTHESIS: Multi-stage LLM refinement achieves ≥30% improvement vs baseline")
    print("=" * 100)
    
    # Perform comprehensive comparison with hypothesis testing
    analysis_results = compare_approaches_with_hypothesis_testing(
        best_thresholds_df, meta_judge_best_df, combined_df, combined_meta_df, CONFIG['OPTIMIZATION_METRIC']
    )
    
    comparison_df = analysis_results['comparison_df']
    best_baseline = analysis_results['best_baseline']
    best_enhanced = analysis_results['best_enhanced']
    improvements = analysis_results['improvements']
    hypothesis_test = analysis_results['hypothesis_test']
    bootstrap_ci = analysis_results['bootstrap_ci']
    
    print(f"\n1. BASELINE vs ENHANCED APPROACH COMPARISON")
    print("-" * 80)
    print(f"BASELINE (Sentence Transformer): {best_baseline['model_name']}")
    print(f"  Threshold: {best_baseline.get('best_threshold', 'N/A')}")
    
    # Fix the syntax error by using a variable for the column name
    optimization_metric_lower = CONFIG['OPTIMIZATION_METRIC'].lower()
    score_column = f'{optimization_metric_lower}_score'
    print(f"  {CONFIG['OPTIMIZATION_METRIC']} Score: {best_baseline[score_column]:.4f}")
    print(f"  Precision: {best_baseline['precision']:.4f}, Recall: {best_baseline['recall']:.4f}")
    print(f"  False Positives: {best_baseline['false_positives']}, False Negatives: {best_baseline['false_negatives']}")
    
    print(f"\nENHANCED (Meta Judge): {best_enhanced['model_name']} ({best_enhanced['score_column']})")
    print(f"  Threshold: {best_enhanced.get('best_threshold', 'N/A')}")
    print(f"  {CONFIG['OPTIMIZATION_METRIC']} Score: {best_enhanced[score_column]:.4f}")
    print(f"  Precision: {best_enhanced['precision']:.4f}, Recall: {best_enhanced['recall']:.4f}")
    print(f"  False Positives: {best_enhanced['false_positives']}, False Negatives: {best_enhanced['false_negatives']}")
    
    print(f"\n2. ACCURACY IMPROVEMENT ANALYSIS")
    print("-" * 80)
    
    # Display improvements for all metrics
    metrics_order = ['f2_score', 'f1_score', 'precision', 'recall', 'accuracy', 'balanced_accuracy', 'matthews_corr']
    
    for metric in metrics_order:
        improvement_key = f'{metric}_improvement_pct'
        if improvement_key in improvements:
            baseline_val = improvements[f'{metric}_baseline']
            enhanced_val = improvements[f'{metric}_enhanced']
            improvement_pct = improvements[improvement_key]
            
            print(f"{metric.upper().replace('_', ' ')}:")
            print(f"  Baseline: {baseline_val:.4f} → Enhanced: {enhanced_val:.4f}")
            print(f"  Improvement: {improvement_pct:+.1f}% {'✓' if improvement_pct >= 30 else '✗'}")
    
    print(f"\n3. HALLUCINATION REDUCTION METRICS")
    print("-" * 80)
    
    if 'fp_reduction' in improvements:
        print(f"FALSE POSITIVE REDUCTION (Over-identification Hallucinations):")
        print(f"  Baseline FP: {best_baseline['false_positives']} → Enhanced FP: {best_enhanced['false_positives']}")
        print(f"  Reduction: {improvements['fp_reduction']} ({improvements['fp_reduction_pct']:+.1f}%)")
        
        print(f"\nFALSE NEGATIVE REDUCTION (Under-identification Hallucinations):")
        print(f"  Baseline FN: {best_baseline['false_negatives']} → Enhanced FN: {best_enhanced['false_negatives']}")
        print(f"  Reduction: {improvements['fn_reduction']} ({improvements['fn_reduction_pct']:+.1f}%)")
        
        print(f"\nTOTAL ERROR REDUCTION:")
        print(f"  Total errors reduced: {improvements['total_error_reduction']}")
        print(f"  Hallucination reduction score: {improvements['hallucination_reduction_score']:.1f}%")
    
    print(f"\n4. 30% IMPROVEMENT HYPOTHESIS TEST")
    print("-" * 80)
    
    if 'error' not in hypothesis_test:
        primary_metric_display = f"{CONFIG['OPTIMIZATION_METRIC']}_SCORE"
        observed_improvement = hypothesis_test['observed_improvement']
        
        print(f"PRIMARY METRIC: {primary_metric_display}")
        print(f"H₀: Improvement ≤ 30%")
        print(f"H₁: Improvement > 30%")
        print(f"")
        print(f"OBSERVED IMPROVEMENT: {observed_improvement:.2f}%")
        print(f"THRESHOLD: {hypothesis_test['null_threshold']:.1f}%")
        print(f"EXCEEDS THRESHOLD: {'YES' if hypothesis_test['meets_30_percent_threshold'] else 'NO'}")
        print(f"MARGIN: {hypothesis_test['improvement_above_threshold']:+.2f}%")
        print(f"EFFECT SIZE: {hypothesis_test['effect_size']:.3f} ({hypothesis_test['practical_significance']})")
        
        if hypothesis_test['meets_30_percent_threshold']:
            print(f"")
            print(f"🎯 HYPOTHESIS VALIDATED: The enhancement achieves ≥30% improvement!")
            print(f"📈 NPD IMPACT: Significant accuracy improvement confirmed for requirements traceability")
        else:
            print(f"")
            print(f"❌ HYPOTHESIS NOT MET: Improvement below 30% threshold")
            print(f"📊 NPD IMPACT: Enhancement shows improvement but below significance threshold")
    
    print(f"\n5. BOOTSTRAP CONFIDENCE INTERVALS (95%)")
    print("-" * 80)
    
    if 'error' not in bootstrap_ci:
        print(f"BOOTSTRAP ANALYSIS:")
        print(f"  Common test instances: {bootstrap_ci['common_instances']}")
        print(f"  Bootstrap samples: {bootstrap_ci['bootstrap_samples']}")
        print(f"")
        print(f"IMPROVEMENT CONFIDENCE INTERVAL:")
        print(f"  Mean improvement: {bootstrap_ci['mean_improvement']:.2f}%")
        print(f"  95% CI: [{bootstrap_ci['improvement_ci_lower']:.2f}%, {bootstrap_ci['improvement_ci_upper']:.2f}%]")
        print(f"")
        print(f"STATISTICAL SIGNIFICANCE:")
        print(f"  CI exceeds 30% threshold: {'YES' if bootstrap_ci['exceeds_30_percent_threshold'] else 'NO'}")
        print(f"  30% within CI: {'YES' if bootstrap_ci['threshold_in_ci'] else 'NO'}")
        
        if bootstrap_ci['exceeds_30_percent_threshold']:
            print(f"  ✅ STATISTICALLY SIGNIFICANT: Lower bound > 30%")
        elif not bootstrap_ci['threshold_in_ci']:
            print(f"  ❌ NOT SIGNIFICANT: 30% threshold outside confidence interval")
        else:
            print(f"  ⚠️  INCONCLUSIVE: 30% threshold within confidence interval")
    else:
        print(f"Bootstrap analysis error: {bootstrap_ci['error']}")
    
    print(f"\n6. NPD REQUIREMENTS TRACEABILITY IMPACT ASSESSMENT")
    print("-" * 80)
    
    # Calculate business impact metrics
    total_requirements = best_baseline['data_points']
    accuracy_gain = improvements.get('accuracy_improvement_pct', 0)
    precision_gain = improvements.get('precision_improvement_pct', 0)
    recall_gain = improvements.get('recall_improvement_pct', 0)
    
    print(f"OPERATIONAL METRICS:")
    print(f"  Total requirement pairs analyzed: {total_requirements}")
    print(f"  Accuracy improvement: {accuracy_gain:+.1f}%")
    print(f"  Precision improvement: {precision_gain:+.1f}% (reduced false connections)")
    print(f"  Recall improvement: {recall_gain:+.1f}% (fewer missed connections)")
    
    if 'fp_reduction' in improvements and 'fn_reduction' in improvements:
        print(f"")
        print(f"QUALITY ASSURANCE IMPACT:")
        print(f"  Reduced over-identification errors: {improvements['fp_reduction']} ({improvements['fp_reduction_pct']:+.1f}%)")
        print(f"  Reduced under-identification errors: {improvements['fn_reduction']} ({improvements['fn_reduction_pct']:+.1f}%)")
        print(f"  Total error reduction: {improvements['total_error_reduction']} cases")
    
    print(f"\n7. VALIDATION SUMMARY")
    print("-" * 80)
    
    # Final validation summary
    meets_hypothesis = hypothesis_test.get('meets_30_percent_threshold', False)
    is_statistically_significant = bootstrap_ci.get('exceeds_30_percent_threshold', False)
    
    print(f"CORE HYPOTHESIS VALIDATION:")
    print(f"  ✓ Multi-stage LLM refinement implemented: YES")
    print(f"  ✓ Hallucination reduction technique applied: YES") 
    print(f"  ✓ NPD requirements traceability tested: YES")
    print(f"  ✓ ≥30% improvement achieved: {'YES' if meets_hypothesis else 'NO'}")
    print(f"  ✓ Statistical significance confirmed: {'YES' if is_statistically_significant else 'NO'}")
    
    if meets_hypothesis and is_statistically_significant:
        print(f"")
        print(f"🏆 RESEARCH HYPOTHESIS VALIDATED")
        print(f"   Multi-stage LLM refinement successfully reduces hallucinations")
        print(f"   and achieves statistically significant ≥30% improvement in")
        print(f"   NPD requirements traceability accuracy.")
    elif meets_hypothesis:
        print(f"")
        print(f"⚠️  HYPOTHESIS PARTIALLY VALIDATED")
        print(f"   30% improvement achieved but statistical significance unclear.")
        print(f"   Consider larger sample size for definitive validation.")
    else:
        print(f"")
        print(f"❌ HYPOTHESIS NOT VALIDATED") 
        print(f"   Enhancement shows improvement but below 30% threshold.")
        print(f"   Further refinement of hallucination reduction techniques needed.")
    
    # Save comparison results for next cells
    globals()['comparison_df'] = comparison_df
    globals()['hypothesis_validation_results'] = {
        'improvements': improvements,
        'hypothesis_test': hypothesis_test,
        'bootstrap_ci': bootstrap_ci,
        'validated': meets_hypothesis and is_statistically_significant
    }
    
else:
    print("\nCannot create comparison - missing results from one or both approaches")
    print("Please ensure you have run:")
    print("  - Cell 5: Sentence transformer evaluation (creates best_thresholds_df)")
    print("  - Cell 8: Meta judge evaluation (creates meta_judge_best_df)")
    print("  - Cell 4: Combined sentence transformer dataset (creates combined_df)")
    print("  - Cell 7: Combined meta judge dataset (creates combined_meta_df)")

In [None]:
# Cell [10] - Visualize Comparison Results
# Purpose: Create visualizations comparing the two approaches
# Dependencies: comparison_df from Cell [9], matplotlib and seaborn from Cell [0]
# Breadcrumbs: Approach Comparison -> Data Visualization -> Performance Charts

if CONFIG['SHOW_VISUALIZATION'] and 'comparison_df' in globals() and not comparison_df.empty:
    # Create visualization comparing approaches
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle(f'Comparison: Sentence Transformers vs Meta Judge - Project: {CONFIG["NEO4J_PROJECT_NAME"]}', 
                 fontsize=16)
    
    # 1. F2 Score comparison
    ax = axes[0, 0]
    
    # Get best result for each method
    best_by_method = comparison_df.groupby('method')[f'{CONFIG["OPTIMIZATION_METRIC"].lower()}_score'].agg(['max', 'mean']).reset_index()
    
    x = np.arange(len(best_by_method))
    width = 0.35
    
    ax.bar(x - width/2, best_by_method['max'], width, label='Best', color='#1A85FF')
    ax.bar(x + width/2, best_by_method['mean'], width, label='Average', color='#FFC61A')
    
    ax.set_xlabel('Method')
    ax.set_ylabel(f'{CONFIG["OPTIMIZATION_METRIC"]} Score')
    ax.set_title(f'{CONFIG["OPTIMIZATION_METRIC"]} Score Comparison')
    ax.set_xticks(x)
    ax.set_xticklabels(best_by_method['method'])
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    
    # 2. Precision-Recall Scatter
    ax = axes[0, 1]
    
    for method in comparison_df['method'].unique():
        method_data = comparison_df[comparison_df['method'] == method]
        ax.scatter(method_data['recall'], method_data['precision'], 
                  label=method, s=100, alpha=0.6)
    
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_title('Precision vs Recall Trade-off')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # 3. Top models bar chart
    ax = axes[1, 0]
    
    top_n = 10
    top_models = comparison_df.head(top_n)
    
    y_pos = np.arange(len(top_models))
    colors = ['#1A85FF' if m == 'sentence_transformer' else '#D41159' 
              for m in top_models['method']]
    
    bars = ax.barh(y_pos, top_models[f'{CONFIG["OPTIMIZATION_METRIC"].lower()}_score'], color=colors)
    
    # Create labels
    labels = []
    for _, row in top_models.iterrows():
        if row['method'] == 'sentence_transformer':
            label = row['model_name'].split('/')[-1] if '/' in row['model_name'] else row['model_name']
        else:
            label = f"{row['model_name']} ({row['score_column']})"
        labels.append(label)
    
    ax.set_yticks(y_pos)
    ax.set_yticklabels(labels)
    ax.set_xlabel(f'{CONFIG["OPTIMIZATION_METRIC"]} Score')
    ax.set_title(f'Top {top_n} Configurations')
    ax.grid(axis='x', alpha=0.3)
    
    # Add legend
    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor='#1A85FF', label='Sentence Transformer'),
        Patch(facecolor='#D41159', label='Meta Judge')
    ]
    ax.legend(handles=legend_elements, loc='lower right')
    
    # 4. Confusion Matrix Comparison for best of each
    ax = axes[1, 1]
    
    # Get best from each method
    best_st = comparison_df[comparison_df['method'] == 'sentence_transformer'].iloc[0] if len(comparison_df[comparison_df['method'] == 'sentence_transformer']) > 0 else None
    best_mj = comparison_df[comparison_df['method'] == 'meta_judge'].iloc[0] if len(comparison_df[comparison_df['method'] == 'meta_judge']) > 0 else None
    
    if best_st is not None and best_mj is not None:
        # Create grouped bar chart for confusion matrix values
        metrics = ['TP', 'FP', 'FN', 'TN']
        st_values = [best_st['true_positives'], best_st['false_positives'], 
                     best_st['false_negatives'], best_st['true_negatives']]
        mj_values = [best_mj['true_positives'], best_mj['false_positives'], 
                     best_mj['false_negatives'], best_mj['true_negatives']]
        
        x = np.arange(len(metrics))
        width = 0.35
        
        ax.bar(x - width/2, st_values, width, label='Sentence Transformer', color='#1A85FF')
        ax.bar(x + width/2, mj_values, width, label='Meta Judge', color='#D41159')
        
        ax.set_xlabel('Metric')
        ax.set_ylabel('Count')
        ax.set_title('Confusion Matrix Comparison (Best Models)')
        ax.set_xticks(x)
        ax.set_xticklabels(metrics)
        ax.legend()
        ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed comparison of best models
    print("\n\nDETAILED COMPARISON OF BEST MODELS:")
    print("=" * 100)
    
    for method in ['sentence_transformer', 'meta_judge']:
        method_best = comparison_df[comparison_df['method'] == method]
        if not method_best.empty:
            best = method_best.iloc[0]
            print(f"\nBest {method.upper()}:")
            print(f"  Model: {best['model_name']}")
            if method == 'meta_judge':
                print(f"  Score Column: {best['score_column']}")
            print(f"  Threshold: {best.get('best_threshold', 'N/A')}")
            print(f"  {CONFIG['OPTIMIZATION_METRIC']} Score: {best[f'{CONFIG["OPTIMIZATION_METRIC"].lower()}_score']:.3f}")
            print(f"  Precision: {best['precision']:.3f}")
            print(f"  Recall: {best['recall']:.3f}")
            print(f"  F1 Score: {best['f1_score']:.3f}")
            print(f"  Matthews Correlation: {best['matthews_corr']:.3f}")
            print(f"  Confusion Matrix:")
            print(f"    True Positives:  {best['true_positives']:4d}")
            print(f"    False Positives: {best['false_positives']:4d}")
            print(f"    False Negatives: {best['false_negatives']:4d}")
            print(f"    True Negatives:  {best['true_negatives']:4d}")

In [None]:
# Cell [11] - Statistical Significance Testing for 30% Improvement Hypothesis
# Purpose: Perform rigorous statistical tests to validate the 30% improvement hypothesis for hallucination reduction
# Dependencies: comparison_df from Cell [9], scipy.stats, sklearn.utils, statsmodels from Cell [0]
# Breadcrumbs: Performance Analysis -> Hypothesis Testing -> 30% Improvement Statistical Validation

def get_predictions_for_best_models(combined_df, combined_meta_df, best_st_config, best_mj_config):
    """
    Get the actual predictions for the best configurations of each method
    
    Parameters:
        combined_df: DataFrame with sentence transformer data
        combined_meta_df: DataFrame with meta judge data
        best_st_config: Best sentence transformer configuration
        best_mj_config: Best meta judge configuration
    
    Returns:
        pd.DataFrame: Merged predictions from both methods on common instances
    """
    # Get sentence transformer predictions
    st_model = best_st_config['model_name']
    st_threshold = best_st_config['best_threshold']
    
    st_data = combined_df[combined_df['model'] == st_model].copy()
    st_data['predicted'] = (st_data['similarity_score'] >= st_threshold).astype(int)
    st_data['actual'] = st_data['ground_truth_traceable'].astype(int)
    
    # Get meta judge predictions
    mj_model = best_mj_config['model_name']
    mj_score_col = best_mj_config['score_column']
    mj_threshold = best_mj_config['best_threshold']
    
    mj_data = combined_meta_df[combined_meta_df['model'] == mj_model].copy()
    mj_data['predicted'] = (mj_data[mj_score_col] >= mj_threshold).astype(int)
    mj_data['actual'] = mj_data['ground_truth_traceable'].astype(int)
    
    # Merge on source_id and target_id to ensure we're comparing the same instances
    merged = pd.merge(
        st_data[['source_id', 'target_id', 'predicted', 'actual']],
        mj_data[['source_id', 'target_id', 'predicted']],
        on=['source_id', 'target_id'],
        suffixes=('_st', '_mj')
    )
    
    return merged

def mcnemars_test_with_effect_size(predictions_df):
    """
    Perform McNemar's test with effect size calculation
    
    Parameters:
        predictions_df: DataFrame with predictions from both methods
    
    Returns:
        dict: Test results with effect size
    """
    # Create contingency table
    st_correct = (predictions_df['predicted_st'] == predictions_df['actual']).astype(int)
    mj_correct = (predictions_df['predicted_mj'] == predictions_df['actual']).astype(int)
    
    # Count the disagreements
    b = ((st_correct == 1) & (mj_correct == 0)).sum()  # ST correct, MJ wrong
    c = ((st_correct == 0) & (mj_correct == 1)).sum()  # ST wrong, MJ correct
    
    # Create 2x2 contingency table
    n00 = ((st_correct == 1) & (mj_correct == 1)).sum()  # both correct
    n01 = b  # ST correct, MJ wrong
    n10 = c  # ST wrong, MJ correct
    n11 = ((st_correct == 0) & (mj_correct == 0)).sum()  # both wrong
    
    contingency_table = np.array([[n00, n01], [n10, n11]])
    
    # Perform McNemar's test
    if b + c > 0:
        result = mcnemar(contingency_table, exact=True if (b + c) < 25 else False)
        
        # Calculate effect size (Cohen's g for McNemar's test)
        total_discordant = b + c
        effect_size = abs(b - c) / np.sqrt(b + c) if total_discordant > 0 else 0
        
        # Calculate improvement direction
        mj_better = c > b  # More cases where MJ correct and ST wrong
        
        return {
            'statistic': result.statistic,
            'pvalue': result.pvalue,
            'b': b,  # ST correct, MJ wrong
            'c': c,  # ST wrong, MJ correct  
            'contingency_table': contingency_table,
            'effect_size': effect_size,
            'mj_better': mj_better,
            'improvement_cases': c - b
        }
    else:
        return {
            'error': 'No disagreements between methods',
            'contingency_table': contingency_table
        }

def one_sided_improvement_test(baseline_score, enhanced_score, n_samples, improvement_threshold=30.0):
    """
    Perform one-sided test for improvement above threshold
    
    Parameters:
        baseline_score: Baseline method score
        enhanced_score: Enhanced method score
        n_samples: Number of test samples
        improvement_threshold: Minimum improvement threshold (default: 30%)
    
    Returns:
        dict: One-sided test results
    """
    # Calculate observed improvement percentage
    observed_improvement = ((enhanced_score - baseline_score) / baseline_score) * 100
    
    # For large samples, use normal approximation
    # Standard error estimation for proportion differences
    p1 = baseline_score  # Baseline success rate
    p2 = enhanced_score  # Enhanced success rate
    
    # Pooled standard error for difference in proportions
    se_diff = np.sqrt((p1 * (1 - p1) + p2 * (1 - p2)) / n_samples)
    
    # Convert improvement threshold to absolute difference
    threshold_absolute = (improvement_threshold / 100) * baseline_score
    
    # Z-statistic for one-sided test
    # H0: p2 - p1 <= threshold_absolute
    # H1: p2 - p1 > threshold_absolute
    observed_diff = enhanced_score - baseline_score
    z_statistic = (observed_diff - threshold_absolute) / se_diff if se_diff > 0 else 0
    
    # One-sided p-value (upper tail) - using scipy_stats to avoid naming conflicts
    p_value = 1 - scipy_stats.norm.cdf(z_statistic)
    
    return {
        'observed_improvement_pct': observed_improvement,
        'improvement_threshold': improvement_threshold,
        'observed_difference': observed_diff,
        'threshold_difference': threshold_absolute,
        'z_statistic': z_statistic,
        'p_value': p_value,
        'meets_threshold': observed_improvement >= improvement_threshold,
        'statistically_significant': p_value < 0.05,
        'standard_error': se_diff
    }

def bootstrap_hypothesis_test(y_true, y_pred_baseline, y_pred_enhanced, metric_func, 
                            improvement_threshold=30.0, n_bootstraps=10000, alpha=0.05):
    """
    Bootstrap test for 30% improvement hypothesis with p-value calculation
    
    Parameters:
        y_true: True labels
        y_pred_baseline: Baseline predictions
        y_pred_enhanced: Enhanced predictions
        metric_func: Function to calculate metric
        improvement_threshold: Minimum improvement threshold (default: 30%)
        n_bootstraps: Number of bootstrap samples
        alpha: Significance level
    
    Returns:
        dict: Bootstrap hypothesis test results
    """
    n_samples = len(y_true)
    improvements = []
    baseline_scores = []
    enhanced_scores = []
    
    # Bootstrap resampling
    for _ in range(n_bootstraps):
        # Resample indices
        indices = resample(np.arange(n_samples), n_samples=n_samples)
        
        # Calculate metrics for resampled data
        baseline_score = metric_func(y_true[indices], y_pred_baseline[indices], beta=2, zero_division=0)
        enhanced_score = metric_func(y_true[indices], y_pred_enhanced[indices], beta=2, zero_division=0)
        
        baseline_scores.append(baseline_score)
        enhanced_scores.append(enhanced_score)
        
        # Calculate percentage improvement
        if baseline_score > 0:
            improvement_pct = ((enhanced_score - baseline_score) / baseline_score) * 100
            improvements.append(improvement_pct)
        else:
            improvements.append(0)
    
    improvements = np.array(improvements)
    
    # Calculate confidence intervals
    lower_percentile = (alpha / 2) * 100
    upper_percentile = (1 - alpha / 2) * 100
    
    improvement_ci = np.percentile(improvements, [lower_percentile, upper_percentile])
    
    # Hypothesis test: H0: improvement <= threshold, H1: improvement > threshold
    # P-value = proportion of bootstrap samples with improvement <= threshold
    p_value_bootstrap = np.mean(improvements <= improvement_threshold)
    
    # One-sided confidence interval (lower bound only for improvement)
    improvement_ci_lower = np.percentile(improvements, alpha * 100)
    
    return {
        'mean_improvement': np.mean(improvements),
        'std_improvement': np.std(improvements),
        'improvement_ci': improvement_ci,
        'improvement_ci_lower_one_sided': improvement_ci_lower,
        'baseline_mean': np.mean(baseline_scores),
        'enhanced_mean': np.mean(enhanced_scores),
        'p_value_bootstrap': p_value_bootstrap,
        'meets_threshold_ci': improvement_ci[0] > improvement_threshold,
        'meets_threshold_one_sided': improvement_ci_lower > improvement_threshold,
        'effect_size': np.mean(improvements) / np.std(improvements) if np.std(improvements) > 0 else 0,
        'bootstrap_samples': len(improvements),
        'power': 1 - p_value_bootstrap if p_value_bootstrap < 0.5 else p_value_bootstrap
    }

def permutation_test_improvement(y_true, y_pred_baseline, y_pred_enhanced, metric_func,
                               improvement_threshold=30.0, n_permutations=10000):
    """
    Permutation test for improvement significance
    
    Parameters:
        y_true: True labels
        y_pred_baseline: Baseline predictions
        y_pred_enhanced: Enhanced predictions
        metric_func: Function to calculate metric
        improvement_threshold: Minimum improvement threshold
        n_permutations: Number of permutation samples
    
    Returns:
        dict: Permutation test results
    """
    # Calculate observed improvement
    baseline_score = metric_func(y_true, y_pred_baseline, beta=2, zero_division=0)
    enhanced_score = metric_func(y_true, y_pred_enhanced, beta=2, zero_division=0)
    observed_improvement = ((enhanced_score - baseline_score) / baseline_score) * 100 if baseline_score > 0 else 0
    
    # Combine predictions for permutation
    combined_predictions = np.column_stack([y_pred_baseline, y_pred_enhanced])
    
    improvements_null = []
    
    for _ in range(n_permutations):
        # Randomly permute the method labels
        permuted_indices = np.random.permutation(2)
        perm_pred1 = combined_predictions[:, permuted_indices[0]]
        perm_pred2 = combined_predictions[:, permuted_indices[1]]
        
        # Calculate metrics
        score1 = metric_func(y_true, perm_pred1, beta=2, zero_division=0)
        score2 = metric_func(y_true, perm_pred2, beta=2, zero_division=0)
        
        # Calculate improvement
        if score1 > 0:
            improvement = ((score2 - score1) / score1) * 100
            improvements_null.append(improvement)
    
    # Calculate p-value: proportion of null improvements >= observed
    p_value_permutation = np.mean(np.array(improvements_null) >= observed_improvement)
    
    return {
        'observed_improvement': observed_improvement,
        'p_value_permutation': p_value_permutation,
        'null_distribution_mean': np.mean(improvements_null),
        'null_distribution_std': np.std(improvements_null),
        'permutation_samples': len(improvements_null)
    }

def comprehensive_hypothesis_validation(baseline_config, enhanced_config, predictions_df, 
                                      y_true, y_pred_baseline, y_pred_enhanced,
                                      improvement_threshold=30.0, alpha=0.05):
    """
    Comprehensive statistical validation of the 30% improvement hypothesis
    
    Parameters:
        baseline_config: Baseline method configuration
        enhanced_config: Enhanced method configuration
        predictions_df: DataFrame with predictions from both methods
        y_true: True labels
        y_pred_baseline: Baseline predictions
        y_pred_enhanced: Enhanced predictions
        improvement_threshold: Minimum improvement threshold (default: 30%)
        alpha: Significance level
    
    Returns:
        dict: Comprehensive validation results
    """
    # Calculate observed metrics
    baseline_f2 = fbeta_score(y_true, y_pred_baseline, beta=2, zero_division=0)
    enhanced_f2 = fbeta_score(y_true, y_pred_enhanced, beta=2, zero_division=0)
    observed_improvement = ((enhanced_f2 - baseline_f2) / baseline_f2) * 100 if baseline_f2 > 0 else 0
    
    # 1. McNemar's test with effect size
    mcnemar_results = mcnemars_test_with_effect_size(predictions_df)
    
    # 2. One-sided improvement test
    one_sided_results = one_sided_improvement_test(baseline_f2, enhanced_f2, len(y_true), improvement_threshold)
    
    # 3. Bootstrap hypothesis test
    bootstrap_results = bootstrap_hypothesis_test(
        y_true, y_pred_baseline, y_pred_enhanced, fbeta_score, 
        improvement_threshold, n_bootstraps=10000, alpha=alpha
    )
    
    # 4. Permutation test
    permutation_results = permutation_test_improvement(
        y_true, y_pred_baseline, y_pred_enhanced, fbeta_score,
        improvement_threshold, n_permutations=5000
    )
    
    # 5. Power analysis
    effect_size_cohen = (enhanced_f2 - baseline_f2) / np.sqrt((baseline_f2 * (1 - baseline_f2) + enhanced_f2 * (1 - enhanced_f2)) / 2)
    
    return {
        'observed_improvement': observed_improvement,
        'improvement_threshold': improvement_threshold,
        'sample_size': len(y_true),
        'mcnemar_test': mcnemar_results,
        'one_sided_test': one_sided_results,
        'bootstrap_test': bootstrap_results,
        'permutation_test': permutation_results,
        'effect_size_cohen': effect_size_cohen,
        'baseline_f2': baseline_f2,
        'enhanced_f2': enhanced_f2
    }

# Perform comprehensive statistical validation if we have comparison results
if 'comparison_df' in globals() and not comparison_df.empty and 'hypothesis_validation_results' in globals():
    
    print("COMPREHENSIVE STATISTICAL VALIDATION: 30% IMPROVEMENT HYPOTHESIS")
    print("=" * 100)
    print("STATISTICAL FRAMEWORK: Multiple testing approaches to validate hallucination reduction claims")
    print("PRIMARY HYPOTHESIS: H₀: Improvement ≤ 30%, H₁: Improvement > 30%")
    print("SIGNIFICANCE LEVEL: α = 0.05")
    print("=" * 100)
    
    # Get best configurations for each method
    best_st = comparison_df[comparison_df['method'] == 'sentence_transformer'].iloc[0]
    best_mj = comparison_df[comparison_df['method'] == 'meta_judge'].iloc[0]
    
    print(f"\nMETHODS UNDER COMPARISON:")
    print(f"  BASELINE: {best_st['model_name']} (Sentence Transformer)")
    print(f"  ENHANCED: {best_mj['model_name']} - {best_mj['score_column']} (Meta Judge)")
    print(f"  F2 SCORES: Baseline = {best_st['f2_score']:.4f}, Enhanced = {best_mj['f2_score']:.4f}")
    print(f"  OBSERVED IMPROVEMENT: {((best_mj['f2_score'] - best_st['f2_score']) / best_st['f2_score'] * 100):.2f}%")
    
    # Get predictions for statistical tests
    predictions = get_predictions_for_best_models(combined_df, combined_meta_df, best_st, best_mj)
    
    if not predictions.empty:
        y_true = predictions['actual'].values
        y_pred_st = predictions['predicted_st'].values
        y_pred_mj = predictions['predicted_mj'].values
        
        print(f"\nTEST DATASET:")
        print(f"  Common instances: {len(predictions)}")
        print(f"  Positive cases: {y_true.sum()} ({y_true.sum()/len(y_true)*100:.1f}%)")
        print(f"  Negative cases: {len(y_true) - y_true.sum()} ({(len(y_true) - y_true.sum())/len(y_true)*100:.1f}%)")
        
        # Perform comprehensive validation
        validation_results = comprehensive_hypothesis_validation(
            best_st, best_mj, predictions, y_true, y_pred_st, y_pred_mj,
            improvement_threshold=30.0, alpha=0.05
        )
        
        print(f"\n1. McNEMAR'S TEST FOR PAIRED CLASSIFIER COMPARISON")
        print("-" * 80)
        
        mcnemar_test = validation_results['mcnemar_test']
        if 'error' not in mcnemar_test:
            print(f"Contingency Table (Method Performance):")
            print(f"                    MJ Correct    MJ Wrong")
            print(f"ST Correct:         {mcnemar_test['contingency_table'][0,0]:>10} {mcnemar_test['contingency_table'][0,1]:>10}")
            print(f"ST Wrong:           {mcnemar_test['contingency_table'][1,0]:>10} {mcnemar_test['contingency_table'][1,1]:>10}")
            print(f"")
            print(f"Disagreement Analysis:")
            print(f"  ST correct, MJ wrong: {mcnemar_test['b']} cases")
            print(f"  ST wrong, MJ correct: {mcnemar_test['c']} cases")
            print(f"  Net improvement: {mcnemar_test['improvement_cases']} cases")
            print(f"")
            print(f"Statistical Results:")
            print(f"  McNemar's χ²: {mcnemar_test['statistic']:.4f}")
            print(f"  p-value: {mcnemar_test['pvalue']:.4f}")
            print(f"  Effect size: {mcnemar_test['effect_size']:.4f}")
            print(f"  MJ superior: {'YES' if mcnemar_test['mj_better'] else 'NO'}")
            
            if mcnemar_test['pvalue'] < 0.05:
                print(f"  ✅ SIGNIFICANT: Methods perform differently (p < 0.05)")
            else:
                print(f"  ❌ NOT SIGNIFICANT: No significant difference in method performance")
        
        print(f"\n2. ONE-SIDED IMPROVEMENT TEST (30% THRESHOLD)")
        print("-" * 80)
        
        one_sided = validation_results['one_sided_test']
        print(f"Hypothesis Testing:")
        print(f"  H₀: Improvement ≤ 30%")
        print(f"  H₁: Improvement > 30%")
        print(f"")
        print(f"Test Results:")
        print(f"  Observed improvement: {one_sided['observed_improvement_pct']:.2f}%")
        print(f"  Improvement threshold: {one_sided['improvement_threshold']:.1f}%")
        print(f"  Z-statistic: {one_sided['z_statistic']:.4f}")
        print(f"  p-value (one-sided): {one_sided['p_value']:.4f}")
        print(f"  Standard error: {one_sided['standard_error']:.4f}")
        print(f"")
        print(f"Hypothesis Decision:")
        print(f"  Meets 30% threshold: {'YES' if one_sided['meets_threshold'] else 'NO'}")
        print(f"  Statistically significant: {'YES' if one_sided['statistically_significant'] else 'NO'}")
        
        if one_sided['meets_threshold'] and one_sided['statistically_significant']:
            print(f"  🎯 HYPOTHESIS VALIDATED: ≥30% improvement with statistical significance")
        elif one_sided['meets_threshold']:
            print(f"  ⚠️  THRESHOLD MET: But not statistically significant")
        else:
            print(f"  ❌ HYPOTHESIS REJECTED: Does not meet 30% threshold")
        
        print(f"\n3. BOOTSTRAP HYPOTHESIS TEST (10,000 SAMPLES)")
        print("-" * 80)
        
        bootstrap = validation_results['bootstrap_test']
        print(f"Bootstrap Distribution:")
        print(f"  Mean improvement: {bootstrap['mean_improvement']:.2f}%")
        print(f"  Standard deviation: {bootstrap['std_improvement']:.2f}%")
        print(f"  95% CI: [{bootstrap['improvement_ci'][0]:.2f}%, {bootstrap['improvement_ci'][1]:.2f}%]")
        print(f"  95% Lower bound: {bootstrap['improvement_ci_lower_one_sided']:.2f}%")
        print(f"")
        print(f"Hypothesis Test Results:")
        print(f"  Bootstrap p-value: {bootstrap['p_value_bootstrap']:.4f}")
        print(f"  Effect size (standardized): {bootstrap['effect_size']:.3f}")
        print(f"  Statistical power: {bootstrap['power']:.3f}")
        print(f"")
        print(f"Confidence Interval Tests:")
        print(f"  Two-sided CI exceeds 30%: {'YES' if bootstrap['meets_threshold_ci'] else 'NO'}")
        print(f"  One-sided CI exceeds 30%: {'YES' if bootstrap['meets_threshold_one_sided'] else 'NO'}")
        
        if bootstrap['meets_threshold_one_sided']:
            print(f"  ✅ BOOTSTRAP VALIDATION: 95% confident improvement > 30%")
        elif bootstrap['p_value_bootstrap'] < 0.05:
            print(f"  ⚠️  SIGNIFICANT IMPROVEMENT: But uncertain if > 30%")
        else:
            print(f"  ❌ BOOTSTRAP REJECTION: Cannot confirm > 30% improvement")
        
        print(f"\n4. PERMUTATION TEST (5,000 SAMPLES)")
        print("-" * 80)
        
        permutation = validation_results['permutation_test']
        print(f"Permutation Analysis:")
        print(f"  Observed improvement: {permutation['observed_improvement']:.2f}%")
        print(f"  Null distribution mean: {permutation['null_distribution_mean']:.2f}%")
        print(f"  Null distribution std: {permutation['null_distribution_std']:.2f}%")
        print(f"  Permutation p-value: {permutation['p_value_permutation']:.4f}")
        print(f"")
        print(f"Null Hypothesis Rejection:")
        if permutation['p_value_permutation'] < 0.05:
            print(f"  ✅ SIGNIFICANT: Observed improvement unlikely due to chance (p < 0.05)")
        else:
            print(f"  ❌ NOT SIGNIFICANT: Improvement could be due to random chance")
        
        print(f"\n5. EFFECT SIZE AND POWER ANALYSIS")
        print("-" * 80)
        
        cohen_d = validation_results['effect_size_cohen']
        print(f"Effect Size Analysis:")
        print(f"  Cohen's d: {cohen_d:.4f}")
        
        if abs(cohen_d) >= 0.8:
            effect_interpretation = "Large"
        elif abs(cohen_d) >= 0.5:
            effect_interpretation = "Medium"
        elif abs(cohen_d) >= 0.2:
            effect_interpretation = "Small"
        else:
            effect_interpretation = "Negligible"
        
        print(f"  Effect interpretation: {effect_interpretation}")
        print(f"  Practical significance: {'YES' if abs(cohen_d) >= 0.5 else 'NO'}")
        
        # Sample size adequacy
        n = len(y_true)
        min_n_for_power = 8 * (1.96 + 0.84)**2 / (cohen_d**2) if cohen_d > 0 else float('inf')
        print(f"")
        print(f"Power Analysis:")
        print(f"  Current sample size: {n}")
        print(f"  Minimum n for 80% power: {min_n_for_power:.0f}")
        print(f"  Sample size adequate: {'YES' if n >= min_n_for_power else 'NO'}")
        
        print(f"\n6. MULTIPLE TESTING CORRECTION")
        print("-" * 80)
        
        # Collect all p-values for correction
        p_values = []
        test_names = []
        
        if 'pvalue' in mcnemar_test:
            p_values.append(mcnemar_test['pvalue'])
            test_names.append("McNemar's Test")
        
        p_values.append(one_sided['p_value'])
        test_names.append("One-sided Improvement")
        
        p_values.append(bootstrap['p_value_bootstrap'])
        test_names.append("Bootstrap Test")
        
        p_values.append(permutation['p_value_permutation'])
        test_names.append("Permutation Test")
        
        # Apply Bonferroni correction
        if p_values:
            reject, p_adjusted, _, _ = multipletests(p_values, method='bonferroni', alpha=0.05)
            
            print(f"Bonferroni Correction (α = 0.05):")
            for i, (test, p_raw, p_adj, rejected) in enumerate(zip(test_names, p_values, p_adjusted, reject)):
                print(f"  {test}: p = {p_raw:.4f} → p_adj = {p_adj:.4f} {'✅' if rejected else '❌'}")
        
        print(f"\n7. FINAL HYPOTHESIS VALIDATION DECISION")
        print("=" * 80)
        
        # Decision criteria
        meets_30_percent = validation_results['observed_improvement'] >= 30.0
        one_sided_significant = one_sided['statistically_significant']
        bootstrap_significant = bootstrap['meets_threshold_one_sided']
        effect_size_adequate = abs(cohen_d) >= 0.5
        
        # Count supporting evidence
        evidence_count = sum([
            meets_30_percent,
            one_sided_significant,
            bootstrap_significant,
            effect_size_adequate
        ])
        
        print(f"EVIDENCE SUMMARY:")
        print(f"  ✓ Observed improvement ≥ 30%: {'YES' if meets_30_percent else 'NO'}")
        print(f"  ✓ One-sided test significant: {'YES' if one_sided_significant else 'NO'}")
        print(f"  ✓ Bootstrap CI supports ≥30%: {'YES' if bootstrap_significant else 'NO'}")
        print(f"  ✓ Effect size adequate: {'YES' if effect_size_adequate else 'NO'}")
        print(f"")
        print(f"SUPPORTING EVIDENCE: {evidence_count}/4 criteria met")
        
        # Final decision
        if evidence_count >= 3:
            print(f"")
            print(f"🏆 STRONG STATISTICAL VALIDATION")
            print(f"   The 30% improvement hypothesis is STRONGLY SUPPORTED")
            print(f"   by multiple independent statistical tests.")
            validation_strength = "STRONG"
        elif evidence_count >= 2:
            print(f"")
            print(f"⚠️  MODERATE STATISTICAL VALIDATION")
            print(f"   The 30% improvement hypothesis is MODERATELY SUPPORTED.")
            print(f"   Consider additional validation or larger sample size.")
            validation_strength = "MODERATE"
        else:
            print(f"")
            print(f"❌ INSUFFICIENT STATISTICAL VALIDATION")
            print(f"   The 30% improvement hypothesis is NOT SUPPORTED")
            print(f"   by the statistical evidence.")
            validation_strength = "INSUFFICIENT"
        
        print(f"\n8. NPD BUSINESS IMPACT VALIDATION")
        print("-" * 80)
        
        print(f"HALLUCINATION REDUCTION EFFECTIVENESS:")
        fp_reduction = best_st['false_positives'] - best_mj['false_positives']
        fn_reduction = best_st['false_negatives'] - best_mj['false_negatives']
        
        print(f"  False Positive Reduction: {fp_reduction} cases")
        print(f"  False Negative Reduction: {fn_reduction} cases")
        print(f"  Total Error Reduction: {fp_reduction + fn_reduction} cases")
        print(f"")
        print(f"NPD REQUIREMENTS TRACEABILITY IMPACT:")
        print(f"  Accuracy improvement: {((best_mj['accuracy'] - best_st['accuracy']) / best_st['accuracy'] * 100):+.1f}%")
        print(f"  Precision improvement: {((best_mj['precision'] - best_st['precision']) / best_st['precision'] * 100):+.1f}%")
        print(f"  Recall improvement: {((best_mj['recall'] - best_st['recall']) / best_st['recall'] * 100):+.1f}%")
        
        # Save validation results
        globals()['statistical_validation_results'] = {
            'validation_strength': validation_strength,
            'evidence_count': evidence_count,
            'meets_30_percent': meets_30_percent,
            'comprehensive_results': validation_results
        }
        
        print(f"\n" + "=" * 100)
        print(f"STATISTICAL VALIDATION COMPLETE")
        print(f"Validation Strength: {validation_strength}")
        print(f"Evidence Score: {evidence_count}/4")
        print(f"=" * 100)
        
    else:
        print("\nError: Could not match predictions between methods for statistical testing.")
        print("Ensure both methods were evaluated on the same requirement pairs.")
        
else:
    print("\nStatistical validation not available.")
    print("Please ensure you have run:")
    print("  - Cell 9: Hypothesis testing comparison (creates comparison_df and hypothesis_validation_results)")
    print("  - All previous cells to load the required data")

In [None]:
# Cell [12] - Hypothesis Testing: Meta Judge Scoring Approaches for 30% Hallucination Reduction
# Purpose: Test whether enhanced meta judge scoring methods achieve ≥30% improvement over baseline actor scores
# Dependencies: meta_judge_df from Cell [6], df_ground_truth from Cell [3], statistical libraries from Cell [0]
# Breadcrumbs: Meta Judge Data -> Hypothesis Testing -> 30% Improvement Validation for Scoring Methods

def test_30_percent_improvement_hypothesis(baseline_score, enhanced_score, approach_name, baseline_name="actor_score"):
    """
    Test the 30% improvement hypothesis for a specific scoring approach
    
    Parameters:
        baseline_score: Baseline method F2 score
        enhanced_score: Enhanced method F2 score
        approach_name: Name of the enhanced approach
        baseline_name: Name of the baseline approach
    
    Returns:
        dict: Hypothesis test results
    """
    if baseline_score <= 0:
        return {'error': 'Invalid baseline score', 'approach': approach_name}
    
    # Calculate percentage improvement
    improvement_pct = ((enhanced_score - baseline_score) / baseline_score) * 100
    
    # Test hypothesis: H0: improvement ≤ 30%, H1: improvement > 30%
    meets_threshold = improvement_pct >= 30.0
    improvement_above_threshold = improvement_pct - 30.0
    
    # Calculate effect size (Cohen's d equivalent)
    effect_size = improvement_above_threshold / 10  # Normalize by 10% as standard unit
    
    if abs(effect_size) >= 0.8:
        practical_significance = "Large"
    elif abs(effect_size) >= 0.5:
        practical_significance = "Medium"
    elif abs(effect_size) >= 0.2:
        practical_significance = "Small"
    else:
        practical_significance = "Negligible"
    
    return {
        'approach': approach_name,
        'baseline_name': baseline_name,
        'baseline_score': baseline_score,
        'enhanced_score': enhanced_score,
        'improvement_pct': improvement_pct,
        'improvement_absolute': enhanced_score - baseline_score,
        'meets_30_percent_threshold': meets_threshold,
        'improvement_above_threshold': improvement_above_threshold,
        'effect_size': effect_size,
        'practical_significance': practical_significance
    }

def bootstrap_improvement_test_meta_judge(y_true, y_pred_baseline, y_pred_enhanced, 
                                        approach_name, n_bootstraps=5000, alpha=0.05):
    """
    Bootstrap test for 30% improvement hypothesis in meta judge scoring
    
    Parameters:
        y_true: True labels
        y_pred_baseline: Baseline predictions (actor_score)
        y_pred_enhanced: Enhanced predictions (meta judge approach)
        approach_name: Name of the enhanced approach
        n_bootstraps: Number of bootstrap samples
        alpha: Significance level
    
    Returns:
        dict: Bootstrap test results
    """
    improvements = []
    baseline_scores = []
    enhanced_scores = []
    
    for _ in range(n_bootstraps):
        # Resample indices
        indices = resample(np.arange(len(y_true)), n_samples=len(y_true))
        
        # Calculate F2 scores for resampled data
        baseline_f2 = fbeta_score(y_true[indices], y_pred_baseline[indices], beta=2, zero_division=0)
        enhanced_f2 = fbeta_score(y_true[indices], y_pred_enhanced[indices], beta=2, zero_division=0)
        
        baseline_scores.append(baseline_f2)
        enhanced_scores.append(enhanced_f2)
        
        # Calculate percentage improvement
        if baseline_f2 > 0:
            improvement_pct = ((enhanced_f2 - baseline_f2) / baseline_f2) * 100
            improvements.append(improvement_pct)
        else:
            improvements.append(0)
    
    improvements = np.array(improvements)
    
    # Calculate confidence intervals
    lower_percentile = (alpha / 2) * 100
    upper_percentile = (1 - alpha / 2) * 100
    improvement_ci = np.percentile(improvements, [lower_percentile, upper_percentile])
    
    # One-sided confidence interval (lower bound for improvement)
    improvement_ci_lower = np.percentile(improvements, alpha * 100)
    
    # Test if improvement exceeds 30%
    exceeds_30_percent_ci = improvement_ci[0] > 30.0
    exceeds_30_percent_one_sided = improvement_ci_lower > 30.0
    
    return {
        'approach': approach_name,
        'mean_improvement': np.mean(improvements),
        'std_improvement': np.std(improvements),
        'improvement_ci': improvement_ci,
        'improvement_ci_lower_one_sided': improvement_ci_lower,
        'exceeds_30_percent_ci': exceeds_30_percent_ci,
        'exceeds_30_percent_one_sided': exceeds_30_percent_one_sided,
        'bootstrap_samples': len(improvements)
    }

print("HYPOTHESIS TESTING: META JUDGE SCORING APPROACHES")
print("=" * 100)
print("CORE HYPOTHESIS: Techniques reducing LLM hallucinations by 30% can improve NPD accuracy")
print("OPERATIONAL HYPOTHESIS: Enhanced meta judge scoring achieves ≥30% improvement over baseline actor_score")
print("BASELINE METHOD: actor_score (original actor predictions)")
print("ENHANCED METHODS: final_score, combined scores (hallucination reduction techniques)")
print("=" * 100)

# Check what data is available
available_data = []
if 'meta_judge_df' in globals():
    available_data.append('meta_judge_df')
if 'combined_meta_df' in globals():
    available_data.append('combined_meta_df')
if 'df_ground_truth' in globals():
    available_data.append('df_ground_truth')

print(f"\nAvailable data structures: {available_data}")

# Use combined_meta_df if available, otherwise meta_judge_df
if 'combined_meta_df' in globals() and not combined_meta_df.empty:
    working_df = combined_meta_df.copy()
    ground_truth_col = 'ground_truth_traceable'
    print(f"\nUsing combined_meta_df with {len(working_df)} records")
elif 'meta_judge_df' in globals() and not meta_judge_df.empty and 'df_ground_truth' in globals() and not df_ground_truth.empty:
    working_df = meta_judge_df.copy()
    
    # Add ground truth if not present
    if 'ground_truth' not in working_df.columns:
        ground_truth_pairs = set(zip(df_ground_truth['source_id'], df_ground_truth['target_id']))
        working_df['ground_truth'] = working_df.apply(
            lambda row: (row['source_id'], row['target_id']) in ground_truth_pairs, 
            axis=1
        )
    ground_truth_col = 'ground_truth'
    print(f"\nUsing meta_judge_df with {len(working_df)} records")
else:
    print("\nERROR: Required data not available. Please run previous cells to load meta judge and ground truth data.")
    working_df = pd.DataFrame()

if not working_df.empty and ground_truth_col in working_df.columns:
    
    print(f"\n1. BASELINE vs ENHANCED APPROACHES IDENTIFICATION")
    print("-" * 80)
    
    # Define baseline and enhanced approaches for hypothesis testing
    baseline_approach = 'actor_score'
    
    # Enhanced approaches that represent hallucination reduction techniques
    enhanced_approaches = {}
    
    # Individual enhanced scores
    if 'final_score' in working_df.columns:
        enhanced_approaches['final_score'] = 'Meta-judge refined score'
    
    # Combined scores representing multi-stage refinement
    available_combinations = []
    if 'actor_score' in working_df.columns and 'final_score' in working_df.columns:
        working_df['total_actor_final'] = working_df['actor_score'].fillna(0) + working_df['final_score'].fillna(0)
        enhanced_approaches['total_actor_final'] = 'Actor + Final (two-stage refinement)'
        available_combinations.append('total_actor_final')
    
    if all(col in working_df.columns for col in ['actor_score', 'judge_score', 'final_score']):
        working_df['total_all_three'] = (working_df['actor_score'].fillna(0) + 
                                       working_df['judge_score'].fillna(0) + 
                                       working_df['final_score'].fillna(0))
        enhanced_approaches['total_all_three'] = 'Actor + Judge + Final (three-stage refinement)'
        available_combinations.append('total_all_three')
    
    # All available metrics combination
    individual_scores = ['actor_score', 'judge_score', 'final_score', 
                       'semantic_alignment', 'non_functional_coverage', 
                       'functional_completeness']
    available_scores = [score for score in individual_scores if score in working_df.columns]
    
    if len(available_scores) >= 3:
        working_df['total_combined_all'] = sum(working_df[score].fillna(0) for score in available_scores)
        enhanced_approaches['total_combined_all'] = f'Combined all {len(available_scores)} metrics'
        available_combinations.append('total_combined_all')
    
    print(f"BASELINE APPROACH: {baseline_approach} (original actor predictions)")
    print(f"ENHANCED APPROACHES ({len(enhanced_approaches)} total):")
    for approach, description in enhanced_approaches.items():
        print(f"  - {approach}: {description}")
    
    # Get ground truth labels
    y_true = working_df[ground_truth_col].astype(int).values
    print(f"\nGROUND TRUTH DISTRIBUTION:")
    print(f"  Total instances: {len(y_true)}")
    print(f"  Positive cases: {y_true.sum()} ({y_true.sum()/len(y_true)*100:.1f}%)")
    print(f"  Negative cases: {len(y_true) - y_true.sum()} ({(len(y_true) - y_true.sum())/len(y_true)*100:.1f}%)")
    
    # Check if baseline approach is available
    if baseline_approach not in working_df.columns:
        print(f"\nERROR: Baseline approach '{baseline_approach}' not found in data")
        print(f"Available columns: {list(working_df.columns)}")
    else:
        print(f"\n2. OPTIMAL THRESHOLD IDENTIFICATION FOR EACH APPROACH")
        print("-" * 80)
        
        # Store results for each approach
        approach_results = {}
        
        # Evaluate baseline approach
        baseline_scores = working_df[baseline_approach].fillna(0).values
        thresholds = np.percentile(baseline_scores[baseline_scores > 0], np.linspace(0, 100, 50)) if np.any(baseline_scores > 0) else [0]
        
        best_baseline_f2 = 0
        best_baseline_threshold = 0
        best_baseline_predictions = None
        
        for threshold in thresholds:
            y_pred = (baseline_scores >= threshold).astype(int)
            if len(np.unique(y_pred)) >= 2:
                f2 = fbeta_score(y_true, y_pred, beta=2, zero_division=0)
                if f2 > best_baseline_f2:
                    best_baseline_f2 = f2
                    best_baseline_threshold = threshold
                    best_baseline_predictions = y_pred
        
        approach_results[baseline_approach] = {
            'threshold': best_baseline_threshold,
            'f2_score': best_baseline_f2,
            'predictions': best_baseline_predictions,
            'is_baseline': True
        }
        
        print(f"BASELINE - {baseline_approach}:")
        print(f"  Optimal threshold: {best_baseline_threshold:.3f}")
        print(f"  F2 Score: {best_baseline_f2:.3f}")
        
        # Evaluate enhanced approaches
        for approach_name in enhanced_approaches.keys():
            if approach_name in working_df.columns:
                approach_scores = working_df[approach_name].fillna(0).values
                thresholds = np.percentile(approach_scores[approach_scores > 0], np.linspace(0, 100, 50)) if np.any(approach_scores > 0) else [0]
                
                best_f2 = 0
                best_threshold = 0
                best_predictions = None
                best_metrics = {}
                
                for threshold in thresholds:
                    y_pred = (approach_scores >= threshold).astype(int)
                    if len(np.unique(y_pred)) >= 2:
                        f2 = fbeta_score(y_true, y_pred, beta=2, zero_division=0)
                        if f2 > best_f2:
                            best_f2 = f2
                            best_threshold = threshold
                            best_predictions = y_pred
                            best_metrics = {
                                'f1_score': f1_score(y_true, y_pred, zero_division=0),
                                'precision': precision_score(y_true, y_pred, zero_division=0),
                                'recall': recall_score(y_true, y_pred, zero_division=0),
                                'accuracy': accuracy_score(y_true, y_pred)
                            }
                
                if best_predictions is not None:
                    approach_results[approach_name] = {
                        'threshold': best_threshold,
                        'f2_score': best_f2,
                        'predictions': best_predictions,
                        'metrics': best_metrics,
                        'is_baseline': False
                    }
                    
                    print(f"\nENHANCED - {approach_name} ({enhanced_approaches[approach_name]}):")
                    print(f"  Optimal threshold: {best_threshold:.3f}")
                    print(f"  F2 Score: {best_f2:.3f}")
                    print(f"  Precision: {best_metrics['precision']:.3f}, Recall: {best_metrics['recall']:.3f}")
        
        print(f"\n3. 30% IMPROVEMENT HYPOTHESIS TESTING")
        print("-" * 80)
        
        # Test 30% improvement hypothesis for each enhanced approach
        hypothesis_results = []
        
        for approach_name, results in approach_results.items():
            if not results['is_baseline']:
                hypothesis_test = test_30_percent_improvement_hypothesis(
                    best_baseline_f2, 
                    results['f2_score'], 
                    approach_name, 
                    baseline_approach
                )
                hypothesis_results.append(hypothesis_test)
                
                print(f"\nHYPOTHESIS TEST: {baseline_approach} vs {approach_name}")
                print(f"  Baseline F2: {hypothesis_test['baseline_score']:.4f}")
                print(f"  Enhanced F2: {hypothesis_test['enhanced_score']:.4f}")
                print(f"  Improvement: {hypothesis_test['improvement_pct']:+.2f}%")
                print(f"  Meets 30% threshold: {'✅ YES' if hypothesis_test['meets_30_percent_threshold'] else '❌ NO'}")
                print(f"  Margin above/below 30%: {hypothesis_test['improvement_above_threshold']:+.2f}%")
                print(f"  Effect size: {hypothesis_test['effect_size']:.3f} ({hypothesis_test['practical_significance']})")
        
        print(f"\n4. BOOTSTRAP CONFIDENCE INTERVALS AND STATISTICAL VALIDATION")
        print("-" * 80)
        
        # Bootstrap testing for approaches that have sufficient improvement
        bootstrap_results = []
        
        for approach_name, results in approach_results.items():
            if not results['is_baseline'] and results['predictions'] is not None:
                bootstrap_test = bootstrap_improvement_test_meta_judge(
                    y_true, 
                    best_baseline_predictions, 
                    results['predictions'], 
                    approach_name
                )
                bootstrap_results.append(bootstrap_test)
                
                print(f"\nBOOTSTRAP ANALYSIS: {approach_name}")
                print(f"  Mean improvement: {bootstrap_test['mean_improvement']:.2f}%")
                print(f"  95% CI: [{bootstrap_test['improvement_ci'][0]:.2f}%, {bootstrap_test['improvement_ci'][1]:.2f}%]")
                print(f"  95% Lower bound: {bootstrap_test['improvement_ci_lower_one_sided']:.2f}%")
                print(f"  CI exceeds 30% (two-sided): {'✅ YES' if bootstrap_test['exceeds_30_percent_ci'] else '❌ NO'}")
                print(f"  CI exceeds 30% (one-sided): {'✅ YES' if bootstrap_test['exceeds_30_percent_one_sided'] else '❌ NO'}")
                
                if bootstrap_test['exceeds_30_percent_one_sided']:
                    print(f"  🎯 STATISTICALLY VALIDATED: 95% confident improvement > 30%")
                elif bootstrap_test['exceeds_30_percent_ci']:
                    print(f"  ⚠️  LIKELY VALIDATED: Strong evidence for > 30% improvement")
                else:
                    print(f"  ❌ NOT VALIDATED: Cannot confirm > 30% improvement with confidence")
        
        print(f"\n5. COMPREHENSIVE HYPOTHESIS VALIDATION SUMMARY")
        print("=" * 80)
        
        # Identify best performing enhanced approach
        valid_enhanced_approaches = [r for r in hypothesis_results if 'error' not in r]
        
        if valid_enhanced_approaches:
            best_enhanced = max(valid_enhanced_approaches, key=lambda x: x['enhanced_score'])
            
            print(f"BEST ENHANCED APPROACH: {best_enhanced['approach']}")
            print(f"  Description: {enhanced_approaches[best_enhanced['approach']]}")
            print(f"  Improvement: {best_enhanced['improvement_pct']:+.2f}%")
            print(f"  F2 Score: {best_enhanced['baseline_score']:.4f} → {best_enhanced['enhanced_score']:.4f}")
            
            # Count approaches that meet the 30% threshold
            approaches_meeting_threshold = [r for r in valid_enhanced_approaches if r['meets_30_percent_threshold']]
            
            print(f"\nHYPOTHESIS VALIDATION RESULTS:")
            print(f"  Enhanced approaches tested: {len(valid_enhanced_approaches)}")
            print(f"  Approaches meeting ≥30% improvement: {len(approaches_meeting_threshold)}")
            
            if approaches_meeting_threshold:
                print(f"\n  ✅ APPROACHES MEETING 30% THRESHOLD:")
                for approach in approaches_meeting_threshold:
                    print(f"    - {approach['approach']}: {approach['improvement_pct']:+.2f}% improvement")
                
                # Check statistical validation
                statistically_validated = []
                for bootstrap_result in bootstrap_results:
                    if bootstrap_result['exceeds_30_percent_one_sided']:
                        statistically_validated.append(bootstrap_result['approach'])
                
                if statistically_validated:
                    print(f"\n  🎯 STATISTICALLY VALIDATED APPROACHES:")
                    for approach in statistically_validated:
                        print(f"    - {approach}: Bootstrap CI confirms > 30% improvement")
                else:
                    print(f"\n  ⚠️  NO STATISTICALLY VALIDATED APPROACHES")
                    print(f"     Approaches meet 30% threshold but confidence intervals are inconclusive")
            else:
                print(f"\n  ❌ NO APPROACHES MEET 30% THRESHOLD")
                print(f"     Enhanced meta judge scoring does not achieve required improvement")
            
            print(f"\n6. NPD BUSINESS IMPACT ASSESSMENT")
            print("-" * 80)
            
            print(f"HALLUCINATION REDUCTION EFFECTIVENESS:")
            if best_enhanced['meets_30_percent_threshold']:
                print(f"  ✅ Meta judge refinement successfully reduces hallucinations")
                print(f"  ✅ Achieves {best_enhanced['improvement_pct']:.1f}% improvement in requirements traceability")
                print(f"  ✅ Practical significance: {best_enhanced['practical_significance']}")
            else:
                print(f"  ❌ Meta judge refinement shows improvement but below 30% threshold")
                print(f"  📊 Observed improvement: {best_enhanced['improvement_pct']:.1f}%")
                print(f"  📊 Additional refinement needed to reach 30% target")
            
            print(f"\nOPERATIONAL RECOMMENDATIONS:")
            if approaches_meeting_threshold:
                print(f"  1. Deploy enhanced meta judge scoring in NPD requirements analysis")
                print(f"  2. Prioritize {best_enhanced['approach']} approach for maximum accuracy gain")
                print(f"  3. Monitor hallucination reduction in production environment")
            else:
                print(f"  1. Continue refinement of meta judge scoring approaches")
                print(f"  2. Investigate additional hallucination reduction techniques")
                print(f"  3. Consider ensemble methods or alternative architectures")
                
        else:
            print(f"ERROR: No valid enhanced approaches found for comparison")
        
        # Visualization if enabled
        if valid_enhanced_approaches and CONFIG.get('SHOW_VISUALIZATION', False):
            print(f"\n7. VISUALIZATION")
            print("-" * 80)
            
            # Prepare data for visualization
            viz_data = []
            
            # Add baseline
            viz_data.append({
                'Approach': baseline_approach,
                'F2 Score': best_baseline_f2,
                'Type': 'Baseline',
                'Improvement %': 0.0
            })
            
            # Add enhanced approaches
            for result in valid_enhanced_approaches:
                viz_data.append({
                    'Approach': result['approach'],
                    'F2 Score': result['enhanced_score'],
                    'Type': 'Enhanced',
                    'Improvement %': result['improvement_pct']
                })
            
            viz_df = pd.DataFrame(viz_data)
            
            # Create visualization
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
            
            # F2 Score comparison
            colors = ['lightblue' if t == 'Baseline' else 'lightcoral' for t in viz_df['Type']]
            bars1 = ax1.bar(range(len(viz_df)), viz_df['F2 Score'], color=colors)
            ax1.set_xticks(range(len(viz_df)))
            ax1.set_xticklabels(viz_df['Approach'], rotation=45, ha='right')
            ax1.set_ylabel('F2 Score')
            ax1.set_title('F2 Score Comparison: Baseline vs Enhanced Approaches')
            ax1.grid(axis='y', alpha=0.3)
            
            # Add 30% improvement threshold line
            if best_baseline_f2 > 0:
                threshold_line = best_baseline_f2 * 1.3
                ax1.axhline(y=threshold_line, color='red', linestyle='--', alpha=0.7, 
                           label=f'30% Improvement Threshold ({threshold_line:.3f})')
                ax1.legend()
            
            # Improvement percentage comparison
            enhanced_only = viz_df[viz_df['Type'] == 'Enhanced']
            colors2 = ['green' if imp >= 30 else 'orange' for imp in enhanced_only['Improvement %']]
            bars2 = ax2.bar(range(len(enhanced_only)), enhanced_only['Improvement %'], color=colors2)
            ax2.set_xticks(range(len(enhanced_only)))
            ax2.set_xticklabels(enhanced_only['Approach'], rotation=45, ha='right')
            ax2.set_ylabel('Improvement %')
            ax2.set_title('Improvement Percentage vs 30% Threshold')
            ax2.axhline(y=30, color='red', linestyle='--', alpha=0.7, label='30% Threshold')
            ax2.grid(axis='y', alpha=0.3)
            ax2.legend()
            
            plt.tight_layout()
            plt.show()
        
        print(f"\n" + "=" * 100)
        print(f"HYPOTHESIS TESTING COMPLETE")
        print(f"Meta judge scoring approaches evaluated against 30% improvement threshold")
        print(f"=" * 100)

else:
    print("\nERROR: Cannot proceed with hypothesis testing")
    print("Required data not available or ground truth column missing")
    print("\nPlease ensure you have:")
    print("  - Loaded meta judge data (meta_judge_df or combined_meta_df)")
    print("  - Loaded ground truth data (df_ground_truth)")
    print("  - Run previous cells to establish proper data relationships")