# LLM Meta Judge Requirements Traceability Analysis
**Statistical evaluation and comparison of LLM meta judge scoring methods with confidence intervals, threshold optimization, performance metrics, and Neo4j integration.**

In [None]:
# Cell [0] - Setup and Imports
# Purpose: Import all required libraries and configure environment settings
# Dependencies: pandas, numpy, neo4j, sklearn, matplotlib, seaborn, scipy, statsmodels
# Breadcrumbs: Setup -> Imports

import os
import logging
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from neo4j import GraphDatabase
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, fbeta_score,
    matthews_corrcoef, confusion_matrix, balanced_accuracy_score,
    cohen_kappa_score, roc_auc_score, precision_recall_curve, auc,
    confusion_matrix, classification_report
)
import json
from datetime import datetime

# Statistical testing imports
from scipy import stats
from scipy.stats import wilcoxon, friedmanchisquare, kruskal, mannwhitneyu
from statsmodels.stats.contingency_tables import mcnemar
from statsmodels.stats.multitest import multipletests
from sklearn.utils import resample

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Function to initialize environment variables and configuration
def initialize_environment():
    """Initialize environment variables and configuration settings.
    
    Returns:
        dict: Configuration dictionary with all settings
    """
    # Load environment variables
    load_dotenv()
    
    # Neo4j credentials from environment variables
    config = {
        'NEO4J_URI': os.getenv('NEO4J_URI'),
        'NEO4J_USER': os.getenv('NEO4J_USER'),
        'NEO4J_PASSWORD': os.getenv('NEO4J_PASSWORD'),
        'NEO4J_PROJECT_NAME': os.getenv('NEO4J_PROJECT_NAME'),
        
        # Get current model from environment
        'CURRENT_MODEL_KEY': os.getenv('CURRENT_MODEL'),
        'CURRENT_MODEL': os.getenv(os.getenv('CURRENT_MODEL')),
        
        # Get optimization metric from environment
        'OPTIMIZATION_METRIC': os.getenv('OPTIMIZATION_METRIC', 'F2').upper(),
        
        # Check if visualizations should be shown
        'SHOW_VISUALIZATION': os.getenv('SHOW_VISUALIZATION', 'False').lower() == 'true',
        
        # Set minimum traceability threshold
        'MIN_TRACEABILITY_THRESHOLD': int(os.getenv('MIN_TRACEABILITY_THRESHOLD', '3'))
    }
    
    return config

# Only execute this code when run directly (not when imported)
if __name__ == "__main__":
    # Initialize environment
    config = initialize_environment()
    
    # Extract variables from config to global namespace for notebook use
    NEO4J_URI = config['NEO4J_URI']
    NEO4J_USER = config['NEO4J_USER']
    NEO4J_PASSWORD = config['NEO4J_PASSWORD']
    NEO4J_PROJECT_NAME = config['NEO4J_PROJECT_NAME']
    CURRENT_MODEL_KEY = config['CURRENT_MODEL_KEY']
    CURRENT_MODEL = config['CURRENT_MODEL']
    OPTIMIZATION_METRIC = config['OPTIMIZATION_METRIC']
    SHOW_VISUALIZATION = config['SHOW_VISUALIZATION']
    MIN_TRACEABILITY_THRESHOLD = config['MIN_TRACEABILITY_THRESHOLD']
    
    # Log configuration
    logger.info(f"Using {OPTIMIZATION_METRIC} score for threshold application in traceability analysis")
    logger.info(f"Visualization display is set to: {SHOW_VISUALIZATION}")
    logger.info(f"Current model: {CURRENT_MODEL}")
    logger.info(f"Project name: {NEO4J_PROJECT_NAME}")
    logger.info(f"Minimum traceability threshold: {MIN_TRACEABILITY_THRESHOLD}")
    
    # Print configuration summary
    print(f"Visualization setting: {'Enabled' if SHOW_VISUALIZATION else 'Disabled'}")
    print(f"Optimization metric: {OPTIMIZATION_METRIC}")
    print(f"Project: {NEO4J_PROJECT_NAME}")
    print(f"Model: {CURRENT_MODEL}")
    print(f"Minimum traceability threshold: {MIN_TRACEABILITY_THRESHOLD}")

In [None]:
# Cell [1] - Neo4j Connection Setup
# Purpose: Create connection to Neo4j database containing traceability data
# Dependencies: neo4j, logging
# Breadcrumbs: Setup -> Database Connection

def create_neo4j_driver(uri=None, user=None, password=None):
    """
    Create and return a Neo4j driver instance
    
    Parameters:
        uri (str, optional): Neo4j connection URI. If None, uses NEO4J_URI from environment.
        user (str, optional): Neo4j username. If None, uses NEO4J_USER from environment.
        password (str, optional): Neo4j password. If None, uses NEO4J_PASSWORD from environment.
    
    Returns:
        GraphDatabase.driver: Connected Neo4j driver
    """
    try:
        # If parameters aren't provided, try to get them from globals (when run as main)
        # or load from environment (when imported)
        if uri is None:
            if 'NEO4J_URI' in globals():
                uri = NEO4J_URI
            else:
                # Load from environment if being imported
                config = initialize_environment()
                uri = config['NEO4J_URI']
                
        if user is None:
            if 'NEO4J_USER' in globals():
                user = NEO4J_USER
            else:
                # Load from environment if being imported
                config = initialize_environment() if 'config' not in locals() else config
                user = config['NEO4J_USER']
                
        if password is None:
            if 'NEO4J_PASSWORD' in globals():
                password = NEO4J_PASSWORD
            else:
                # Load from environment if being imported
                config = initialize_environment() if 'config' not in locals() else config
                password = config['NEO4J_PASSWORD']
        
        driver = GraphDatabase.driver(uri, auth=(user, password))
        logger.info("Successfully connected to Neo4j database")
        return driver
    except Exception as e:
        logger.error(f"Failed to connect to Neo4j: {str(e)}")
        logger.error("Exception details:", exc_info=True)
        raise

# Only create the driver when running the notebook directly
if __name__ == "__main__":
    # Create Neo4j driver
    driver = create_neo4j_driver()

In [None]:
# Cell [2] - Query Meta Judge Links
# Purpose: Retrieve LLM_RESULT_META_JUDGE links from Neo4j
# Dependencies: neo4j, pandas, logging
# Breadcrumbs: Data Acquisition -> Meta Judge Links

def query_meta_judge_links(driver, project_name=None, model=None):
    """
    Query LLM_RESULT_META_JUDGE links from Neo4j
    
    Parameters:
        driver: Neo4j driver connection
        project_name (str, optional): Project name to query. If None, uses NEO4J_PROJECT_NAME from environment.
        model (str, optional): Model name to query. If None, uses CURRENT_MODEL from environment.
    
    Returns:
        pd.DataFrame: DataFrame containing meta judge links
    """
    try:
        # If parameters aren't provided, try to get them from globals or environment
        if project_name is None:
            if 'NEO4J_PROJECT_NAME' in globals():
                project_name = NEO4J_PROJECT_NAME
            else:
                # Load from environment if being imported
                config = initialize_environment()
                project_name = config['NEO4J_PROJECT_NAME']
                
        if model is None:
            if 'CURRENT_MODEL' in globals():
                model = CURRENT_MODEL
            else:
                # Load from environment if being imported
                config = initialize_environment() if 'config' not in locals() else config
                model = config['CURRENT_MODEL']
        
        # Query for meta-judge links
        meta_judge_query = """
        MATCH (p:Project {name: $project_name})-[:CONTAINS]->(d:Document)-[:CONTAINS]->(source:Requirement)-[r:LLM_RESULT_META_JUDGE]->(target:Requirement)
        WHERE source.type = 'SOURCE' AND r.model = $model
        RETURN 
            p.name as project_name,
            source.id as source_id,
            target.id as target_id,
            r.is_traceable as is_traceable,
            r.judge_score as judge_score,
            r.semantic_alignment as semantic_alignment,
            r.non_functional_coverage as non_functional_coverage,
            r.final_score as final_score,
            r.actor_score as actor_score,
            r.functional_completeness as functional_completeness,
            r.model as model
        ORDER BY source.id, target.id
        """
        
        with driver.session() as session:
            try:
                # Execute the query with project name and model parameters
                results = session.run(
                    meta_judge_query, 
                    project_name=project_name,
                    model=model
                ).data()
                
                if results:
                    logger.info(f"Retrieved {len(results)} meta judge links")
                    meta_judge_df = pd.DataFrame(results)
                    
                    # Convert boolean columns to boolean type if they exist as strings
                    if 'is_traceable' in meta_judge_df.columns:
                        if meta_judge_df['is_traceable'].dtype == 'object':
                            meta_judge_df['is_traceable'] = meta_judge_df['is_traceable'].map(
                                lambda x: str(x).lower() == 'true' if pd.notna(x) else False
                            )
                    
                    # Convert numeric columns to float
                    numeric_columns = [
                        'judge_score', 'semantic_alignment', 'non_functional_coverage',
                        'final_score', 'actor_score', 'functional_completeness'
                    ]
                    
                    for col in numeric_columns:
                        if col in meta_judge_df.columns:
                            meta_judge_df[col] = pd.to_numeric(meta_judge_df[col], errors='coerce')
                    
                    return meta_judge_df
                else:
                    logger.warning(f"No meta judge links found for project: {project_name} and model: {model}")
                    return pd.DataFrame()
                    
            except Exception as e:
                logger.error(f"Error executing meta judge query: {str(e)}")
                logger.error("Exception details:", exc_info=True)
                return pd.DataFrame()
    
    except Exception as e:
        logger.error(f"Error querying meta judge links: {str(e)}")
        logger.error("Exception details:", exc_info=True)
        return pd.DataFrame()

# Only execute this code when running the notebook directly
if __name__ == "__main__":
    # Execute the query and get results
    meta_judge_df = query_meta_judge_links(driver)

    # Display information about the retrieved data
    if not meta_judge_df.empty:
        print("\nMeta Judge Links for Project:", NEO4J_PROJECT_NAME)
        print("=" * 80)
        display(meta_judge_df.head())
        
        # Count unique source and target requirements
        unique_sources = meta_judge_df['source_id'].nunique()
        unique_targets = meta_judge_df['target_id'].nunique()
        print(f"\nMeta Judge Dataset Metrics:")
        print("-" * 50)
        print(f"Total meta judge links: {len(meta_judge_df)}")
        print(f"Unique source requirements: {unique_sources}")
        print(f"Unique target requirements: {unique_targets}")
        print(f"Link density: {len(meta_judge_df) / (unique_sources * unique_targets):.4f}")
        
        # Display distribution of is_traceable
        if 'is_traceable' in meta_judge_df.columns:
            traceable_count = meta_judge_df['is_traceable'].sum()
            print(f"\nTraceability Distribution:")
            print(f"Traceable links: {traceable_count} ({traceable_count/len(meta_judge_df)*100:.2f}%)")
            print(f"Non-traceable links: {len(meta_judge_df) - traceable_count} ({(len(meta_judge_df) - traceable_count)/len(meta_judge_df)*100:.2f}%)")
        
        # Display score statistics
        score_columns = ['judge_score', 'semantic_alignment', 'non_functional_coverage', 
                         'final_score', 'actor_score', 'functional_completeness']
        
        print("\nScore Statistics:")
        print("-" * 50)
        for col in score_columns:
            if col in meta_judge_df.columns:
                print(f"{col.replace('_', ' ').title()}:")
                stats = meta_judge_df[col].describe()
                print(f"  Mean: {stats['mean']:.2f}")
                print(f"  Median: {stats['50%']:.2f}")
                print(f"  Min: {stats['min']:.2f}")
                print(f"  Max: {stats['max']:.2f}")
                print(f"  StdDev: {stats['std']:.2f}")
                print()
    else:
        print("\nNo meta judge links found. Please check that:")
        print("  - The project name is correct")
        print("  - The model name matches what's in the database")
        print("  - LLM_RESULT_META_JUDGE relationships exist in the database")
        print("\nProject:", NEO4J_PROJECT_NAME)
        print("Model:", CURRENT_MODEL)

In [None]:
# Cell [0] - Setup and Imports
# Purpose: Import all required libraries and configure environment settings
# Dependencies: pandas, numpy, neo4j, sklearn, matplotlib, seaborn, scipy, statsmodels
# Breadcrumbs: Setup -> Imports
# notebooks/07_Meta_Judge_Analysis_Notebook.ipynb

import os
import logging
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from neo4j import GraphDatabase
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, fbeta_score,
    matthews_corrcoef, confusion_matrix, balanced_accuracy_score,
    cohen_kappa_score, roc_auc_score, precision_recall_curve, auc,
    confusion_matrix, classification_report
)
import json
from datetime import datetime

# Statistical testing imports
from scipy import stats
from scipy.stats import wilcoxon, friedmanchisquare, kruskal, mannwhitneyu
from statsmodels.stats.contingency_tables import mcnemar
from statsmodels.stats.multitest import multipletests
from sklearn.utils import resample

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Function to initialize environment variables and configuration
def initialize_environment():
    """Initialize environment variables and configuration settings.
    
    Returns:
        dict: Configuration dictionary with all settings
    """
    # Load environment variables
    load_dotenv()
    
    # Neo4j credentials from environment variables
    config = {
        'NEO4J_URI': os.getenv('NEO4J_URI'),
        'NEO4J_USER': os.getenv('NEO4J_USER'),
        'NEO4J_PASSWORD': os.getenv('NEO4J_PASSWORD'),
        'NEO4J_PROJECT_NAME': os.getenv('NEO4J_PROJECT_NAME'),
        
        # Get current model from environment
        'CURRENT_MODEL_KEY': os.getenv('CURRENT_MODEL'),
        'CURRENT_MODEL': os.getenv(os.getenv('CURRENT_MODEL')),
        
        # Get optimization metric from environment
        'OPTIMIZATION_METRIC': os.getenv('OPTIMIZATION_METRIC', 'F2').upper(),
        
        # Check if visualizations should be shown
        'SHOW_VISUALIZATION': os.getenv('SHOW_VISUALIZATION', 'False').lower() == 'true',
        
        # Set minimum traceability threshold
        'MIN_TRACEABILITY_THRESHOLD': int(os.getenv('MIN_TRACEABILITY_THRESHOLD', '3'))
    }
    
    return config

# Only execute this code when run directly (not when imported)
if __name__ == "__main__":
    # Initialize environment
    config = initialize_environment()
    
    # Extract variables from config to global namespace for notebook use
    NEO4J_URI = config['NEO4J_URI']
    NEO4J_USER = config['NEO4J_USER']
    NEO4J_PASSWORD = config['NEO4J_PASSWORD']
    NEO4J_PROJECT_NAME = config['NEO4J_PROJECT_NAME']
    CURRENT_MODEL_KEY = config['CURRENT_MODEL_KEY']
    CURRENT_MODEL = config['CURRENT_MODEL']
    OPTIMIZATION_METRIC = config['OPTIMIZATION_METRIC']
    SHOW_VISUALIZATION = config['SHOW_VISUALIZATION']
    MIN_TRACEABILITY_THRESHOLD = config['MIN_TRACEABILITY_THRESHOLD']
    
    # Log configuration
    logger.info(f"Using {OPTIMIZATION_METRIC} score for threshold application in traceability analysis")
    logger.info(f"Visualization display is set to: {SHOW_VISUALIZATION}")
    logger.info(f"Current model: {CURRENT_MODEL}")
    logger.info(f"Project name: {NEO4J_PROJECT_NAME}")
    logger.info(f"Minimum traceability threshold: {MIN_TRACEABILITY_THRESHOLD}")
    
    # Print configuration summary
    print(f"Visualization setting: {'Enabled' if SHOW_VISUALIZATION else 'Disabled'}")
    print(f"Optimization metric: {OPTIMIZATION_METRIC}")
    print(f"Project: {NEO4J_PROJECT_NAME}")
    print(f"Model: {CURRENT_MODEL}")
    print(f"Minimum traceability threshold: {MIN_TRACEABILITY_THRESHOLD}")

In [None]:
# Cell [3] - Query LLM Result Links
# Purpose: Retrieve LLM_RESULT links from Neo4j
# Dependencies: neo4j, pandas, logging
# Breadcrumbs: Data Acquisition -> LLM Result Links
# notebooks/07_Meta_Judge_Analysis_Notebook.ipynb

def query_llm_result_links(driver, project_name=None, model=None):
    """
    Query LLM_RESULT links from Neo4j
    
    Parameters:
        driver: Neo4j driver connection
        project_name (str, optional): Project name to query. If None, uses NEO4J_PROJECT_NAME from environment.
        model (str, optional): Model name to query. If None, uses CURRENT_MODEL from environment.
    
    Returns:
        pd.DataFrame: DataFrame containing LLM result links
    """
    try:
        # If parameters aren't provided, try to get them from globals or environment
        if project_name is None:
            if 'NEO4J_PROJECT_NAME' in globals():
                project_name = NEO4J_PROJECT_NAME
            else:
                # Load from environment if being imported
                config = initialize_environment()
                project_name = config['NEO4J_PROJECT_NAME']
                
        if model is None:
            if 'CURRENT_MODEL' in globals():
                model = CURRENT_MODEL
            else:
                # Load from environment if being imported
                config = initialize_environment() if 'config' not in locals() else config
                model = config['CURRENT_MODEL']
        
        # Query for LLM_RESULT links
        llm_query = """
        MATCH (p:Project {name: $project_name})-[:CONTAINS]->(d:Document)-[:CONTAINS]->(source:Requirement)-[r:LLM_RESULT]->(target:Requirement)
        WHERE source.type = 'SOURCE' AND r.model = $model
        RETURN 
            p.name as project_name,
            source.id as source_id,
            target.id as target_id,
            r.is_associated as is_associated,
            r.association_probability as association_probability,
            r.model as model
        ORDER BY source.id, target.id
        """
        
        with driver.session() as session:
            try:
                # Execute the query with project name and model parameters
                results = session.run(
                    llm_query, 
                    project_name=project_name,
                    model=model
                ).data()
                
                if results:
                    logger.info(f"Retrieved {len(results)} LLM result links")
                    llm_result_df = pd.DataFrame(results)
                    
                    # Convert boolean columns to boolean type if they exist as strings
                    if 'is_associated' in llm_result_df.columns:
                        if llm_result_df['is_associated'].dtype == 'object':
                            llm_result_df['is_associated'] = llm_result_df['is_associated'].map(
                                lambda x: str(x).lower() == 'true' if pd.notna(x) else False
                            )
                    
                    # Convert association_probability to float
                    if 'association_probability' in llm_result_df.columns:
                        llm_result_df['association_probability'] = pd.to_numeric(
                            llm_result_df['association_probability'], errors='coerce'
                        )
                    
                    return llm_result_df
                else:
                    logger.warning(f"No LLM result links found for project: {project_name} and model: {model}")
                    return pd.DataFrame()
                    
            except Exception as e:
                logger.error(f"Error executing LLM result query: {str(e)}")
                logger.error("Exception details:", exc_info=True)
                return pd.DataFrame()
    
    except Exception as e:
        logger.error(f"Error querying LLM result links: {str(e)}")
        logger.error("Exception details:", exc_info=True)
        return pd.DataFrame()

# Only execute this code when running the notebook directly
if __name__ == "__main__":
    # Execute the query and get results
    llm_result_df = query_llm_result_links(driver)

    # Display information about the retrieved data
    if not llm_result_df.empty:
        print("\nLLM Result Links for Project:", NEO4J_PROJECT_NAME)
        print("=" * 80)
        display(llm_result_df.head())
        
        # Count unique source and target requirements
        unique_sources = llm_result_df['source_id'].nunique()
        unique_targets = llm_result_df['target_id'].nunique()
        print(f"\nLLM Result Dataset Metrics:")
        print("-" * 50)
        print(f"Total LLM result links: {len(llm_result_df)}")
        print(f"Unique source requirements: {unique_sources}")
        print(f"Unique target requirements: {unique_targets}")
        print(f"Link density: {len(llm_result_df) / (unique_sources * unique_targets):.4f}")
        
        # Display distribution of is_associated
        if 'is_associated' in llm_result_df.columns:
            associated_count = llm_result_df['is_associated'].sum()
            print(f"\nAssociation Distribution:")
            print(f"Associated links: {associated_count} ({associated_count/len(llm_result_df)*100:.2f}%)")
            print(f"Non-associated links: {len(llm_result_df) - associated_count} ({(len(llm_result_df) - associated_count)/len(llm_result_df)*100:.2f}%)")
        
        # Display probability statistics
        if 'association_probability' in llm_result_df.columns:
            print("\nAssociation Probability Statistics:")
            print("-" * 50)
            stats = llm_result_df['association_probability'].describe()
            print(f"Mean: {stats['mean']:.2f}")
            print(f"Median: {stats['50%']:.2f}")
            print(f"Min: {stats['min']:.2f}")
            print(f"Max: {stats['max']:.2f}")
            print(f"StdDev: {stats['std']:.2f}")
    else:
        print("\nNo LLM result links found. Please check that:")
        print("  - The project name is correct")
        print("  - The model name matches what's in the database")
        print("  - LLM_RESULT relationships exist in the database")
        print("\nProject:", NEO4J_PROJECT_NAME)
        print("Model:", CURRENT_MODEL)

In [None]:
# Cell [4] - Query Ground Truth Links
# Purpose: Retrieve ground truth traceability links from Neo4j
# Dependencies: neo4j, pandas, logging
# Breadcrumbs: Data Acquisition -> Ground Truth
# notebooks/07_Meta_Judge_Analysis_Notebook.ipynb

def query_ground_truth_links(driver, project_name=None):
    """
    Query ground truth traceability links from Neo4j database
    
    Parameters:
        driver: Neo4j driver connection
        project_name (str, optional): Project name to query. If None, uses NEO4J_PROJECT_NAME from environment.
    
    Returns:
        pd.DataFrame: DataFrame containing ground truth links
    """
    try:
        # If project_name isn't provided, try to get it from globals or environment
        if project_name is None:
            if 'NEO4J_PROJECT_NAME' in globals():
                project_name = NEO4J_PROJECT_NAME
            else:
                # Load from environment if being imported
                config = initialize_environment()
                project_name = config['NEO4J_PROJECT_NAME']
                
        # Query for ground truth links
        ground_truth_query = """
        MATCH (p:Project {name: $project_name})-[:CONTAINS]->(d:Document)-[:CONTAINS]->(source:Requirement)-[r:GROUND_TRUTH]->(target:Requirement)
        RETURN 
            p.name as project_name,
            p.description as project_description,
            d.id as document_id,
            source.id as source_id,
            source.type as source_type,
            target.id as target_id,
            target.type as target_type,
            1 as ground_truth
        ORDER BY source.id, target.id DESC
        """
        
        with driver.session() as session:
            try:
                # Execute the query with project name parameter
                results = session.run(ground_truth_query, project_name=project_name).data()
                
                if results:
                    logger.info(f"Retrieved {len(results)} ground truth links using GROUND_TRUTH relationship")
                    df_ground_truth = pd.DataFrame(results)
                    return df_ground_truth
                else:
                    logger.warning(f"No ground truth links found for project: {project_name}")
                    return pd.DataFrame()
                    
            except Exception as e:
                logger.error(f"Error executing ground truth query: {str(e)}")
                logger.error("Exception details:", exc_info=True)
                return pd.DataFrame()
    
    except Exception as e:
        logger.error(f"Error querying ground truth links: {str(e)}")
        logger.error("Exception details:", exc_info=True)
        return pd.DataFrame()

# Only execute the query and display results when running the notebook directly
if __name__ == "__main__":
    # Execute the query and get results
    df_ground_truth = query_ground_truth_links(driver)

    # Display information about the retrieved data
    if not df_ground_truth.empty:
        print("\nGround Truth Links for Project:", NEO4J_PROJECT_NAME)
        print("=" * 80)
        display(df_ground_truth.head())
        
        # Count source and target requirements
        unique_sources = df_ground_truth['source_id'].nunique()
        unique_targets = df_ground_truth['target_id'].nunique()
        print(f"\nGround Truth Dataset Metrics:")
        print("-" * 50)
        print(f"Total ground truth links: {len(df_ground_truth)}")
        print(f"Unique source requirements: {unique_sources}")
        print(f"Unique target requirements: {unique_targets}")
        print(f"Link density: {len(df_ground_truth) / (unique_sources * unique_targets):.4f}")
    else:
        print("\nNo ground truth links found for", NEO4J_PROJECT_NAME)
        print("Continuing analysis without ground truth data.")

In [None]:
# Cell [5] - Create combined dataset for analysis
# Purpose: Merge meta judge, LLM result, and ground truth data for analysis
# Dependencies: pandas, logging
# Breadcrumbs: Data Preparation -> Combination
# notebooks/07_Meta_Judge_Analysis_Notebook.ipynb

def create_combined_dataset(meta_judge_df=None, llm_result_df=None, df_ground_truth=None, min_traceability_threshold=None):
    """
    Create a combined dataset with meta judge, LLM result, and ground truth information
    
    Parameters:
        meta_judge_df (pd.DataFrame, optional): DataFrame with meta judge data
        llm_result_df (pd.DataFrame, optional): DataFrame with LLM result data
        df_ground_truth (pd.DataFrame, optional): DataFrame with ground truth data
        min_traceability_threshold (int, optional): Threshold for traceability. If None, uses environment value.
    
    Returns:
        pd.DataFrame: Combined dataset with all relevant information
    """
    try:
        # Check if we have meta judge data
        if meta_judge_df is None or meta_judge_df.empty:
            if 'meta_judge_df' in globals() and not globals()['meta_judge_df'].empty:
                meta_judge_df = globals()['meta_judge_df']
            else:
                logger.error("No meta judge data available to create combined dataset")
                return pd.DataFrame()
        
        # Get min_traceability_threshold if not provided
        if min_traceability_threshold is None:
            if 'MIN_TRACEABILITY_THRESHOLD' in globals():
                min_traceability_threshold = globals()['MIN_TRACEABILITY_THRESHOLD']
            else:
                # Load from environment if being imported
                config = initialize_environment()
                min_traceability_threshold = config['MIN_TRACEABILITY_THRESHOLD']
        
        # Make sure we have ground truth data
        if df_ground_truth is None or df_ground_truth.empty:
            if 'df_ground_truth' in globals() and not globals()['df_ground_truth'].empty:
                df_ground_truth = globals()['df_ground_truth']
            else:
                logger.warning("No ground truth data available for filtering relevant requirements")
                df_ground_truth = pd.DataFrame()
        
        # Filter for relevant source and target requirements if ground truth is available
        if not df_ground_truth.empty:
            # Extract unique source and target IDs with ground truth links
            valid_source_ids = df_ground_truth['source_id'].unique()
            valid_target_ids = df_ground_truth['target_id'].unique()
            
            # Filter meta_judge_df to only include valid source and target requirements
            filtered_meta_judge_df = meta_judge_df[
                (meta_judge_df['source_id'].isin(valid_source_ids)) & 
                (meta_judge_df['target_id'].isin(valid_target_ids))
            ].copy()
            
            logger.info(f"Filtered meta judge data from {len(meta_judge_df)} to {len(filtered_meta_judge_df)} rows")
            logger.info(f"Using {len(valid_source_ids)} valid source requirements and {len(valid_target_ids)} valid target requirements")
            
            # Use the filtered dataframe
            combined_df = filtered_meta_judge_df
        else:
            # No ground truth available, use all meta judge data
            combined_df = meta_judge_df.copy()
            logger.warning("No ground truth data available for filtering. Using all meta judge data.")
            
        # Add LLM result information if available
        if llm_result_df is not None and not llm_result_df.empty:
            # Create key for merging
            combined_df['merge_key'] = combined_df['source_id'] + '_' + combined_df['target_id']
            llm_temp = llm_result_df.copy()
            llm_temp['merge_key'] = llm_temp['source_id'] + '_' + llm_temp['target_id']
            
            # Select columns to merge from LLM results
            llm_cols = ['merge_key', 'is_associated', 'association_probability', 'explanation']
            llm_cols = [c for c in llm_cols if c in llm_temp.columns]
            
            # Merge dataframes
            combined_df = combined_df.merge(
                llm_temp[llm_cols],
                on='merge_key',
                how='left'
            )
            
            # Remove merge key column
            combined_df.drop('merge_key', axis=1, inplace=True)
            
            logger.info(f"Added LLM result data to combined dataset")
        else:
            if 'llm_result_df' in globals() and not globals()['llm_result_df'].empty:
                # Use global if available
                llm_temp = globals()['llm_result_df'].copy()
                
                # Create key for merging
                combined_df['merge_key'] = combined_df['source_id'] + '_' + combined_df['target_id']
                llm_temp['merge_key'] = llm_temp['source_id'] + '_' + llm_temp['target_id']
                
                # Select columns to merge from LLM results
                llm_cols = ['merge_key', 'is_associated', 'association_probability', 'explanation']
                llm_cols = [c for c in llm_cols if c in llm_temp.columns]
                
                # Merge dataframes
                combined_df = combined_df.merge(
                    llm_temp[llm_cols],
                    on='merge_key',
                    how='left'
                )
                
                # Remove merge key column
                combined_df.drop('merge_key', axis=1, inplace=True)
                
                logger.info(f"Added LLM result data to combined dataset from global variable")
            else:
                logger.warning("No LLM result data available for combined dataset")
            
        # Add ground truth information if available
        if not df_ground_truth.empty:
            # Create a set of ground truth links for fast lookup
            ground_truth_pairs = set(zip(df_ground_truth['source_id'], df_ground_truth['target_id']))
            
            # Add ground_truth_traceable column
            combined_df['ground_truth_traceable'] = combined_df.apply(
                lambda row: (row['source_id'], row['target_id']) in ground_truth_pairs,
                axis=1
            )
            
            logger.info(f"Added ground truth data: {combined_df['ground_truth_traceable'].sum()} true links out of {len(combined_df)} total")
        else:
            logger.warning("No ground truth data available for combined dataset")
        
        # Add derived columns for analysis
        if 'judge_score' in combined_df.columns and 'actor_score' in combined_df.columns:
            # Original total_score: judge_score + actor_score
            combined_df['total_score'] = combined_df['judge_score'] + combined_df['actor_score']
            
            # Add alternative total scores for comparison
            if 'final_score' in combined_df.columns:
                # Alternative 1: actor_score + final_score
                combined_df['total_score_with_final'] = combined_df['actor_score'] + combined_df['final_score']
                
                # Alternative 2: actor_score + judge_score + final_score
                combined_df['total_score_all'] = combined_df['actor_score'] + combined_df['judge_score'] + combined_df['final_score']
            
            # Create a new column for different threshold mechanisms
            combined_df['is_traceable_threshold'] = combined_df['total_score'] >= min_traceability_threshold
            
            # Also create threshold columns for alternative total scores if they exist
            if 'total_score_with_final' in combined_df.columns:
                combined_df['is_traceable_threshold_with_final'] = combined_df['total_score_with_final'] >= min_traceability_threshold
            
            if 'total_score_all' in combined_df.columns:
                combined_df['is_traceable_threshold_all'] = combined_df['total_score_all'] >= min_traceability_threshold * 1.5  # Adjust threshold for 3 scores
            
            logger.info(f"Added derived total_score columns and is_traceable_threshold columns using threshold {min_traceability_threshold}")
            
        return combined_df
    
    except Exception as e:
        logger.error(f"Error creating combined dataset: {str(e)}")
        logger.error("Exception details:", exc_info=True)
        return pd.DataFrame()

In [None]:
# Cell [6] - Model evaluation and threshold optimization with statistical analysis
# Purpose: Evaluate meta judge models and find optimal thresholds with confidence intervals
# Dependencies: pandas, numpy, sklearn.metrics, scipy.stats
# Breadcrumbs: Analysis -> Evaluation -> Threshold Optimization -> Statistical Analysis
# notebooks/07_Meta_Judge_Analysis_Notebook.ipynb

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, fbeta_score,
    matthews_corrcoef, confusion_matrix, balanced_accuracy_score
)
from sklearn.utils import resample

def get_actual_model_name(model_key):
    """
    Resolve the actual model name from environment variables
    
    If model_key is a reference to another environment variable,
    resolve it to get the actual model name
    """
    if model_key in os.environ:
        return os.environ[model_key]
    return model_key

def bootstrap_confidence_interval(y_true, y_pred, metric_func, n_iterations=1000, confidence_level=0.95):
    """
    Calculate bootstrap confidence interval for a metric
    
    Parameters:
        y_true: True labels
        y_pred: Predicted labels
        metric_func: Function to calculate the metric
        n_iterations: Number of bootstrap iterations
        confidence_level: Confidence level (default 0.95 for 95% CI)
    
    Returns:
        dict: Contains mean, std, ci_lower, ci_upper
    """
    bootstrapped_scores = []
    
    for i in range(n_iterations):
        # Resample with replacement
        indices = resample(range(len(y_true)), replace=True, n_samples=len(y_true))
        y_true_boot = y_true[indices]
        y_pred_boot = y_pred[indices]
        
        # Calculate metric
        try:
            score = metric_func(y_true_boot, y_pred_boot)
            bootstrapped_scores.append(score)
        except:
            # Skip if metric calculation fails (e.g., no positive samples)
            continue
    
    if not bootstrapped_scores:
        return {'mean': 0, 'std': 0, 'ci_lower': 0, 'ci_upper': 0}
    
    # Calculate confidence interval
    alpha = 1 - confidence_level
    lower = np.percentile(bootstrapped_scores, (alpha/2) * 100)
    upper = np.percentile(bootstrapped_scores, (1 - alpha/2) * 100)
    mean = np.mean(bootstrapped_scores)
    std = np.std(bootstrapped_scores)
    
    return {
        'mean': mean,
        'std': std,
        'ci_lower': lower,
        'ci_upper': upper
    }

def evaluate_model_thresholds(df, model_name, score_column='total_score', 
                             ground_truth_column='ground_truth_traceable', 
                             optimize_for='F2', calculate_ci=True):
    """
    Evaluate a model's performance across different thresholds with confidence intervals
    
    Parameters:
        df: DataFrame containing model predictions and ground truth
        model_name: Name of the model to evaluate
        score_column: Column containing score values
        ground_truth_column: Column containing ground truth values
        optimize_for: Metric to optimize for ('F1' or 'F2')
        calculate_ci: Whether to calculate confidence intervals
    
    Returns:
        dict: Dictionary containing evaluation results with confidence intervals
    """
    try:
        # Filter data for this model
        model_df = df[df['model'] == model_name].copy()
        
        if model_df.empty:
            print(f"No data available for model: {model_name}")
            return {}
            
        if ground_truth_column not in model_df.columns:
            print(f"Ground truth column '{ground_truth_column}' not found for model: {model_name}")
            return {}
        
        # Get ground truth and scores
        y_true = model_df[ground_truth_column].astype(int).values
        
        # Check for and handle None/NaN values in score column
        if model_df[score_column].isna().any():
            print(f"Found NaN values in {score_column} for model {model_name}. Filling with 0.")
            model_df[score_column] = model_df[score_column].fillna(0)
        
        # Ensure scores are numeric
        if model_df[score_column].dtype == object:
            try:
                model_df[score_column] = pd.to_numeric(model_df[score_column])
                print(f"Converted {score_column} to numeric for model {model_name}")
            except Exception as e:
                print(f"Error converting {score_column} to numeric: {str(e)}")
                # Default to zeros if conversion fails
                model_df[score_column] = 0
        
        scores = model_df[score_column].values
        
        # Debug information
        print(f"  - Total data points: {len(model_df)}")
        print(f"  - Positive examples: {y_true.sum()} ({y_true.sum()/len(y_true)*100:.2f}%)")
        print(f"  - Negative examples: {len(y_true) - y_true.sum()} ({(len(y_true) - y_true.sum())/len(y_true)*100:.2f}%)")
        print(f"  - Score range: {scores.min():.4f} to {scores.max():.4f}")
        
        # If all ground truth values are the same, we can't calculate meaningful metrics
        if len(np.unique(y_true)) < 2:
            print(f"Insufficient ground truth variety for model {model_name} - all values are {np.unique(y_true)[0]}")
            return {
                'model_name': model_name,
                'data_points': len(model_df),
                'ground_truth_positive': int(y_true.sum()),
                'ground_truth_negative': int(len(y_true) - y_true.sum())
            }
        
        # Generate possible thresholds from the data
        unique_scores = np.unique(scores)
        # Add some intermediate thresholds to get a more fine-grained evaluation
        thresholds = np.sort(np.concatenate([
            unique_scores,
            np.linspace(scores.min(), scores.max(), 20)
        ]))
        
        # Calculate metrics for each threshold
        results = []
        
        for threshold in thresholds:
            # Convert scores to binary predictions using this threshold
            y_pred = (scores >= threshold).astype(int)
            
            # Only calculate if we have at least one prediction of each class
            if np.unique(y_pred).size < 2:
                continue
                
            # Confusion matrix components
            tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
            
            # Basic metrics
            accuracy = accuracy_score(y_true, y_pred)
            balanced_acc = balanced_accuracy_score(y_true, y_pred)
            
            # Handle division by zero
            if tp + fp == 0:  # No positive predictions
                prec = 0
            else:
                prec = tp / (tp + fp)
                
            if tp + fn == 0:  # No positive ground truth
                rec = 0
            else:
                rec = tp / (tp + fn)
            
            f1 = f1_score(y_true, y_pred, zero_division=0)
            f2 = fbeta_score(y_true, y_pred, beta=2, zero_division=0)
            
            # Additional metrics
            tnr = tn / (tn + fp) if (tn + fp) > 0 else 0  # Specificity/True Negative Rate
            fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # Miss Rate/False Negative Rate
            mcc = matthews_corrcoef(y_true, y_pred)  # Matthews Correlation Coefficient
            
            results.append({
                'threshold': threshold,
                'tp': tp,
                'fp': fp,
                'fn': fn,
                'tn': tn,
                'accuracy': accuracy,
                'balanced_accuracy': balanced_acc,
                'precision': prec,
                'recall': rec,
                'tnr': tnr,  # specificity
                'fnr': fnr,  # miss rate
                'f1_score': f1,
                'f2_score': f2,
                'mcc': mcc  # Matthews Correlation Coefficient
            })
        
        # Convert to DataFrame for easier analysis
        results_df = pd.DataFrame(results)
        
        if results_df.empty:
            print(f"No valid thresholds found for model {model_name}")
            return {
                'model_name': model_name,
                'data_points': len(model_df),
                'ground_truth_positive': int(y_true.sum()),
                'ground_truth_negative': int(len(y_true) - y_true.sum()),
                'error': "No valid thresholds found with current data"
            }
        
        # Find best threshold based on optimization metric
        if optimize_for == 'F1':
            best_idx = results_df['f1_score'].idxmax()
            best_metric = 'f1_score'
        else:  # F2
            best_idx = results_df['f2_score'].idxmax()
            best_metric = 'f2_score'
            
        best_result = results_df.loc[best_idx]
        
        # Calculate confidence intervals for the best threshold if requested
        confidence_intervals = {}
        if calculate_ci:
            best_threshold = best_result['threshold']
            best_y_pred = (scores >= best_threshold).astype(int)
            
            print(f"  Calculating confidence intervals...")
            
            # Calculate CIs for key metrics
            confidence_intervals['precision'] = bootstrap_confidence_interval(
                y_true, best_y_pred, 
                lambda y_t, y_p: precision_score(y_t, y_p, zero_division=0)
            )
            
            confidence_intervals['recall'] = bootstrap_confidence_interval(
                y_true, best_y_pred,
                lambda y_t, y_p: recall_score(y_t, y_p, zero_division=0)
            )
            
            confidence_intervals['f1'] = bootstrap_confidence_interval(
                y_true, best_y_pred,
                lambda y_t, y_p: f1_score(y_t, y_p, zero_division=0)
            )
            
            confidence_intervals['f2'] = bootstrap_confidence_interval(
                y_true, best_y_pred,
                lambda y_t, y_p: fbeta_score(y_t, y_p, beta=2, zero_division=0)
            )
            
            confidence_intervals['accuracy'] = bootstrap_confidence_interval(
                y_true, best_y_pred,
                accuracy_score
            )
            
            confidence_intervals['mcc'] = bootstrap_confidence_interval(
                y_true, best_y_pred,
                matthews_corrcoef
            )
        
        # Return comprehensive results
        return {
            'model_name': model_name,
            'data_points': len(model_df),
            'ground_truth_positive': int(y_true.sum()),
            'ground_truth_negative': int(len(y_true) - y_true.sum()),
            'best_threshold': best_result['threshold'],
            'best_precision': best_result['precision'],
            'best_recall': best_result['recall'],
            'best_accuracy': best_result['accuracy'],
            'best_balanced_accuracy': best_result['balanced_accuracy'],
            'best_f1': best_result['f1_score'],
            'best_f2': best_result['f2_score'],
            'best_tnr': best_result['tnr'],
            'best_fnr': best_result['fnr'],
            'best_mcc': best_result['mcc'],
            'best_tp': best_result['tp'],
            'best_fp': best_result['fp'],
            'best_fn': best_result['fn'],
            'best_tn': best_result['tn'],
            'optimization_metric': optimize_for,
            'threshold_results': results_df,
            'confidence_intervals': confidence_intervals
        }
    except Exception as e:
        print(f"Error evaluating model {model_name}: {str(e)}")
        import traceback
        traceback.print_exc()
        return {
            'model_name': model_name,
            'data_points': len(model_df) if 'model_df' in locals() else 0,
            'error': str(e)
        }

# This will be executed when the cell is run
# Load environment variables for model name resolution
load_dotenv()

# Get the actual model name
current_model_var = os.environ.get('CURRENT_MODEL', '')
actual_model_name = os.environ.get(current_model_var, current_model_var)

# Update the global variable if it exists
if 'CURRENT_MODEL_KEY' in globals():
    CURRENT_MODEL_KEY = actual_model_name

# Initialize best_thresholds_df as an empty DataFrame (will be filled later)
best_thresholds_df = pd.DataFrame()

# First, make sure combined_df exists by creating it if necessary
if 'combined_df' not in globals() or globals()['combined_df'] is None or globals()['combined_df'].empty:
    print("Creating combined dataset from previous data...")
    try:
        # Check if we have the required data frames from previous cells
        if all(var in globals() for var in ['meta_judge_df', 'llm_result_df', 'df_ground_truth']):
            # Create combined_df using the create_combined_dataset function from cell 5
            if 'create_combined_dataset' in globals():
                combined_df = create_combined_dataset(
                    meta_judge_df=globals()['meta_judge_df'],
                    llm_result_df=globals()['llm_result_df'],
                    df_ground_truth=globals()['df_ground_truth']
                )
                print(f"Successfully created combined dataset with {len(combined_df)} rows")
            else:
                print("create_combined_dataset function not found. Please run cell 5 first.")
                combined_df = pd.DataFrame()
        else:
            print("Missing required data. Please run cells 0-5 first.")
            # Create empty DataFrame to prevent errors
            combined_df = pd.DataFrame()
    except Exception as e:
        print(f"Error creating combined dataset: {str(e)}")
        import traceback
        traceback.print_exc()
        # Create empty DataFrame to prevent errors
        combined_df = pd.DataFrame()

# Only proceed with evaluation if we have data
if not combined_df.empty and 'ground_truth_traceable' in combined_df.columns and 'model' in combined_df.columns:
    # Get list of all models
    all_models = combined_df['model'].unique()
    
    # Set OPTIMIZATION_METRIC if not already in globals
    if 'OPTIMIZATION_METRIC' not in globals():
        OPTIMIZATION_METRIC = 'F2'
    else:
        OPTIMIZATION_METRIC = globals()['OPTIMIZATION_METRIC']
    
    # Set SHOW_VISUALIZATION if not already in globals
    if 'SHOW_VISUALIZATION' not in globals():
        SHOW_VISUALIZATION = True
    else:
        SHOW_VISUALIZATION = globals()['SHOW_VISUALIZATION']
    
    # Evaluate each model
    evaluation_results = []
    
    print(f"\nEvaluating {len(all_models)} models using meta judge data")
    print(f"Optimizing for {OPTIMIZATION_METRIC} score")
    print("=" * 80)
    
    # Define score columns to evaluate - including the new score columns
    score_columns_to_evaluate = [
        'is_traceable',         # Boolean indicator
        'actor_score',          # Individual score
        'judge_score',          # Individual score
        'final_score',          # Individual score
        'total_score',          # judge_score + actor_score
        'total_score_with_final',  # actor_score + final_score
        'total_score_all'       # actor_score + judge_score + final_score
    ]
    
    # Add a new total score column that combines all the requested metrics
    try:
        combined_df['total_combined_score'] = combined_df.apply(
            lambda row: sum([
                row.get('judge_score', 0) or 0,
                row.get('semantic_alignment', 0) or 0, 
                row.get('non_functional_coverage', 0) or 0,
                row.get('final_score', 0) or 0,
                row.get('actor_score', 0) or 0,
                row.get('functional_completeness', 0) or 0
            ]), 
            axis=1
        )
        score_columns_to_evaluate.append('total_combined_score')
        print(f"Created new total_combined_score column combining all requested metrics")
    except Exception as e:
        print(f"Error creating total_combined_score: {str(e)}")
        import traceback
        traceback.print_exc()
    
    # Filter to only columns that exist
    score_columns_to_evaluate = [
        col for col in score_columns_to_evaluate 
        if col in combined_df.columns and not combined_df[col].isna().all()
    ]
    
    for model in all_models:
        print(f"Evaluating model: {model}")
        
        # For each model, evaluate using different score columns
        model_results = []
        for score_column in score_columns_to_evaluate:
            print(f"  Evaluating using {score_column}:")
            result = evaluate_model_thresholds(
                combined_df, 
                model, 
                score_column=score_column,
                optimize_for=OPTIMIZATION_METRIC,
                calculate_ci=True  # Enable confidence interval calculation
            )
            
            if result and 'best_threshold' in result:
                # Add score column to result
                result['score_column'] = score_column
                model_results.append(result)
                
                # Print key metrics with confidence intervals
                print(f"    - Best threshold: {result['best_threshold']:.3f}")
                
                if 'confidence_intervals' in result and result['confidence_intervals']:
                    ci = result['confidence_intervals']
                    
                    # Print metrics with CIs
                    for metric in ['precision', 'recall', 'f1', 'f2']:
                        if metric in ci:
                            metric_ci = ci[metric]
                            print(f"    - {metric.capitalize()}: {result[f'best_{metric}']:.3f} "
                                  f"(95% CI: [{metric_ci['ci_lower']:.3f}, {metric_ci['ci_upper']:.3f}])")
                    
                    if 'mcc' in ci:
                        mcc_ci = ci['mcc']
                        print(f"    - MCC: {result['best_mcc']:.3f} "
                              f"(95% CI: [{mcc_ci['ci_lower']:.3f}, {mcc_ci['ci_upper']:.3f}])")
                else:
                    # Print without CIs if not calculated
                    print(f"    - Precision: {result['best_precision']:.3f}")
                    print(f"    - Recall: {result['best_recall']:.3f}")
                    print(f"    - F1: {result['best_f1']:.3f}")
                    print(f"    - F2: {result['best_f2']:.3f}")
                    print(f"    - MCC: {result['best_mcc']:.3f}")
        
        # Find best score column for this model based on optimization metric
        if model_results:
            # Sort by the chosen optimization metric
            if OPTIMIZATION_METRIC == 'F1':
                model_results.sort(key=lambda x: x['best_f1'], reverse=True)
            else:  # F2
                model_results.sort(key=lambda x: x['best_f2'], reverse=True)
                
            best_result = model_results[0]
            print(f"  Best performing score for {model}: {best_result['score_column']}")
            print(f"    - {OPTIMIZATION_METRIC} Score: {best_result['best_f2' if OPTIMIZATION_METRIC == 'F2' else 'best_f1']:.3f}")
            
            evaluation_results.extend(model_results)
    
    # Create DataFrame of best thresholds with all metrics
    if evaluation_results:
        best_thresholds_df = pd.DataFrame([
            {
                'model_name': r['model_name'],
                'score_column': r['score_column'],
                'best_threshold': r['best_threshold'],
                'accuracy': r['best_accuracy'],
                'balanced_accuracy': r['best_balanced_accuracy'],
                'precision': r['best_precision'],
                'recall': r['best_recall'],
                'specificity': r['best_tnr'],
                'miss_rate': r['best_fnr'],
                'f1_score': r['best_f1'],
                'f2_score': r['best_f2'],
                'matthews_corr': r['best_mcc'],
                'true_positives': r['best_tp'],
                'false_positives': r['best_fp'],
                'false_negatives': r['best_fn'],
                'true_negatives': r['best_tn'],
                'data_points': r['data_points'],
                'ground_truth_positive': r['ground_truth_positive'],
                'ground_truth_negative': r['ground_truth_negative'],
                # Add confidence interval data
                'precision_ci_lower': r['confidence_intervals']['precision']['ci_lower'] if 'confidence_intervals' in r and 'precision' in r['confidence_intervals'] else None,
                'precision_ci_upper': r['confidence_intervals']['precision']['ci_upper'] if 'confidence_intervals' in r and 'precision' in r['confidence_intervals'] else None,
                'recall_ci_lower': r['confidence_intervals']['recall']['ci_lower'] if 'confidence_intervals' in r and 'recall' in r['confidence_intervals'] else None,
                'recall_ci_upper': r['confidence_intervals']['recall']['ci_upper'] if 'confidence_intervals' in r and 'recall' in r['confidence_intervals'] else None,
                'f1_ci_lower': r['confidence_intervals']['f1']['ci_lower'] if 'confidence_intervals' in r and 'f1' in r['confidence_intervals'] else None,
                'f1_ci_upper': r['confidence_intervals']['f1']['ci_upper'] if 'confidence_intervals' in r and 'f1' in r['confidence_intervals'] else None,
                'f2_ci_lower': r['confidence_intervals']['f2']['ci_lower'] if 'confidence_intervals' in r and 'f2' in r['confidence_intervals'] else None,
                'f2_ci_upper': r['confidence_intervals']['f2']['ci_upper'] if 'confidence_intervals' in r and 'f2' in r['confidence_intervals'] else None,
            }
            for r in evaluation_results if 'best_threshold' in r
        ])
        
        # Check if we have any results
        if not best_thresholds_df.empty:
            # Sort by the appropriate metric
            sort_col = 'f1_score' if OPTIMIZATION_METRIC == 'F1' else 'f2_score'
            best_thresholds_df = best_thresholds_df.sort_values(sort_col, ascending=False).reset_index(drop=True)
            
            print("\nBest Thresholds by Model and Score Column:")
            print("-" * 80)
            display(best_thresholds_df[['model_name', 'score_column', 'best_threshold', 
                                       'precision', 'recall', 'f1_score', 'f2_score', 'matthews_corr']])
            
            # Store best_thresholds_df in globals
            globals()['best_thresholds_df'] = best_thresholds_df
            
            # Store evaluation_results for statistical comparisons
            globals()['evaluation_results'] = evaluation_results
            
            # Create Metrics Comparison Heatmap like in Sentence Transformer notebook
            print("\nMetrics Comparison Heatmap:")
            print("-" * 80)

            # Select relevant metrics for comparison
            metrics_to_compare = ['accuracy', 'balanced_accuracy', 'precision', 'recall', 
                                 'specificity', 'f1_score', 'f2_score', 'matthews_corr']

            # Filter to only include metrics that exist in the DataFrame
            available_metrics = [metric for metric in metrics_to_compare if metric in best_thresholds_df.columns]

            if len(available_metrics) > 0:
                # Create a new dataframe with model+score_column as index and metrics as columns
                best_thresholds_df['model_score'] = best_thresholds_df.apply(
                    lambda row: f"{row['model_name']} ({row['score_column']})", axis=1)
                
                comparison_df = best_thresholds_df.set_index('model_score')[available_metrics]
                
                # Create heatmap
                plt.figure(figsize=(12, 8))
                sns.heatmap(comparison_df, annot=True, cmap='YlGnBu', fmt='.3f',
                           linewidths=0.5, cbar_kws={'label': 'Score'})
                plt.title(f'Project: {NEO4J_PROJECT_NAME} - Comparison of Models Across Metrics', fontsize=14)
                plt.tight_layout()
                plt.show()
            else:
                print("No metrics available for heatmap visualization.")
        else:
            print("No valid threshold results found. Check that the data contains ground truth information.")
            # Initialize as empty DataFrame to prevent errors
            globals()['best_thresholds_df'] = pd.DataFrame()
    else:
        print("No evaluation results available. Check that the data contains model and ground truth information.")
        # Initialize as empty DataFrame to prevent errors
        globals()['best_thresholds_df'] = pd.DataFrame()
else:
    print("\nCannot evaluate model thresholds: missing ground truth data or model information")
    print("Make sure combined_df has the 'ground_truth_traceable' and 'model' columns")
    print("Please run cells 0-5 first to create the combined dataset.")
    
    # Initialize as empty DataFrame to prevent errors in later cells
    globals()['best_thresholds_df'] = pd.DataFrame()

In [None]:
# Cell [7] - Statistical Comparison of Scoring Methods
# Purpose: Compare different scoring methods using statistical tests
# Dependencies: scipy.stats, pandas, numpy
# Breadcrumbs: Analysis -> Statistical Tests -> Scoring Method Comparison
# notebooks/07_Meta_Judge_Analysis_Notebook.ipynb

def compare_scoring_methods_statistically(df, model_name, ground_truth='ground_truth_traceable'):
    """
    Statistically compare different scoring methods using paired tests
    
    Parameters:
        df: DataFrame containing all data
        model_name: Model to analyze
        ground_truth: Ground truth column name
    
    Returns:
        dict: Statistical test results
    """
    from scipy.stats import wilcoxon, friedmanchisquare
    import pandas as pd
    
    model_df = df[df['model'] == model_name].copy()
    
    if model_df.empty:
        print(f"No data for model {model_name}")
        return {}
    
    # Get all scoring columns
    scoring_columns = [
        'is_traceable', 'actor_score', 'judge_score', 'final_score',
        'total_score', 'total_score_with_final', 'total_score_all', 'total_combined_score'
    ]
    
    # Filter to available columns
    available_columns = [col for col in scoring_columns if col in model_df.columns]
    
    print(f"\nStatistical Comparison of Scoring Methods for {model_name}")
    print("=" * 80)
    print(f"Comparing {len(available_columns)} scoring methods")
    
    # Get predictions and F2 scores for each method
    method_predictions = {}
    method_f2_scores = []
    method_names = []
    
    for col in available_columns:
        # Find optimal threshold for this scoring method
        scores = model_df[col].fillna(0).values
        y_true = model_df[ground_truth].values
        
        # For boolean columns, use directly
        if col == 'is_traceable':
            best_pred = model_df[col].astype(int).values
            best_threshold = 0.5
        else:
            # Find best threshold
            thresholds = np.unique(scores)
            best_f2 = 0
            best_pred = None
            best_threshold = 0
            
            for thresh in thresholds:
                pred = (scores >= thresh).astype(int)
                if len(np.unique(pred)) < 2:
                    continue
                f2 = fbeta_score(y_true, pred, beta=2, zero_division=0)
                if f2 > best_f2:
                    best_f2 = f2
                    best_pred = pred
                    best_threshold = thresh
        
        if best_pred is not None:
            method_predictions[col] = best_pred
            # Calculate per-sample correctness for Friedman test
            correct = (best_pred == y_true).astype(int)
            method_f2_scores.append(correct)
            method_names.append(col)
            
            # Calculate overall metrics
            f2 = fbeta_score(y_true, best_pred, beta=2, zero_division=0)
            precision = precision_score(y_true, best_pred, zero_division=0)
            recall = recall_score(y_true, best_pred, zero_division=0)
            
            print(f"\n{col}:")
            print(f"  Threshold: {best_threshold:.3f}")
            print(f"  F2 Score: {f2:.3f}")
            print(f"  Precision: {precision:.3f}")
            print(f"  Recall: {recall:.3f}")
    
    # Perform statistical tests if we have multiple methods
    if len(method_f2_scores) > 2:
        # Friedman test for multiple related samples
        try:
            stat, p_value = friedmanchisquare(*method_f2_scores)
            print(f"\nFriedman Test Results:")
            print(f"  Chi-squared statistic: {stat:.4f}")
            print(f"  p-value: {p_value:.4f}")
            print(f"  Significant difference: {'Yes' if p_value < 0.05 else 'No'} (α=0.05)")
            
            # Post-hoc pairwise comparisons if significant
            if p_value < 0.05:
                print("\nPost-hoc Pairwise Comparisons (Wilcoxon signed-rank test):")
                print("-" * 60)
                
                pairwise_results = []
                for i in range(len(method_names)):
                    for j in range(i+1, len(method_names)):
                        try:
                            # Wilcoxon signed-rank test
                            stat, p = wilcoxon(method_f2_scores[i], method_f2_scores[j])
                            
                            # Calculate effect size (r = Z / sqrt(N))
                            n = len(method_f2_scores[i])
                            z = stat
                            effect_size = z / np.sqrt(n)
                            
                            pairwise_results.append({
                                'method1': method_names[i],
                                'method2': method_names[j],
                                'statistic': stat,
                                'p_value': p,
                                'effect_size': abs(effect_size),
                                'significant': p < 0.05
                            })
                            
                            print(f"{method_names[i]} vs {method_names[j]}:")
                            print(f"  p-value: {p:.4f} {'*' if p < 0.05 else ''}")
                            print(f"  Effect size (r): {abs(effect_size):.3f}")
                            
                        except Exception as e:
                            print(f"Could not compare {method_names[i]} vs {method_names[j]}: {str(e)}")
                
                # Apply multiple testing correction
                if pairwise_results:
                    p_values = [r['p_value'] for r in pairwise_results]
                    from statsmodels.stats.multitest import multipletests
                    reject, p_adjusted, _, _ = multipletests(p_values, method='bonferroni')
                    
                    print("\nBonferroni-corrected p-values:")
                    print("-" * 60)
                    for i, result in enumerate(pairwise_results):
                        print(f"{result['method1']} vs {result['method2']}: "
                              f"p_adj={p_adjusted[i]:.4f} {'*' if reject[i] else ''}")
                        
        except Exception as e:
            print(f"\nError in Friedman test: {str(e)}")
    
    elif len(method_f2_scores) == 2:
        # Just two methods - use Wilcoxon signed-rank test
        try:
            stat, p_value = wilcoxon(method_f2_scores[0], method_f2_scores[1])
            print(f"\nWilcoxon Signed-Rank Test Results:")
            print(f"  Comparing: {method_names[0]} vs {method_names[1]}")
            print(f"  Test statistic: {stat:.4f}")
            print(f"  p-value: {p_value:.4f}")
            print(f"  Significant difference: {'Yes' if p_value < 0.05 else 'No'} (α=0.05)")
        except Exception as e:
            print(f"\nError in Wilcoxon test: {str(e)}")
    
    return {
        'methods': method_names,
        'predictions': method_predictions,
        'f2_scores': method_f2_scores
    }

# Run statistical comparison for each model
if 'combined_df' in globals() and not combined_df.empty:
    models = combined_df['model'].unique()
    
    statistical_results = {}
    for model in models:
        print(f"\n{'='*80}")
        print(f"Model: {model}")
        print(f"{'='*80}")
        
        results = compare_scoring_methods_statistically(combined_df, model)
        statistical_results[model] = results
    
    # Store results for later use
    globals()['scoring_method_comparison'] = statistical_results
else:
    print("No data available for statistical comparison. Please run previous cells first.")

In [None]:
# Cell [8] - Traceability Analysis with Fixed Threshold
# Purpose: Analyze traceability prediction using meta judge data with MIN_TRACEABILITY_THRESHOLD from .env
# Dependencies: pandas, numpy, seaborn, matplotlib
# Breadcrumbs: Analysis -> Traceability Evaluation -> Fixed Threshold

def analyze_fixed_threshold(combined_df, min_traceability_threshold=None):
    """
    Analyze traceability prediction using a fixed threshold value
    
    Parameters:
        combined_df (pd.DataFrame): Combined dataset with meta judge, LLM and ground truth data
        min_traceability_threshold (int, optional): Fixed threshold value. If None, uses environment value.
        
    Returns:
        pd.DataFrame: DataFrame with analysis results
    """
    try:
        # Create a copy of the combined DataFrame for evaluation
        combined_traced_eval_df = combined_df.copy()
        
        # Get min_traceability_threshold if not provided
        if min_traceability_threshold is None:
            if 'MIN_TRACEABILITY_THRESHOLD' in globals():
                min_traceability_threshold = globals()['MIN_TRACEABILITY_THRESHOLD']
            else:
                # Load from environment if being imported
                config = initialize_environment()
                min_traceability_threshold = config['MIN_TRACEABILITY_THRESHOLD']
        
        # Define score columns of interest (including the new total score variants)
        score_columns = [
            'is_traceable', 'judge_score', 'actor_score', 'final_score', 
            'total_score', 'total_score_with_final', 'total_score_all'
        ]
        score_columns = [col for col in score_columns if col in combined_traced_eval_df.columns]
        
        # Check for ground truth data
        if 'ground_truth_traceable' not in combined_traced_eval_df.columns:
            logger.warning("No ground truth data available for evaluation")
            has_ground_truth = False
        else:
            has_ground_truth = True
        
        # Create a dictionary with fixed thresholds for each model and score type
        model_thresholds = {}
        for model_name in combined_traced_eval_df['model'].unique():
            model_thresholds[model_name] = {
                'is_traceable': True,  # Boolean
                'judge_score': min_traceability_threshold,
                'actor_score': min_traceability_threshold,
                'final_score': min_traceability_threshold,
                'total_score': min_traceability_threshold * 2,  # Double for combined judge+actor
                'total_score_with_final': min_traceability_threshold * 2,  # Double for actor+final
                'total_score_all': min_traceability_threshold * 3  # Triple for actor+judge+final
            }
        
        # Make sure there are no None values in score columns
        for col in score_columns:
            if col in combined_traced_eval_df.columns:
                null_count = combined_traced_eval_df[col].isna().sum()
                if null_count > 0:
                    logger.warning(f"Found {null_count} null values in {col} column before applying thresholds. Replacing with 0 or False.")
                    
                    if col == 'is_traceable':
                        combined_traced_eval_df[col] = combined_traced_eval_df[col].fillna(False)
                    else:
                        combined_traced_eval_df[col] = combined_traced_eval_df[col].fillna(0)
                
                # Ensure score column is numeric (except for boolean is_traceable)
                if col != 'is_traceable' and combined_traced_eval_df[col].dtype == object:
                    try:
                        combined_traced_eval_df[col] = pd.to_numeric(combined_traced_eval_df[col])
                        logger.info(f"Converted {col} to numeric type for threshold application")
                    except Exception as e:
                        logger.error(f"Error converting {col} to numeric: {str(e)}")
                        logger.error("Using zeros for score")
                        combined_traced_eval_df[col] = 0
        
        # Create prediction columns for each score type
        for score_col in score_columns:
            if score_col in combined_traced_eval_df.columns:
                pred_col_name = f'predicted_{score_col}'
                
                # Apply appropriate threshold based on score column type
                if score_col == 'is_traceable':
                    # For boolean column, just use the value directly
                    combined_traced_eval_df[pred_col_name] = combined_traced_eval_df[score_col]
                else:
                    # For numeric columns, apply fixed threshold
                    def apply_threshold(row):
                        model = row['model']
                        if model in model_thresholds and score_col in model_thresholds[model]:
                            threshold = model_thresholds[model][score_col]
                        else:
                            # Fallback thresholds if not found
                            if score_col == 'total_score':
                                threshold = min_traceability_threshold * 2  # Default for combined score
                            elif score_col == 'total_score_with_final':
                                threshold = min_traceability_threshold * 2  # Default for actor+final
                            elif score_col == 'total_score_all':
                                threshold = min_traceability_threshold * 3  # Default for actor+judge+final
                            else:
                                threshold = min_traceability_threshold  # Default for other scores
                        
                        return row[score_col] >= threshold
                    
                    combined_traced_eval_df[pred_col_name] = combined_traced_eval_df.apply(apply_threshold, axis=1)
                
                logger.info(f"Created prediction column {pred_col_name} based on {score_col}")
        
        # Add confusion matrix categories for each prediction type if we have ground truth
        if has_ground_truth:
            for score_col in score_columns:
                pred_col = f'predicted_{score_col}'
                if pred_col in combined_traced_eval_df.columns:
                    conf_col = f'confusion_{score_col}'
                    
                    def get_confusion_category(row):
                        if row['ground_truth_traceable'] and row[pred_col]:
                            return 'TP'  # True Positive
                        elif not row['ground_truth_traceable'] and row[pred_col]:
                            return 'FP'  # False Positive
                        elif row['ground_truth_traceable'] and not row[pred_col]:
                            return 'FN'  # False Negative
                        else:  # not ground_truth and not predicted
                            return 'TN'  # True Negative
                    
                    combined_traced_eval_df[conf_col] = combined_traced_eval_df.apply(get_confusion_category, axis=1)
                    logger.info(f"Created confusion category column {conf_col} based on {pred_col}")
        
        # Save fixed model thresholds to combined_traced_eval_df for use in other cells
        combined_traced_eval_df.attrs['fixed_model_thresholds'] = model_thresholds
        
        return combined_traced_eval_df
    
    except Exception as e:
        logger.error(f"Error in fixed threshold evaluation: {str(e)}")
        logger.error("Exception details:", exc_info=True)
        return pd.DataFrame()

In [None]:
# Cell [9] - Confusion Matrix and Performance Metrics
# Purpose: Analyze confusion matrices and model performance for different score variants
# Dependencies: pandas, numpy, matplotlib, seaborn
# Breadcrumbs: Analysis -> Confusion Matrix -> Visualization

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, fbeta_score,
    matthews_corrcoef, confusion_matrix, balanced_accuracy_score
)

def get_actual_model_name(model_key):
    """
    Resolve the actual model name from environment variables
    
    If model_key is a reference to another environment variable,
    resolve it to get the actual model name
    """
    if model_key in os.environ:
        return os.environ[model_key]
    return model_key

def create_heatmap(df, title, ax=None, cmap='Blues', annot=False, vmin=None, vmax=None):
    """
    Create a heatmap visualization with absolute counts
    
    Parameters:
        df (pd.DataFrame): DataFrame to visualize (with counts)
        title (str): Title for the heatmap
        ax (matplotlib.axes.Axes, optional): Axes to plot on
        cmap (str): Colormap to use
        annot (bool): Whether to use seaborn's annotations (we'll add our own)
        vmin (float): Minimum value for colormap
        vmax (float): Maximum value for colormap
    
    Returns:
        matplotlib.axes.Axes: The axes with the heatmap
    """
    sns.heatmap(df, annot=annot, cmap=cmap, ax=ax, cbar=True, vmin=vmin, vmax=vmax)
    ax.set_title(title)
    return ax

def analyze_confusion_matrix(combined_results, score_columns=None, models=None):
    """
    Analyze confusion matrix for different score columns and models
    
    Parameters:
        combined_results (pd.DataFrame): Combined dataset with prediction results
        score_columns (list, optional): List of score columns to analyze
        models (list, optional): List of models to analyze
    
    Returns:
        dict: Dictionary with analysis results
    """
    try:
        # Make sure we have the necessary data
        if combined_results is None or combined_results.empty:
            print("No data available for confusion matrix analysis")
            return {}
            
        # Check if we have ground truth data
        if 'ground_truth_traceable' not in combined_results.columns:
            print("No ground truth data available for confusion matrix analysis")
            return {}
            
        # Use all score columns if not specified
        if score_columns is None:
            # Look for columns that start with 'predicted_'
            pred_cols = [col for col in combined_results.columns if col.startswith('predicted_')]
            score_columns = [col.replace('predicted_', '') for col in pred_cols]
            
        # Use all models if not specified
        if models is None:
            models = combined_results['model'].unique()
            
        # Ensure all columns exist
        score_columns = [col for col in score_columns 
                        if f'predicted_{col}' in combined_results.columns]
            
        # Initialize results dictionary
        results = {}
        
        # Calculate metrics for each model and score column
        for model in models:
            model_df = combined_results[combined_results['model'] == model]
            
            if model_df.empty:
                continue
                
            model_results = {}
            
            for score_col in score_columns:
                pred_col = f'predicted_{score_col}'
                
                if pred_col not in model_df.columns:
                    continue
                    
                # Create confusion matrix
                y_true = model_df['ground_truth_traceable'].astype(int).values
                y_pred = model_df[pred_col].astype(int).values
                
                # Confusion matrix components
                try:
                    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
                    
                    # Calculate metrics
                    accuracy = accuracy_score(y_true, y_pred)
                    balanced_acc = balanced_accuracy_score(y_true, y_pred)
                    
                    # Handle division by zero for precision and recall
                    prec = tp / (tp + fp) if (tp + fp) > 0 else 0
                    rec = tp / (tp + fn) if (tp + fn) > 0 else 0
                    
                    f1 = f1_score(y_true, y_pred, zero_division=0)
                    f2 = fbeta_score(y_true, y_pred, beta=2, zero_division=0)
                    
                    # Additional metrics
                    tnr = tn / (tn + fp) if (tn + fp) > 0 else 0  # Specificity
                    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # Miss Rate
                    
                    model_results[score_col] = {
                        'confusion_matrix': {
                            'tn': tn,
                            'fp': fp,
                            'fn': fn,
                            'tp': tp
                        },
                        'metrics': {
                            'accuracy': accuracy,
                            'balanced_accuracy': balanced_acc,
                            'precision': prec,
                            'recall': rec,
                            'f1_score': f1,
                            'f2_score': f2,
                            'specificity': tnr,
                            'miss_rate': fnr
                        }
                    }
                except Exception as e:
                    print(f"Error creating confusion matrix for {model}, {score_col}: {str(e)}")
                    continue
                    
            if model_results:
                results[model] = model_results
        
        return results
    
    except Exception as e:
        print(f"Error in confusion matrix analysis: {str(e)}")
        import traceback
        traceback.print_exc()
        return {}

# Run the fixed threshold analysis if combined_df exists
print("Running fixed threshold analysis to create evaluation dataset...")

# Create total_combined_score if needed
if 'combined_df' in globals() and not combined_df.empty:
    # Only add it if it doesn't already exist
    if 'total_combined_score' not in combined_df.columns:
        combined_df['total_combined_score'] = combined_df.apply(
            lambda row: sum([
                row.get('judge_score', 0) or 0,
                row.get('semantic_alignment', 0) or 0, 
                row.get('non_functional_coverage', 0) or 0,
                row.get('final_score', 0) or 0,
                row.get('actor_score', 0) or 0,
                row.get('functional_completeness', 0) or 0
            ]), 
            axis=1
        )
        print("Created total_combined_score column")

# Apply fixed threshold analysis
if 'analyze_fixed_threshold' in globals() and 'combined_df' in globals() and not combined_df.empty:
    # Get min_traceability_threshold from environment if available
    if 'MIN_TRACEABILITY_THRESHOLD' in globals():
        min_traceability_threshold = globals()['MIN_TRACEABILITY_THRESHOLD']
    else:
        config = initialize_environment() if 'initialize_environment' in globals() else {}
        min_traceability_threshold = config.get('MIN_TRACEABILITY_THRESHOLD', 3)
    
    # Include the total_combined_score in the score_columns list
    combined_traced_eval_df = analyze_fixed_threshold(combined_df, min_traceability_threshold)
    
    if not combined_traced_eval_df.empty:
        print(f"Created evaluation dataset with {len(combined_traced_eval_df)} rows")
        
        # Make sure the total_combined_score column is included in evaluation
        # Add predicted_total_combined_score column if not already exists
        if 'total_combined_score' in combined_traced_eval_df.columns and 'predicted_total_combined_score' not in combined_traced_eval_df.columns:
            # Calculate threshold as 6x the min_traceability_threshold (sum of 6 scores)
            threshold = min_traceability_threshold * 6
            combined_traced_eval_df['predicted_total_combined_score'] = combined_traced_eval_df['total_combined_score'] >= threshold
            print(f"Created prediction column for total_combined_score with threshold {threshold}")
            
            # Add confusion matrix column if we have ground truth
            if 'ground_truth_traceable' in combined_traced_eval_df.columns:
                def get_confusion_category(row):
                    if row['ground_truth_traceable'] and row['predicted_total_combined_score']:
                        return 'TP'  # True Positive
                    elif not row['ground_truth_traceable'] and row['predicted_total_combined_score']:
                        return 'FP'  # False Positive
                    elif row['ground_truth_traceable'] and not row['predicted_total_combined_score']:
                        return 'FN'  # False Negative
                    else:  # not ground_truth and not predicted
                        return 'TN'  # True Negative
                
                combined_traced_eval_df['confusion_total_combined_score'] = combined_traced_eval_df.apply(get_confusion_category, axis=1)
                print("Created confusion category column for total_combined_score")
        
        # Analyze confusion matrices and create performance table
        print("\nAnalyzing confusion matrices and performance metrics")
        print("=" * 80)
        
        # Make sure to include total_combined_score in the list of score columns
        all_score_columns = [
            'is_traceable', 'actor_score', 'judge_score', 'final_score', 
            'total_score', 'total_score_with_final', 'total_score_all', 'total_combined_score'
        ]
        
        # Filter to only columns that exist
        score_columns_to_analyze = [
            col for col in all_score_columns 
            if f'predicted_{col}' in combined_traced_eval_df.columns
        ]
        
        print(f"Analyzing {len(score_columns_to_analyze)} score columns: {score_columns_to_analyze}")
        
        # Get all models
        all_models = combined_traced_eval_df['model'].unique()
        
        # Analyze confusion matrices
        confusion_results = analyze_confusion_matrix(
            combined_traced_eval_df, 
            score_columns=score_columns_to_analyze,
            models=all_models
        )
        
        print(f"Generated confusion matrices for {len(confusion_results)} models")
        
        # Create a dataframe for performance comparison
        performance_rows = []
        
        for model, model_results in confusion_results.items():
            for score_col, result in model_results.items():
                cm = result['confusion_matrix']
                metrics = result['metrics']
                
                performance_rows.append({
                    'model': model,
                    'score_column': score_col,
                    'precision': metrics['precision'],
                    'recall': metrics['recall'],
                    'f1_score': metrics['f1_score'],
                    'f2_score': metrics['f2_score'],
                    'accuracy': metrics['accuracy'],
                    'balanced_accuracy': metrics['balanced_accuracy'],
                    'specificity': metrics['specificity'],
                    'miss_rate': metrics['miss_rate'],
                    'tp': cm['tp'],
                    'fp': cm['fp'],
                    'fn': cm['fn'],
                    'tn': cm['tn']
                })
        
        # Create performance comparison DataFrame
        if performance_rows:
            performance_df = pd.DataFrame(performance_rows)
            
            # Sort by F2 score (or F1 if specified)
            if 'OPTIMIZATION_METRIC' in globals() and globals()['OPTIMIZATION_METRIC'] == 'F1':
                performance_df = performance_df.sort_values('f1_score', ascending=False).reset_index(drop=True)
                sort_metric = 'F1'
            else:
                performance_df = performance_df.sort_values('f2_score', ascending=False).reset_index(drop=True)
                sort_metric = 'F2'
            
            # Display performance comparison
            print(f"\nPerformance Comparison of Score Columns (sorted by {sort_metric} score):")
            print("-" * 80)
            display(performance_df)
            
            # Find the best overall
            best_row = performance_df.iloc[0]
            print(f"\nBest performing score column: {best_row['score_column']} for model {best_row['model']}")
            print(f"  - {sort_metric} Score: {best_row['f2_score' if sort_metric == 'F2' else 'f1_score']:.3f}")
            print(f"  - Precision: {best_row['precision']:.3f}")
            print(f"  - Recall: {best_row['recall']:.3f}")
            
            # Show confusion matrix for best result
            print("\nConfusion matrix counts for best result:")
            print(f"  - True Positives (TP): {best_row['tp']}")
            print(f"  - False Positives (FP): {best_row['fp']}")
            print(f"  - False Negatives (FN): {best_row['fn']}")
            print(f"  - True Negatives (TN): {best_row['tn']}")
            
            # Store results for later use
            globals()['performance_comparison_df'] = performance_df
            globals()['combined_traced_eval_df'] = combined_traced_eval_df
            
            # Check if visualization is enabled in environment
            if 'SHOW_VISUALIZATION' in globals() and globals()['SHOW_VISUALIZATION']:
                # Get project name from globals
                project_name = globals().get('NEO4J_PROJECT_NAME', 'Unknown Project')
                
                # Create confusion matrix visualization grid
                print("\nCreating confusion matrix visualizations...")
                
                # Define color palette for consistency
                color_palette = {
                    'TP': '#1A85FF',  # Good - Blue
                    'FP': '#FFC61A',  # Okay - Yellow/Gold
                    'FN': '#D41159',  # Not Great - Magenta/Red
                    'TN': '#CCCCCC'   # Neutral - Light Gray
                }
                
                # Set up portrait layout - 2 columns, up to 4 rows
                n_score_cols = len(score_columns_to_analyze)
                n_cols = 4  # Fixed at 2 columns
                n_rows = min(2, (n_score_cols + n_cols - 1) // n_cols)  # Up to 4 rows
                
                # Create figure with subplots
                fig, axes = plt.subplots(n_rows, n_cols, figsize=(10, 4 * n_rows))
                
                # Flatten axes if we have multiple rows
                if n_rows > 1:
                    axes = axes.flatten()
                else:
                    # Make sure axes is always an array
                    axes = np.array([axes]) if n_cols == 1 else axes
                
                # For each score column, create a confusion matrix visualization
                for i, score_col in enumerate(score_columns_to_analyze):
                    if i >= len(axes):
                        break
                        
                    ax = axes[i]
                    
                    # Get metrics for the first model (assuming single model analysis)
                    model = all_models[0]
                    if model in confusion_results and score_col in confusion_results[model]:
                        cm = confusion_results[model][score_col]['confusion_matrix']
                        
                        # Create confusion matrix as numpy array
                        cm_array = np.array([
                            [cm['tn'], cm['fp']],
                            [cm['fn'], cm['tp']]
                        ])
                        
                        # Create confusion matrix plot
                        sns.heatmap(
                            cm_array, 
                            annot=True, 
                            fmt='g', 
                            cmap='Blues',
                            xticklabels=['Negative', 'Positive'],
                            yticklabels=['Negative', 'Positive'], 
                            ax=ax,
                            cbar=False  # No colorbar
                        )
                        
                        # Set title with score column name
                        ax.set_title(f"{score_col}", fontsize=10)
                        ax.set_xlabel('Predicted', fontsize=8)
                        ax.set_ylabel('Actual', fontsize=8)
                    else:
                        # Empty plot if no data
                        ax.axis('off')
                        ax.text(0.5, 0.5, 'No data', ha='center', va='center')
                
                # Hide any unused subplots
                for i in range(n_score_cols, len(axes)):
                    axes[i].axis('off')
                
                # Add a common title with project name and model
                plt.suptitle(f"Project: {project_name}\nConfusion Matrices for Model: {model}", fontsize=14, y=1.05)
                
                # Adjust layout
                plt.tight_layout()
                
                # Show plot
                plt.show()
        else:
            print("No performance metrics calculated. Check that the dataset has ground truth and prediction data.")
    else:
        print("Failed to create evaluation dataset")
else:
    print("Missing required data or functions. Please run cells 0-6 first.")

In [None]:
# Cell [10] - Statistical Comparison Between Models
# Purpose: Compare performance between different models using statistical tests
# Dependencies: scipy.stats, pandas, numpy
# Breadcrumbs: Analysis -> Statistical Tests -> Model Comparison
# notebooks/07_Meta_Judge_Analysis_Notebook.ipynb

def compare_models_statistically(performance_df, metric='f2_score'):
    """
    Compare multiple models using statistical tests
    
    Parameters:
        performance_df: DataFrame with model performance metrics
        metric: Metric to use for comparison
    
    Returns:
        dict: Statistical test results
    """
    from scipy.stats import kruskal, mannwhitneyu
    from statsmodels.stats.multitest import multipletests
    
    print(f"\nStatistical Comparison of Models using {metric.upper()}")
    print("=" * 80)
    
    # Group by model and get best score for each
    model_scores = {}
    
    for model in performance_df['model'].unique():
        model_data = performance_df[performance_df['model'] == model]
        # Get the best score for this model across all scoring methods
        best_score = model_data[metric].max()
        # Get all scores for this model
        all_scores = model_data[metric].values
        model_scores[model] = {
            'best': best_score,
            'all': all_scores,
            'mean': np.mean(all_scores),
            'std': np.std(all_scores)
        }
    
    print("\nModel Performance Summary:")
    print("-" * 60)
    for model, scores in model_scores.items():
        print(f"{model}:")
        print(f"  Best {metric}: {scores['best']:.3f}")
        print(f"  Mean {metric}: {scores['mean']:.3f} (±{scores['std']:.3f})")
    
    # Kruskal-Wallis test if more than 2 models
    if len(model_scores) > 2:
        # Use all scores for each model
        scores_lists = [scores['all'] for scores in model_scores.values()]
        
        try:
            h_stat, p_value = kruskal(*scores_lists)
            print(f"\nKruskal-Wallis H-test:")
            print(f"  H-statistic: {h_stat:.4f}")
            print(f"  p-value: {p_value:.4f}")
            print(f"  Significant difference: {'Yes' if p_value < 0.05 else 'No'} (α=0.05)")
            
            # Post-hoc pairwise comparisons if significant
            if p_value < 0.05:
                print("\nPost-hoc Pairwise Comparisons (Mann-Whitney U test):")
                print("-" * 60)
                
                model_names = list(model_scores.keys())
                pairwise_results = []
                
                for i in range(len(model_names)):
                    for j in range(i+1, len(model_names)):
                        scores1 = model_scores[model_names[i]]['all']
                        scores2 = model_scores[model_names[j]]['all']
                        
                        try:
                            stat, p = mannwhitneyu(scores1, scores2, alternative='two-sided')
                            
                            # Calculate effect size (r = Z / sqrt(N))
                            n1, n2 = len(scores1), len(scores2)
                            z = stat
                            effect_size = z / np.sqrt(n1 + n2)
                            
                            pairwise_results.append({
                                'model1': model_names[i],
                                'model2': model_names[j],
                                'statistic': stat,
                                'p_value': p,
                                'effect_size': abs(effect_size)
                            })
                            
                            print(f"{model_names[i]} vs {model_names[j]}:")
                            print(f"  p-value: {p:.4f}")
                            print(f"  Effect size (r): {abs(effect_size):.3f}")
                            
                        except Exception as e:
                            print(f"Could not compare {model_names[i]} vs {model_names[j]}: {str(e)}")
                
                # Apply multiple testing correction
                if pairwise_results:
                    p_values = [r['p_value'] for r in pairwise_results]
                    reject, p_adjusted, _, _ = multipletests(p_values, method='bonferroni')
                    
                    print("\nBonferroni-corrected p-values:")
                    print("-" * 60)
                    for i, result in enumerate(pairwise_results):
                        print(f"{result['model1']} vs {result['model2']}: "
                              f"p_adj={p_adjusted[i]:.4f} {'*' if reject[i] else ''}")
                              
        except Exception as e:
            print(f"\nError in Kruskal-Wallis test: {str(e)}")
    
    elif len(model_scores) == 2:
        # Mann-Whitney U test for two models
        model_names = list(model_scores.keys())
        scores1 = model_scores[model_names[0]]['all']
        scores2 = model_scores[model_names[1]]['all']
        
        try:
            stat, p_value = mannwhitneyu(scores1, scores2, alternative='two-sided')
            print(f"\nMann-Whitney U Test Results:")
            print(f"  Comparing: {model_names[0]} vs {model_names[1]}")
            print(f"  U-statistic: {stat:.4f}")
            print(f"  p-value: {p_value:.4f}")
            print(f"  Significant difference: {'Yes' if p_value < 0.05 else 'No'} (α=0.05)")
            
            # Calculate effect size
            n1, n2 = len(scores1), len(scores2)
            effect_size = stat / (n1 * n2)
            print(f"  Effect size (r): {effect_size:.3f}")
            
        except Exception as e:
            print(f"\nError in Mann-Whitney U test: {str(e)}")
    
    return model_scores

# Run model comparison if we have performance data
if 'performance_comparison_df' in globals() and not performance_comparison_df.empty:
    # Use the optimization metric for comparison
    metric_to_compare = 'f2_score' if OPTIMIZATION_METRIC == 'F2' else 'f1_score'
    
    model_comparison_results = compare_models_statistically(
        performance_comparison_df, 
        metric=metric_to_compare
    )
    
    # Store results
    globals()['model_comparison_results'] = model_comparison_results
    
elif 'best_thresholds_df' in globals() and not best_thresholds_df.empty:
    # Use best_thresholds_df if performance_comparison_df not available
    metric_to_compare = 'f2_score' if OPTIMIZATION_METRIC == 'F2' else 'f1_score'
    
    model_comparison_results = compare_models_statistically(
        best_thresholds_df, 
        metric=metric_to_compare
    )
    
    # Store results
    globals()['model_comparison_results'] = model_comparison_results
else:
    print("No performance data available for model comparison. Please run previous cells first.")

In [None]:
# Cell [11] - Analysis Summary and Conclusions with Model Name Resolution
# Purpose: Summarize findings and draw conclusions about traceability prediction
# Dependencies: pandas
# Breadcrumbs: Analysis -> Summary
# notebooks/07_Meta_Judge_Analysis_Notebook.ipynb

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv

def get_actual_model_name(model_key):
    """
    Resolve the actual model name from environment variables
    
    If model_key is a reference to another environment variable,
    resolve it to get the actual model name
    """
    if model_key in os.environ:
        return os.environ[model_key]
    return model_key

def summarize_analysis_results(best_thresholds_df=None, project_name=None, optimization_metric=None, show_visualization=False, model_key=None):
    """
    Create a summary of model evaluation and traceability analysis
    
    Parameters:
        best_thresholds_df (pd.DataFrame): DataFrame containing model evaluation results
        project_name (str): Name of the project being analyzed
        optimization_metric (str): Metric used for optimization ('F1' or 'F2')
        show_visualization (bool): Whether to show visualizations
        model_key (str): Model key/identifier
        
    Returns:
        dict: Dictionary containing summary information
    """
    # Get parameter values from globals if not provided
    if best_thresholds_df is None and 'best_thresholds_df' in globals() and globals()['best_thresholds_df'] is not None and isinstance(globals()['best_thresholds_df'], pd.DataFrame) and not globals()['best_thresholds_df'].empty:
        best_thresholds_df = globals()['best_thresholds_df']
        
    if project_name is None and 'NEO4J_PROJECT_NAME' in globals():
        project_name = globals()['NEO4J_PROJECT_NAME']
    else:
        project_name = project_name or "Unknown Project"
        
    if optimization_metric is None and 'OPTIMIZATION_METRIC' in globals():
        optimization_metric = globals()['OPTIMIZATION_METRIC']
    else:
        optimization_metric = optimization_metric or "F2"
        
    if show_visualization is None and 'SHOW_VISUALIZATION' in globals():
        show_visualization = globals()['SHOW_VISUALIZATION']
        
    if model_key is None and 'CURRENT_MODEL_KEY' in globals():
        model_key = globals()['CURRENT_MODEL_KEY']
    else:
        model_key = model_key or "Unknown Model"
        
    # Create empty result dictionary
    summary = {
        'project_name': project_name,
        'optimization_metric': optimization_metric,
        'has_model_evaluation': False,
        'models_evaluated': 0,
        'best_model': None,
        'best_model_threshold': None,
        'best_model_metrics': {},
        'model_family_comparison': {},
        'recommendations': []
    }
    
    # If we don't have threshold results, return basic summary
    if best_thresholds_df is None or not isinstance(best_thresholds_df, pd.DataFrame) or best_thresholds_df.empty:
        return summary
        
    # We have model evaluation results
    summary['has_model_evaluation'] = True
    summary['models_evaluated'] = len(best_thresholds_df['model_name'].unique())
    
    # Count score columns evaluated
    score_columns = best_thresholds_df['score_column'].unique()
    summary['score_columns_evaluated'] = len(score_columns)
    summary['score_columns'] = list(score_columns)
    
    # Identify the best performing model and score column combination
    if optimization_metric.upper() == 'F1':
        best_idx = best_thresholds_df['f1_score'].idxmax()
        sort_metric = 'f1_score'
    else:
        best_idx = best_thresholds_df['f2_score'].idxmax()
        sort_metric = 'f2_score'
        
    best_row = best_thresholds_df.loc[best_idx]
    summary['best_model'] = best_row['model_name']
    summary['best_score_column'] = best_row['score_column']
    summary['best_model_threshold'] = float(best_row['best_threshold'])
    summary['sort_metric'] = sort_metric
    
    # Store best model metrics
    summary['best_model_metrics'] = {
        'threshold': float(best_row['best_threshold']),
        'precision': float(best_row['precision']),
        'recall': float(best_row['recall']),
        'f1_score': float(best_row['f1_score']),
        'f2_score': float(best_row['f2_score']),
        'matthews_corr': float(best_row['matthews_corr']),
        'balanced_accuracy': float(best_row['balanced_accuracy']),
        'confusion_matrix': {
            'tp': int(best_row['true_positives']),
            'fp': int(best_row['false_positives']),
            'fn': int(best_row['false_negatives']),
            'tn': int(best_row['true_negatives'])
        }
    }
    
    # Get performance by score column type
    score_column_performance = best_thresholds_df.groupby('score_column')[sort_metric].mean().sort_values(ascending=False)
    summary['score_column_performance'] = {col: float(val) for col, val in score_column_performance.items()}
    
    # Get the best performing score column type
    best_score_column = score_column_performance.index[0]
    summary['best_score_column_type'] = best_score_column
    summary['best_score_column_score'] = float(score_column_performance.iloc[0])
    
    # Get performance by model
    model_performance = best_thresholds_df.groupby('model_name')[sort_metric].max().sort_values(ascending=False)
    summary['model_performance'] = {model: float(val) for model, val in model_performance.items()}
    
    # Total data points analyzed
    total_data_points = best_thresholds_df['data_points'].iloc[0]
    summary['total_data_points'] = int(total_data_points)
    summary['ground_truth_positive'] = int(best_row['ground_truth_positive'])
    summary['ground_truth_negative'] = int(best_row['ground_truth_negative'])
    summary['ground_truth_ratio'] = float(best_row['ground_truth_positive'] / total_data_points)
    
    # Recommendations based on analysis
    recommendations = [
        f"Use {best_row['model_name']} model with {best_row['score_column']} score for best traceability results",
        f"Apply a threshold of {best_row['best_threshold']:.3f} to {best_row['score_column']} scores",
        f"Expected {optimization_metric} score: {best_row[sort_metric]:.3f}"
    ]
    
    # Check precision-recall balance
    if abs(best_row['precision'] - best_row['recall']) > 0.1:
        if best_row['precision'] > best_row['recall']:
            recommendations.append(f"Note that precision ({best_row['precision']:.3f}) is higher than recall ({best_row['recall']:.3f}), meaning the model finds fewer links but with higher confidence")
        else:
            recommendations.append(f"Note that recall ({best_row['recall']:.3f}) is higher than precision ({best_row['precision']:.3f}), meaning the model finds more links but with lower confidence")
    
    summary['recommendations'] = recommendations
    
    return summary

# Execute only when the notebook is run directly
if __name__ == "__main__":
    # Load environment variables
    load_dotenv()
    
    # Get the actual model name
    current_model_var = os.environ.get('CURRENT_MODEL', '')
    actual_model_name = os.environ.get(current_model_var, current_model_var)
    
    # Update the global variable if it exists
    if 'CURRENT_MODEL_KEY' in globals():
        CURRENT_MODEL_KEY = actual_model_name
    
    # Check if best_thresholds_df is available
    if 'best_thresholds_df' in globals() and globals()['best_thresholds_df'] is not None and isinstance(globals()['best_thresholds_df'], pd.DataFrame) and not globals()['best_thresholds_df'].empty:
        # Create a pivot table to compare different score columns across models
        score_comparison_df = best_thresholds_df.pivot_table(
            index='model_name',
            columns='score_column',
            values=['f1_score', 'f2_score', 'precision', 'recall', 'matthews_corr'],
            aggfunc='first'
        )
        
        # Define the score columns we want to compare
        score_columns = best_thresholds_df['score_column'].unique()
        
        # Print summary of results
        print("\nMeta Judge Analysis Summary:")
        print("=" * 80)
        print(f"Project: {NEO4J_PROJECT_NAME}")
        print(f"Model: {actual_model_name}")
        print(f"Optimization Metric: {OPTIMIZATION_METRIC}")
        print(f"Score Columns Evaluated: {', '.join(score_columns)}")
        
        # Identify best performing score column by optimization metric
        if OPTIMIZATION_METRIC.upper() == 'F1':
            best_score_col = best_thresholds_df.loc[best_thresholds_df['f1_score'].idxmax()]['score_column']
            sort_metric = 'f1_score'
        else:  # F2
            best_score_col = best_thresholds_df.loc[best_thresholds_df['f2_score'].idxmax()]['score_column']
            sort_metric = 'f2_score'
            
        print(f"\nBest Performing Score Column: {best_score_col} (based on {OPTIMIZATION_METRIC} score)")
        
        # Get average performance by score column
        avg_by_score_col = best_thresholds_df.groupby('score_column')[sort_metric].mean().sort_values(ascending=False)
        print("\nAverage Performance by Score Column Type:")
        for col, score in avg_by_score_col.items():
            print(f"  {col}: {score:.3f}")
        
        # Display top 3 model and score column combinations
        top_combinations = best_thresholds_df.sort_values(sort_metric, ascending=False).head(3)
        print("\nTop 3 Model and Score Column Combinations:")
        for i, (_, row) in enumerate(top_combinations.iterrows(), 1):
            print(f"  {i}. {row['model_name']} with {row['score_column']}: {row[sort_metric]:.3f} {OPTIMIZATION_METRIC}")
            print(f"     - Threshold: {row['best_threshold']:.3f}")
            print(f"     - Precision: {row['precision']:.3f}, Recall: {row['recall']:.3f}")
            print(f"     - TP: {row['true_positives']}, FP: {row['false_positives']}, FN: {row['false_negatives']}, TN: {row['true_negatives']}")
        
        # If show visualization is enabled, create visualizations
        if SHOW_VISUALIZATION:
            print("\nCreating visualization for score column comparison...")
            
            # Create a heatmap showing model performance by score column
            plt.figure(figsize=(12, 8))
            
            # Access the specific level of the MultiIndex for the optimization metric
            metric_values = score_comparison_df.loc[:, (sort_metric, slice(None))]
            
            # Flatten the MultiIndex columns
            metric_values.columns = metric_values.columns.get_level_values(1)
            
            # Create the heatmap
            sns.heatmap(metric_values, annot=True, cmap='YlGnBu', fmt='.3f',
                        linewidths=0.5, cbar_kws={'label': OPTIMIZATION_METRIC})
            
            plt.title(f'Model Performance by Score Column ({OPTIMIZATION_METRIC} Score)\nProject: {NEO4J_PROJECT_NAME}', fontsize=14)
            plt.tight_layout()
            plt.show()
            
            # Create combined bar chart for top models
            plt.figure(figsize=(12, 8))
            
            # Get the top 5 model/score combinations
            top5 = best_thresholds_df.sort_values(sort_metric, ascending=False).head(5)
            
            # Create combined labels
            labels = [f"{row['model_name'].split('/')[-1]}\n({row['score_column']})" for _, row in top5.iterrows()]
            
            # Plot bars for key metrics
            metrics = ['precision', 'recall', 'f1_score', 'f2_score', 'matthews_corr']
            metric_labels = ['Precision', 'Recall', 'F1 Score', 'F2 Score', 'Matthews Corr']
            bar_width = 0.15
            
            x = np.arange(len(labels))
            
            for i, (metric, label) in enumerate(zip(metrics, metric_labels)):
                plt.bar(x + (i - 2) * bar_width, top5[metric], width=bar_width, 
                        label=label, zorder=2)
            
            # Set chart properties
            plt.xlabel('Model + Score Column', fontsize=12)
            plt.ylabel('Score', fontsize=12)
            plt.title(f'Top 5 Model + Score Column Combinations\nProject: {NEO4J_PROJECT_NAME}', fontsize=14)
            plt.xticks(x, labels, rotation=45, ha='right')
            plt.legend(loc='upper right')
            plt.grid(axis='y', linestyle='--', alpha=0.3, zorder=1)
            
            plt.tight_layout()
            plt.show()
    else:
        print("\nNo model evaluation results available. Please run Cell 6 first to perform threshold optimization.")

In [None]:
# Cell [12] - LLM Meta Judge Metrics Comparison Whisker Chart with Confidence Intervals
# Purpose: Create box plots with statistical heatmap and confidence intervals for metrics
# Dependencies: pandas, matplotlib, seaborn, numpy
# Breadcrumbs: Visualization -> LLM Metrics Distribution -> Confidence Intervals
# notebooks/07_Meta_Judge_Analysis_Notebook.ipynb

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib.patches import Rectangle

def extract_llm_name(model_metrics_df):
    """
    Extract the LLM name from the model data
    
    Parameters:
        model_metrics_df: DataFrame containing model metrics
        
    Returns:
        str: Name of the LLM used in the analysis
    """
    if model_metrics_df.empty:
        return "Unknown LLM"
    
    # Try to extract LLM name from model_name column
    if 'model_name' in model_metrics_df.columns:
        # Look for common LLM names in the model_name column
        model_names = model_metrics_df['model_name'].astype(str).tolist()
        
        # Try to detect the LLM from the model names
        llm_indicators = {
            'gpt-4o': 'gpt-4o',
            'gpt-4': 'gpt-4',
            'gpt-3': 'gpt-3.5',
            'claude': 'claude',
            'llama': 'llama',
            'falcon': 'falcon',
            'palm': 'palm',
            'gemini': 'gemini',
            'mistral': 'mistral'
        }
        
        # Check if any of the indicators appear in the model names
        for model_name in model_names:
            for indicator, llm_name in llm_indicators.items():
                if indicator.lower() in model_name.lower():
                    return llm_name
        
        # If no specific LLM detected but model names exist, return the first one
        if model_names:
            # Extract first part of the model name if it contains a slash
            first_model = model_names[0]
            if '/' in first_model:
                return first_model.split('/')[0]
            return first_model
    
    # Check if LLM_NAME exists in globals
    if 'LLM_NAME' in globals():
        return globals()['LLM_NAME']
    
    # Look for llm column or property
    for col in model_metrics_df.columns:
        if 'llm' in col.lower():
            unique_values = model_metrics_df[col].unique()
            if len(unique_values) > 0 and pd.notna(unique_values[0]):
                return str(unique_values[0])
    
    # Default fallback
    return "gpt-4o"

def create_llm_metrics_whisker_plot_with_ci(model_metrics_df=None, project_name=None, show_visualization=True):
    """
    Create whisker plots showing the distribution of metrics with confidence intervals
    
    Parameters:
        model_metrics_df: DataFrame containing LLM model metrics (default: best_thresholds_df)
        project_name: Name of the project for visualization titles
        show_visualization: Whether to display visualizations
        
    Returns:
        dict: Dictionary containing plotting data
    """
    try:
        # Use global variables if parameters not provided
        _model_metrics_df = model_metrics_df if model_metrics_df is not None else globals().get('best_thresholds_df', pd.DataFrame())
        _project_name = project_name if project_name is not None else globals().get('NEO4J_PROJECT_NAME', 'Unknown Project')
        _show_visualization = show_visualization if show_visualization is not None else globals().get('SHOW_VISUALIZATION', True)
        
        if _model_metrics_df.empty:
            print("No LLM meta judge metrics data available. Please run model evaluation first.")
            return {}
        
        # Extract the LLM name from the data
        llm_name = extract_llm_name(_model_metrics_df)
        print(f"Detected LLM: {llm_name}")
            
        # Select the metrics we want to visualize
        metrics_to_plot = [
            'accuracy', 'balanced_accuracy', 'precision', 'recall', 
            'f1_score', 'f2_score', 'matthews_corr'
        ]
        
        # Define human-readable names for the metrics
        metric_names = {
            'accuracy': 'Accuracy',
            'balanced_accuracy': 'Balanced Accuracy',
            'precision': 'Precision',
            'recall': 'Recall',
            'f1_score': 'F1 Score',
            'f2_score': 'F2 Score',
            'matthews_corr': 'Matthews Correlation'
        }
        
        # Reshape data from wide to long format for easier plotting with seaborn
        plot_data = pd.melt(
            _model_metrics_df, 
            id_vars=['model_name', 'score_column'], 
            value_vars=metrics_to_plot,
            var_name='metric', 
            value_name='score'
        )
        
        # Map metric names to their human-readable versions
        plot_data['metric'] = plot_data['metric'].map(metric_names)
        
        # Add confidence interval data if available
        ci_data = {}
        for metric in metrics_to_plot:
            metric_display = metric_names[metric]
            ci_data[metric_display] = {
                'lower': [],
                'upper': [],
                'has_ci': False
            }
            
            # Check for CI columns
            ci_lower_col = f"{metric}_ci_lower"
            ci_upper_col = f"{metric}_ci_upper"
            
            if ci_lower_col in _model_metrics_df.columns and ci_upper_col in _model_metrics_df.columns:
                ci_data[metric_display]['lower'] = _model_metrics_df[ci_lower_col].dropna().values
                ci_data[metric_display]['upper'] = _model_metrics_df[ci_upper_col].dropna().values
                ci_data[metric_display]['has_ci'] = len(ci_data[metric_display]['lower']) > 0
        
        # Calculate metric statistics for the heatmap
        stats_list = ['min', '25%', 'mean', '50%', '75%', 'max', 'std']
        stats_names = ['Min', 'Q1', 'Mean', 'Median', 'Q3', 'Max', 'Std Dev']
        
        # Create stats DataFrames
        stats_values = {}
        for metric in metrics_to_plot:
            metric_display_name = metric_names[metric]
            stats = _model_metrics_df[metric].describe()
            stats_values[metric_display_name] = [stats[stat] for stat in stats_list]
        
        # Create numeric DataFrame for heatmap coloring
        stats_df_numeric = pd.DataFrame(stats_values, index=stats_names)
        
        # Create formatted DataFrame
        formatted_data = {}
        for col in stats_df_numeric.columns:
            formatted_data[col] = []
            for idx in stats_names:
                value = stats_df_numeric.loc[idx, col]
                if idx == 'Std Dev':
                    if value < 0.001:
                        formatted_data[col].append(f"{value:.2e}")
                    else:
                        formatted_data[col].append(f"{value:.3f}")
                else:
                    formatted_data[col].append(f"{value:.3f}")
        
        stats_df_formatted = pd.DataFrame(formatted_data, index=stats_names)
        
        # Create a figure with two subplots
        fig = plt.figure(figsize=(14, 12))
        
        # Create grid for the plots
        gs = plt.GridSpec(2, 1, height_ratios=[2, 1], hspace=0.3)
        
        # Box plot subplot
        ax_box = fig.add_subplot(gs[0])
        
        # Create box plot
        box_plot = sns.boxplot(
            x='metric', 
            y='score', 
            data=plot_data, 
            ax=ax_box,
            color='lightblue',
            width=0.5
        )
        
        # Add confidence intervals if available
        for i, metric_display in enumerate(metric_names.values()):
            if metric_display in ci_data and ci_data[metric_display]['has_ci']:
                # Get CI bounds
                lower_bounds = ci_data[metric_display]['lower']
                upper_bounds = ci_data[metric_display]['upper']
                
                # Calculate mean CI width
                if len(lower_bounds) > 0 and len(upper_bounds) > 0:
                    mean_lower = np.mean(lower_bounds)
                    mean_upper = np.mean(upper_bounds)
                    
                    # Add CI bar
                    ax_box.plot([i, i], [mean_lower, mean_upper], 
                               color='darkred', linewidth=3, alpha=0.7, 
                               solid_capstyle='round')
                    
                    # Add CI caps
                    cap_width = 0.1
                    ax_box.plot([i-cap_width, i+cap_width], [mean_lower, mean_lower], 
                               color='darkred', linewidth=2, alpha=0.7)
                    ax_box.plot([i-cap_width, i+cap_width], [mean_upper, mean_upper], 
                               color='darkred', linewidth=2, alpha=0.7)
        
        # Add individual data points
        sns.stripplot(
            x='metric', 
            y='score', 
            data=plot_data, 
            jitter=True, 
            color='navy',
            marker='o', 
            alpha=0.5,
            size=4,
            ax=ax_box
        )
        
        # Customize the box plot
        ax_box.set_title(f'Project: {_project_name} - Distribution of Meta Judge Performance Metrics', fontsize=14)
        ax_box.set_xlabel('')
        ax_box.set_ylabel('Score', fontsize=12)
        ax_box.grid(axis='y', linestyle='--', alpha=0.7)
        ax_box.set_ylim(0, 1.0)
        
        # Add a horizontal line at y=0.5 as a reference
        ax_box.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
        
        # Add legend
        from matplotlib.patches import Patch
        from matplotlib.lines import Line2D
        legend_elements = [
            Patch(facecolor='lightblue', edgecolor='navy', alpha=0.7, label=f'Meta Judge ({llm_name})'),
            Line2D([0], [0], color='darkred', linewidth=3, label='95% Confidence Interval')
        ]
        ax_box.legend(handles=legend_elements, loc='upper right')
        
        # Heatmap subplot
        ax_heatmap = fig.add_subplot(gs[1])
        
        # Create the heatmap
        mask = np.isnan(stats_df_numeric.values)
        cmap = sns.light_palette("steelblue", as_cmap=True)
        sns.heatmap(
            stats_df_numeric,
            annot=stats_df_formatted.values,
            fmt="",
            cmap=cmap,
            linewidths=0.5,
            linecolor='lightgray',
            cbar=False,
            ax=ax_heatmap,
            mask=mask,
            annot_kws={"size": 10, "weight": "normal"},
            vmin=0,
            vmax=1.0
        )
        
        # Customize heatmap appearance
        ax_heatmap.set_title(f'Statistical Summary of Meta Judge ({llm_name}) Metrics', fontsize=12)
        ax_heatmap.set_xticklabels(ax_heatmap.get_xticklabels(), rotation=0, ha='center')
        
        # Adjust figure layout
        plt.subplots_adjust(hspace=0.3)
        
        if _show_visualization:
            plt.show()
        else:
            plt.close()
            
        return {
            'plot_data': plot_data,
            'stats_df_numeric': stats_df_numeric,
            'stats_df_formatted': stats_df_formatted,
            'llm_name': llm_name,
            'ci_data': ci_data
        }
    
    except Exception as e:
        print(f"Error creating LLM metrics whisker plot: {str(e)}")
        import traceback
        print(traceback.format_exc())
        return {}

# Create and display metrics whisker plot with confidence intervals
llm_metrics_viz_data = create_llm_metrics_whisker_plot_with_ci()

In [None]:
# Cell [13] - Store Meta Judge Whisker Chart Data in Neo4j with Correct Model Name
# Purpose: Store the metrics statistics data from whisker chart in Neo4j with the specific LLM model name
# Dependencies: neo4j, pandas, logging, os, dotenv
# Breadcrumbs: Data Storage -> Neo4j Persistence

import json
from datetime import datetime
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv

def store_meta_judge_whisker_data_in_neo4j(driver=None, project_name=None):
    """
    Store the Meta Judge whisker chart data in Neo4j
    
    Parameters:
        driver: Neo4j driver connection
        project_name: Project name to attach the metrics data to
    
    Returns:
        bool: True if successful, False otherwise
    """
    try:
        # Load environment variables to get the current model name
        load_dotenv()
        
        # Get the current model variable reference (e.g., OPENAI_MODEL_ID)
        model_var_name = os.getenv('CURRENT_MODEL')
        
        # Get the actual model name value from that variable (e.g., gpt-4o)
        model_name = os.getenv(model_var_name)
        
        # Make sure we have a clean model name without any score columns appended
        if model_name and '_' in model_name:
            # Check if score column names are appended to the model name
            score_keywords = ['total_score', 'is_traceable', 'actor_score', 'judge_score', 'final_score']
            for keyword in score_keywords:
                if keyword in model_name:
                    # Extract the part before the score column
                    model_name = model_name.split('_' + keyword)[0]
                    break
        
        print(f"Current model from environment: {model_name}")
        
        # Use global variables if parameters not provided
        _driver = driver if driver is not None else globals().get('driver')
        _project_name = project_name if project_name is not None else globals().get('NEO4J_PROJECT_NAME', 'Unknown Project')
        
        if not _driver:
            print("No Neo4j driver available")
            return False
        
        # DEBUG: Print all global variables that might contain DataFrame or dict data
        print("\n=== DEBUG: Available Global Variables ===")
        potential_data_vars = []
        for var_name, var_value in globals().items():
            if isinstance(var_value, (dict, pd.DataFrame)) and not var_name.startswith('_'):
                var_type = type(var_value).__name__
                var_info = ""
                if isinstance(var_value, pd.DataFrame):
                    var_info = f"DataFrame with shape {var_value.shape}"
                    if 'model_name' in var_value.columns:
                        var_info += f", contains 'model_name' column"
                    if 'metric' in var_value.columns:
                        var_info += f", contains 'metric' column"
                    if 'score_column' in var_value.columns:
                        var_info += f", contains 'score_column' column"
                elif isinstance(var_value, dict):
                    var_info = f"Dict with {len(var_value)} keys"
                    if 'stats_df_numeric' in var_value:
                        var_info += f", contains 'stats_df_numeric'"
                    if 'plot_data' in var_value:
                        var_info += f", contains 'plot_data'"
                
                potential_data_vars.append((var_name, var_type, var_info))
                print(f"- {var_name}: {var_type} - {var_info}")
        
        # Additional variable names to check beyond our previous list
        viz_var_names = [
            'metrics_viz_data', 'whisker_chart_data', 'judge_metrics_viz', 'viz_data', 'chart_data',
            'results_viz', 'model_metrics_viz', 'model_viz_data', 'llm_metrics_viz', 'meta_judge_viz',
            'metrics_data', 'model_metrics', 'meta_metrics', 'metrics_whisker',
            # Additional names that might be more specific to this notebook
            'judge_results', 'judge_data', 'performance_viz', 'evaluation_data',
            # Try looking at variable names containing 'whisker'
            *[name for name in globals() if 'whisker' in str(name).lower()],
            # Try looking at variable names containing 'viz' or 'vis'
            *[name for name in globals() if ('viz' in str(name).lower() or 'vis' in str(name).lower())]
        ]
        
        # Look for whisker chart data from cell 12
        metrics_viz_data = None
        found_var_name = None
        
        for var_name in viz_var_names:
            if var_name in globals() and globals()[var_name] is not None:
                var_value = globals()[var_name]
                
                # If it's a dictionary, check if it has the expected structure
                if isinstance(var_value, dict) and ('stats_df_numeric' in var_value or 'plot_data' in var_value):
                    metrics_viz_data = var_value
                    found_var_name = var_name
                    print(f"\nFound promising whisker chart data in variable '{var_name}'")
                    print(f"Keys: {list(var_value.keys())}")
                    break
                
                # If it's a DataFrame, it might be the actual stats data
                elif isinstance(var_value, pd.DataFrame) and any(col for col in var_value.columns if 'score' in str(col).lower() or 'metric' in str(col).lower()):
                    # Create a synthetic metrics_viz_data dict
                    print(f"\nFound DataFrame in '{var_name}' that might contain metrics data")
                    print(f"Columns: {list(var_value.columns)}")
                    
                    # If this looks like the plot data
                    if 'model_name' in var_value.columns and 'metric' in var_value.columns:
                        stats_df = var_value.groupby('metric')['score'].agg(['mean', 'min', 'max', 'std']).reset_index()
                        metrics_viz_data = {
                            'plot_data': var_value,
                            'stats_df_numeric': stats_df
                        }
                        found_var_name = var_name
                        print(f"Created metrics_viz_data from DataFrame in '{var_name}'")
                        break
        
        # If we still don't have data, look for the raw model metrics data
        if metrics_viz_data is None:
            # Look for any DataFrame that might contain model metrics
            for var_name, var_value in globals().items():
                if isinstance(var_value, pd.DataFrame) and not var_name.startswith('_'):
                    # Look for common column patterns in model metrics data
                    cols = list(var_value.columns)
                    if ('model_name' in cols or 'model' in cols) and len(var_value) > 0:
                        print(f"\nFound potential model metrics DataFrame in '{var_name}'")
                        print(f"Columns: {cols}")
                        print(f"Sample data:\n{var_value.head(2)}")
                        
                        # Use this as our data source
                        if 'model_name' not in cols and 'model' in cols:
                            var_value = var_value.rename(columns={'model': 'model_name'})
                        
                        # If we have numeric columns, we can create a stats DataFrame
                        numeric_cols = var_value.select_dtypes(include=[np.number]).columns.tolist()
                        if numeric_cols and 'model_name' in var_value.columns:
                            # Calculate stats (min, max, mean, etc.) for numeric columns
                            stats_df = pd.DataFrame()
                            for col in numeric_cols:
                                if col != 'model_name':
                                    stats = var_value[col].agg(['mean', 'min', 'max', 'std']).reset_index()
                                    stats.columns = ['Statistic', col]
                                    if stats_df.empty:
                                        stats_df = stats
                                    else:
                                        stats_df = pd.merge(stats_df, stats, on='Statistic')
                            
                            if not stats_df.empty:
                                stats_df = stats_df.set_index('Statistic')
                                # Create our viz data structure
                                metrics_viz_data = {
                                    'plot_data': var_value,
                                    'stats_df_numeric': stats_df
                                }
                                found_var_name = var_name
                                print(f"Created stats from numeric columns in '{var_name}'")
                                break
        
        if metrics_viz_data is None:
            print("\nCould not find whisker chart data in global variables")
            print("Available variables that might contain the data:")
            for var_name, var_type, var_info in potential_data_vars:
                print(f"- {var_name}: {var_type} - {var_info}")
            print("\nTry running cell 11 first to generate the whisker chart data")
            return False
        
        print(f"\nUsing data from '{found_var_name}'")
        
        # Check if we have the stats data
        if 'stats_df_numeric' not in metrics_viz_data or metrics_viz_data['stats_df_numeric'].empty:
            print("No statistical metrics data found in the data structure")
            return False
        
        # Get the stats DataFrame
        stats_df = metrics_viz_data['stats_df_numeric']
        print(f"\nStats DataFrame shape: {stats_df.shape}")
        print(f"Stats DataFrame columns: {list(stats_df.columns)}")
        print(f"Stats DataFrame index: {list(stats_df.index)}")
        
        # Prepare data for Neo4j storage - convert stats dataframe to dictionary
        metrics_data = {}
        for column in stats_df.columns:
            # Each column is a metric
            metric_stats = {}
            for idx, value in stats_df[column].items():
                # Each row is a statistic (min, max, etc.)
                # Convert to standard lowercase keys without spaces
                key = str(idx).lower().replace(' ', '_')
                try:
                    metric_stats[key] = float(value)
                except (ValueError, TypeError):
                    metric_stats[key] = str(value)
            
            # Add metric stats to overall data
            metrics_data[column.lower().replace(' ', '_')] = metric_stats
        
        # Also add the number of models analyzed
        if 'plot_data' in metrics_viz_data and not metrics_viz_data['plot_data'].empty:
            try:
                model_col = None
                for col in ['model_name', 'model', 'llm_model', 'name']:
                    if col in metrics_viz_data['plot_data'].columns:
                        model_col = col
                        break
                
                if model_col:
                    model_count = metrics_viz_data['plot_data'][model_col].nunique()
                    metrics_data['model_count'] = model_count
                    print(f"Found {model_count} unique models in the data")
                else:
                    print("Could not find model name column in plot data")
            except Exception as e:
                print(f"Error calculating model count: {str(e)}")
        
        # Ensure we store only the clean model name without any score column appendages
        clean_model_name = model_name
        if '_' in clean_model_name:
            # Additional safeguard to ensure no score metrics in model name
            score_columns = ['total_score', 'total_score_with_final', 'total_combined_score', 
                            'is_traceable', 'actor_score', 'judge_score', 'final_score']
            for score in score_columns:
                if f"_{score}" in clean_model_name:
                    clean_model_name = clean_model_name.split(f"_{score}")[0]
        
        # Add the clean model name to the metrics data
        metrics_data['current_model'] = clean_model_name
        
        # Create results_data dictionary to store TP, FP, FN, TN for each model
        results_data = {}
        
        # If we have best_thresholds_df, extract the confusion matrix data (TP, FP, FN, TN)
        if 'best_thresholds_df' in globals() and not globals()['best_thresholds_df'].empty:
            best_df = globals()['best_thresholds_df']
            print(f"\nExtracting confusion matrix data from best_thresholds_df with shape {best_df.shape}")
            
            # Confusion matrix column names mapping - USING FULL NAMES instead of abbreviations
            cm_columns = {
                'true_positives': 'true_positives',
                'false_positives': 'false_positives',
                'false_negatives': 'false_negatives',
                'true_negatives': 'true_negatives',
                # Handle alternative column names by mapping them to full names
                'tp': 'true_positives',
                'fp': 'false_positives',
                'fn': 'false_negatives',
                'tn': 'true_negatives'
            }
            
            # Check which confusion matrix columns are available
            available_cm_cols = [col for col in cm_columns.keys() if col in best_df.columns]
            
            if available_cm_cols:
                # For each model+score_column combination, store its confusion matrix data
                for _, row in best_df.iterrows():
                    # Create a unique key for this model+score combination
                    if 'model_name' in row and 'score_column' in row:
                        model_key = str(row['score_column'])  # Use only the score_column as the key
                    elif 'score_column' in row:
                        model_key = str(row['score_column'])
                    elif 'model_name' in row:
                        model_key = str(row['model_name'])
                    elif 'model' in row:
                        model_key = str(row['model'])
                    else:
                        # Default fallback key
                        model_key = f"model_{_}"
                    
                    # Standardize key formatting
                    model_key = model_key.replace('/', '_').replace(' ', '_')
                    
                    # Store confusion matrix data for this model
                    results_data[model_key] = {}
                    
                    # Extract confusion matrix values
                    for orig_col, result_col in cm_columns.items():
                        if orig_col in row:
                            val = row[orig_col]
                            if pd.notna(val):
                                results_data[model_key][result_col] = int(val)
                            else:
                                results_data[model_key][result_col] = 0
                    
                    # Add threshold if available
                    if 'best_threshold' in row:
                        results_data[model_key]['threshold'] = float(row['best_threshold']) if pd.notna(row['best_threshold']) else 0.0
                    
                    # Add score_column if available
                    if 'score_column' in row:
                        results_data[model_key]['score_type'] = str(row['score_column'])
                
                print(f"  Added confusion matrix data for {len(results_data)} score methods")
                # Show a few examples
                for model_name, data in list(results_data.items())[:3]:
                    # Display using the full names for confusion matrix values
                    confusion_info = ", ".join([f"{k}={v}" for k, v in data.items() 
                                              if k in ['true_positives', 'false_positives', 'false_negatives', 'true_negatives']])
                    print(f"    {model_name}: {confusion_info}")
                if len(results_data) > 3:
                    print(f"    ... and {len(results_data) - 3} more")
            else:
                print("No confusion matrix columns found in best_thresholds_df")
        
        # If we don't have confusion matrix data from best_thresholds_df, try to find it elsewhere
        if not results_data:
            # Try to find confusion matrix data in other potential sources
            for var_name in ['performance_df', 'performance_comparison_df', 'top5', 'top_combinations']:
                if var_name in globals() and isinstance(globals()[var_name], pd.DataFrame) and not globals()[var_name].empty:
                    df = globals()[var_name]
                    
                    # Check for confusion matrix columns
                    cm_columns = {
                        'true_positives': 'true_positives',
                        'false_positives': 'false_positives',
                        'false_negatives': 'false_negatives',
                        'true_negatives': 'true_negatives',
                        # Handle alternative column names by mapping them to full names
                        'tp': 'true_positives',
                        'fp': 'false_positives',
                        'fn': 'false_negatives',
                        'tn': 'true_negatives'
                    }
                    
                    available_cm_cols = [col for col in cm_columns.keys() if col in df.columns]
                    
                    if available_cm_cols:
                        print(f"\nFound confusion matrix data in {var_name} with columns: {available_cm_cols}")
                        
                        for _, row in df.iterrows():
                            # Create a unique key for this model+score combination
                            if 'score_column' in row:
                                model_key = str(row['score_column'])  # Use only the score_column as the key
                            elif 'model_name' in row:
                                model_key = str(row['model_name'])
                            elif 'model' in row:
                                model_key = str(row['model'])
                            else:
                                # Default fallback key
                                model_key = f"model_{_}"
                            
                            # Standardize key formatting
                            model_key = model_key.replace('/', '_').replace(' ', '_')
                            
                            # Store confusion matrix data for this model
                            results_data[model_key] = {}
                            
                            # Extract confusion matrix values
                            for orig_col, result_col in cm_columns.items():
                                if orig_col in row:
                                    val = row[orig_col]
                                    if pd.notna(val):
                                        results_data[model_key][result_col] = int(val)
                                    else:
                                        results_data[model_key][result_col] = 0
                            
                            # Add threshold if available
                            if 'best_threshold' in row:
                                results_data[model_key]['threshold'] = float(row['best_threshold']) if pd.notna(row['best_threshold']) else 0.0
                            elif 'threshold' in row:
                                results_data[model_key]['threshold'] = float(row['threshold']) if pd.notna(row['threshold']) else 0.0
                            
                            # Add score_column if available
                            if 'score_column' in row:
                                results_data[model_key]['score_type'] = str(row['score_column'])
                        
                        print(f"  Added confusion matrix data for {len(results_data)} score methods")
                        # Show a few examples
                        for model_name, data in list(results_data.items())[:3]:
                            # Display using the full names for confusion matrix values
                            confusion_info = ", ".join([f"{k}={v}" for k, v in data.items() 
                                                      if k in ['true_positives', 'false_positives', 'false_negatives', 'true_negatives']])
                            print(f"    {model_name}: {confusion_info}")
                        if len(results_data) > 3:
                            print(f"    ... and {len(results_data) - 3} more")
                        
                        break
                        
            if not results_data:
                print("\nNo confusion matrix data found in available dataframes.")
                # Creating a placeholder results_data to avoid empty field
                results_data = {"no_data": {"info": "No confusion matrix data available"}}
        
        # Add model_data with the actual whisker chart data points
        model_data = {}
        
        # Define the scoring method names we're looking for
        scoring_methods = [
            'is_traceable',
            'actor_score',
            'judge_score',
            'final_score',
            'total_score',
            'total_score_with_final',
            'total_score_all',
            'total_combined_score'
        ]
        
        # First try to find best_thresholds_df which should have the proper score names
        if 'best_thresholds_df' in globals() and not globals()['best_thresholds_df'].empty:
            best_df = globals()['best_thresholds_df']
            print(f"\nFound best_thresholds_df with shape {best_df.shape}")
            
            # Check if it has the score_column we need
            if 'score_column' in best_df.columns:
                print(f"Score columns found: {best_df['score_column'].unique()}")
                
                # Metrics we want to capture
                metrics = ['accuracy', 'balanced_accuracy', 'precision', 'recall', 
                           'f1_score', 'f2_score', 'matthews_corr']
                
                # Initialize the model_data structure
                for metric in metrics:
                    metric_key = metric.lower().replace(' ', '_')
                    model_data[metric_key] = {}
                
                # For each scoring method, get the values for each metric
                for _, row in best_df.iterrows():
                    score_method = row['score_column']
                    
                    for metric in metrics:
                        if metric in row:
                            metric_key = metric.lower().replace(' ', '_')
                            model_data[metric_key][score_method] = float(row[metric])
                
                print(f"Extracted data for {len(best_df)} scoring methods across {len(metrics)} metrics")
            else:
                print("best_thresholds_df doesn't have a score_column - looking for alternative data sources")
        
        # If we couldn't find best_thresholds_df or it doesn't have the right structure,
        # try looking for alternative data sources
        if not model_data:
            # Try to find performance_df or top5 which often has the right structure
            for var_name in ['performance_df', 'top5', 'top_combinations', 'performance_comparison_df']:
                if var_name in globals() and not globals()[var_name].empty:
                    df = globals()[var_name]
                    print(f"\nFound {var_name} with shape {df.shape}")
                    
                    # Check if it has score_column
                    if 'score_column' in df.columns:
                        print(f"Score columns found: {df['score_column'].unique()}")
                        
                        # Metrics we want to capture
                        metrics = [col for col in df.columns if col.lower() in 
                                  ['accuracy', 'balanced_accuracy', 'precision', 'recall', 
                                   'f1_score', 'f2_score', 'matthews_corr']]
                        
                        # Initialize the model_data structure
                        for metric in metrics:
                            metric_key = metric.lower().replace(' ', '_')
                            model_data[metric_key] = {}
                        
                        # For each scoring method, get the values for each metric
                        for _, row in df.iterrows():
                            score_method = row['score_column']
                            
                            for metric in metrics:
                                metric_key = metric.lower().replace(' ', '_')
                                model_data[metric_key][score_method] = float(row[metric])
                        
                        print(f"Extracted data for {len(df)} scoring methods across {len(metrics)} metrics")
                        break
            
            # If we still don't have data, try to use plot_data from the whisker chart
            if not model_data and 'plot_data' in metrics_viz_data and not metrics_viz_data['plot_data'].empty:
                plot_df = metrics_viz_data['plot_data']
                print(f"\nExamining plot_data from whisker chart with shape {plot_df.shape}")
                
                # If plot_data has both 'metric' and 'score_column'
                if 'metric' in plot_df.columns and 'score_column' in plot_df.columns:
                    print(f"Metric and score columns found in plot_data")
                    
                    metrics = plot_df['metric'].unique()
                    for metric in metrics:
                        metric_key = metric.lower().replace(' ', '_')
                        model_data[metric_key] = {}
                        
                        # Get data for this metric
                        metric_data = plot_df[plot_df['metric'] == metric]
                        
                        # Store scores by scoring method
                        for _, row in metric_data.iterrows():
                            score_method = row['score_column']
                            model_data[metric_key][score_method] = float(row['score'])
                    
                    print(f"Extracted data for {len(metrics)} metrics")
                
                # If plot_data doesn't have score_column but has metric
                elif 'metric' in plot_df.columns:
                    print(f"Only metric column found in plot_data - trying to match scores")
                    
                    # We'll need to match the scores to methods using best_thresholds_df or similar
                    metrics = plot_df['metric'].unique()
                    for metric in metrics:
                        metric_key = metric.lower().replace(' ', '_')
                        model_data[metric_key] = {}
                        
                        # Get scores for this metric
                        scores = plot_df[plot_df['metric'] == metric]['score'].values
                        
                        # Try to match scores to methods using other data sources
                        if 'best_thresholds_df' in globals():
                            best_df = globals()['best_thresholds_df']
                            # If we have best_thresholds_df with this metric and score_column
                            if metric_key in best_df.columns and 'score_column' in best_df.columns:
                                # Match scores to methods
                                for score in scores:
                                    # Find the scoring method with the closest score
                                    closest_row = best_df.iloc[(best_df[metric_key] - score).abs().argsort()[0]]
                                    score_method = closest_row['score_column']
                                    model_data[metric_key][score_method] = float(score)
                            else:
                                # Fallback to standard scoring method names if we can't match
                                for i, score in enumerate(scores):
                                    if i < len(scoring_methods):
                                        model_data[metric_key][scoring_methods[i]] = float(score)
                                    else:
                                        model_data[metric_key][f"score_{i}"] = float(score)
                        else:
                            # Fallback to standard scoring method names if we can't match
                            for i, score in enumerate(scores):
                                if i < len(scoring_methods):
                                    model_data[metric_key][scoring_methods[i]] = float(score)
                                else:
                                    model_data[metric_key][f"score_{i}"] = float(score)
                    
                    print(f"Extracted data for {len(metrics)} metrics with estimated scoring method mapping")
        
        # If we still don't have data, we need to fall back to a more direct approach
        if not model_data:
            print("\nCould not find structured data for the whisker chart points. Using direct method.")
            
            # Directly access potentially relevant datasets in a specific order
            # Getting data directly from best_thresholds_df
            if 'best_thresholds_df' in globals() and not globals()['best_thresholds_df'].empty:
                df = globals()['best_thresholds_df']
                print(f"Trying direct extraction from best_thresholds_df with shape {df.shape}")
                
                # These are the metrics we want to extract
                metrics = ['accuracy', 'balanced_accuracy', 'precision', 'recall', 
                          'f1_score', 'f2_score', 'matthews_corr']
                
                # Initialize each metric in model_data
                for metric in metrics:
                    metric_key = metric.lower().replace(' ', '_')
                    model_data[metric_key] = {}
                
                # Try to use the proper scoring method names
                # First see if we have score_column in the DataFrame
                if 'score_column' in df.columns:
                    # We can use the actual scoring method names
                    for i, row in df.iterrows():
                        score_method = row['score_column']
                        
                        for metric in metrics:
                            if metric in df.columns:
                                metric_key = metric.lower().replace(' ', '_')
                                model_data[metric_key][score_method] = float(row[metric])
                    
                    print(f"Extracted data using actual scoring method names")
                else:
                    # We need to map rows to our scoring method names
                    # Assuming rows are in the same order as our scoring_methods list
                    for i, row in df.iterrows():
                        if i < len(scoring_methods):
                            score_method = scoring_methods[i]
                        else:
                            score_method = f"score_{i}"
                            
                        for metric in metrics:
                            if metric in df.columns:
                                metric_key = metric.lower().replace(' ', '_')
                                model_data[metric_key][score_method] = float(row[metric])
                    
                    print(f"Extracted data with mapped scoring method names")
        
        # Print a summary of what we found
        print("\nModel data summary:")
        for metric, values in model_data.items():
            print(f"  {metric}: {len(values)} data points")
            for score_method, value in list(values.items())[:3]:  # Show first 3 as example
                print(f"    {score_method}: {value}")
            if len(values) > 3:
                print(f"    ... and {len(values) - 3} more")
        
        # Print the model name we're going to use
        print(f"\nUsing clean model name for Neo4j storage: '{clean_model_name}'")
        
        # Add model_data to metrics_data
        metrics_data['model_data'] = model_data
        
        # Serialize all data to JSON for Neo4j storage
        metrics_json = json.dumps(metrics_data)
        model_data_json = json.dumps(model_data)
        results_json = json.dumps(results_data)
        
        # Current timestamp for the analysis record
        timestamp = datetime.now().isoformat()
        
        # Cypher query to create metrics data connected to project
        # Using stable identifiers to avoid duplication
        query = """
        MATCH (p:Project {name: $project_name})
        MERGE (m:MetricsAnalysis {project_name: $project_name, model_type: $model_type, analysis_type: 'whisker_chart'})
        MERGE (p)-[r:HAS_METRICS_ANALYSIS {model_type: $model_type}]->(m)
        SET m.metrics_data = $metrics_data,
            m.model_count = $model_count,
            m.model_data = $model_data,
            m.results = $results_data,
            m.created_at = CASE WHEN m.created_at IS NULL THEN $timestamp ELSE m.created_at END,
            m.last_updated = $timestamp,
            r.timestamp = $timestamp,
            r.last_updated = $timestamp
        RETURN p.name as project_name, m.created_at as created_at
        """
        
        # Execute query to store metrics data
        with _driver.session() as session:
            result = session.run(
                query,
                project_name=_project_name,
                model_type=clean_model_name,  # Use the clean model name without score column appended
                timestamp=timestamp,
                metrics_data=metrics_json,
                model_data=model_data_json,
                results_data=results_json,
                model_count=metrics_data.get('model_count', 0)
            ).single()
            
            if result:
                print(f"Successfully stored whisker chart data for model {clean_model_name} in project: {result['project_name']}")
                return True
            else:
                print(f"No result returned when storing whisker chart data for project: {_project_name}")
                return False
                
    except Exception as e:
        print(f"Error storing whisker chart data in Neo4j: {str(e)}")
        import traceback
        print(traceback.format_exc())
        return False

# Store the whisker chart data in Neo4j
print("Starting to store whisker chart data in Neo4j...")
success = store_meta_judge_whisker_data_in_neo4j()

if success:
    # Get the model name again for displaying
    load_dotenv()
    model_var_name = os.getenv('CURRENT_MODEL')
    model_name = os.getenv(model_var_name)
    
    # Clean model name for display
    if model_name and '_' in model_name:
        # Check if score column names are appended to the model name
        score_keywords = ['total_score', 'is_traceable', 'actor_score', 'judge_score', 'final_score']
        for keyword in score_keywords:
            if keyword in model_name:
                # Extract the part before the score column
                model_name = model_name.split('_' + keyword)[0]
                break
    
    print("\nWhisker chart data successfully stored in Neo4j")
    print("=" * 80)
    print(f"Project: {NEO4J_PROJECT_NAME}")
    print(f"Model: {model_name}")
    print(f"Timestamp: {datetime.now().isoformat()}")
else:
    print("\nFailed to store whisker chart data.")
    print("See debugging information above for details.")