# SiFP Function Point Analysis with Statistical Modeling
**Analysis of Software Interface Function Points (SiFP) estimation accuracy using statistical modeling, code metrics correlation, linear/non-linear regression, and multi-model performance comparison with Neo4j integration.**

In [None]:
# Cell [0] - Jupyter Notebook Configuration
# Purpose: Configure notebook environment, import all required libraries, and set display settings for optimal analysis
# Dependencies: os, re, logging, warnings, typing, pathlib, dotenv, pandas, numpy, matplotlib, seaborn, neo4j, scipy, sklearn, xgboost, IPython
# Breadcrumbs: Setup -> Configuration

# Standard library imports - for file operations, environment variables, typing, and warnings
import os
import re
import logging
import warnings
from typing import Dict, List, Any, Tuple
from pathlib import Path
from dotenv import load_dotenv

# Data processing and manipulation - pandas and numpy are core to all analysis cells
import pandas as pd
import numpy as np

# Visualization libraries - used extensively in cells 4, 7, 8, 9, 10, 11 for plots and charts
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

# Database connectivity - needed for Neo4j graph database queries in cells 2, 3, 6
from neo4j import GraphDatabase

# Statistical analysis - scipy for statistics, used in correlation analysis
from scipy import stats
from scipy.stats import pearsonr, spearmanr, ttest_rel, ttest_ind, f_oneway, chi2_contingency
from scipy.stats import bootstrap, binomtest
from statsmodels.stats.contingency_tables import mcnemar
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson

# Machine learning - required for advanced modeling in cells 7-11
# Core metrics and linear models
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
# Model validation
from sklearn.model_selection import KFold, cross_val_score, train_test_split
# Feature preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# Advanced modeling used in cell 11
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as XGBRegressor

# Jupyter/IPython display tools - for rich output formatting throughout notebook
from IPython.display import display, HTML

# Suppress warnings for cleaner notebook output
warnings.filterwarnings('ignore')

# Configure logging with shorter format for better PDF wrapping
import textwrap

class PDFLoggingFormatter(logging.Formatter):
    """Custom formatter that wraps long log messages for better PDF display"""
    def format(self, record):
        # Format the basic log record
        formatted = super().format(record)
        
        # Wrap long lines at 120 characters with proper indentation
        if len(formatted) > 120:
            lines = textwrap.wrap(formatted, width=120, 
                                subsequent_indent='    ')  # Indent continuation lines
            formatted = '\n'.join(lines)
        
        return formatted

# Create custom formatter
pdf_formatter = PDFLoggingFormatter('%(asctime)s - %(levelname)s - %(message)s')

# Configure logging with custom formatter
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True  # Override any existing handlers
)

# Apply custom formatter to all handlers
logger = logging.getLogger(__name__)
for handler in logging.getLogger().handlers:
    handler.setFormatter(pdf_formatter)

# Configure pandas display settings for legal landscape format
pd.set_option('display.width', 130)           # Set width threshold for legal landscape
pd.set_option('display.max_columns', 25)     # Reasonable number of columns
pd.set_option('display.max_colwidth', 25)    # Compact column width
pd.set_option('display.precision', 2)        # Only 2 decimal places to save space
pd.set_option('display.float_format', '{:.2f}'.format)  # Consistent float formatting
pd.set_option('display.max_rows', None)      # Show all rows
# Note: Removed expand_frame_repr=False to allow natural wrapping at 130 chars

# Configure matplotlib settings for consistent visualizations
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100
plt.style.use('seaborn' if 'seaborn' in plt.style.available else 'default')

print("Notebook Configuration Complete ✓")

In [None]:
# Cell [1] - Environment Setup
# Purpose: Load environment variables and configure Neo4j connection parameters from .env file
# Dependencies: dotenv, os, logging
# Breadcrumbs: Setup -> Environment

# Load environment variables
load_dotenv()

# Neo4j credentials from environment variables
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USER = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_PROJECT_NAME = os.getenv('NEO4J_PROJECT_NAME')

# Log environment setup status
logger.info("Environment variables loaded")
logger.info(f"Using project: {NEO4J_PROJECT_NAME}")
logger.info(f"Neo4j URI: {NEO4J_URI if NEO4J_URI else 'Not set'}")

In [None]:
# Cell [2] - Neo4j Connection Setup
# Purpose: Create Neo4j driver instance and establish database connection for querying SIFP data
# Dependencies: neo4j, logging
# Breadcrumbs: Setup -> Database Connection

def create_neo4j_driver():
    """Create and return a Neo4j driver instance with authentication.
    
    Establishes a connection to the Neo4j database using credentials from 
    environment variables. This function is used to create a reusable driver
    for executing Cypher queries throughout the notebook.
    
    Environment Variables Required:
        NEO4J_URI (str): The URI of the Neo4j database (e.g., 'bolt://localhost:7687')
        NEO4J_USER (str): Username for Neo4j authentication
        NEO4J_PASSWORD (str): Password for Neo4j authentication
    
    Returns:
        neo4j.Driver: Configured Neo4j driver instance ready for session creation
        
    Raises:
        ConnectionError: If unable to connect to the Neo4j database
        AuthError: If authentication fails
        ServiceUnavailable: If the Neo4j service is not available
        
    Example:
        >>> driver = create_neo4j_driver()
        >>> with driver.session() as session:
        ...     result = session.run("MATCH (n) RETURN count(n)")
        ...     print(result.single()[0])
    """
    try:
        driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
        logger.info("Successfully connected to Neo4j database")
        return driver
    except Exception as e:
        logger.error(f"Failed to connect to Neo4j: {str(e)}")
        raise

# Create Neo4j driver
driver = create_neo4j_driver()

In [None]:
# Cell [3] - Statistical Utility Functions  
# Purpose: Define comprehensive statistical analysis functions for correlation, comparison, bootstrap, and regression diagnostics
# Dependencies: numpy, scipy, sklearn, statsmodels
# Breadcrumbs: Setup -> Statistical Tools

def calculate_correlation_with_stats(x, y, method='pearson'):
    """Calculate correlation coefficient with comprehensive statistical analysis.
    
    Computes correlation between two variables with significance testing, confidence 
    intervals using Fisher transformation, and effect size interpretation. Handles 
    missing values and provides comprehensive statistical output.
    
    Args:
        x (array-like): First data vector (dependent or independent variable)
        y (array-like): Second data vector (dependent or independent variable)  
        method (str, optional): Correlation method to use. Defaults to 'pearson'.
            - 'pearson': Pearson product-moment correlation (assumes linear relationship)
            - 'spearman': Spearman rank correlation (non-parametric, monotonic relationships)
    
    Returns:
        dict: Comprehensive correlation analysis results containing:
            - correlation (float): Correlation coefficient (-1 to 1)
            - p_value (float): Two-tailed p-value for significance test
            - confidence_interval (tuple): 95% confidence interval (lower, upper)
            - n_samples (int): Number of valid paired observations used
            - interpretation (str): Combined effect size and significance interpretation
            - effect_size (str): Effect size category ('negligible', 'small', 'medium', 'large', 'very large')
            - significance (str): Statistical significance level description
    
    Notes:
        - Automatically removes NaN values from both vectors before analysis
        - Requires minimum 3 valid paired observations for meaningful results
        - Uses Fisher z-transformation for confidence interval calculation
        - Effect size interpretation follows Cohen's conventions:
            * |r| < 0.1: negligible
            * 0.1 ≤ |r| < 0.3: small  
            * 0.3 ≤ |r| < 0.5: medium
            * 0.5 ≤ |r| < 0.7: large
            * |r| ≥ 0.7: very large
    
    Example:
        >>> import numpy as np
        >>> x = [1, 2, 3, 4, 5]
        >>> y = [2, 4, 6, 8, 10]
        >>> result = calculate_correlation_with_stats(x, y, method='pearson')
        >>> print(f"r = {result['correlation']:.3f}, p = {result['p_value']:.3f}")
        >>> print(result['interpretation'])
        r = 1.000, p = 0.000
        very large effect, highly significant (p < 0.001)
    """
    # Remove NaN values
    mask = ~(np.isnan(x) | np.isnan(y))
    x_clean = np.array(x)[mask]
    y_clean = np.array(y)[mask]
    
    if len(x_clean) < 3:
        return {
            'correlation': np.nan,
            'p_value': np.nan,
            'confidence_interval': (np.nan, np.nan),
            'n_samples': len(x_clean),
            'interpretation': 'Insufficient data'
        }
    
    # Calculate correlation
    if method == 'pearson':
        corr, p_val = pearsonr(x_clean, y_clean)
    else:
        corr, p_val = spearmanr(x_clean, y_clean)
    
    # Convert correlation results to standard float types
    try:
        corr = np.float64(corr).item()  # Convert to standard Python float
    except (TypeError, ValueError, AttributeError):
        corr = np.nan
        
    try:
        p_val = np.float64(p_val).item()  # Convert to standard Python float
    except (TypeError, ValueError, AttributeError):
        p_val = 1.0
    
    # Calculate confidence interval for correlation
    n = len(x_clean)
    ci_lower = np.nan
    ci_upper = np.nan
    
    try:
        if n > 3 and not np.isnan(corr) and abs(corr) < 0.99:
            # Fisher transformation for confidence interval
            z = 0.5 * np.log((1 + corr) / (1 - corr))
            se = 1 / np.sqrt(n - 3)
            z_critical = stats.norm.ppf(0.975)  # 95% confidence
            
            z_lower = z - z_critical * se
            z_upper = z + z_critical * se
            
            # Transform back
            ci_lower = (np.exp(2 * z_lower) - 1) / (np.exp(2 * z_lower) + 1)
            ci_upper = (np.exp(2 * z_upper) - 1) / (np.exp(2 * z_upper) + 1)
    except:
        pass
    
    # Interpretation
    p_val_num = p_val  # Already converted to float above
        
    if p_val_num < 0.001:
        significance = "highly significant (p < 0.001)"
    elif p_val_num < 0.01:
        significance = "very significant (p < 0.01)"
    elif p_val_num < 0.05:
        significance = "significant (p < 0.05)"
    elif p_val_num < 0.1:
        significance = "marginally significant (p < 0.1)"
    else:
        significance = "not significant (p ≥ 0.1)"
    
    # Effect size interpretation
    try:
        abs_corr = abs(corr) if not np.isnan(corr) else 0.0
    except (TypeError, ValueError, AttributeError):
        abs_corr = 0.0
        
    if abs_corr < 0.1:
        effect_size = "negligible"
    elif abs_corr < 0.3:
        effect_size = "small"
    elif abs_corr < 0.5:
        effect_size = "medium"
    elif abs_corr < 0.7:
        effect_size = "large"
    else:
        effect_size = "very large"
    
    interpretation = f"{effect_size} effect, {significance}"
    
    return {
        'correlation': corr,
        'p_value': p_val,
        'confidence_interval': (ci_lower, ci_upper),
        'n_samples': n,
        'interpretation': interpretation,
        'effect_size': effect_size,
        'significance': significance
    }

def compare_paired_metrics(metric1, metric2, metric_names=('Metric 1', 'Metric 2'), alpha=0.05):
    """Perform paired statistical comparison between two related metrics.
    
    Conducts a paired t-test to compare two sets of paired measurements, calculates 
    effect size (Cohen's d), confidence intervals, and provides interpretation of 
    statistical and practical significance.
    
    Args:
        metric1 (array-like): First set of paired measurements
        metric2 (array-like): Second set of paired measurements (must have same length as metric1)
        metric_names (tuple, optional): Descriptive names for the metrics used in reporting.
            Defaults to ('Metric 1', 'Metric 2').
        alpha (float, optional): Significance level for hypothesis testing. Defaults to 0.05.
    
    Returns:
        dict: Comprehensive paired comparison results containing:
            - n_samples (int): Number of valid paired observations
            - t_statistic (float): Paired t-test statistic  
            - p_value (float): Two-tailed p-value from paired t-test
            - effect_size (float): Cohen's d for paired samples (standardized mean difference)
            - mean_difference (float): Mean of pairwise differences (metric1 - metric2)
            - confidence_interval (tuple): Confidence interval for the mean difference
            - better_metric (str): Which metric performs better or 'neither' if no significant difference
            - significance (str): Description of statistical significance
            - effect_interpretation (str): Practical significance category
            - interpretation (str): Combined statistical and practical significance summary
    
    Notes:
        - Automatically handles missing values by pairwise deletion
        - Requires minimum 3 valid paired observations
        - Effect size interpretation (Cohen's d):
            * |d| < 0.2: negligible effect
            * 0.2 ≤ |d| < 0.5: small effect  
            * 0.5 ≤ |d| < 0.8: medium effect
            * |d| ≥ 0.8: large effect
        - Uses Welch's t-test assumptions (paired differences normally distributed)
    
    Example:
        >>> before = [100, 110, 105, 120, 115] 
        >>> after = [95, 102, 98, 112, 108]
        >>> result = compare_paired_metrics(before, after, ('Before', 'After'))
        >>> print(f"Mean improvement: {result['mean_difference']:.2f}")
        >>> print(result['interpretation'])
    """
    # Remove NaN values
    mask = ~(np.isnan(metric1) | np.isnan(metric2))
    m1_clean = np.array(metric1)[mask]
    m2_clean = np.array(metric2)[mask]
    
    if len(m1_clean) < 3:
        return {
            'n_samples': len(m1_clean),
            'interpretation': 'Insufficient data for statistical testing'
        }
    
    # Paired t-test
    t_stat, p_val = ttest_rel(m1_clean, m2_clean)
    
    # Effect size (Cohen's d for paired samples)
    diff = m1_clean - m2_clean
    effect_size = np.mean(diff) / np.std(diff, ddof=1)
    
    # Confidence interval for the difference
    se_diff = stats.sem(diff)
    t_critical = stats.t.ppf(1 - alpha/2, len(diff) - 1)
    ci_lower = np.mean(diff) - t_critical * se_diff
    ci_upper = np.mean(diff) + t_critical * se_diff
    
    # Interpretation
    if p_val < alpha:
        significance = f"significant difference (p = {p_val:.4f})"
        if np.mean(diff) > 0:
            better_metric = metric_names[0]
        else:
            better_metric = metric_names[1]
    else:
        significance = f"no significant difference (p = {p_val:.4f})"
        better_metric = "neither"
    
    # Effect size interpretation
    abs_effect = abs(effect_size)
    if abs_effect < 0.2:
        effect_interpretation = "negligible"
    elif abs_effect < 0.5:
        effect_interpretation = "small"
    elif abs_effect < 0.8:
        effect_interpretation = "medium"
    else:
        effect_interpretation = "large"
    
    return {
        'n_samples': len(m1_clean),
        't_statistic': t_stat,
        'p_value': p_val,
        'effect_size': effect_size,
        'mean_difference': np.mean(diff),
        'confidence_interval': (ci_lower, ci_upper),
        'better_metric': better_metric,
        'significance': significance,
        'effect_interpretation': effect_interpretation,
        'interpretation': f"{effect_interpretation} effect size, {significance}"
    }

def bootstrap_confidence_interval(data, statistic_func, confidence_level=0.95, n_bootstrap=1000):
    """Calculate bootstrap confidence interval for any statistical estimator.
    
    Implements non-parametric bootstrap resampling to estimate the sampling distribution
    of a statistic and compute confidence intervals without distributional assumptions.
    Uses percentile method for confidence interval calculation.
    
    Args:
        data (array-like): Input dataset for bootstrap resampling
        statistic_func (callable): Function that computes the statistic of interest.
            Must accept array-like input and return a single numeric value.
        confidence_level (float, optional): Confidence level as proportion (0-1). 
            Defaults to 0.95 for 95% confidence interval.
        n_bootstrap (int, optional): Number of bootstrap resamples to generate.
            Defaults to 1000. More samples improve accuracy but increase computation time.
    
    Returns:
        dict: Bootstrap analysis results containing:
            - statistic (float): Original statistic computed on full dataset
            - confidence_interval (tuple): Bootstrap confidence interval (lower, upper)
            - bootstrap_std (float): Standard error of bootstrap distribution
            - n_bootstrap (int): Number of successful bootstrap samples used
            - interpretation (str): Formatted confidence interval description
    
    Notes:
        - Automatically removes NaN values before bootstrap resampling
        - Requires minimum 3 valid observations for meaningful results
        - Uses random sampling with replacement (bootstrap principle)
        - Sets random seed (42) for reproducible results
        - Filters out NaN bootstrap statistics automatically
        - Falls back gracefully when insufficient bootstrap samples succeed
    
    Mathematical Foundation:
        For statistic θ̂, bootstrap creates empirical sampling distribution by:
        1. Resample data with replacement B times
        2. Compute θ̂* for each bootstrap sample  
        3. Use percentiles of {θ̂*} for confidence intervals
    
    Example:
        >>> import numpy as np
        >>> data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        >>> result = bootstrap_confidence_interval(data, np.mean, confidence_level=0.95)
        >>> print(f"Mean: {result['statistic']:.2f}")
        >>> print(result['interpretation'])
        Mean: 5.50
        95% CI: [3.2000, 7.8000]
    """
    # Remove NaN values
    clean_data = np.array(data)[~np.isnan(data)]
    
    if len(clean_data) < 3:
        return {
            'statistic': np.nan,
            'confidence_interval': (np.nan, np.nan),
            'interpretation': 'Insufficient data for bootstrap'
        }
    
    # Original statistic
    original_stat = statistic_func(clean_data)
    
    # Bootstrap
    np.random.seed(42)  # For reproducibility
    bootstrap_stats = []
    
    for _ in range(n_bootstrap):
        boot_sample = np.random.choice(clean_data, size=len(clean_data), replace=True)
        boot_stat = statistic_func(boot_sample)
        if not np.isnan(boot_stat):
            bootstrap_stats.append(boot_stat)
    
    if len(bootstrap_stats) < 10:
        return {
            'statistic': original_stat,
            'confidence_interval': (np.nan, np.nan),
            'interpretation': 'Bootstrap failed'
        }
    
    # Calculate confidence interval
    alpha = 1 - confidence_level
    ci_lower = np.percentile(bootstrap_stats, 100 * alpha / 2)
    ci_upper = np.percentile(bootstrap_stats, 100 * (1 - alpha / 2))
    
    return {
        'statistic': original_stat,
        'confidence_interval': (ci_lower, ci_upper),
        'bootstrap_std': np.std(bootstrap_stats),
        'n_bootstrap': len(bootstrap_stats),
        'interpretation': f"{confidence_level*100:.0f}% CI: [{ci_lower:.4f}, {ci_upper:.4f}]"
    }

def regression_diagnostics(X, y, model_name="Model"):
    """
    Perform regression diagnostics and return statistical measures
    
    Parameters:
    -----------
    X : array-like
        Feature matrix
    y : array-like
        Target variable
    model_name : str
        Name of the model for reporting
    
    Returns:
    --------
    dict
        Regression diagnostics and statistics
    """
    # Remove NaN values
    if X.ndim == 1:
        X = X.reshape(-1, 1)
    
    mask = ~(np.isnan(y) | np.isnan(X).any(axis=1))
    X_clean = X[mask]
    y_clean = y[mask]
    
    if len(X_clean) < 5:
        return {'interpretation': 'Insufficient data for regression diagnostics'}
    
    # Fit regression model with statsmodels for detailed statistics
    X_with_const = sm.add_constant(X_clean)
    model = sm.OLS(y_clean, X_with_const).fit()
    
    # Get predictions and residuals
    y_pred = model.predict(X_with_const)
    residuals = y_clean - y_pred
    
    # Statistical tests
    results = {
        'n_samples': len(X_clean),
        'r_squared': model.rsquared,
        'adj_r_squared': model.rsquared_adj,
        'f_statistic': model.fvalue,
        'f_pvalue': model.f_pvalue,
        'coefficients': model.params,
        'coef_pvalues': model.pvalues,
        'coef_confidence_intervals': model.conf_int(),
        'rmse': np.sqrt(np.mean(residuals**2)),
        'mae': np.mean(np.abs(residuals))
    }
    
    # Heteroscedasticity test (Breusch-Pagan)
    try:
        bp_stat, bp_pvalue, _, _ = het_breuschpagan(residuals, X_with_const)
        results['heteroscedasticity_test'] = {
            'bp_statistic': bp_stat,
            'bp_pvalue': bp_pvalue,
            'interpretation': 'Homoscedastic' if bp_pvalue > 0.05 else 'Heteroscedastic'
        }
    except:
        results['heteroscedasticity_test'] = {'interpretation': 'Test failed'}
    
    # Durbin-Watson test for autocorrelation
    try:
        dw_stat = durbin_watson(residuals)
        results['durbin_watson'] = {
            'statistic': dw_stat,
            'interpretation': 'No autocorrelation' if 1.5 < dw_stat < 2.5 else 'Possible autocorrelation'
        }
    except:
        results['durbin_watson'] = {'interpretation': 'Test failed'}
    
    # Overall model interpretation
    if results['f_pvalue'] < 0.001:
        model_significance = "highly significant (p < 0.001)"
    elif results['f_pvalue'] < 0.01:
        model_significance = "very significant (p < 0.01)"
    elif results['f_pvalue'] < 0.05:
        model_significance = "significant (p < 0.05)"
    else:
        model_significance = f"not significant (p = {results['f_pvalue']:.4f})"
    
    results['interpretation'] = f"{model_name}: R² = {results['r_squared']:.4f}, {model_significance}"
    
    return results

print("Statistical utility functions loaded successfully ✓")
print("Available functions:")
print("- calculate_correlation_with_stats(): Correlation with p-values and confidence intervals")
print("- compare_paired_metrics(): Paired statistical tests for metric comparison")
print("- bootstrap_confidence_interval(): Bootstrap confidence intervals")
print("- regression_diagnostics(): Comprehensive regression statistics")


In [None]:
# Cell [4] - Query SIFP Estimation Data  
# Purpose: Query SIFP estimation data from Neo4j database and process into structured DataFrame for analysis
# Dependencies: pandas, neo4j, os, logging
# Breadcrumbs: Data Acquisition -> SIFP Data

def query_sifp_estimations(driver) -> pd.DataFrame:
    """Query comprehensive SIFP estimation data from Neo4j graph database.
    
    Executes a complex Cypher query to retrieve SIFP (Software Interface Function Points)
    estimation data including actor analysis, judge evaluation, and final estimations.
    Processes JSON-encoded fields and returns structured tabular data for analysis.
    
    Args:
        driver (neo4j.Driver): Active Neo4j database driver connection for executing queries
    
    Returns:
        pd.DataFrame: Structured DataFrame containing SIFP estimation results with columns:
            - sifp_requirement_id (str): Unique identifier for the requirement
            - sifp_model (str): AI model identifier used for estimation  
            - sifp_is_valid (bool): Validation status of the estimation
            - sifp_judge_score (float): Overall judge evaluation score
            - sifp_judge_confidence (float): Judge confidence level (0-1)
            - sifp_actor_confidence (float): Actor analysis confidence level
            - sifp_actor_total (float): Total SIFP points from actor analysis
            - sifp_judge_ugep_accuracy (float): Judge accuracy for UGEP classification
            - sifp_judge_ugdg_accuracy (float): Judge accuracy for UGDG classification  
            - sifp_judge_calculation_accuracy (float): Judge mathematical calculation accuracy
            - sifp_judge_classification_accuracy (float): Judge component classification accuracy
            - sifp_final_total (float): Final adjusted SIFP points total
    
    Environment Variables Required:
        NEO4J_PROJECT_NAME (str): Name of the project to query
        SIFP_ESTIMATION_REQUIREMENT (str, optional): Requirement type filter ('SOURCE' default)
    
    Notes:
        - Queries only requirements with complete actor_analysis, judge_evaluation, and final_estimation
        - Automatically parses JSON-encoded fields using APOC functions
        - Logs comprehensive statistics about retrieved data
        - Returns empty DataFrame if no data found or on query errors
        - Handles missing or malformed JSON gracefully
    
    Raises:
        Exception: Database connection or query execution errors (logged and re-raised)
    
    Example:
        >>> driver = create_neo4j_driver()
        >>> df = query_sifp_estimations(driver)
        >>> print(f"Retrieved {len(df)} SIFP estimations")
        >>> print(f"Average final SIFP points: {df['sifp_final_total'].mean():.2f}")
    """
    try:
        # Get SIFP_ESTIMATION_REQUIREMENT from environment
        sifp_estimation_requirement = os.getenv('SIFP_ESTIMATION_REQUIREMENT', 'SOURCE')
        
        sifp_query = """
        MATCH (p:Project {name: $project_name})
        MATCH (p)-[:CONTAINS*]->(n)
        MATCH (r:Requirement)-[s:SIFP_ESTIMATION]->(e)
        WHERE r = n
        AND r.type = $requirement_type
        WITH r, s
        WHERE s.actor_analysis IS NOT NULL
        AND s.judge_evaluation IS NOT NULL
        AND s.final_estimation IS NOT NULL
        WITH r, s,
            CASE
                WHEN s.actor_analysis STARTS WITH '{'
                THEN apoc.convert.fromJsonMap(s.actor_analysis)
                ELSE NULL
            END as actor_analysis,
            CASE
                WHEN s.judge_evaluation STARTS WITH '{'
                THEN apoc.convert.fromJsonMap(s.judge_evaluation)
                ELSE NULL
            END as judge_eval,
            CASE
                WHEN s.final_estimation STARTS WITH '{'
                THEN apoc.convert.fromJsonMap(s.final_estimation)
                ELSE NULL
            END as final_est
        WHERE actor_analysis IS NOT NULL
        AND final_est IS NOT NULL
        WITH r.id as sifp_requirement_id,
            s.is_valid as sifp_is_valid,
            s.model as sifp_model,
            s.judge_score as sifp_judge_score,
            s.judge_confidence as sifp_judge_confidence,
            // Actor Analysis values
            actor_analysis.confidence as sifp_actor_confidence,
            actor_analysis.sifp_points.total as sifp_actor_total,
            // Judge Evaluation values
            judge_eval.ugep_accuracy as sifp_judge_ugep_accuracy,
            judge_eval.ugdg_accuracy as sifp_judge_ugdg_accuracy,
            judge_eval.calculation_accuracy as sifp_judge_calculation_accuracy,
            judge_eval.component_classification_accuracy as sifp_judge_classification_accuracy,
            // Final Estimation values
            final_est.sifp_points.total as sifp_final_total
        WITH sifp_requirement_id, sifp_model,
            COLLECT([sifp_is_valid, sifp_judge_score, sifp_judge_confidence,
            sifp_actor_confidence, sifp_actor_total, sifp_judge_ugep_accuracy,
            sifp_judge_ugdg_accuracy, sifp_judge_calculation_accuracy,
            sifp_judge_classification_accuracy, sifp_final_total])[0] as fields
        RETURN 
            sifp_requirement_id,
            sifp_model,
            fields[0] as sifp_is_valid,
            fields[1] as sifp_judge_score,
            fields[2] as sifp_judge_confidence,
            // Actor Analysis values
            fields[3] as sifp_actor_confidence,
            fields[4] as sifp_actor_total,
            // Judge Evaluation values
            fields[5] as sifp_judge_ugep_accuracy,
            fields[6] as sifp_judge_ugdg_accuracy,
            fields[7] as sifp_judge_calculation_accuracy,
            fields[8] as sifp_judge_classification_accuracy,
            // Final Estimation values
            fields[9] as sifp_final_total
        """
        
        with driver.session() as session:
            # Execute query with project name parameter and requirement type
            results = session.run(sifp_query, 
                                 project_name=NEO4J_PROJECT_NAME,
                                 requirement_type=sifp_estimation_requirement).data()
            
            if not results:
                logger.warning(f"No SIFP estimation results found for project: {NEO4J_PROJECT_NAME} with requirement type: {sifp_estimation_requirement}")
                return pd.DataFrame()
            
            df = pd.DataFrame(results)
            
            # Log summary statistics
            logger.info(f"Retrieved {len(df)} SIFP estimation results for project: {NEO4J_PROJECT_NAME}")
            logger.info(f"Requirement type: {sifp_estimation_requirement}")
            logger.info(f"Number of unique requirements: {df['sifp_requirement_id'].nunique()}")
            logger.info(f"Number of unique models: {df['sifp_model'].nunique()}")
            
            # Calculate and log some basic statistics
            logger.info("Summary Statistics:")
            logger.info(f"Average actor total points: {df['sifp_actor_total'].mean():.2f}")
            logger.info(f"Average final total points: {df['sifp_final_total'].mean():.2f}")
            logger.info(f"Average judge score: {df['sifp_judge_score'].mean():.2f}")
            
            return df
            
    except Exception as e:
        logger.error(f"Error querying Neo4j for SIFP estimations: {str(e)}")
        logger.error("Exception details:", exc_info=True)
        return pd.DataFrame()  # Return empty DataFrame on error

# Execute query and get results
sifp_results_df = query_sifp_estimations(driver)

# Display sample of results and dataset info
print("SIFP Estimation Analysis:")
print("=" * 80)
print(f"Total number of estimations: {len(sifp_results_df)}")
print(f"Unique requirements: {sifp_results_df['sifp_requirement_id'].nunique()}")
print(f"Unique models: {sifp_results_df['sifp_model'].nunique()}")

print("Sample of SIFP estimation results:")
display(sifp_results_df.head())

# Display summary statistics for numerical columns
print("Summary Statistics for Key Metrics:")
print("-" * 50)
numerical_columns = [
    'sifp_judge_score', 'sifp_actor_total', 'sifp_final_total',
    'sifp_judge_ugep_accuracy', 'sifp_judge_ugdg_accuracy',
    'sifp_judge_calculation_accuracy', 'sifp_judge_classification_accuracy'
]
print(sifp_results_df[numerical_columns].describe())

In [None]:
# Cell [5] - Display Selected SIFP Metrics by Model
# Purpose: Display and analyze SIFP metrics grouped by model with basic visualizations and statistical summaries  
# Dependencies: pandas, matplotlib, seaborn, os
# Breadcrumbs: Data Analysis -> Model Metrics

# Select and display specific columns
selected_columns = [
    'sifp_requirement_id', 
    'sifp_model', 
    'sifp_actor_total', 
    'sifp_final_total'
]

print(f"Selected SIFP Metrics for Project: {NEO4J_PROJECT_NAME}")
print("=" * 80)
display(sifp_results_df[selected_columns].head())

# Display counts by model
print(f"Count of estimations by model for Project {NEO4J_PROJECT_NAME}:")
print("-" * 40)
print(sifp_results_df['sifp_model'].value_counts())

# Add model-specific analysis
print(f"Project-specific SIFP Analysis for {NEO4J_PROJECT_NAME}:")
print("-" * 40)

# Calculate average scores by model
model_avg = sifp_results_df.groupby('sifp_model')[numerical_columns].mean().reset_index()
print("Average estimated size by model:")
print(model_avg)

# Calculate percentage of valid estimations
valid_percent = sifp_results_df['sifp_is_valid'].mean() * 100
print(f"Percentage of valid estimations: {valid_percent:.2f}%")

# Get visualization setting from environment
SHOW_VISUALIZATION = os.getenv('SHOW_VISUALIZATION', 'False').lower() == 'true'

# Create a bar chart comparing models only if visualization is enabled
if SHOW_VISUALIZATION:
    plt.figure(figsize=(12, 6))
    sns.barplot(x='sifp_model', y='sifp_final_total', data=model_avg, palette='viridis')
    plt.title('Average Estimated Size by Model')
    plt.xlabel('Model')
    plt.ylabel('SIFP Points')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("(Visualizations disabled. Set SHOW_VISUALIZATION=True in .env to enable.)")

In [None]:
# Cell [6] - Load and Display SLOC Metrics
# Purpose: Load Java code metrics CSV file and display summary statistics for code size analysis
# Dependencies: pandas, pathlib, logging
# Breadcrumbs: Data Acquisition -> Code Metrics

try:
    # Define the file path relative to the notebook location using the project name
    file_path = Path('..') / 'datasets' / NEO4J_PROJECT_NAME / NEO4J_PROJECT_NAME / 'java.csv'
    
    # Log attempt to read file
    logger.info(f"Attempting to read Java metrics CSV file for project {NEO4J_PROJECT_NAME} from: {file_path.absolute()}")
    
    # Check if file exists
    if not file_path.exists():
        logger.warning(f"Java metrics file not found at: {file_path.absolute()}")
        logger.warning(f"Checking for alternative file paths...")
        
        # Try alternative file paths
        alt_paths = [
            Path('..') / 'datasets' / NEO4J_PROJECT_NAME / f"{NEO4J_PROJECT_NAME}-SLOC Metrics.csv",
            Path('..') / 'datasets' / NEO4J_PROJECT_NAME / 'SLOC Metrics.csv',
            Path('..') / 'datasets' / NEO4J_PROJECT_NAME / f"{NEO4J_PROJECT_NAME}.csv"
        ]
        
        for alt_path in alt_paths:
            logger.info(f"Checking alternative path: {alt_path.absolute()}")
            if alt_path.exists():
                logger.info(f"Found file at alternative path: {alt_path.absolute()}")
                file_path = alt_path
                break
        
        # If still not found, raise error
        if not file_path.exists():
            raise FileNotFoundError(f"Java metrics file not found for project {NEO4J_PROJECT_NAME}.")
    
    # Read the CSV file
    df_sloc = pd.read_csv(file_path)
    
    # Log successful read
    logger.info(f"Successfully loaded Java metrics CSV file with shape: {df_sloc.shape}")
    
    # Process the data based on the file format
    if 'Kind' in df_sloc.columns and 'Name' in df_sloc.columns:
        logger.info("Detected standard Java metrics format - filtering for File entries only")
        
        # Filter for File entries only
        file_data = df_sloc[df_sloc['Kind'] == 'File']
        
        if len(file_data) == 0:
            # If no File entries, check if there are any Class entries as fallback
            class_data = df_sloc[df_sloc['Kind'].str.contains('Class', na=False)]
            
            if len(class_data) > 0:
                logger.info(f"Using {len(class_data)} Class entries as fallback")
                file_data = class_data
            else:
                raise ValueError("No File or Class entries found in the Java metrics file")
        
        logger.info(f"Filtered to {len(file_data)} File entries")
        
        # Create a new DataFrame with the required columns
        df_sloc_transformed = pd.DataFrame({
            'File': file_data['Name'],
            'target_type': file_data['Name'].apply(lambda x: x.split('.')[-1] if '.' in x else x),
            'Lines': file_data['CountLine'],
            'Comments': file_data['CountLineComment'],
            'Blanks': file_data['CountLineBlank'],
            'Code': file_data['CountLineCode'],
            'Lines-exe': file_data['CountLineCodeExe'] if 'CountLineCodeExe' in file_data.columns else None,
            'Lines-dec': file_data['CountLineCodeDecl'] if 'CountLineCodeDecl' in file_data.columns else None,
            'Stmt-exe': file_data['CountStmtExe'] if 'CountStmtExe' in file_data.columns else None,
            'Stmt-dec': file_data['CountStmtDecl'] if 'CountStmtDecl' in file_data.columns else None,
            'Units': file_data['CountDeclMethod'] if 'CountDeclMethod' in file_data.columns else None
        })
        
        # Replace the original DataFrame with the transformed one
        df_sloc = df_sloc_transformed
        logger.info(f"Transformed data to required format with shape: {df_sloc.shape}")
    
    # Display basic information about the dataframe
    print(f"Java Metrics for Project: {NEO4J_PROJECT_NAME}")
    print("=" * 80)
    print("DataFrame Info:")
    print(df_sloc.info())
    
    print("First few rows of the data:")
    display(df_sloc.head())
    
    # Display summary statistics
    print("Summary Statistics:")
    print("-" * 50)
    numeric_cols = df_sloc.select_dtypes(include=['number']).columns
    print(df_sloc[numeric_cols].describe())
    
    # Log some key metrics
    logger.info(f"Total number of files: {len(df_sloc)}")
    if 'Code' in df_sloc.columns:
        logger.info(f"Total lines of code: {df_sloc['Code'].sum()}")
    if 'Lines-exe' in df_sloc.columns:
        logger.info(f"Total executable lines: {df_sloc['Lines-exe'].sum()}")
    if 'Lines-dec' in df_sloc.columns:
        logger.info(f"Total declarative lines: {df_sloc['Lines-dec'].sum()}")
    if 'Units' in df_sloc.columns:
        logger.info(f"Total units/methods: {df_sloc['Units'].sum()}")

except Exception as e:
    logger.error(f"Error loading Java metrics file: {str(e)}", exc_info=True)
    print(f"Error loading Java metrics for project {NEO4J_PROJECT_NAME}")
    print(f"Error details: {str(e)}")
    print("Continuing with analysis without Java metrics data...")

In [None]:
# Cell [7] - Map Requirements to Code Files
# Purpose: Query Neo4j to map requirements to code files and create synthetic mappings for analysis
# Dependencies: pandas, neo4j, logging
# Breadcrumbs: Data Preparation -> Requirement Mapping

def map_requirements_to_code():
    """
    Query Neo4j to map requirements to code files
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with requirement to code file mappings
    """
    try:
        # Use an approach that captures ALL target requirements with a GROUND_TRUTH relationship
        with driver.session() as session:
            # Modified query to get all target requirements with a GROUND_TRUTH relationship
            alt_query = """
            MATCH (p:Project {name: $project_name})-[:CONTAINS]->(d:Document)-[:CONTAINS]->(source:Requirement)-[r:GROUND_TRUTH]->(target:Requirement)
            WHERE source.type = 'SOURCE' AND target.type = 'TARGET'
            RETURN DISTINCT
                target.id as target_id,
                'Derived from ground truth' as code_file_path,
                1 as ground_truth
            """
            
            alt_results = session.run(alt_query, project_name=NEO4J_PROJECT_NAME).data()
            
            if not alt_results:
                logger.warning("No ground truth mappings found")
                return pd.DataFrame()
            
            logger.info(f"Found {len(alt_results)} ground truth mappings")
            
            # Create DataFrame from results
            mapping_df = pd.DataFrame(alt_results)
            
            # Log distribution of target IDs to understand their patterns
            target_prefixes = mapping_df['target_id'].apply(lambda x: x.split('_')[0] if '_' in x else x).value_counts()
            logger.info(f"Target ID prefix distribution: {target_prefixes.to_dict()}")
            
            # Map CC identifiers to EA file names
            def map_cc_to_ea_file(cc_id):
                if cc_id and isinstance(cc_id, str):
                    # Extract number from different formats:
                    # Format CC_123 -> 123
                    if cc_id.startswith('CC_'):
                        num = cc_id[3:]
                    # Format CC123 -> 123
                    elif cc_id.startswith('CC'):
                        num = cc_id[2:]
                    # Try to extract any number from the ID
                    else:
                        num = ''.join(c for c in cc_id if c.isdigit())
                    
                    # Only return a mapped file name if we have a number
                    if num:
                        return f"EA{num}.java"
                return None
            
            # Apply the mapping function to target_id
            mapping_df['java_file_name'] = mapping_df['target_id'].apply(map_cc_to_ea_file)
            
            # Count how many mappings were successfully created
            mapped_count = mapping_df['java_file_name'].notna().sum()
            logger.info(f"Successfully mapped {mapped_count} of {len(mapping_df)} target IDs to Java file names")
            
            # For any rows where java_file_name is None, try alternate mappings
            if mapped_count < len(mapping_df):
                logger.info("Applying fallback mapping for non-standard target IDs")
                
                # For unmapped rows, try to extract any numeric parts as a fallback
                unmapped_mask = mapping_df['java_file_name'].isna()
                unmapped_ids = mapping_df.loc[unmapped_mask, 'target_id']
                
                # Extract any numeric part from the ID
                mapping_df.loc[unmapped_mask, 'numeric_id'] = unmapped_ids.apply(
                    lambda x: ''.join(c for c in x if c.isdigit()) if x else None
                )
                
                # Create java_file_name from numeric ID where possible
                mapping_df.loc[unmapped_mask & mapping_df['numeric_id'].notna(), 'java_file_name'] = \
                    mapping_df.loc[unmapped_mask & mapping_df['numeric_id'].notna(), 'numeric_id'].apply(
                        lambda x: f"EA{x}.java" if x else None
                    )
                
                # Count final mapping success
                final_mapped_count = mapping_df['java_file_name'].notna().sum()
                logger.info(f"After fallback, mapped {final_mapped_count} of {len(mapping_df)} target IDs")
            
            return mapping_df
            
    except Exception as e:
        logger.error(f"Error querying requirement to code mappings: {str(e)}")
        logger.error("Exception details:", exc_info=True)
        return pd.DataFrame()  # Return empty DataFrame on error

# Get requirement to code mappings
req_code_mappings = map_requirements_to_code()

# Display the mappings
if not req_code_mappings.empty:
    print("Requirement to Code File Mappings:")
    print("=" * 80)
    display(req_code_mappings.head())
    print(f"Total mappings: {len(req_code_mappings)}")
    print(f"Unique target requirements: {req_code_mappings['target_id'].nunique()}")
    
    # Check how many target IDs could be mapped to Java file names
    mapped_count = req_code_mappings['java_file_name'].notna().sum()
    print(f"Target requirements mapped to Java files: {mapped_count} ({mapped_count/len(req_code_mappings)*100:.1f}%)")
    
    # List the most common target ID prefixes
    if 'target_id' in req_code_mappings.columns:
        target_prefixes = req_code_mappings['target_id'].apply(lambda x: x.split('_')[0] if '_' in x else x[:2]).value_counts()
        print("Target ID prefix distribution:")
        for prefix, count in target_prefixes.items():
            print(f"  {prefix}: {count} ({count/len(req_code_mappings)*100:.1f}%)")
else:
    print("No requirement to code file mappings found.")
    print("Creating synthetic mappings based on requirement IDs...")
    
    # Create mappings with CC numbers matching EA numbers
    unique_requirements = sifp_results_df['sifp_requirement_id'].unique()
    
    # Create an empty DataFrame for the mappings
    req_code_mappings = pd.DataFrame()
    
    # Iterate through requirements to create proper mappings
    mappings_data = []
    for req_id in unique_requirements:
        if req_id and isinstance(req_id, str) and req_id.startswith('CC'):
            # Extract number from CC prefix (e.g., CC167 -> 167)
            cc_number = req_id[2:] if req_id.startswith('CC') else req_id
            # Create matching EA file name
            ea_file_name = f"EA{cc_number}.java"
            
            mappings_data.append({
                'target_id': req_id,  # Use the same ID as target_id
                'code_file_path': "Synthetic mapping",
                'java_file_name': ea_file_name,
                'ground_truth': 0  # Mark these as non-ground-truth mappings
            })
    
    # Create DataFrame from the mapped data
    req_code_mappings = pd.DataFrame(mappings_data)
    
    print(f"Created {len(req_code_mappings)} synthetic mappings with matching CC and EA numbers")
    display(req_code_mappings.head())

In [None]:
# Cell [8] - Merge SIFP Estimates with Code Metrics and Visualize Relationships
# Purpose: Merge SIFP data with code metrics and create correlation visualizations to analyze relationships
# Dependencies: pandas, numpy, sklearn, matplotlib, seaborn, os
# Breadcrumbs: Data Analysis -> SIFP-Code Correlation

def prepare_file_level_analysis():
    """
    Prepare data for file-level analysis by joining SIFP results, code mappings, and code metrics
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with merged SIFP data and code metrics for file-level analysis
    """
    try:
        # Check if the required dataframes are available
        if 'sifp_results_df' not in globals() or 'req_code_mappings' not in globals() or 'df_sloc' not in globals():
            logger.warning("Required data not available for file-level analysis")
            return pd.DataFrame()
        
        # Step 1: First, standardize the format of all file names for better matching
        
        # Create normalized versions of SIFP requirement IDs without prefixes
        sifp_results_df['normalized_req_id'] = sifp_results_df['sifp_requirement_id'].apply(
            lambda x: x[3:] if x and isinstance(x, str) and x.startswith('CC_') else x
        )
        
        # Create normalized versions of file names without extensions
        df_sloc['normalized_file'] = df_sloc['File'].apply(
            lambda x: x.split('.')[0] if x and isinstance(x, str) and '.' in x else x
        )
        
        # Create normalized versions of java_file_name without extensions and prefixes
        req_code_mappings['normalized_java_file'] = req_code_mappings['java_file_name'].apply(
            lambda x: x[2:-5] if x and isinstance(x, str) and x.startswith('EA') and x.endswith('.java') else x
        )
        
        # Step 2: Merge SIFP results with requirement to code mappings
        logger.info(f"Before first merge: SIFP results: {len(sifp_results_df)} rows, Code mappings: {len(req_code_mappings)} rows")
        merged_df = pd.merge(
            sifp_results_df,
            req_code_mappings,
            left_on='sifp_requirement_id',
            right_on='target_id',
            how='inner'
        )
        logger.info(f"After first merge: {len(merged_df)} rows with code file mappings")
        
        # Step 3: Create a more precise file matching approach
        # Extract a clean identifier for matching from both dataframes
        
        # For merged_df, use the normalized java file name without EA prefix and .java suffix
        merged_df['clean_file_id'] = merged_df['java_file_name'].apply(
            lambda x: x[2:-5] if x and isinstance(x, str) and x.startswith('EA') and x.endswith('.java') else 
                       (x.replace('.java', '') if x and isinstance(x, str) and x.endswith('.java') else x)
        )
        
        # For df_sloc, normalize file names similarly
        df_sloc['clean_file_id'] = df_sloc['File'].apply(
            lambda x: x.split('.')[0] if x and isinstance(x, str) and '.' in x else x
        )
        
        # Log some sample mappings for debugging
        logger.info(f"SIFP file identifiers: {merged_df['clean_file_id'].head(5).tolist()}")
        logger.info(f"SLOC file identifiers: {df_sloc['clean_file_id'].head(5).tolist()}")
        
        # Step 4: Perform the merge using the cleaned identifiers
        final_df = pd.merge(
            merged_df,
            df_sloc,
            left_on='clean_file_id',
            right_on='clean_file_id',
            how='inner'
        )
        
        # Log the results of this merge
        logger.info(f"After precise file identifier matching: {len(final_df)} rows")
        
        # If the precise matching failed, try a fallback approach with just the numeric parts
        if len(final_df) == 0:
            logger.info("Precise matching failed, trying fallback with numeric identifiers")
            
            # Extract just the numeric part from both identifiers
            merged_df['numeric_id'] = merged_df['clean_file_id'].apply(
                lambda x: ''.join(c for c in str(x) if c.isdigit()) if x else ''
            )
            
            df_sloc['numeric_id'] = df_sloc['clean_file_id'].apply(
                lambda x: ''.join(c for c in str(x) if c.isdigit()) if x else ''
            )
            
            # Only match if the numeric part is not empty and has at least 1 digit
            merged_df_valid = merged_df[merged_df['numeric_id'].str.len() > 0]
            df_sloc_valid = df_sloc[df_sloc['numeric_id'].str.len() > 0]
            
            logger.info(f"Valid numeric IDs - SIFP: {len(merged_df_valid)}, SLOC: {len(df_sloc_valid)}")
            
            # Perform the merge with numeric IDs
            final_df = pd.merge(
                merged_df_valid,
                df_sloc_valid,
                on='numeric_id',
                how='inner'
            )
            
            logger.info(f"After numeric matching: {len(final_df)} rows")
            
        # If we still have too many rows, we're likely getting duplicate matches
        # Let's group by the key columns and take one representative row per group
        if len(final_df) > len(merged_df) * 2:
            logger.warning(f"Too many matches: {len(final_df)} rows. Filtering to ensure one-to-one mappings.")
            
            # Group by the key columns to get unique combinations
            group_cols = ['sifp_requirement_id', 'sifp_model', 'File']
            final_df = final_df.groupby(group_cols).first().reset_index()
            
            logger.info(f"After filtering to unique combinations: {len(final_df)} rows")
        
        return final_df
        
    except Exception as e:
        logger.error(f"Error preparing file-level analysis: {str(e)}")
        logger.error("Exception details:", exc_info=True)
        return pd.DataFrame()  # Return empty DataFrame on error

# Execute the function to merge data
file_analysis_df = prepare_file_level_analysis()

# Display the merged data
if not file_analysis_df.empty:
    print("Merged SIFP and Code Metrics Data:")
    print("=" * 80)
    display(file_analysis_df.head())
    print(f"Total file-level entries: {len(file_analysis_df)}")
    print(f"Unique code files: {file_analysis_df['File'].nunique()}")
    print(f"Unique requirements: {file_analysis_df['sifp_requirement_id'].nunique()}")
    
    # Get all unique models from the data before any filtering
    all_models = file_analysis_df['sifp_model'].unique()
    print(f"Unique models before filtering: {len(all_models)}")
    print(f"Available models: {all_models}")
    
    # Define code metrics for analysis - only include ones that exist in the dataframe
    code_metrics = ['Code', 'Lines', 'Lines-exe', 'Lines-dec', 'Units']
    code_metrics = [m for m in code_metrics if m in file_analysis_df.columns]
    
    # Get the model ID variable names from environment variable
    analysis_model_vars = os.getenv('RESULTS_ANALYSIS_MODEL_IDS', '')
    print(f"DEBUG - Raw RESULTS_ANALYSIS_MODEL_IDS from env: '{analysis_model_vars}'")
    
    # Resolve environment variable names to actual model IDs
    if analysis_model_vars:
        # Split by comma and strip whitespace to get variable names
        model_var_names = [var_name.strip() for var_name in analysis_model_vars.split(',')]
        print(f"Model variable names: {model_var_names}")
        
        # Look up each variable in the environment to get actual model IDs
        selected_models = []
        for var_name in model_var_names:
            model_id = os.getenv(var_name, '')
            if model_id:
                print(f"  {var_name} = {model_id}")
                selected_models.append(model_id)
            else:
                print(f"  WARNING: Environment variable {var_name} not found or empty")
        
        print(f"Resolved model IDs: {selected_models}")
        
        # Filter to only include models from the environment variable that exist in the data
        models = [model for model in selected_models if model in all_models]
        print(f"Models found in dataset: {models}")
        
        if len(models) == 0 and len(all_models) > 0:
            print(f"WARNING: No models from RESULTS_ANALYSIS_MODEL_IDS found in data!")
            print(f"Available models in data: {all_models}")
            print("Falling back to all models")
            # Fall back to all models if none match
            models = all_models
        else:
            print(f"SUCCESS: Will use only these models for visualization: {models}")
            
        # CRITICAL CHANGE: Filter the dataframe to only include selected models
        print(f"Before filtering: dataset has {len(file_analysis_df)} rows with {file_analysis_df['sifp_model'].nunique()} unique models")
        file_analysis_df = file_analysis_df[file_analysis_df['sifp_model'].isin(models)]
        print(f"After filtering: dataset has {len(file_analysis_df)} rows with {file_analysis_df['sifp_model'].nunique()} unique models")
        print(f"Models in filtered dataset: {file_analysis_df['sifp_model'].unique()}")
    else:
        # Use all models if no filter is specified
        models = all_models
        print(f"No RESULTS_ANALYSIS_MODEL_IDS specified, using all {len(models)} models")
    
    # Get visualization setting from environment
    SHOW_VISUALIZATION = os.getenv('SHOW_VISUALIZATION', 'False').lower() == 'true'
    
    if SHOW_VISUALIZATION:
        # Create a color map for the models - use more distinguishable colors
        colors = plt.cm.tab10(np.arange(10))  # Use tab10 for more distinguishable colors
        model_colors = {model: colors[i % len(colors)] for i, model in enumerate(models)}
        
        # Create markers for different models
        markers = ['o', 's', '^', 'D', 'v', '<', '>', 'p', '*', 'h']
        model_markers = {model: markers[i % len(markers)] for i, model in enumerate(models)}
        
        # Create line styles for different models
        linestyles = ['-', '--', ':', '-.', (0, (3, 1, 1, 1)), (0, (5, 1))]
        model_linestyles = {model: linestyles[i % len(linestyles)] for i, model in enumerate(models)}
        
        # First iterate through metrics
        for metric in code_metrics:
            # Skip if metric not available
            if metric not in file_analysis_df.columns:
                continue
            
            # 1. Create plot for Actor SIFP vs Metric (all models on the same plot)
            # Size optimized for a 6.5" wide document with bottom legend
            fig_height = 4.5 + (0.25 * len(models))  # Adjust height based on number of models
            plt.figure(figsize=(6.5, fig_height))
            
            # Create a list to store legend handles
            legend_handles = []
            
            # Track max values for setting plot limits
            max_x_val = 0
            max_y_val = 0
            min_x_val = float('inf')
            
            # Now iterate through models to add each to the same plot
            for i, model in enumerate(models):
                # Filter data for this model
                model_data = file_analysis_df[file_analysis_df['sifp_model'] == model]
                
                # Skip if not enough data
                if len(model_data) < 3:
                    print(f"Skipping {model} - insufficient data points ({len(model_data)} records)")
                    continue
                
                # Get color, marker, and linestyle for this model
                color = model_colors[model]
                marker = model_markers[model]
                linestyle = model_linestyles[model]
                
                # Create scatter plot for this model
                scatter = plt.scatter(
                    model_data['sifp_actor_total'], 
                    model_data[metric],
                    color=color, 
                    marker=marker,
                    alpha=0.7, 
                    s=40,  # Slightly smaller points
                )
                
                # Get max values for axis scaling
                if model_data['sifp_actor_total'].max() > max_x_val:
                    max_x_val = model_data['sifp_actor_total'].max()
                if model_data[metric].max() > max_y_val:
                    max_y_val = model_data[metric].max()
                if model_data['sifp_actor_total'].min() < min_x_val and model_data['sifp_actor_total'].min() > 0:
                    min_x_val = model_data['sifp_actor_total'].min()
                
                # Add regression line for this model
                X = model_data['sifp_actor_total'].values.reshape(-1, 1)
                y = model_data[metric].values
                
                # Skip if NaN values present
                if np.isnan(X).any() or np.isnan(y).any():
                    valid_indices = ~(np.isnan(X).any(axis=1) | np.isnan(y))
                    X = X[valid_indices]
                    y = y[valid_indices]
                
                if len(X) >= 3:
                    lr = LinearRegression()
                    lr.fit(X, y)
                    
                    # Calculate R²
                    r2 = r2_score(y, lr.predict(X))
                    
                    # Create a right-justified formula string
                    model_name = f"{model}"
                    formula = f"$\\mathbf{{{lr.coef_[0]:.1f}×SIFP + {lr.intercept_:.0f}}}$ ($\\mathbf{{R²={r2:.2f}}}$)"
                    
                    # Create a combined legend entry with both marker and line
                    legend_handles.append(
                        plt.Line2D([0], [0], color=color, marker=marker, linestyle=linestyle, 
                                markersize=6, linewidth=1.5, label=f"{model_name} → {formula}")
                    )
            
            # Set reasonable axis limits first so lines can use them
            plot_min_x = min_x_val * 0.9 if min_x_val != float('inf') else 0
            plot_max_x = max_x_val * 1.1
            plt.xlim(plot_min_x, plot_max_x)
            plt.ylim(0, max_y_val * 1.1)
            
            # Now add regression lines using the full plot width
            for i, model in enumerate(models):
                model_data = file_analysis_df[file_analysis_df['sifp_model'] == model]
                
                # Skip if not enough data
                if len(model_data) < 3:
                    continue
                
                # Get color and linestyle for this model
                color = model_colors[model]
                linestyle = model_linestyles[model]
                
                X = model_data['sifp_actor_total'].values.reshape(-1, 1)
                y = model_data[metric].values
                
                # Skip if NaN values present
                if np.isnan(X).any() or np.isnan(y).any():
                    valid_indices = ~(np.isnan(X).any(axis=1) | np.isnan(y))
                    X = X[valid_indices]
                    y = y[valid_indices]
                
                if len(X) >= 3:
                    lr = LinearRegression()
                    lr.fit(X, y)
                    
                    # Create x_range across the FULL plot width for the regression line
                    x_range = np.linspace(plot_min_x, plot_max_x, 100).reshape(-1, 1)
                    plt.plot(x_range, lr.predict(x_range), linestyle=linestyle, color=color, linewidth=1.5)
            
            # Set labels and title with appropriate font sizes
            plt.title(f"Project: {NEO4J_PROJECT_NAME}\nActor SIFP vs {metric}", fontsize=12)
            plt.xlabel("Actor SIFP Points", fontsize=10)
            plt.ylabel(f"{metric}", fontsize=10)
            plt.grid(alpha=0.3)
            plt.xticks(fontsize=10)
            plt.yticks(fontsize=10)
            
            # Add legend to the bottom of the plot with stacked entries
            legend = plt.legend(handles=legend_handles, 
                      loc='upper center', 
                      bbox_to_anchor=(0.5, -0.15),  # Moved down from -0.05 to -0.15 to add more space 
                      fontsize=8,
                      frameon=True,
                      ncol=1)  # One column for stacked legend
            
            # Adjust layout to make room for the taller legend at the bottom
            bottom_margin = 0.15 + (len(legend_handles) * 0.03)  # Increased from 0.05 to 0.15
            plt.subplots_adjust(bottom=bottom_margin)
            
            plt.tight_layout()
            plt.show()
            
            # 2. Create plot for Judge SIFP vs Metric (all models on the same plot)
            plt.figure(figsize=(6.5, fig_height))
            
            # Reset legend handles
            legend_handles = []
            
            # Reset max values for setting plot limits
            max_x_val = 0
            max_y_val = 0
            min_x_val = float('inf')
            
            # Now iterate through models to add each to the same plot
            for i, model in enumerate(models):
                # Filter data for this model
                model_data = file_analysis_df[file_analysis_df['sifp_model'] == model]
                
                # Skip if not enough data
                if len(model_data) < 3:
                    continue
                
                # Get color, marker, and linestyle for this model
                color = model_colors[model]
                marker = model_markers[model]
                linestyle = model_linestyles[model]
                
                # Create scatter plot for this model
                scatter = plt.scatter(
                    model_data['sifp_final_total'], 
                    model_data[metric],
                    color=color, 
                    marker=marker,
                    alpha=0.7, 
                    s=40,  # Slightly smaller points
                )
                
                # Get max values for axis scaling
                if model_data['sifp_final_total'].max() > max_x_val:
                    max_x_val = model_data['sifp_final_total'].max()
                if model_data[metric].max() > max_y_val:
                    max_y_val = model_data[metric].max()
                if model_data['sifp_final_total'].min() < min_x_val and model_data['sifp_final_total'].min() > 0:
                    min_x_val = model_data['sifp_final_total'].min()
                
                # Add regression line for this model
                X = model_data['sifp_final_total'].values.reshape(-1, 1)
                y = model_data[metric].values
                
                # Skip if NaN values present
                if np.isnan(X).any() or np.isnan(y).any():
                    valid_indices = ~(np.isnan(X).any(axis=1) | np.isnan(y))
                    X = X[valid_indices]
                    y = y[valid_indices]
                
                if len(X) >= 3:
                    lr = LinearRegression()
                    lr.fit(X, y)
                    
                    # Calculate R²
                    r2 = r2_score(y, lr.predict(X))
                    
                    # Create a right-justified formula string
                    model_name = f"{model}"
                    formula = f"$\\mathbf{{{lr.coef_[0]:.1f}×SIFP + {lr.intercept_:.0f}}}$ ($\\mathbf{{R²={r2:.2f}}}$)"
                    
                    # Create a combined legend entry with both marker and line
                    legend_handles.append(
                        plt.Line2D([0], [0], color=color, marker=marker, linestyle=linestyle, 
                                markersize=6, linewidth=1.5, label=f"{model_name} → {formula}")
                    )
            
            # Set reasonable axis limits first so lines can use them
            plot_min_x = min_x_val * 0.9 if min_x_val != float('inf') else 0
            plot_max_x = max_x_val * 1.1
            plt.xlim(plot_min_x, plot_max_x)
            plt.ylim(0, max_y_val * 1.1)
            
            # Now add regression lines using the full plot width
            for i, model in enumerate(models):
                model_data = file_analysis_df[file_analysis_df['sifp_model'] == model]
                
                # Skip if not enough data
                if len(model_data) < 3:
                    continue
                
                # Get color and linestyle for this model
                color = model_colors[model]
                linestyle = model_linestyles[model]
                
                X = model_data['sifp_final_total'].values.reshape(-1, 1)
                y = model_data[metric].values
                
                # Skip if NaN values present
                if np.isnan(X).any() or np.isnan(y).any():
                    valid_indices = ~(np.isnan(X).any(axis=1) | np.isnan(y))
                    X = X[valid_indices]
                    y = y[valid_indices]
                
                if len(X) >= 3:
                    lr = LinearRegression()
                    lr.fit(X, y)
                    
                    # Create x_range across the FULL plot width for the regression line
                    x_range = np.linspace(plot_min_x, plot_max_x, 100).reshape(-1, 1)
                    plt.plot(x_range, lr.predict(x_range), linestyle=linestyle, color=color, linewidth=1.5)
            
            # Set labels and title with appropriate font sizes
            plt.title(f"Project: {NEO4J_PROJECT_NAME}\nMeta-Judge SIFP vs {metric}", fontsize=12)
            plt.xlabel("Judge SIFP Points", fontsize=10)
            plt.ylabel(f"{metric}", fontsize=10)
            plt.grid(alpha=0.3)
            plt.xticks(fontsize=10)
            plt.yticks(fontsize=10)
            
            # Add legend to the bottom of the plot with stacked entries
            legend = plt.legend(handles=legend_handles, 
                      loc='upper center', 
                      bbox_to_anchor=(0.5, -0.15),  # Moved down from -0.05 to -0.15 to add more space
                      fontsize=8,
                      frameon=True,
                      ncol=1)  # One column for stacked legend
            
            # Adjust layout to make room for the taller legend at the bottom
            bottom_margin = 0.15 + (len(legend_handles) * 0.03)  # Increased from 0.05 to 0.15
            plt.subplots_adjust(bottom=bottom_margin)
            
            plt.tight_layout()
            plt.show()
    else:
        print("(Visualizations disabled. Set SHOW_VISUALIZATION=True in .env to enable.)")
    
    # Analyze correlation between SIFP scores and code metrics
    print("Correlation between SIFP scores and code metrics:")
    print("=" * 80)
    for metric in code_metrics:
        if metric in file_analysis_df.columns:
            corr_actor = file_analysis_df[['sifp_actor_total', metric]].corr().iloc[0, 1]
            corr_judge = file_analysis_df[['sifp_final_total', metric]].corr().iloc[0, 1]
            diff = corr_judge - corr_actor
            print(f"{metric:20} | Actor: {corr_actor:.3f} | Judge: {corr_judge:.3f} | Difference: {diff:.3f}")
    
    print("Modeling SIFP to Code Metric Relationships:")
    print("=" * 80)
    
    # Use the filtered models for analysis
    for model in models:
        model_data = file_analysis_df[file_analysis_df['sifp_model'] == model]
        
        if len(model_data) < 5:
            continue
            
        print(f"Model: {model}")
        
        for metric in code_metrics:
            if metric not in model_data.columns:
                continue
                
            # Actor SIFP relationship
            X_actor = model_data['sifp_actor_total'].values.reshape(-1, 1)
            y_actor = model_data[metric].values
            
            # Judge SIFP relationship
            X_judge = model_data['sifp_final_total'].values.reshape(-1, 1)
            y_judge = model_data[metric].values
            
            # Skip if not enough valid data
            if (np.isnan(X_actor).any() or np.isnan(y_actor).any() or 
                np.isnan(X_judge).any() or np.isnan(y_judge).any()):
                continue
                
            # Fit linear models
            actor_model = LinearRegression().fit(X_actor, y_actor)
            judge_model = LinearRegression().fit(X_judge, y_judge)
            
            # Calculate R²
            actor_r2 = r2_score(y_actor, actor_model.predict(X_actor))
            judge_r2 = r2_score(y_judge, judge_model.predict(X_judge))
            
            # Print results
            print(f"{metric} (Actor SIFP): slope = {actor_model.coef_[0]:.3f}, intercept = {actor_model.intercept_:.3f}, R² = {actor_r2:.3f}")
            print(f"{metric} (Judge SIFP): slope = {judge_model.coef_[0]:.3f}, intercept = {judge_model.intercept_:.3f}, R² = {judge_r2:.3f}")
else:
    print("No file-level analysis data available.")
    print("Could not merge SIFP estimations with code metrics.")

In [None]:
# Cell [9] - File-Level Analysis: Understanding Record Composition in Detail  
# Purpose: Analyze record composition and prepare for performance analysis
# Dependencies: pandas, numpy, matplotlib, seaborn, os
# Breadcrumbs: Analysis -> Record Composition

print("File-Level Analysis: Understanding Record Composition")
print("=" * 80)

# Check if we have the merged file analysis data from cell 8
if 'file_analysis_df' in globals() and not file_analysis_df.empty:
    print(f"Total records in dataset: {len(file_analysis_df)}")
    print(f"Unique code files: {file_analysis_df['File'].nunique()}")
    print(f"Unique requirements: {file_analysis_df['sifp_requirement_id'].nunique()}")
    print(f"Unique models: {file_analysis_df['sifp_model'].nunique()}")
    
    print("Why are there more records than files?")
    print("-" * 80)
    print("Each record in this dataset represents a unique combination of:")
    print("1. AI model (sifp_model)")
    print("2. Requirement (sifp_requirement_id)")
    print("3. Code file (File)")
    print("4. Associated metrics (Lines, Code, etc.)")
    
    print("The record count exceeds the file count because:")
    print("- Multiple AI models evaluated the same codebase")
    print("- Each model estimated multiple requirements")
    print("- Each requirement can map to multiple code files")
    print("- A single file may implement parts of multiple requirements")
    
    # Get display setting from environment variable
    SHOW_ALL_DATA = os.getenv('SHOW_ALL_DATA', 'False').lower() == 'true'
    
    # Add detailed record breakdown by model
    print("Detailed Breakdown of Records by Model and Metric Type")
    print("-" * 80)
    
    # Define metrics for analysis
    selected_metrics = ['Code', 'Lines', 'Lines-exe', 'Lines-dec', 'Units']
    selected_metrics = [m for m in selected_metrics if m in file_analysis_df.columns]
    
    # Get unique models
    models = file_analysis_df['sifp_model'].unique()
    
    # Create breakdown table data
    breakdown_data = []
    for model in models:
        model_data = file_analysis_df[file_analysis_df['sifp_model'] == model]
        model_files = model_data['File'].nunique()
        model_reqs = model_data['sifp_requirement_id'].nunique()
        
        # Create base row with model info
        breakdown_row = {
            'Model': model,
            'Total Records': len(model_data),
            'Unique Files': model_files,
            'Unique Requirements': model_reqs,
            'Records per File (avg)': round(len(model_data) / model_files, 2) if model_files > 0 else 0,
            'Records per Requirement (avg)': round(len(model_data) / model_reqs, 2) if model_reqs > 0 else 0
        }
        
        # Add counts for each metric type
        for metric in selected_metrics:
            if metric in model_data.columns:
                metric_count = model_data[metric].notna().sum()
                breakdown_row[f'{metric} Metrics'] = metric_count
                
        breakdown_data.append(breakdown_row)
    
    # Create and display the breakdown dataframe
    breakdown_df = pd.DataFrame(breakdown_data)
    display(breakdown_df)
    
    # Basic correlation analysis we can do with current data
    print("Basic Correlation Analysis with Available Data:")
    print("=" * 80)
    
    # Simple correlation between actor and judge SIFP scores
    if 'sifp_actor_total' in file_analysis_df.columns and 'sifp_final_total' in file_analysis_df.columns:
        correlation = file_analysis_df[['sifp_actor_total', 'sifp_final_total']].corr().iloc[0, 1]
        print(f"Correlation between Actor and Judge SIFP scores: {correlation:.3f}")
    
    # Basic model statistics
    print("Basic Model Statistics:")
    print("-" * 50)
    
    for model in models:
        model_data = file_analysis_df[file_analysis_df['sifp_model'] == model]
        
        if len(model_data) < 5:
            print(f"Skipping {model} - insufficient data points ({len(model_data)} records)")
            continue
            
        print(f"Model: {model}")
        print(f"  Records: {len(model_data)}")
        print(f"  Unique files: {model_data['File'].nunique()}")
        print(f"  Unique requirements: {model_data['sifp_requirement_id'].nunique()}")
        
        # Calculate basic statistics for this model
        if 'sifp_actor_total' in model_data.columns:
            actor_mean = model_data['sifp_actor_total'].mean()
            print(f"  Mean Actor SIFP: {actor_mean:.2f}")
            
        if 'sifp_final_total' in model_data.columns:
            judge_mean = model_data['sifp_final_total'].mean()
            print(f"  Mean Judge SIFP: {judge_mean:.2f}")
            
        # Analyze available code metrics
        for metric in selected_metrics:
            if metric in model_data.columns:
                metric_mean = model_data[metric].mean()
                print(f"  Mean {metric}: {metric_mean:.2f}")
                
    # Show sample of the record data if requested
    print("Complete Record Listing - Understanding the Records/Files Relationship")
    print("-" * 80)
    print("This table shows all records in the dataset, ordered by model and file name.")
    print("This explains why we see many more records than unique files in the analysis.")
    
    # Create a cleaned up summary dataframe with the essential columns
    record_listing = file_analysis_df[['sifp_model', 'File', 'sifp_requirement_id', 'sifp_actor_total', 
                                      'sifp_final_total'] + selected_metrics]
    
    # Sort by model and file name as requested
    record_listing = record_listing.sort_values(['sifp_model', 'File'])
    
    # Show all data if configured, otherwise just show the head
    if SHOW_ALL_DATA:
        print(f"Showing all {len(record_listing)} records:")
        display(record_listing)
    else:
        print(f"Showing first 50 of {len(record_listing)} records (set SHOW_ALL_DATA=True in .env to see all):")
        display(record_listing.head(50))
    
    print("\nNote: Performance metrics will be calculated in the next cell.")
    print("This will enable detailed statistical analysis including paired comparisons and effect size calculations.")
    
else:
    print("No file-level analysis data available.")
    print("Cannot perform record composition analysis.")
    print("Check if cell 8 executed successfully and created file_analysis_df.")

print("Record composition analysis complete. Ready for performance calculation...")

In [None]:
# Cell [10] - Statistical Analysis: Paired Comparisons and Performance Testing
# Purpose: Perform comprehensive statistical tests comparing Actor vs Judge performance with significance testing
# Dependencies: scipy, pandas, numpy, sklearn
# Breadcrumbs: Analysis -> Statistical Testing

# Initialize storage for statistical test results
statistical_test_results = []

print("Performance Calculation and Statistical Analysis")
print("=" * 80)

# Check if we have the file analysis data from previous cells
if 'file_analysis_df' in globals() and not file_analysis_df.empty:
    print(f"Starting with {len(file_analysis_df)} records from file analysis")
    
    # Define metrics for analysis
    selected_metrics = ['Code', 'Lines', 'Lines-exe', 'Lines-dec', 'Units']
    selected_metrics = [m for m in selected_metrics if m in file_analysis_df.columns]
    
    # Get unique models
    models = file_analysis_df['sifp_model'].unique()
    
    # Create performance dataframe to hold performance data
    performance_rows = []
    
    # Calculate proportional predictions and errors for each model and metric
    for model in models:
        model_data = file_analysis_df[file_analysis_df['sifp_model'] == model]
        
        if len(model_data) < 5:
            print(f"Skipping {model} - insufficient data points ({len(model_data)} records)")
            continue
            
        print(f"Analyzing Model: {model} ({len(model_data)} records)")
        
        for metric in selected_metrics:
            if metric not in model_data.columns:
                continue
                
            # For each file, calculate the predicted metric value using the proportion
            # of SIFP points to total SIFP points
            
            # Step 1: Get total SIFP points and total metric value across all files
            total_actor_sifp = model_data['sifp_actor_total'].sum()
            total_judge_sifp = model_data['sifp_final_total'].sum()
            total_metric = model_data[metric].sum()
            
            # Skip if we have zeros or NaN
            if total_actor_sifp == 0 or total_judge_sifp == 0 or pd.isna(total_metric):
                continue
            
            # Step 2: For each file, calculate predicted metric value based on proportion
            for _, row in model_data.iterrows():
                # Skip rows with missing data
                if pd.isna(row['sifp_actor_total']) or pd.isna(row['sifp_final_total']) or pd.isna(row[metric]):
                    continue
                    
                # Calculate predicted values using proportional approach
                actor_prediction = (row['sifp_actor_total'] / total_actor_sifp) * total_metric
                judge_prediction = (row['sifp_final_total'] / total_judge_sifp) * total_metric
                
                # Calculate absolute errors
                actor_error = abs(actor_prediction - row[metric])
                judge_error = abs(judge_prediction - row[metric])
                
                # Calculate percentage errors
                if row[metric] > 0:
                    actor_pct_error = (actor_error / row[metric]) * 100
                    judge_pct_error = (judge_error / row[metric]) * 100
                else:
                    actor_pct_error = np.nan
                    judge_pct_error = np.nan
                
                # Calculate error improvement
                if actor_error > 0:
                    error_improvement = ((actor_error - judge_error) / actor_error) * 100
                else:
                    error_improvement = 0
                
                # Store results
                performance_rows.append({
                    'Model': model,
                    'Code Metric': metric,
                    'File': row['File'],
                    'Requirement': row['sifp_requirement_id'],
                    'Actual Value': row[metric],
                    'Actor Prediction': actor_prediction,
                    'Judge Prediction': judge_prediction,
                    'Actor Error': actor_error,
                    'Judge Error': judge_error,
                    'Actor % Error': actor_pct_error,
                    'Judge % Error': judge_pct_error,
                    'Error Improvement': error_improvement,
                    'Error % Improvement': error_improvement
                })
    
    # Create performance dataframe from collected data
    performance_df = pd.DataFrame(performance_rows)
    
    # Display summary statistics if we have performance data
    if not performance_df.empty:
        print("Performance Summary by Model and Metric:")
        print("=" * 80)
        
        # Group by model and metric, calculate averages
        summary = performance_df.groupby(['Model', 'Code Metric']).agg({
            'Actor Error': 'mean',
            'Judge Error': 'mean',
            'Actor % Error': 'mean',
            'Judge % Error': 'mean',
            'Error % Improvement': 'mean',
            'File': 'count'
        }).reset_index()
        
        # Rename File count to num_files for clarity
        summary = summary.rename(columns={'File': 'num_files'})
        
        # Display summary
        display(summary)
        
        # Get overall averages
        print("Overall Averages:")
        overall = performance_df.agg({
            'Actor Error': 'mean',
            'Judge Error': 'mean',
            'Actor % Error': 'mean',
            'Judge % Error': 'mean',
            'Error % Improvement': 'mean'
        }).reset_index()
        
        overall = overall.rename(columns={'index': 'Metric', 0: 'Value'})
        display(overall)
        
        # Calculate what percentage of files have improved predictions with the judge
        improved_count = (performance_df['Error % Improvement'] > 0).sum()
        total_count = performance_df['Error % Improvement'].count()
        improvement_rate = (improved_count / total_count) * 100 if total_count > 0 else 0
        
        print(f"Improvement Rate: {improved_count} out of {total_count} file predictions improved ({improvement_rate:.1f}%)")
        
        # Get visualization setting from environment
        SHOW_VISUALIZATION = os.getenv('SHOW_VISUALIZATION', 'False').lower() == 'true'
        
        if SHOW_VISUALIZATION:
            # Create visualizations for each model and metric combination
            for metric in selected_metrics:
                metric_data = performance_df[performance_df['Code Metric'] == metric]
                
                if not metric_data.empty:
                    plt.figure(figsize=(12, 6))
                    
                    # Boxplot of percent errors
                    plt.subplot(1, 2, 1)
                    boxdata = [
                        metric_data['Actor % Error'].dropna(),
                        metric_data['Judge % Error'].dropna()
                    ]
                    plt.boxplot(boxdata, labels=['Actor', 'Judge'])
                    plt.title(f'% Error for {metric}')
                    plt.ylabel('Percent Error (%)')
                    plt.grid(True, alpha=0.3)
                    
                    # Scatter plot of actual vs predicted
                    plt.subplot(1, 2, 2)
                    plt.scatter(metric_data['Actual Value'], metric_data['Actor Prediction'], 
                              alpha=0.7, label='Actor', marker='o')
                    plt.scatter(metric_data['Actual Value'], metric_data['Judge Prediction'], 
                              alpha=0.7, label='Judge', marker='x')
                    plt.plot([metric_data['Actual Value'].min(), metric_data['Actual Value'].max()], 
                           [metric_data['Actual Value'].min(), metric_data['Actual Value'].max()], 
                           'k--', alpha=0.3)
                    plt.title(f'Actual vs Predicted {metric}')
                    plt.xlabel('Actual Value')
                    plt.ylabel('Predicted Value')
                    plt.legend()
                    plt.grid(True, alpha=0.3)
                    
                    plt.tight_layout()
                    plt.show()
            
            # Create a heatmap of improvement by model and metric
            if len(models) > 1 and len(selected_metrics) > 1:
                pivot_data = performance_df.pivot_table(
                    index='Model', 
                    columns='Code Metric',
                    values='Error % Improvement',
                    aggfunc='mean'
                )
                
                plt.figure(figsize=(10, 6))
                sns.heatmap(pivot_data, annot=True, cmap='RdBu_r', center=0)
                plt.title('Mean Error % Improvement (Judge vs Actor)')
                plt.ylabel('Model')
                plt.xlabel('Code Metric')
                plt.tight_layout()
                plt.show()
        else:
            print("Visualizations disabled. Set SHOW_VISUALIZATION=True in .env to enable.")
            
        print("Performance calculation completed successfully!")
        print(f"Created performance dataframe with {len(performance_df)} entries")
        
    else:
        print("No performance data could be calculated.")
        print("Check if SIFP and code metric data are available and valid.")
        
else:
    print("No file-level analysis data available from previous cells.")
    print("Cannot perform performance calculation.")
    print("Please ensure cell 8 executed successfully.")

print("Performance analysis complete.")

In [None]:
# Cell [11] - Detailed File-Level Prediction Accuracy using Linear Relationships
# Purpose: Calculate per-file prediction accuracy using linear regression models and analyze prediction errors
# Dependencies: pandas, numpy, sklearn, matplotlib
# Breadcrumbs: Analysis -> Linear Modeling

if not file_analysis_df.empty:
    print("Per-File Prediction Accuracy Analysis:")
    print("=" * 80)
    
    # Debugging info
    print(f"File analysis dataframe shape: {file_analysis_df.shape}")
    print(f"File analysis dataframe columns: {file_analysis_df.columns.tolist()}")
    
    # Get all models
    models = file_analysis_df['sifp_model'].unique()
    print(f"Found {len(models)} models: {models}")
    
    # Get code metrics - only include ones that actually exist in the dataframe
    all_possible_metrics = ['Lines', 'Code', 'Lines-exe', 'Lines-dec', 'Units', 'Comments', 'Blanks', 'Stmt-exe', 'Stmt-dec']
    code_metrics = [m for m in all_possible_metrics if m in file_analysis_df.columns]
    print(f"Using code metrics: {code_metrics}")
    
    # Generate linear coefficients
    print("Generating linear coefficients...")
    model_linear_mapping = {}
    
    for model in models:
        model_df = file_analysis_df[file_analysis_df['sifp_model'] == model]
        
        if len(model_df) < 5:
            print(f"Skipping model {model} - insufficient data (only {len(model_df)} records)")
            continue
            
        model_coefficients = {}
        
        # Calculate linear relationships for this model
        for sifp_type, sifp_col in [('Actor', 'sifp_actor_total'), ('Judge', 'sifp_final_total')]:
            for code_metric in code_metrics:
                if code_metric not in model_df.columns:
                    continue
                
                # Filter for valid data
                valid_data = model_df[~(model_df[sifp_col].isna() | model_df[code_metric].isna())]
                
                if len(valid_data) < 3:
                    print(f"Skipping {model} {sifp_type} → {code_metric} - insufficient valid data points ({len(valid_data)})")
                    continue
                
                try:
                    # Fit linear regression
                    X = valid_data[sifp_col].values.reshape(-1, 1)
                    y = valid_data[code_metric].values
                    
                    lr_model = LinearRegression()
                    lr_model.fit(X, y)
                    
                    # Store coefficients
                    model_coefficients[(sifp_type, code_metric)] = {
                        'slope': lr_model.coef_[0],
                        'intercept': lr_model.intercept_,
                        'r2': r2_score(y, lr_model.predict(X)),
                        'rmse': np.sqrt(mean_squared_error(y, lr_model.predict(X)))
                    }
                    
                    # Debug print
                    print(f"  {model} {sifp_type} → {code_metric}: slope={lr_model.coef_[0]:.2f}, intercept={lr_model.intercept_:.2f}, r2={r2_score(y, lr_model.predict(X)):.3f}")
                except Exception as e:
                    print(f"Error fitting {model} {sifp_type} → {code_metric}: {str(e)}")
        
        if model_coefficients:
            model_linear_mapping[model] = model_coefficients
            print(f"Created {len(model_coefficients)} coefficient mappings for {model}")
        else:
            print(f"No valid coefficient mappings for {model}")
    
    # Create performance data using linear relationships
    print("Calculating prediction performance...")
    model_performance_data = []
    
    for model in models:
        model_df = file_analysis_df[file_analysis_df['sifp_model'] == model]
        
        if len(model_df) < 5 or model not in model_linear_mapping:
            print(f"Skipping model {model} - insufficient data or no coefficient mappings")
            continue
            
        print(f"Analyzing {model} file-level predictions using linear relationships...")
        model_coefficients = model_linear_mapping[model]
        
        # Get unique files for this model
        unique_files = model_df['File'].unique()
        print(f"  Found {len(unique_files)} unique files for {model}")
        
        file_count = 0
        predictions_count = 0
        
        # Group by file for detailed analysis
        for file in unique_files:
            file_rows = model_df[model_df['File'] == file]
            
            if len(file_rows) < 1:
                continue
                
            file_count += 1
            
            # Get the actual code metrics for this file
            file_metrics = {}
            for metric in code_metrics:
                if metric in file_rows.columns:
                    file_metrics[metric] = file_rows[metric].iloc[0]
            
            # Calculate prediction errors for each metric using linear model
            for metric in code_metrics:
                if metric not in file_metrics:
                    continue
                    
                actual = file_metrics[metric]
                
                # Skip zero or NaN values
                if pd.isna(actual) or actual == 0:
                    continue
                
                # Actor analysis
                if ('Actor', metric) in model_coefficients:
                    coeffs = model_coefficients[('Actor', metric)]
                    
                    # Skip if actor SIFP values are all NaN
                    if file_rows['sifp_actor_total'].isna().all():
                        continue
                    
                    actor_sifp = file_rows['sifp_actor_total'].mean()
                    actor_pred = coeffs['slope'] * actor_sifp + coeffs['intercept']
                    actor_error = abs(actual - actor_pred)
                    actor_pct_error = (actor_error / actual) * 100 if actual != 0 else np.nan
                    
                    # Judge evaluation
                    if ('Judge', metric) in model_coefficients:
                        coeffs_judge = model_coefficients[('Judge', metric)]
                        
                        # Skip if judge SIFP values are all NaN
                        if file_rows['sifp_final_total'].isna().all():
                            continue
                        
                        judge_sifp = file_rows['sifp_final_total'].mean()
                        judge_pred = coeffs_judge['slope'] * judge_sifp + coeffs_judge['intercept']
                        judge_error = abs(actual - judge_pred)
                        judge_pct_error = (judge_error / actual) * 100 if actual != 0 else np.nan
                        
                        # Calculate improvement from actor to judge
                        error_improvement = actor_error - judge_error
                        pct_improvement = actor_pct_error - judge_pct_error
                        
                        # Store performance data
                        performance_entry = {
                            'Model': model,
                            'File': file,
                            'Code Metric': metric,
                            'Actual Value': actual,
                            'Actor SIFP': actor_sifp,
                            'Actor Coefficient': coeffs['slope'],
                            'Actor Intercept': coeffs['intercept'],
                            'Actor Prediction': actor_pred,
                            'Actor Error': actor_error,
                            'Actor % Error': actor_pct_error,
                            'Judge SIFP': judge_sifp,
                            'Judge Coefficient': coeffs_judge['slope'],
                            'Judge Intercept': coeffs_judge['intercept'],
                            'Judge Prediction': judge_pred,
                            'Judge Error': judge_error,
                            'Judge % Error': judge_pct_error,
                            'Error Improvement': error_improvement,
                            'Error % Improvement': pct_improvement
                        }
                        
                        # Check if any key values are NaN
                        has_nan = any(pd.isna(val) for key, val in performance_entry.items() 
                                     if key in ['Actor SIFP', 'Judge SIFP', 'Actual Value', 
                                               'Actor Prediction', 'Judge Prediction'])
                        
                        if not has_nan:
                            model_performance_data.append(performance_entry)
                            predictions_count += 1
        
        print(f"  Generated {predictions_count} predictions across {file_count} files for {model}")
    
    # Create complete performance dataframe
    performance_df = pd.DataFrame(model_performance_data)
    
    if not performance_df.empty:
        # Remove infinite values for better display
        performance_df.replace([np.inf, -np.inf], np.nan, inplace=True)
        
        print("File-Level Prediction Performance Summary (Linear Model):")
        print(f"Total records: {len(performance_df)}")
        print(f"Showing first 5 rows:")
        display(performance_df.head())
        
        # Calculate average performance by model and metric
        print("Average Prediction Performance by Model and Metric:")
        avg_performance = performance_df.groupby(['Model', 'Code Metric']).agg({
            'Actor % Error': 'mean',
            'Judge % Error': 'mean',
            'Error % Improvement': 'mean',
            'Actor Coefficient': 'first',
            'Judge Coefficient': 'first'
        }).reset_index()
        
        avg_performance = avg_performance.round(2)
        print(f"Showing first 5 of {len(avg_performance)} rows:")
        display(avg_performance.head())
        
        # Identify best performing model per metric for file-level prediction
        print("Best Performing Models for File-Level Prediction (Linear Model):")
        for metric in code_metrics:
            metric_performance = avg_performance[avg_performance['Code Metric'] == metric]
            
            if not metric_performance.empty:
                # Best actor performance
                best_actor = metric_performance.loc[metric_performance['Actor % Error'].idxmin()]
                print(f"{metric}:")
                print(f"  Best Actor Analysis: {best_actor['Model']} (Avg Error: {best_actor['Actor % Error']}%, Coefficient: {best_actor['Actor Coefficient']})")
                
                # Best judge performance
                best_judge = metric_performance.loc[metric_performance['Judge % Error'].idxmin()]
                print(f"  Best Judge Evaluation: {best_judge['Model']} (Avg Error: {best_judge['Judge % Error']}%, Coefficient: {best_judge['Judge Coefficient']})")
                
                # Most improved
                if 'Error % Improvement' in metric_performance.columns:
                    most_improved = metric_performance.loc[metric_performance['Error % Improvement'].idxmax()]
                    print(f"  Most Improved: {most_improved['Model']} (Improvement: {most_improved['Error % Improvement']}%)")
        
        # Analyze hard-to-predict files
        print("Files with Highest Prediction Errors (Linear Model):")
        worst_predictions = performance_df.sort_values('Judge % Error', ascending=False).head(5)
        display(worst_predictions[['Model', 'File', 'Code Metric', 'Actual Value', 'Judge Prediction', 'Judge % Error']])
        
        # Get visualization setting from environment
        SHOW_VISUALIZATION = os.getenv('SHOW_VISUALIZATION', 'False').lower() == 'true'
        
        if SHOW_VISUALIZATION:
            # Create scatter plots for actual vs linearly predicted values by model for selected metrics
            for metric in code_metrics:
                metric_data = performance_df[performance_df['Code Metric'] == metric]
                
                if len(metric_data) < 5:
                    print(f"Skipping visualization for {metric} - insufficient data points")
                    continue
                    
                plt.figure(figsize=(14, 10))
                
                # Use different markers and colors for each model
                markers = ['o', 's', '^', 'D', 'v', '<', '>', 'p', '*', 'h', 'H']
                colors = plt.cm.tab10.colors
                
                for i, model in enumerate(models):
                    model_data = performance_df[(performance_df['Model'] == model) & 
                                              (performance_df['Code Metric'] == metric)]
                    
                    if len(model_data) > 1:
                        marker = markers[i % len(markers)]
                        color = colors[i % len(colors)]
                        
                        plt.scatter(
                            model_data['Actual Value'], 
                            model_data['Judge Prediction'],
                            marker=marker,
                            color=color,
                            alpha=0.7,
                            label=f"{model} (coef={model_data['Judge Coefficient'].iloc[0]:.2f})"
                        )
                
                # Add perfect prediction line
                max_val = max(
                    performance_df[performance_df['Code Metric'] == metric]['Actual Value'].max(),
                    performance_df[performance_df['Code Metric'] == metric]['Judge Prediction'].max()
                ) * 1.1
                
                plt.plot([0, max_val], [0, max_val], 'k--', alpha=0.5)
                
                plt.title(f"Linear Model Prediction: Judge → {metric}")
                plt.xlabel(f"Actual {metric}")
                plt.ylabel(f"Predicted {metric} (from Judge SIFP)")
                plt.grid(alpha=0.3)
                plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
                plt.tight_layout()
                plt.show()
        else:
            print("(Visualizations disabled. Set SHOW_VISUALIZATION=True in .env to enable.)")
    else:
        print("No performance data available for analysis. Check the following:")
        print("1. Are there sufficient data points for each model?")
        print("2. Do the SIFP scores and code metrics have valid, non-NaN values?") 
        print("3. Were any linear coefficients successfully generated?")
        print(f"Models with coefficients: {list(model_linear_mapping.keys())}")
        print(f"Total coefficient mappings: {sum(len(v) for v in model_linear_mapping.values())}")
else:
    print("No file-level data available for analysis. The file_analysis_df is empty.")

In [None]:
# Cell [12] - Model Accuracy Summary for Linear Model Predictions
# Purpose: Summarize and compare accuracy of linear model predictions across different SIFP models and metrics
# Dependencies: pandas, numpy, sklearn, matplotlib
# Breadcrumbs: Analysis -> Model Comparison

if 'performance_df' in locals() and not performance_df.empty:
    print("Model Accuracy Summary for Linear Model Predictions:")
    print("=" * 80)
    
    # Get all models and metrics
    models = performance_df['Model'].unique()
    code_metrics = ['Lines', 'Code', 'Lines-exe', 'Lines-dec', 'Units']
    code_metrics = [m for m in code_metrics if m in performance_df['Code Metric'].unique()]
    
    # Calculate overall accuracy stats
    print("Overall Prediction Accuracy by Model:")
    
    model_accuracy = []
    for model in models:
        model_data = performance_df[performance_df['Model'] == model]
        
        if len(model_data) < 3:
            continue
            
        # Calculate mean absolute percentage error for actor and judge
        actor_mape = model_data['Actor % Error'].mean()
        judge_mape = model_data['Judge % Error'].mean()
        
        # Calculate R-squared for actor and judge
        actor_r2_values = []
        judge_r2_values = []
        actor_coeffs = []
        judge_coeffs = []
        
        for metric in code_metrics:
            metric_data = model_data[model_data['Code Metric'] == metric]
            
            if len(metric_data) > 2:
                # Calculate R-squared between actual and model prediction
                actor_r2 = r2_score(metric_data['Actual Value'], metric_data['Actor Prediction'])
                actor_r2_values.append(actor_r2)
                actor_coeffs.append(metric_data['Actor Coefficient'].iloc[0])
                
                judge_r2 = r2_score(metric_data['Actual Value'], metric_data['Judge Prediction'])
                judge_r2_values.append(judge_r2)
                judge_coeffs.append(metric_data['Judge Coefficient'].iloc[0])
        
        actor_avg_r2 = np.mean(actor_r2_values) if actor_r2_values else np.nan
        judge_avg_r2 = np.mean(judge_r2_values) if judge_r2_values else np.nan
        actor_avg_coeff = np.mean(actor_coeffs) if actor_coeffs else np.nan
        judge_avg_coeff = np.mean(judge_coeffs) if judge_coeffs else np.nan
        
        model_accuracy.append({
            'Model': model,
            'File Count': model_data['File'].nunique(),
            'Actor MAPE': actor_mape,
            'Judge MAPE': judge_mape,
            'MAPE Improvement': actor_mape - judge_mape,
            'Actor Avg R²': actor_avg_r2,
            'Judge Avg R²': judge_avg_r2,
            'R² Improvement': judge_avg_r2 - actor_avg_r2,
            'Actor Avg Coefficient': actor_avg_coeff,
            'Judge Avg Coefficient': judge_avg_coeff
        })
    
    # Create and display the accuracy summary
    if model_accuracy:
        accuracy_df = pd.DataFrame(model_accuracy)
        accuracy_df = accuracy_df.sort_values('Judge MAPE')
        
        # Format the dataframe for better display
        for col in ['Actor MAPE', 'Judge MAPE', 'MAPE Improvement']:
            accuracy_df[col] = accuracy_df[col].round(2)
            
        for col in ['Actor Avg R²', 'Judge Avg R²', 'R² Improvement']:
            accuracy_df[col] = accuracy_df[col].round(4)
            
        for col in ['Actor Avg Coefficient', 'Judge Avg Coefficient']:
            accuracy_df[col] = accuracy_df[col].round(4)
            
        print(f"Showing first 5 of {len(accuracy_df)} rows:")
        display(accuracy_df.head())
        
        # Identify the best models based on different metrics
        best_judge_mape = accuracy_df.loc[accuracy_df['Judge MAPE'].idxmin()]
        best_judge_r2 = accuracy_df.loc[accuracy_df['Judge Avg R²'].idxmax()]
        most_improved_mape = accuracy_df.loc[accuracy_df['MAPE Improvement'].idxmax()]
        most_improved_r2 = accuracy_df.loc[accuracy_df['R² Improvement'].idxmax()]
        
        print("Best Performing Models (Linear Relationship):")
        print(f"• Most accurate predictions (lowest error): {best_judge_mape['Model']} (MAPE: {best_judge_mape['Judge MAPE']}%)")
        print(f"• Strongest linear fit (highest R²): {best_judge_r2['Model']} (R²: {best_judge_r2['Judge Avg R²']})")
        print(f"• Most improved accuracy through judge evaluation: {most_improved_mape['Model']} (MAPE reduced by {most_improved_mape['MAPE Improvement']}%)")
        print(f"• Most improved predictive power: {most_improved_r2['Model']} (R² improved by {most_improved_r2['R² Improvement']})")
        
        # Calculate metric-specific accuracy by model
        print("Linear Prediction Coefficients by Model and Metric:")
        
        coef_data = []
        for model in models:
            for metric in code_metrics:
                model_metric_data = performance_df[(performance_df['Model'] == model) & 
                                                (performance_df['Code Metric'] == metric)]
                
                if len(model_metric_data) < 3:
                    continue
                
                # Get the coefficients for this model and metric
                actor_coef = model_metric_data['Actor Coefficient'].iloc[0] if not model_metric_data.empty else np.nan
                actor_intercept = model_metric_data['Actor Intercept'].iloc[0] if not model_metric_data.empty else np.nan
                judge_coef = model_metric_data['Judge Coefficient'].iloc[0] if not model_metric_data.empty else np.nan
                judge_intercept = model_metric_data['Judge Intercept'].iloc[0] if not model_metric_data.empty else np.nan
                
                # Calculate error metrics
                actor_mape = model_metric_data['Actor % Error'].mean()
                judge_mape = model_metric_data['Judge % Error'].mean()
                
                coef_data.append({
                    'Model': model,
                    'Code Metric': metric,
                    'Actor Coefficient': actor_coef,
                    'Actor Intercept': actor_intercept,
                    'Actor Formula': f"{metric} = {actor_coef:.2f} × SIFP + {actor_intercept:.2f}",
                    'Actor MAPE': actor_mape,
                    'Judge Coefficient': judge_coef,
                    'Judge Intercept': judge_intercept,
                    'Judge Formula': f"{metric} = {judge_coef:.2f} × SIFP + {judge_intercept:.2f}",
                    'Judge MAPE': judge_mape
                })
        
        if coef_data:
            coef_df = pd.DataFrame(coef_data)
            
            # Format the dataframe for better display
            for col in ['Actor Coefficient', 'Actor Intercept', 'Judge Coefficient', 'Judge Intercept', 'Actor MAPE', 'Judge MAPE']:
                if col in coef_df.columns:
                    coef_df[col] = coef_df[col].round(2)
            
            # Display summary by metric
            for metric in code_metrics:
                metric_coef = coef_df[coef_df['Code Metric'] == metric]
                if not metric_coef.empty:
                    print(f"{metric} Linear Coefficients:")
                    metric_coef = metric_coef.sort_values('Judge MAPE')
                    display(metric_coef[['Model', 'Judge Formula', 'Judge MAPE', 'Actor Formula', 'Actor MAPE']].head())
        
        # Get visualization setting from environment
        SHOW_VISUALIZATION = os.getenv('SHOW_VISUALIZATION', 'False').lower() == 'true'
        
        if SHOW_VISUALIZATION:
            # Create bar charts for model comparison
            plt.figure(figsize=(14, 8))
            
            # Set up the bar positions
            x = np.arange(len(models))
            width = 0.35
            
            # Extract data
            actor_mape = [accuracy_df[accuracy_df['Model'] == m]['Actor MAPE'].values[0] 
                         if m in accuracy_df['Model'].values else np.nan for m in models]
            
            judge_mape = [accuracy_df[accuracy_df['Model'] == m]['Judge MAPE'].values[0] 
                         if m in accuracy_df['Model'].values else np.nan for m in models]
            
            # Create the grouped bars for MAPE
            plt.bar(x - width/2, actor_mape, width, label='Actor MAPE', color='skyblue')
            plt.bar(x + width/2, judge_mape, width, label='Judge MAPE', color='lightcoral')
            
            plt.title('Mean Absolute Percentage Error by Model (Linear Models)')
            plt.xlabel('Model')
            plt.ylabel('MAPE (lower is better)')
            plt.xticks(x, models, rotation=45, ha='right')
            plt.legend()
            plt.tight_layout()
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            plt.show()
            
            # Bar chart for average coefficients
            plt.figure(figsize=(14, 8))
            
            # Extract coefficient data
            actor_coef = [accuracy_df[accuracy_df['Model'] == m]['Actor Avg Coefficient'].values[0] 
                         if m in accuracy_df['Model'].values else np.nan for m in models]
            
            judge_coef = [accuracy_df[accuracy_df['Model'] == m]['Judge Avg Coefficient'].values[0] 
                         if m in accuracy_df['Model'].values else np.nan for m in models]
            
            # Create the grouped bars for coefficients
            plt.bar(x - width/2, actor_coef, width, label='Actor Coefficient', color='skyblue')
            plt.bar(x + width/2, judge_coef, width, label='Judge Coefficient', color='lightcoral')
            
            plt.title('Average Linear Coefficients by Model (SIFP to Code Metric)')
            plt.xlabel('Model')
            plt.ylabel('Coefficient (SIFP to Code Metric)')
            plt.xticks(x, models, rotation=45, ha='right')
            plt.legend()
            plt.tight_layout()
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            plt.show()
        else:
            print("(Visualizations disabled. Set SHOW_VISUALIZATION=True in .env to enable.)")
    else:
        print("Insufficient data for accuracy summary")
        
    # Summarize key findings and patterns
    print("Key Observations on Linear Model Predictions:")
    print("1. This analysis models the relationship between SIFP and code metrics as a linear equation: Code_Metric = Coefficient × SIFP + Intercept")
    print("2. The coefficients represent the scaling factor between SIFP points and actual code metrics.")
    print("3. Higher R² values indicate that the model's SIFP points consistently predict code metrics with a linear relationship.")
    
    if 'accuracy_df' in locals() and not accuracy_df.empty:
        avg_improvement = accuracy_df['MAPE Improvement'].mean()
        if avg_improvement > 0:
            print(f"4. On average, the judge evaluation process reduced prediction error by {avg_improvement:.2f}% across all models.")
        else:
            print(f"4. On average, the judge evaluation process increased prediction error by {-avg_improvement:.2f}% across all models.")
            
        avg_r2_improvement = accuracy_df['R² Improvement'].mean()
        if avg_r2_improvement > 0:
            print(f"5. The judge process improved linear fit quality (R²) by an average of {avg_r2_improvement:.4f} across all models.")
        else:
            print(f"5. The judge process decreased linear fit quality (R²) by an average of {-avg_r2_improvement:.4f} across all models.")
            
        avg_actor_coef = accuracy_df['Actor Avg Coefficient'].mean()
        avg_judge_coef = accuracy_df['Judge Avg Coefficient'].mean()
        print(f"6. The average scaling factor from SIFP to code metrics is {avg_actor_coef:.2f} for Actor and {avg_judge_coef:.2f} for Judge evaluations.")
else:
    print("No performance data available for accuracy summary")

In [None]:
# Cell [13] - Advanced Modeling Comparison (Linear vs. Non-Linear Methods)
# Purpose: Compare linear and non-linear modeling methods for SIFP to code metric prediction using multiple algorithms
# Dependencies: pandas, numpy, sklearn, matplotlib, seaborn, xgboost
# Breadcrumbs: Analysis -> Advanced Modeling

# Check if we have the dataframe from previous cells
if 'file_analysis_df' not in globals() or file_analysis_df.empty:
    print("ERROR: No file-level analysis data available from previous cells.")
else:
    print("Advanced Modeling Comparison (Linear vs. Non-Linear):")
    print("=" * 80)
    print(f"Total file-level entries: {len(file_analysis_df)}")
    print(f"Unique code files: {file_analysis_df['File'].nunique()}")
    print(f"Unique models: {file_analysis_df['sifp_model'].nunique()}")
    
    # Define code metrics to analyze
    code_metrics = ['Code', 'Lines', 'Lines-exe', 'Lines-dec', 'Units']
    code_metrics = [m for m in code_metrics if m in file_analysis_df.columns]
    print(f"Available code metrics: {code_metrics}")
    
    # Define SIFP models
    sifp_models = file_analysis_df['sifp_model'].unique()
    print(f"Available SIFP models: {sifp_models}")
    
    # Define modeling methods to compare
    modeling_methods = {
        'Linear': LinearRegression(),
        'Ridge': Ridge(alpha=1.0),
        'Lasso': Lasso(alpha=0.1),
        'RandomForest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
        'SVR': SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1),
        'XGBoost': XGBRegressor.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
        'MLP': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
    }
    
    # Create storage for results
    results = []
    
    # Define a function to evaluate a model
    def evaluate_model(X_train, X_test, y_train, y_test, model_name, model, sifp_type):
        try:
            # Standardize input features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
            # Fit model
            model.fit(X_train_scaled, y_train)
            
            # Make predictions
            y_pred = model.predict(X_test_scaled)
            
            # Calculate metrics
            mae = mean_absolute_error(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)
            
            # Handle r2 NaN or infinity
            if np.isnan(r2) or np.isinf(r2):
                r2 = 0
                
            # Calculate MAPE (Mean Absolute Percentage Error)
            if np.all(y_test != 0):  # Avoid division by zero
                mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
            else:
                mape = np.nan
                
            return {
                'MAE': mae,
                'RMSE': rmse,
                'R2': r2,
                'MAPE': mape,
                'y_test': y_test,
                'y_pred': y_pred,
                'model': model
            }
        except Exception as e:
            print(f"Error evaluating {model_name} model: {str(e)}")
            return None
    
    # For each SIFP model, code metric, and sifp type (Actor/Judge)
    for sifp_model in sifp_models:
        print(f"Analyzing SIFP Model: {sifp_model}")
        
        # Filter data for this SIFP model
        model_data = file_analysis_df[file_analysis_df['sifp_model'] == sifp_model].copy()
        
        # Skip if insufficient data
        if len(model_data) < 10:
            print(f"  Skipping {sifp_model} - insufficient data points ({len(model_data)} records)")
            continue
            
        # For each code metric
        for metric in code_metrics:
            if metric not in model_data.columns:
                continue
                
            # Create valid dataset (no NaN values)
            valid_data = model_data.dropna(subset=[metric, 'sifp_actor_total', 'sifp_final_total'])
            
            if len(valid_data) < 10:
                print(f"  Skipping {sifp_model} → {metric} - insufficient valid data points ({len(valid_data)})")
                continue
                
            print(f"  Processing {sifp_model} → {metric} with {len(valid_data)} data points")
            
            # For each SIFP type (Actor/Judge)
            for sifp_type, sifp_col in [('Actor', 'sifp_actor_total'), ('Judge', 'sifp_final_total')]:
                # Prepare feature array (add more features if available)
                features = [sifp_col]
                
                # Check if confidence scores are available and add them
                confidence_cols = [col for col in valid_data.columns if 'confidence' in col.lower()]
                available_conf_cols = [col for col in confidence_cols if not valid_data[col].isna().all()]
                
                if available_conf_cols:
                    features.extend(available_conf_cols)
                
                # Create X and y
                X = valid_data[features].values
                y = valid_data[metric].values
                
                # Skip if not enough samples for meaningful split
                if len(X) < 10:
                    print(f"    Skipping {sifp_type} analysis - insufficient samples ({len(X)})")
                    continue
                
                # Create train-test split (70-30)
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.3, random_state=42
                )
                
                # Evaluate each modeling method
                for method_name, model in modeling_methods.items():
                    # Skip models that require more data points than we have
                    if (method_name in ['SVR', 'XGBoost', 'MLP', 'RandomForest'] and len(X_train) < 5) or len(X_train) < 3:
                        continue
                        
                    evaluation = evaluate_model(X_train, X_test, y_train, y_test, method_name, model, sifp_type)
                    
                    if evaluation:
                        results.append({
                            'SIFP Model': sifp_model,
                            'Code Metric': metric,
                            'SIFP Type': sifp_type,
                            'Method': method_name,
                            'MAE': evaluation['MAE'],
                            'RMSE': evaluation['RMSE'],
                            'R2': evaluation['R2'],
                            'MAPE': evaluation['MAPE'],
                            'y_test': evaluation['y_test'],
                            'y_pred': evaluation['y_pred'],
                            'Features': features,
                            'Trained Model': evaluation['model']
                        })
    
    # Create dataframe from results
    results_df = pd.DataFrame([{k: v for k, v in r.items() if k not in ['y_test', 'y_pred', 'Trained Model', 'Features']} 
                              for r in results])
    
    # Display results
    if not results_df.empty:
        print("Modeling Results Summary:")
        print("=" * 80)
        
        # Format for better display
        display_cols = ['SIFP Model', 'Code Metric', 'SIFP Type', 'Method', 'MAE', 'RMSE', 'R2', 'MAPE']
        display_df = results_df[display_cols].copy()
        
        # Round numeric columns
        numeric_cols = ['MAE', 'RMSE', 'R2', 'MAPE']
        for col in numeric_cols:
            display_df[col] = display_df[col].round(4)
        
        # Display summary
        print(f"Total model evaluations: {len(display_df)}")
        display(display_df.head())
        
        # Find best method for each SIFP model and metric combination
        print("Best Method for Each SIFP Model and Metric (Based on R²):")
        
        # Group by relevant columns and find the max R2 score
        best_methods = display_df.loc[display_df.groupby(['SIFP Model', 'Code Metric', 'SIFP Type'])['R2'].idxmax()]
        best_methods = best_methods.sort_values(['SIFP Model', 'Code Metric', 'SIFP Type'])
        
        display(best_methods)
        
        # Calculate average performance by method
        print("Average Performance by Method:")
        method_performance = display_df.groupby('Method')[numeric_cols].mean().round(4)
        method_performance = method_performance.sort_values('R2', ascending=False)
        display(method_performance)
        
        # Calculate which method is best most often
        best_count = best_methods['Method'].value_counts()
        print("Methods Count as Best Performer:")
        display(best_count)
        
        # Compare Actor vs Judge overall performance
        actor_judge_perf = display_df.groupby('SIFP Type')[numeric_cols].mean().round(4)
        print("Actor vs Judge Average Performance:")
        display(actor_judge_perf)
        
        # Get visualization setting from environment
        SHOW_VISUALIZATION = os.getenv('SHOW_VISUALIZATION', 'False').lower() == 'true'
        
        if SHOW_VISUALIZATION:
            # Create visualizations
            
            # 1. Bar plot of average R² by method
            plt.figure(figsize=(12, 6))
            sns.barplot(x=method_performance.index, y=method_performance['R2'], palette='viridis')
            plt.title('Average R² by Modeling Method')
            plt.xlabel('Method')
            plt.ylabel('Average R²')
            plt.xticks(rotation=45)
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.show()
            
            # 2. Heatmap of R² for each SIFP model and method combination
            model_method_r2 = display_df.pivot_table(
                index='SIFP Model', 
                columns=['Method', 'SIFP Type'], 
                values='R2', 
                aggfunc='mean'
            )
            
            if not model_method_r2.empty and not model_method_r2.isna().all().all():
                plt.figure(figsize=(14, 8))
                sns.heatmap(model_method_r2, annot=True, cmap='RdYlGn', linewidths=.5, fmt='.3f')
                plt.title('R² by SIFP Model, Method, and SIFP Type')
                plt.tight_layout()
                plt.show()
            
            # 3. Compare actual vs predicted for best methods
            for index, row in best_methods.iterrows():
                # Get the result with the selected best method
                selected_result = None
                for res in results:
                    if (res['SIFP Model'] == row['SIFP Model'] and 
                        res['Code Metric'] == row['Code Metric'] and
                        res['SIFP Type'] == row['SIFP Type'] and
                        res['Method'] == row['Method']):
                        selected_result = res
                        break
                
                if selected_result:
                    y_test = selected_result['y_test']
                    y_pred = selected_result['y_pred']
                    
                    # Create scatterplot
                    plt.figure(figsize=(8, 8))
                    plt.scatter(y_test, y_pred, alpha=0.7)
                    
                    # Plot perfect prediction line
                    min_val = min(min(y_test), min(y_pred))
                    max_val = max(max(y_test), max(y_pred))
                    plt.plot([min_val, max_val], [min_val, max_val], 'r--')
                    
                    # Add R² and RMSE annotation
                    r2 = row['R2']
                    rmse = row['RMSE']
                    plt.annotate(f"R² = {r2:.4f}\nRMSE = {rmse:.2f}", 
                                xy=(0.05, 0.95), xycoords='axes fraction',
                                bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8),
                                ha='left', va='top')
                    
                    plt.title(f"{row['SIFP Model']} - {row['Code Metric']} - {row['SIFP Type']}\nMethod: {row['Method']}")
                    plt.xlabel('Actual')
                    plt.ylabel('Predicted')
                    plt.grid(True, alpha=0.3)
                    plt.tight_layout()
                    plt.show()
            
            # 4. Method performance by SIFP model
            for metric in code_metrics:
                metric_data = display_df[display_df['Code Metric'] == metric]
                
                if not metric_data.empty:
                    pivot = metric_data.pivot_table(
                        index='Method', 
                        columns=['SIFP Model', 'SIFP Type'], 
                        values='R2'
                    )
                    
                    if not pivot.empty and not pivot.isna().all().all():
                        plt.figure(figsize=(14, 8))
                        sns.heatmap(pivot, annot=True, cmap='RdYlGn', linewidths=.5, fmt='.3f')
                        plt.title(f'R² by Method, SIFP Model, and SIFP Type for {metric}')
                        plt.tight_layout()
                        plt.show()
        else:
            print("Visualizations disabled. Set SHOW_VISUALIZATION=True in .env to enable.")
        
        # Output conclusions
        print("Key Findings:")
        if not method_performance.empty:
            best_overall_method = method_performance.index[0]
            print(f"1. Overall Best Method: {best_overall_method} (Avg R²: {method_performance.loc[best_overall_method, 'R2']:.4f})")
        
        if not best_count.empty:
            most_frequent_best = best_count.index[0]
            print(f"2. Most Frequently Best Method: {most_frequent_best} (Best in {best_count.iloc[0]} cases)")
        
        if not actor_judge_perf.empty:
            better_sifp = "Judge" if actor_judge_perf.loc['Judge', 'R2'] > actor_judge_perf.loc['Actor', 'R2'] else "Actor"
            print(f"3. Better SIFP Type: {better_sifp} (Avg R²: {actor_judge_perf.loc[better_sifp, 'R2']:.4f} vs. {actor_judge_perf.loc['Actor' if better_sifp == 'Judge' else 'Judge', 'R2']:.4f})")
        
        # Check if non-linear methods outperform linear ones
        linear_methods = ['Linear', 'Ridge', 'Lasso']
        nonlinear_methods = ['RandomForest', 'SVR', 'XGBoost', 'MLP']
        
        linear_perf = display_df[display_df['Method'].isin(linear_methods)][numeric_cols].mean()
        nonlinear_perf = display_df[display_df['Method'].isin(nonlinear_methods)][numeric_cols].mean()
        
        print(f"4. Linear vs. Non-Linear Methods:")
        print(f"   Linear Methods Average R²: {linear_perf['R2']:.4f}")
        print(f"   Non-Linear Methods Average R²: {nonlinear_perf['R2']:.4f}")
        if nonlinear_perf['R2'] > linear_perf['R2']:
            print(f"   Non-Linear methods outperform Linear methods by {(nonlinear_perf['R2'] - linear_perf['R2']):.4f} R² points")
        else:
            print(f"   Linear methods outperform Non-Linear methods by {(linear_perf['R2'] - nonlinear_perf['R2']):.4f} R² points")
        
        # Check which SIFP model performs best with which method
        for method in modeling_methods.keys():
            method_data = display_df[display_df['Method'] == method]
            if not method_data.empty:
                best_sifp_model = method_data.loc[method_data['R2'].idxmax()]
                print(f"5. {method} performs best with {best_sifp_model['SIFP Model']} on {best_sifp_model['Code Metric']} (R²: {best_sifp_model['R2']:.4f})")
    else:
        print("No modeling results available. Check data quality and availability.")