# Machine Learning Similarity Score Analysis
**Trains Random Forest classifiers on sentence transformer similarity scores with feature importance analysis and performance visualization.**


In [None]:
# Cell [0] - Setup and Imports
# Purpose: Import all required libraries and configure environment settings for Multi-LLM testing
# Dependencies: os, io, sys, pathlib, dotenv, pandas, numpy, datetime, sklearn, matplotlib, seaborn, tqdm, praxis_sentence_transformer
# Breadcrumbs: Setup -> Imports -> Environment Configuration

import os
import io
import sys
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, 
    fbeta_score, 
    roc_auc_score, 
    confusion_matrix,
    roc_curve,
    auc,
    precision_recall_curve,
    precision_score,
    recall_score
)
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

# Import project modules (installed via pip)
from praxis_sentence_transformer import (
    setup_logging,
    handle_exception,
    DebugTimer,
    Neo4jClient,
    create_results_directory
)

# Load environment variables
load_dotenv()

# Set up logging
logger = setup_logging("similarity-score-analysis-notebook")

try:
    # Log initialization
    logger.info("Initializing Similarity Score Analysis Notebook")
    
    # Verify environment variables
    required_env_vars = [
        'NEO4J_URI', 
        'NEO4J_USER', 
        'NEO4J_PASSWORD',
        'PROJECT_NAME'
    ]
    missing_vars = [var for var in required_env_vars if not os.getenv(var)]
    
    if missing_vars:
        logger.error(f"Missing required environment variables: {missing_vars}")
        raise EnvironmentError(f"Missing required environment variables: {missing_vars}")
    
    # Validate PROJECT_NAME
    project_name = os.getenv('PROJECT_NAME')
    if not project_name:
        raise ValueError("PROJECT_NAME environment variable is empty")
    
    # Initialize Neo4j client and verify project
    neo4j_client = Neo4jClient(
        uri=os.getenv('NEO4J_URI'),
        username=os.getenv('NEO4J_USER'),
        password=os.getenv('NEO4J_PASSWORD')
    )
    
    # Verify project exists
    if not neo4j_client.verify_project_exists(project_name):
        raise ValueError(f"Project '{project_name}' not found in Neo4j database")
    
    logger.debug("All required environment variables loaded successfully")
        
    try:
        # Create results directory structure
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        results = create_results_directory(
            model_name="similarity-analysis",
            dataset_name=project_name
        )
        logger.info(f"Created results directory structure")
        logger.debug(f"Results will be saved with timestamp: {timestamp}")

    except Exception as e:
        logger.error("Failed to create results directory", exc_info=True)
        handle_exception(e)
        raise

except Exception as e:
    logger.error("Failed to initialize notebook", exc_info=True)
    handle_exception(e)
    raise

finally:
    if 'neo4j_client' in locals():
        neo4j_client.close()
        logger.debug("Neo4j connection closed")

In [None]:
# Cell [1] - Neo4j Connection and Data Loading
# Purpose: Establish database connection and load similarity data for comprehensive analysis
# Dependencies: Neo4jClient, logger, os, pandas, handle_exception
# Breadcrumbs: Environment Configuration -> Database Connection -> Data Retrieval

def create_neo4j_client():
    """
    Create a Neo4j client with connection parameters from environment variables
    Returns:
        Neo4jClient: Configured Neo4j client instance
    """
    return Neo4jClient(
        uri=os.getenv('NEO4J_URI'),
        username=os.getenv('NEO4J_USER'),
        password=os.getenv('NEO4J_PASSWORD')
    )

def check_schema(neo4j_client):
    """
    Check Neo4j schema and relationship types based on project hierarchy
    Args:
        neo4j_client: Neo4jClient instance
    Returns:
        dict: Schema information from Neo4j
    """
    logger.debug("Initializing Neo4j schema check")
    try:
        project_name = os.getenv('PROJECT_NAME')
        
        # Count query with proper collection and relationship counting
        count_query = """
        MATCH (p:Project {name: $project_name})
        WITH p
        MATCH (p)-[:CONTAINS*1..]->(r1:Requirement)
        WHERE r1.type = 'SOURCE'
        WITH p, collect(DISTINCT r1) as source_reqs
        MATCH (p)-[:CONTAINS*1..]->(r2:Requirement)
        WHERE r2.type = 'TARGET'
        WITH source_reqs, collect(DISTINCT r2) as target_reqs
        OPTIONAL MATCH (r1)-[rel:SIMILAR_TO]->(r2)
        WHERE r1 IN source_reqs 
        AND r2 IN target_reqs
        AND rel.project = $project_name
        RETURN 
            size(source_reqs) as source_req_count,
            size(target_reqs) as target_req_count,
            count(rel) as similar_count,
            size(source_reqs) * size(target_reqs) as potential_total_pairs
        """
        
        # Schema query with similar pattern
        schema_query = """
        MATCH (p:Project {name: $project_name})
        WITH p
        MATCH (p)-[:CONTAINS*1..]->(r1:Requirement)
        WHERE r1.type = 'SOURCE'
        WITH p, collect(DISTINCT r1) as source_reqs
        MATCH (p)-[:CONTAINS*1..]->(r2:Requirement)
        WHERE r2.type = 'TARGET'
        WITH source_reqs, collect(DISTINCT r2) as target_reqs
        MATCH (r1)-[rel:SIMILAR_TO]->(r2)
        WHERE r1 IN source_reqs 
        AND r2 IN target_reqs
        AND rel.project = $project_name
        WITH DISTINCT labels(r1) as req_labels,
             collect(DISTINCT type(rel)) as rel_types,
             collect(DISTINCT keys(rel)) as rel_properties
        RETURN {
            requirement_labels: req_labels,
            relationship_types: rel_types,
            relationship_properties: rel_properties[0]
        } as value
        """
        
        with neo4j_client.driver.session() as session:
            # Get counts
            count_result = session.run(count_query, project_name=project_name)
            counts = count_result.single()
            
            if counts:
                logger.info("Database Schema Statistics:")
                logger.info(f"Project: {project_name}")
                logger.info(f"Number of source requirements: {counts['source_req_count']}")
                logger.info(f"Number of target requirements: {counts['target_req_count']}")
                logger.info(f"Number of SIMILAR_TO relationships: {counts['similar_count']}")
                logger.info(f"Potential total pairs: {counts['potential_total_pairs']}")
            else:
                logger.warning("No relationships found in the database")
            
            # Get schema
            schema_result = session.run(schema_query, project_name=project_name)
            schema_record = schema_result.single()
            
            if not schema_record:
                logger.warning(f"No schema information found for project: {project_name}")
                schema = {}
            else:
                schema = schema_record['value']
                logger.debug("Schema details:")
                logger.debug(f"Requirement labels: {schema.get('requirement_labels', [])}")
                logger.debug(f"Relationship types: {schema.get('relationship_types', [])}")
                logger.debug(f"Relationship properties: {schema.get('relationship_properties', [])}")
            
            return schema
            
    except Exception as e:
        logger.error("Error checking Neo4j schema", exc_info=True)
        handle_exception(e)
        raise

def get_similarity_data(neo4j_client):
    """
    Retrieve similarity data for the current project including model scores
    
    Args:
        neo4j_client: Neo4jClient instance
    Returns:
        pandas.DataFrame: DataFrame containing similarity scores and ground truth
    """
    logger.debug("Initializing similarity data retrieval")
    try:
        project_name = os.getenv('PROJECT_NAME')
        
        # Query to get all valid requirement pairs and their scores
        query = """
        MATCH (p:Project {name: $project_name})
        WITH p
        MATCH (p)-[:CONTAINS*1..]->(r1:Requirement)
        WHERE r1.type = 'SOURCE'
        WITH p, collect(DISTINCT r1) as source_reqs
        MATCH (p)-[:CONTAINS*1..]->(r2:Requirement)
        WHERE r2.type = 'TARGET'
        WITH source_reqs, collect(DISTINCT r2) as target_reqs
        UNWIND source_reqs as r1
        UNWIND target_reqs as r2
        OPTIONAL MATCH (r1)-[s:SIMILAR_TO]->(r2)
        WHERE s.project = $project_name
        OPTIONAL MATCH (r1)-[g:GROUND_TRUTH]->(r2)
        WHERE g.project = $project_name
        WITH 
            r1.id as source_id,
            r2.id as target_id,
            s.similarity as similarity_score,
            s.model as model_name,
            CASE WHEN g IS NOT NULL THEN 1 ELSE 0 END as is_related
        WHERE similarity_score IS NOT NULL OR is_related = 1
        RETURN *
        """
        
        with neo4j_client.driver.session() as session:
            result = session.run(query, project_name=project_name)
            records = [dict(record) for record in result]
            
            if not records:
                logger.warning(f"No data found for project: {project_name}")
                return pd.DataFrame()
            
            # Create initial DataFrame
            data = pd.DataFrame(records)
            
            # Pivot the data to create separate columns for each model
            model_scores = data.pivot(
                index=['source_id', 'target_id', 'is_related'],
                columns='model_name',
                values='similarity_score'
            ).reset_index()
            
            # Log dataset statistics
            logger.info("\nDataset Statistics:")
            logger.info(f"Total pairs: {len(model_scores)}")
            logger.info(f"Related pairs: {model_scores['is_related'].sum()}")
            logger.info(f"Unrelated pairs: {len(model_scores) - model_scores['is_related'].sum()}")
            
            # Check for missing values
            if model_scores.isnull().values.any():
                logger.warning("Missing values found in the dataset")
                logger.debug(f"Missing value counts:\n{model_scores.isnull().sum()}")
            else:
                logger.debug("No missing values found in the dataset")
            
            # Log sample data
            logger.info("\nFirst 5 rows of dataset:")
            logger.info(f"\n{model_scores.head()}")
            logger.info("\nLast 5 rows of dataset:")
            logger.info(f"\n{model_scores.tail()}")
            logger.info("\nDataset Info:")
            logger.info(f"\n{model_scores.info()}")
            
            return model_scores
            
    except Exception as e:
        logger.error("Error retrieving similarity data", exc_info=True)
        handle_exception(e)
        raise

try:
    # Create a single Neo4j client instance to be used throughout the notebook
    neo4j_client = create_neo4j_client()
    logger.info("Created Neo4j client for notebook session")
    
    # Use the client for schema check and data retrieval
    schema = check_schema(neo4j_client)
    similarity_data = get_similarity_data(neo4j_client)
    
except Exception as e:
    logger.error("Failed to initialize data", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [2] - Data Preparation and Model Training
# Purpose: Prepare balanced dataset and train Random Forest classifier for similarity score analysis
# Dependencies: pandas, sklearn, logger, RandomForestClassifier, train_test_split
# Breadcrumbs: Data Retrieval -> Data Preparation -> Model Training

def prepare_and_train_model(data):
    """
    Prepare balanced dataset and train Random Forest model
    
    Args:
        data: DataFrame containing similarity scores and labels
        
    Returns:
        tuple: (trained model, feature importance DataFrame, X_test, y_test, predictions, X_full, y_full)
    """
    logger.info(f"Starting data preparation and model training for project: {os.getenv('PROJECT_NAME')}")
    try:
        # Prepare features and target
        # Only use numerical similarity score columns
        feature_columns = [col for col in data.columns 
                         if col not in ['source_id', 'source_content', 'target_id', 
                                      'target_content', 'is_related', 'model_name']]
        
        # Create full dataset
        X_full = data[feature_columns]
        y_full = data['is_related']
        
        # Prepare balanced dataset
        logger.debug("Preparing balanced dataset")
        positive_samples = data[data['is_related'] == 1]
        negative_samples = data[data['is_related'] == 0].sample(n=len(positive_samples), random_state=42)
        balanced_data = pd.concat([positive_samples, negative_samples])
        
        logger.debug(f"Created balanced dataset with {len(balanced_data)} total samples")
        
        # Prepare features and target for balanced data
        X = balanced_data[feature_columns]
        y = balanced_data['is_related']
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        logger.debug(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")
        
        # Initialize and train model
        logger.debug("Initializing RandomForestClassifier")
        rf_model = RandomForestClassifier(
            n_estimators=100,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            random_state=42
        )
        
        logger.debug("Training RandomForestClassifier")
        rf_model.fit(X_train, y_train)
        
        # Evaluate model
        logger.debug("Evaluating model performance")
        y_pred = rf_model.predict(X_test)
        
        # Calculate metrics
        classification_rep = classification_report(y_test, y_pred)
        f2 = fbeta_score(y_test, y_pred, beta=2)
        roc_auc = roc_auc_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        
        # Log results
        logger.info("\nClassification Report:")
        logger.info(f"\n{classification_rep}")
        logger.info(f"F2 Score: {f2:.3f}")
        logger.info(f"ROC AUC Score: {roc_auc:.3f}")
        logger.info("Confusion Matrix:")
        logger.info(f"\n{conf_matrix}")
        
        # Feature importance analysis
        feature_importance = pd.DataFrame({
            'feature': feature_columns,
            'importance': rf_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        logger.info("\nFeature Importance:")
        logger.info(f"\n{feature_importance}")
        
        return rf_model, feature_importance, X_test, y_test, y_pred, X_full, y_full
        
    except Exception as e:
        logger.error("Error in model preparation and training", exc_info=True)
        handle_exception(e)
        raise

try:
    # Create a single Neo4j client instance to be used throughout the notebook
    rf_model, feature_importance, X_test, y_test, y_pred, X_full, y_full = prepare_and_train_model(similarity_data)
    logger.info("Model training completed successfully")
    
except Exception as e:
    logger.error("Failed to prepare and train model", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [3] - Visualization Functions
# Purpose: Create comprehensive visualizations for model performance analysis and feature importance
# Dependencies: matplotlib, seaborn, sklearn.metrics, confusion_matrix, roc_curve, precision_recall_curve
# Breadcrumbs: Model Training -> Performance Analysis -> Visualization Generation

def create_visualizations(model, feature_importance, X_test, y_test, y_pred):
    """Create and save visualizations for model analysis"""
    project_name = os.getenv('PROJECT_NAME')
    logger.info(f"Creating visualizations for project: {project_name}")
    
    try:
        # Set up the figure with subplots
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 16))
        fig.suptitle(f'Model Analysis Visualizations - Project: {project_name}', fontsize=16)
        
        # 1. Confusion Matrix Heatmap
        logger.debug("Creating confusion matrix heatmap")
        conf_matrix = confusion_matrix(y_test, y_pred)
        sns.heatmap(
            conf_matrix, 
            annot=True, 
            fmt='d', 
            cmap='Blues',
            xticklabels=['Not Related', 'Related'],
            yticklabels=['Not Related', 'Related'],
            ax=ax1
        )
        ax1.set_title('Confusion Matrix')
        ax1.set_xlabel('Predicted')
        ax1.set_ylabel('Actual')
        
        # 2. Feature Importance Plot
        logger.debug("Creating feature importance plot")
        feature_importance_plot = feature_importance.plot(
            kind='barh',
            x='feature',
            y='importance',
            ax=ax2,
            color='skyblue'
        )
        ax2.set_title('Feature Importance')
        ax2.set_xlabel('Importance Score')
        plt.setp(ax2.get_xticklabels(), rotation=45, ha='right')
        
        # 3. ROC Curve
        logger.debug("Creating ROC curve")
        y_prob = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)
        
        ax3.plot(
            fpr, 
            tpr, 
            color='darkorange',
            lw=2, 
            label=f'ROC curve (AUC = {roc_auc:.2f})'
        )
        ax3.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        ax3.set_xlim([0.0, 1.0])
        ax3.set_ylim([0.0, 1.05])
        ax3.set_xlabel('False Positive Rate')
        ax3.set_ylabel('True Positive Rate')
        ax3.set_title('Receiver Operating Characteristic (ROC) Curve')
        ax3.legend(loc="lower right")
        
        # 4. Precision-Recall Curve
        logger.debug("Creating precision-recall curve")
        precision, recall, _ = precision_recall_curve(y_test, y_prob)
        pr_auc = auc(recall, precision)
        
        ax4.plot(
            recall, 
            precision, 
            color='green',
            lw=2, 
            label=f'PR curve (AUC = {pr_auc:.2f})'
        )
        ax4.set_xlim([0.0, 1.0])
        ax4.set_ylim([0.0, 1.05])
        ax4.set_xlabel('Recall')
        ax4.set_ylabel('Precision')
        ax4.set_title('Precision-Recall Curve')
        ax4.legend(loc="lower left")
        
        plt.tight_layout()
        
        # Save the plot
        try:
            plot_filename = f"model_analysis_{project_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
            plot_path = results['visualizations'] / plot_filename
            plt.savefig(plot_path)
            logger.info(f"Saved visualization plot to {plot_path}")
        except Exception as save_error:
            logger.warning(f"Could not save plot to file: {str(save_error)}")
        
        plt.show()
        
    except Exception as e:
        logger.error("Error creating visualizations", exc_info=True)
        handle_exception(e)
        raise

try:
    create_visualizations(rf_model, feature_importance, X_test, y_test, y_pred)
except Exception as e:
    logger.error("Failed to create visualizations", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [4] - Dataset Distribution Analysis
# Purpose: Analyze distribution of related and unrelated requirement pairs for data balance assessment
# Dependencies: pandas, logger, similarity_data
# Breadcrumbs: Visualization Generation -> Data Analysis -> Distribution Statistics

try:
    # Method 1: Using sum()
    related_count = similarity_data['is_related'].sum()
    logger.info("Dataset Distribution Analysis:")
    logger.info(f"Number of related pairs: {related_count}")

    # Method 2: Using value_counts() to see both related and unrelated counts
    distribution = similarity_data['is_related'].value_counts()
    logger.info("Distribution of related/unrelated pairs:")
    for label, count in distribution.items():
        logger.info(f"Class {label}: {count} pairs")

    # Method 3: Using value_counts(normalize=True) to see percentages
    percentage_dist = similarity_data['is_related'].value_counts(normalize=True) * 100
    logger.info("Percentage distribution:")
    for label, percentage in percentage_dist.items():
        logger.info(f"Class {label}: {percentage:.2f}%")

    # Additional statistics
    total_pairs = len(similarity_data)
    logger.info(f"Total number of pairs analyzed: {total_pairs}")
    logger.info(f"Imbalance ratio (unrelated:related): {(total_pairs - related_count)/related_count:.2f}:1")

except Exception as e:
    logger.error("Failed to analyze dataset distribution", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [5] - Feature Importance Analysis
# Purpose: Analyze and visualize feature importance for model selection and weight optimization
# Dependencies: pandas, sklearn, RandomForestClassifier, seaborn, matplotlib
# Breadcrumbs: Distribution Statistics -> Feature Analysis -> Model Interpretation

def analyze_feature_importance(X_train, y_train, model_names, project_name):
    """
    Analyze and visualize feature importance using Random Forest Classifier
    
    Parameters:
    -----------
    X_train : pandas.DataFrame
        Training features
    y_train : pandas.Series
        Training labels
    model_names : list
        List of model names used as features
    project_name : str
        Name of the project being analyzed
    
    Returns:
    --------
    pandas.DataFrame
        Feature importance scores sorted in descending order
    """
    logger.debug(f"Starting feature importance analysis for project: {project_name}")
    
    try:
        # Initialize and train Random Forest Classifier
        rf_classifier = RandomForestClassifier(
            n_estimators=100,
            random_state=42,
            class_weight='balanced'
        )
        logger.debug("Training Random Forest Classifier")
        rf_classifier.fit(X_train, y_train)
        
        # Calculate feature importance
        importance_scores = pd.DataFrame({
            'Feature': model_names,
            'Importance': rf_classifier.feature_importances_
        })
        importance_scores = importance_scores.sort_values('Importance', ascending=False)
        
        logger.info(f"\nFeature Importance Rankings for {project_name}:")
        for idx, row in importance_scores.iterrows():
            logger.info(f"{row['Feature']}: {row['Importance']:.4f}")
        
        # Create visualization
        plt.figure(figsize=(12, 6))
        sns.barplot(
            data=importance_scores,
            x='Importance',
            y='Feature',
            palette='viridis'
        )
        plt.title(f'Feature Importance Analysis - {project_name}')
        plt.xlabel('Importance Score')
        plt.ylabel('Model')
        
        # Add value labels to the bars
        for i, v in enumerate(importance_scores['Importance']):
            plt.text(v, i, f'{v:.4f}', va='center')
        
        # Adjust layout and save plot
        plt.tight_layout()
        
        # Save the plot with timestamp in results directory
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        plot_filename = f"feature_importance_{project_name.lower()}_{timestamp}.png"
        plot_path = results['visualizations'] + "/" + plot_filename
        plt.savefig(plot_path)
        logger.info(f"Feature importance plot saved to {plot_path}")
        
        plt.show()
        
        return importance_scores
        
    except Exception as e:
        logger.error("Error in feature importance analysis", exc_info=True)
        handle_exception(e)
        raise

try:
    project_name = os.getenv('PROJECT_NAME')
    logger.info(f"Starting feature importance analysis for {project_name}")
        
    # Get model names (excluding 'is_related' and other non-model columns)
    model_columns = [col for col in similarity_data.columns if col not in ['source_id', 'target_id', 'is_related']]
    
    # Prepare data
    X = similarity_data[model_columns]
    y = similarity_data['is_related']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2, 
        random_state=42, 
        stratify=y
    )
    
    # Run analysis
    importance_results = analyze_feature_importance(
        X_train, 
        y_train, 
        model_columns,
        project_name
    )
    
    logger.info("Feature importance analysis completed successfully")
    
except Exception as e:
    logger.error("Failed to complete feature importance analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [6] - Threshold Analysis
# Purpose: Analyze model performance across different similarity thresholds for optimal threshold selection
# Dependencies: numpy, sklearn.metrics, pandas, tqdm, matplotlib
# Breadcrumbs: Feature Analysis -> Threshold Optimization -> Performance Evaluation

def analyze_threshold_performance(data, thresholds=None):
    """
    Analyze model performance across different thresholds
    
    Parameters:
    -----------
    data : pandas.DataFrame
        DataFrame containing similarity scores and ground truth
    thresholds : list, optional
        List of thresholds to evaluate (default: np.arange(0.1, 1.0, 0.1))
        
    Returns:
    --------
    dict
        Dictionary containing performance metrics for each threshold and model
    """
    logger.debug("Starting threshold performance analysis")
    
    try:
        if thresholds is None:
            thresholds = np.arange(0.1, 1.0, 0.1)
            
        # Get only numeric columns (excluding metadata and non-numeric columns)
        numeric_columns = data.select_dtypes(include=[np.number]).columns
        model_columns = [col for col in numeric_columns 
                        if col not in ['source_id', 'target_id', 'is_related']]
        
        logger.debug(f"Analyzing thresholds for models: {model_columns}")
        
        results = {model: {} for model in model_columns}
        
        for model in model_columns:
            logger.debug(f"Analyzing thresholds for {model}")
            
            # Ensure we have numeric data
            model_data = data[model].astype(float)
            ground_truth = data['is_related'].astype(int)
            
            for threshold in thresholds:
                # Make predictions using threshold
                predictions = (model_data >= threshold).astype(int)
                
                # Calculate metrics
                metrics = {
                    'precision': precision_score(ground_truth, predictions),
                    'recall': recall_score(ground_truth, predictions),
                    'f1': fbeta_score(ground_truth, predictions, beta=1),
                    'f2': fbeta_score(ground_truth, predictions, beta=2),
                    'confusion_matrix': confusion_matrix(ground_truth, predictions)
                }
                
                # Calculate additional metrics from confusion matrix
                tn, fp, fn, tp = metrics['confusion_matrix'].ravel()
                metrics.update({
                    'true_positives': tp,
                    'false_positives': fp,
                    'true_negatives': tn,
                    'false_negatives': fn,
                    'false_negative_rate': fn / (fn + tp) if (fn + tp) > 0 else 0,
                    'false_positive_rate': fp / (fp + tn) if (fp + tn) > 0 else 0
                })
                
                results[model][threshold] = metrics
                
                logger.debug(f"{model} @ {threshold:.2f}: "
                           f"F1={metrics['f1']:.3f}, "
                           f"F2={metrics['f2']:.3f}, "
                           f"FNR={metrics['false_negative_rate']:.3f}")
                
        return results
        
    except Exception as e:
        logger.error("Error in threshold performance analysis", exc_info=True)
        handle_exception(e)
        raise

def plot_threshold_analysis(results, save_path=None):
    """
    Create visualizations for threshold analysis results
    
    Parameters:
    -----------
    results : dict
        Results from threshold analysis
    save_path : str, optional
        Path to save the visualization
    """
    logger.debug("Creating threshold analysis visualizations")
    
    try:
        models = list(results.keys())
        thresholds = sorted(list(results[models[0]].keys()))
        
        # Create subplots
        fig, axes = plt.subplots(2, 2, figsize=(20, 16))
        fig.suptitle('Threshold Analysis Results', fontsize=16)
        
        # Plot 1: Precision-Recall curves
        for model in models:
            precision = [results[model][t]['precision'] for t in thresholds]
            recall = [results[model][t]['recall'] for t in thresholds]
            axes[0, 0].plot(recall, precision, marker='o', label=model)
            
        axes[0, 0].set_title('Precision-Recall Curve')
        axes[0, 0].set_xlabel('Recall')
        axes[0, 0].set_ylabel('Precision')
        axes[0, 0].grid(True)
        axes[0, 0].legend()
        
        # Plot 2: F1 and F2 scores vs threshold
        for model in models:
            f1_scores = [results[model][t]['f1'] for t in thresholds]
            f2_scores = [results[model][t]['f2'] for t in thresholds]
            axes[0, 1].plot(thresholds, f1_scores, marker='o', label=f'{model} (F1)')
            axes[0, 1].plot(thresholds, f2_scores, marker='s', label=f'{model} (F2)')
            
        axes[0, 1].set_title('F1 and F2 Scores vs Threshold')
        axes[0, 1].set_xlabel('Threshold')
        axes[0, 1].set_ylabel('Score')
        axes[0, 1].grid(True)
        axes[0, 1].legend()
        
        # Plot 3: False Negative Rate vs threshold
        for model in models:
            fnr = [results[model][t]['false_negative_rate'] for t in thresholds]
            axes[1, 0].plot(thresholds, fnr, marker='o', label=model)
            
        axes[1, 0].set_title('False Negative Rate vs Threshold')
        axes[1, 0].set_xlabel('Threshold')
        axes[1, 0].set_ylabel('False Negative Rate')
        axes[1, 0].grid(True)
        axes[1, 0].legend()
        
        # Plot 4: ROC curve
        for model in models:
            fpr = [results[model][t]['false_positive_rate'] for t in thresholds]
            tpr = [1 - results[model][t]['false_negative_rate'] for t in thresholds]
            axes[1, 1].plot(fpr, tpr, marker='o', label=model)
            
        axes[1, 1].plot([0, 1], [0, 1], 'k--')  # diagonal line
        axes[1, 1].set_title('ROC Curve')
        axes[1, 1].set_xlabel('False Positive Rate')
        axes[1, 1].set_ylabel('True Positive Rate')
        axes[1, 1].grid(True)
        axes[1, 1].legend()
        
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path)
            logger.info(f"Threshold analysis plots saved to {save_path}")
            
        plt.show()
        
    except Exception as e:
        logger.error("Error creating threshold analysis visualizations", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting threshold analysis")
    
    # Define thresholds to analyze
    thresholds = np.arange(0.1, 1.0, 0.1)
    
    # Run threshold analysis
    threshold_results = analyze_threshold_performance(similarity_data, thresholds)
    
    # Create visualizations
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    plot_filename = f"threshold_analysis_{project_name.lower()}_{timestamp}.png"
    plot_path = results['visualizations'] + "/" + plot_filename
    
    plot_threshold_analysis(threshold_results, save_path=plot_path)
    
    # Log best thresholds for each model
    logger.info("\nBest thresholds by F1 score:")
    for model in threshold_results.keys():
        best_threshold = max(threshold_results[model].items(),
                           key=lambda x: x[1]['f1'])[0]
        best_metrics = threshold_results[model][best_threshold]
        
        logger.info(f"\n{model}:")
        logger.info(f"Best threshold: {best_threshold:.2f}")
        logger.info(f"F1 score: {best_metrics['f1']:.3f}")
        logger.info(f"F2 score: {best_metrics['f2']:.3f}")
        logger.info(f"Precision: {best_metrics['precision']:.3f}")
        logger.info(f"Recall: {best_metrics['recall']:.3f}")
        logger.info(f"False Negative Rate: {best_metrics['false_negative_rate']:.3f}")
        
    logger.info("Threshold analysis completed successfully")
    
except Exception as e:
    logger.error("Failed to complete threshold analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [7] - Requirement Text Retrieval
# Purpose: Retrieve requirement text content from Neo4j database for detailed analysis
# Dependencies: Neo4jClient, logger, handle_exception
# Breadcrumbs: Performance Evaluation -> Text Retrieval -> Content Analysis

def get_requirement_texts(source_ids, target_ids, neo4j_client):
    """
    Get requirement texts from Neo4j for given IDs
    
    Args:
        source_ids (list): List of source requirement IDs
        target_ids (list): List of target requirement IDs
        neo4j_client (Neo4jClient): Existing Neo4j client instance
        
    Returns:
        tuple: (source_texts dict, target_texts dict)
    """
    logger.debug("Fetching requirement texts from Neo4j")
    try:
        # Query to get source requirement texts
        query = """
        MATCH (s:Requirement {type: 'SOURCE'})
        WHERE s.id IN $source_ids
        RETURN s.id as source_id, s.content as source_text
        """
        with neo4j_client.driver.session() as session:
            result = session.run(query, source_ids=source_ids)
            source_texts = {record['source_id']: record['source_text'] for record in result}
        
        # Query to get target requirement texts
        query = """
        MATCH (t:Requirement {type: 'TARGET'})
        WHERE t.id IN $target_ids
        RETURN t.id as target_id, t.content as target_text
        """
        with neo4j_client.driver.session() as session:
            result = session.run(query, target_ids=target_ids)
            target_texts = {record['target_id']: record['target_text'] for record in result}
        
        logger.debug(f"Successfully retrieved texts for {len(source_texts)} source and {len(target_texts)} target requirements")
        return source_texts, target_texts
        
    except Exception as e:
        logger.error("Failed to retrieve requirement texts", exc_info=True)
        handle_exception(e)
        raise

In [None]:
# Cell [8] - False Negative Analysis
# Purpose: Analyze false negative cases to understand model limitations and improve recall
# Dependencies: numpy, pandas, matplotlib, logger, RandomForestClassifier
# Breadcrumbs: Content Analysis -> Error Analysis -> False Negative Investigation

def analyze_false_negatives(rf, X_full, y_full, threshold=0.5):
    """
    Analyze all cases where model predicted negative but actual was positive
    Args:
        rf: Random Forest model
        X_full: Full feature set
        y_full: Full ground truth labels
        threshold: Classification threshold (default 0.5)
    """
    logger.debug(f"Starting false negative analysis with threshold {threshold}")
    try:
        # Get predictions using the specified threshold
        y_prob = rf.predict_proba(X_full)[:, 1]
        y_pred = (y_prob >= threshold).astype(int)
        
        # Find false negative indices
        fn_indices = np.where((y_pred == 0) & (y_full == 1))[0]
        
        # Create DataFrame with false negatives
        fn_data = pd.DataFrame({
            'Actual': y_full.iloc[fn_indices],
            'Predicted': y_pred[fn_indices],
            'Probability': y_prob[fn_indices],
            'Source ID': similarity_data.iloc[fn_indices]['source_id'],
            'Target ID': similarity_data.iloc[fn_indices]['target_id'],
        })
        
        # Get model columns from X_full
        model_columns = X_full.columns
        logger.debug(f"Available model columns: {model_columns}")
        
        # Add model scores using actual column names from X_full
        for i, model_col in enumerate(model_columns):
            fn_data[f'Model {i+1}'] = X_full.iloc[fn_indices][model_col]
            logger.debug(f"Added scores from {model_col} as Model {i+1}")
        
        # Sort by probability
        fn_data = fn_data.sort_values('Probability', ascending=False)
        
        # Log statistics
        logger.info("\nFalse Negative Analysis Results:")
        logger.info(f"Total false negatives: {len(fn_indices)}")
        logger.info(f"False negative rate: {len(fn_indices)/len(y_full):.2%}")
        
        # Log probability distribution
        logger.info("\nProbability Distribution of False Negatives:")
        prob_ranges = [(0.0, 0.1), (0.1, 0.2), (0.2, 0.3), (0.3, 0.4), (0.4, 0.5)]
        for low, high in prob_ranges:
            count = ((fn_data['Probability'] >= low) & (fn_data['Probability'] < high)).sum()
            logger.info(f"Probability {low:.1f}-{high:.1f}: {count} cases ({count/len(fn_indices):.1%})")
        
        # Log model score statistics
        logger.info("\nModel Score Statistics for False Negatives:")
        for i, model_col in enumerate(model_columns):
            scores = fn_data[f'Model {i+1}']
            logger.info(f"Model {i+1} ({model_col}):")
            logger.info(f"  Mean: {scores.mean():.3f}")
            logger.info(f"  Std: {scores.std():.3f}")
            logger.info(f"  Min: {scores.min():.3f}")
            logger.info(f"  Max: {scores.max():.3f}")
        
        # Plot probability distribution
        logger.debug("Generating probability distribution plot")
        plt.figure(figsize=(10, 6))
        plt.hist(fn_data['Probability'], bins=20)
        plt.xlabel('Prediction Probability')
        plt.ylabel('Count')
        plt.title('Distribution of False Negative Probabilities')
        plt.grid(True)
        plt.show()
        
        logger.debug("False negative analysis completed successfully")
        return fn_data
        
    except Exception as e:
        logger.error("Error during false negative analysis", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting false negative analysis")
    fn_analysis = analyze_false_negatives(rf_model, X_full, y_full, threshold=0.3)
    
    # Get requirement texts for example cases
    source_texts, target_texts = get_requirement_texts(
        fn_analysis['Source ID'].head().tolist(),
        fn_analysis['Target ID'].head().tolist(),
        neo4j_client
    )
    
    # Log example cases
    logger.info("Example False Negative Cases (Top 5 by probability):")
    for idx, row in fn_analysis.head().iterrows():
        logger.info(f"Case {idx+1}:")
        logger.info(f"Source ID: {row['Source ID']}")
        logger.info(f"Target ID: {row['Target ID']}")
        logger.info(f"Prediction Probability: {row['Probability']:.3f}")
        for i in range(len(X_full.columns)):
            logger.info(f"Model {i+1} Score: {row[f'Model {i+1}']:.3f}")
        
        # Log requirement texts at DEBUG level
        logger.debug("Requirement Texts:")
        logger.debug(f"Source Text: {source_texts.get(row['Source ID'], 'Not found')}")
        logger.debug(f"Target Text: {target_texts.get(row['Target ID'], 'Not found')}")
    
    logger.debug("False negative analysis completed")
    
except Exception as e:
    logger.error("Failed to complete false negative analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [9] - False Positive Analysis
# Purpose: Analyze false positive cases to understand precision limitations and reduce false alarms
# Dependencies: numpy, pandas, matplotlib, logger, RandomForestClassifier
# Breadcrumbs: False Negative Investigation -> Error Analysis -> False Positive Investigation

def analyze_false_positives(rf, X_full, y_full, threshold=0.5):
    """
    Analyze all cases where model predicted positive but actual was negative
    Args:
        rf: Random Forest model
        X_full: Full feature set
        y_full: Full ground truth labels
        threshold: Classification threshold (default 0.5)
    """
    logger.debug(f"Starting false positive analysis with threshold {threshold}")
    try:
        # Get predictions using the specified threshold
        y_prob = rf.predict_proba(X_full)[:, 1]
        y_pred = (y_prob >= threshold).astype(int)
        
        # Find false positive indices
        fp_indices = np.where((y_pred == 1) & (y_full == 0))[0]
        
        # Create DataFrame with false positives
        fp_data = pd.DataFrame({
            'Actual': y_full.iloc[fp_indices],
            'Predicted': y_pred[fp_indices],
            'Probability': y_prob[fp_indices],
            'Source ID': similarity_data.iloc[fp_indices]['source_id'],
            'Target ID': similarity_data.iloc[fp_indices]['target_id'],
        })
        
        # Get model columns from X_full
        model_columns = X_full.columns
        logger.debug(f"Available model columns: {model_columns}")
        
        # Add model scores using actual column names from X_full
        for i, model_col in enumerate(model_columns):
            fp_data[f'Model {i+1}'] = X_full.iloc[fp_indices][model_col]
            logger.debug(f"Added scores from {model_col} as Model {i+1}")
        
        # Sort by probability
        fp_data = fp_data.sort_values('Probability', ascending=False)
        
        # Log statistics
        logger.info("\nFalse Positive Analysis Results:")
        logger.info(f"Total false positives: {len(fp_indices)}")
        logger.info(f"False positive rate: {len(fp_indices)/len(y_full):.2%}")
        
        # Log probability distribution
        logger.info("\nProbability Distribution of False Positives:")
        prob_ranges = [(0.5, 0.6), (0.6, 0.7), (0.7, 0.8), (0.8, 0.9), (0.9, 1.0)]
        for low, high in prob_ranges:
            count = ((fp_data['Probability'] >= low) & (fp_data['Probability'] < high)).sum()
            logger.info(f"Probability {low:.1f}-{high:.1f}: {count} cases ({count/len(fp_indices):.1%})")
        
        # Log model score statistics
        logger.info("\nModel Score Statistics for False Positives:")
        for i, model_col in enumerate(model_columns):
            scores = fp_data[f'Model {i+1}']
            logger.info(f"Model {i+1} ({model_col}):")
            logger.info(f"  Mean: {scores.mean():.3f}")
            logger.info(f"  Std: {scores.std():.3f}")
            logger.info(f"  Min: {scores.min():.3f}")
            logger.info(f"  Max: {scores.max():.3f}")
        
        # Plot probability distribution
        logger.debug("Generating probability distribution plot")
        plt.figure(figsize=(10, 6))
        plt.hist(fp_data['Probability'], bins=20)
        plt.xlabel('Prediction Probability')
        plt.ylabel('Count')
        plt.title('Distribution of False Positive Probabilities')
        plt.grid(True)
        plt.show()
        
        logger.debug("False positive analysis completed successfully")
        return fp_data
        
    except Exception as e:
        logger.error("Error during false positive analysis", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting false positive analysis")
    fp_analysis = analyze_false_positives(rf_model, X_full, y_full, threshold=0.3)
    
    # Get requirement texts for example cases
    source_texts, target_texts = get_requirement_texts(
        fp_analysis['Source ID'].head().tolist(),
        fp_analysis['Target ID'].head().tolist(),
        neo4j_client
    )
    
    # Log example cases
    logger.info("Example False Positive Cases (Top 5 by probability):")
    for idx, row in fp_analysis.head().iterrows():
        logger.info(f"Case {idx+1}:")
        logger.info(f"Source ID: {row['Source ID']}")
        logger.info(f"Target ID: {row['Target ID']}")
        logger.info(f"Prediction Probability: {row['Probability']:.3f}")
        for i in range(len(X_full.columns)):
            logger.info(f"Model {i+1} Score: {row[f'Model {i+1}']:.3f}")
            
        # Log requirement texts at DEBUG level
        logger.debug("Requirement Texts:")
        logger.debug(f"Source Text: {source_texts.get(row['Source ID'], 'Not found')}")
        logger.debug(f"Target Text: {target_texts.get(row['Target ID'], 'Not found')}")
    
    logger.debug("False positive analysis completed")
    
except Exception as e:
    logger.error("Failed to complete false positive analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [10] - True Positive Analysis
# Purpose: Analyze true positive cases to understand what makes good predictions and validate model strength
# Dependencies: numpy, pandas, matplotlib, logger, RandomForestClassifier
# Breadcrumbs: False Positive Investigation -> Success Analysis -> True Positive Investigation

def analyze_true_positives(rf, X_full, y_full, threshold=0.5):
    """
    Analyze all cases where model correctly predicted positive matches
    Args:
        rf: Random Forest model
        X_full: Full feature set
        y_full: Full ground truth labels
        threshold: Classification threshold (default 0.5)
    """
    logger.debug(f"Starting true positive analysis with threshold {threshold}")
    try:
        # Get predictions using the specified threshold
        y_prob = rf.predict_proba(X_full)[:, 1]
        y_pred = (y_prob >= threshold).astype(int)
        
        # Find true positive indices
        tp_indices = np.where((y_pred == 1) & (y_full == 1))[0]
        
        # Create DataFrame with true positives
        tp_data = pd.DataFrame({
            'Actual': y_full.iloc[tp_indices],
            'Predicted': y_pred[tp_indices],
            'Probability': y_prob[tp_indices],
            'Source ID': similarity_data.iloc[tp_indices]['source_id'],
            'Target ID': similarity_data.iloc[tp_indices]['target_id'],
        })
        
        # Get model columns from X_full
        model_columns = X_full.columns
        logger.debug(f"Available model columns: {model_columns}")
        
        # Add model scores using actual column names from X_full
        for i, model_col in enumerate(model_columns):
            tp_data[f'Model {i+1}'] = X_full.iloc[tp_indices][model_col]
            logger.debug(f"Added scores from {model_col} as Model {i+1}")
        
        # Sort by probability
        tp_data = tp_data.sort_values('Probability', ascending=False)
        
        # Log statistics
        logger.info("\nTrue Positive Analysis Results:")
        logger.info(f"Total true positives: {len(tp_indices)}")
        logger.info(f"True positive rate: {len(tp_indices)/len(y_full):.2%}")
        
        # Log probability distribution
        logger.info("\nProbability Distribution of True Positives:")
        prob_ranges = [(0.5, 0.6), (0.6, 0.7), (0.7, 0.8), (0.8, 0.9), (0.9, 1.0)]
        for low, high in prob_ranges:
            count = ((tp_data['Probability'] >= low) & (tp_data['Probability'] < high)).sum()
            logger.info(f"Probability {low:.1f}-{high:.1f}: {count} cases ({count/len(tp_indices):.1%})")
        
        # Log model score statistics
        logger.info("\nModel Score Statistics for True Positives:")
        for i, model_col in enumerate(model_columns):
            scores = tp_data[f'Model {i+1}']
            logger.info(f"Model {i+1} ({model_col}):")
            logger.info(f"  Mean: {scores.mean():.3f}")
            logger.info(f"  Std: {scores.std():.3f}")
            logger.info(f"  Min: {scores.min():.3f}")
            logger.info(f"  Max: {scores.max():.3f}")
        
        # Plot probability distribution
        logger.debug("Generating probability distribution plot")
        plt.figure(figsize=(10, 6))
        plt.hist(tp_data['Probability'], bins=20)
        plt.xlabel('Prediction Probability')
        plt.ylabel('Count')
        plt.title('Distribution of True Positive Probabilities')
        plt.grid(True)
        plt.show()
        
        logger.debug("True positive analysis completed successfully")
        return tp_data
        
    except Exception as e:
        logger.error("Error during true positive analysis", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting true positive analysis")
    tp_analysis = analyze_true_positives(rf_model, X_full, y_full, threshold=0.3)
    
    # Get requirement texts for example cases
    source_texts, target_texts = get_requirement_texts(
        tp_analysis['Source ID'].head().tolist(),
        tp_analysis['Target ID'].head().tolist(),
        neo4j_client
    )
    
    # Log example cases
    logger.info("Example True Positive Cases (Top 5 by probability):")
    for idx, row in tp_analysis.head().iterrows():
        logger.info(f"Case {idx+1}:")
        logger.info(f"Source ID: {row['Source ID']}")
        logger.info(f"Target ID: {row['Target ID']}")
        logger.info(f"Prediction Probability: {row['Probability']:.3f}")
        for i in range(len(X_full.columns)):
            logger.info(f"Model {i+1} Score: {row[f'Model {i+1}']:.3f}")
            
        # Log requirement texts at DEBUG level
        logger.debug("Requirement Texts:")
        logger.debug(f"Source Text: {source_texts.get(row['Source ID'], 'Not found')}")
        logger.debug(f"Target Text: {target_texts.get(row['Target ID'], 'Not found')}")
    
    logger.debug("True positive analysis completed")
    
except Exception as e:
    logger.error("Failed to complete true positive analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [11] - True Negative Analysis
# Purpose: Analyze true negative cases to validate model specificity and understand correct rejections
# Dependencies: numpy, pandas, matplotlib, logger, RandomForestClassifier
# Breadcrumbs: True Positive Investigation -> Specificity Analysis -> True Negative Investigation

def analyze_true_negatives(rf, X_full, y_full, threshold=0.5):
    """
    Analyze all cases where model correctly predicted negative matches
    Args:
        rf: Random Forest model
        X_full: Full feature set
        y_full: Full ground truth labels
        threshold: Classification threshold (default 0.5)
    """
    logger.debug(f"Starting true negative analysis with threshold {threshold}")
    try:
        # Get predictions using the specified threshold
        y_prob = rf.predict_proba(X_full)[:, 1]
        y_pred = (y_prob >= threshold).astype(int)
        
        # Find true negative indices
        tn_indices = np.where((y_pred == 0) & (y_full == 0))[0]
        
        # Create DataFrame with true negatives
        tn_data = pd.DataFrame({
            'Actual': y_full.iloc[tn_indices],
            'Predicted': y_pred[tn_indices],
            'Probability': y_prob[tn_indices],
            'Source ID': similarity_data.iloc[tn_indices]['source_id'],
            'Target ID': similarity_data.iloc[tn_indices]['target_id'],
        })
        
        # Get model columns from X_full
        model_columns = X_full.columns
        logger.debug(f"Available model columns: {model_columns}")
        
        # Add model scores using actual column names from X_full
        for i, model_col in enumerate(model_columns):
            tn_data[f'Model {i+1}'] = X_full.iloc[tn_indices][model_col]
            logger.debug(f"Added scores from {model_col} as Model {i+1}")
        
        # Sort by probability
        tn_data = tn_data.sort_values('Probability', ascending=False)
        
        # Log statistics
        logger.info("\nTrue Negative Analysis Results:")
        logger.info(f"Total true negatives: {len(tn_indices)}")
        logger.info(f"True negative rate: {len(tn_indices)/len(y_full):.2%}")
        
        # Log probability distribution
        logger.info("\nProbability Distribution of True Negatives:")
        prob_ranges = [(0.0, 0.1), (0.1, 0.2), (0.2, 0.3), (0.3, 0.4), (0.4, 0.5)]
        for low, high in prob_ranges:
            count = ((tn_data['Probability'] >= low) & (tn_data['Probability'] < high)).sum()
            logger.info(f"Probability {low:.1f}-{high:.1f}: {count} cases ({count/len(tn_indices):.1%})")
        
        # Log model score statistics
        logger.info("\nModel Score Statistics for True Negatives:")
        for i, model_col in enumerate(model_columns):
            scores = tn_data[f'Model {i+1}']
            logger.info(f"Model {i+1} ({model_col}):")
            logger.info(f"  Mean: {scores.mean():.3f}")
            logger.info(f"  Std: {scores.std():.3f}")
            logger.info(f"  Min: {scores.min():.3f}")
            logger.info(f"  Max: {scores.max():.3f}")
        
        # Plot probability distribution
        logger.debug("Generating probability distribution plot")
        plt.figure(figsize=(10, 6))
        plt.hist(tn_data['Probability'], bins=20)
        plt.xlabel('Prediction Probability')
        plt.ylabel('Count')
        plt.title('Distribution of True Negative Probabilities')
        plt.grid(True)
        plt.show()
        
        logger.debug("True negative analysis completed successfully")
        return tn_data
        
    except Exception as e:
        logger.error("Error during true negative analysis", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting true negative analysis")
    tn_analysis = analyze_true_negatives(rf_model, X_full, y_full, threshold=0.3)
    
    # Get requirement texts for example cases
    source_texts, target_texts = get_requirement_texts(
        tn_analysis['Source ID'].head().tolist(),
        tn_analysis['Target ID'].head().tolist(),
        neo4j_client
    )
    
    # Log example cases
    logger.info("\nExample True Negative Cases (Top 5 by probability):")
    for idx, row in tn_analysis.head().iterrows():
        logger.info(f"\nCase {idx+1}:")
        logger.info(f"Source ID: {row['Source ID']}")
        logger.info(f"Target ID: {row['Target ID']}")
        logger.info(f"Prediction Probability: {row['Probability']:.3f}")
        for i in range(len(X_full.columns)):
            logger.info(f"Model {i+1} Score: {row[f'Model {i+1}']:.3f}")
            
        # Log requirement texts at DEBUG level
        logger.debug("\nRequirement Texts:")
        logger.debug(f"Source Text: {source_texts.get(row['Source ID'], 'Not found')}")
        logger.debug(f"Target Text: {target_texts.get(row['Target ID'], 'Not found')}")
    
    logger.debug("True negative analysis completed")
    
except Exception as e:
    logger.error("Failed to complete true negative analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [12] - Impact Analysis on Project Effort Estimation
# Purpose: Analyze requirement complexity metrics to understand impact on project effort estimation
# Dependencies: pandas, logger, get_requirement_texts, neo4j_client
# Breadcrumbs: True Negative Investigation -> Impact Assessment -> Complexity Analysis

def analyze_requirement_complexity(df_analysis, category):
    """
    Analyze complexity metrics for requirements in different prediction categories
    (TP, FP, FN, or TN) to understand potential impact on effort estimation
    
    Args:
        df_analysis: DataFrame containing the analysis results
        category: String indicating which category we're analyzing ('TP', 'FP', 'FN', or 'TN')
    """
    logger.debug(f"Starting complexity analysis for {category} requirements")
    try:
        # Get requirement texts for analysis
        source_texts, target_texts = get_requirement_texts(
            df_analysis['Source ID'].tolist(),
            df_analysis['Target ID'].tolist(),
            neo4j_client
        )
        
        # Add texts to dataframe
        df_analysis['Source Text'] = df_analysis['Source ID'].map(source_texts)
        df_analysis['Target Text'] = df_analysis['Target ID'].map(target_texts)
        
        # Calculate complexity metrics
        metrics = {
            'avg_text_length': {
                'source': df_analysis['Source Text'].str.len().mean(),
                'target': df_analysis['Target Text'].str.len().mean()
            },
            'requirement_count': len(df_analysis),
            'avg_similarity_scores': {
                'model1': df_analysis['Model 1'].mean(),
                'model2': df_analysis['Model 2'].mean(),
                'tfidf': df_analysis['Model 9'].mean()  # TFIDF is Model 9
            },
            'similarity_score_variance': {
                'model1': df_analysis['Model 1'].var(),
                'model2': df_analysis['Model 2'].var(),
                'tfidf': df_analysis['Model 9'].var()
            }
        }
        
        # Log analysis results
        logger.info(f"=== {category} Impact Analysis ===")
        logger.info(f"Number of Requirements: {metrics['requirement_count']}")
        
        logger.info("Text Length Analysis:")
        logger.info(f"Average Source Text Length: {metrics['avg_text_length']['source']:.1f} characters")
        logger.info(f"Average Target Text Length: {metrics['avg_text_length']['target']:.1f} characters")
        
        logger.info("Similarity Score Analysis:")
        logger.info("Average Scores:")
        logger.info(f"  Model 1: {metrics['avg_similarity_scores']['model1']:.3f}")
        logger.info(f"  Model 2: {metrics['avg_similarity_scores']['model2']:.3f}")
        logger.info(f"  TF-IDF: {metrics['avg_similarity_scores']['tfidf']:.3f}")
        
        logger.info("Score Variance:")
        logger.info(f"  Model 1: {metrics['similarity_score_variance']['model1']:.3f}")
        logger.info(f"  Model 2: {metrics['similarity_score_variance']['model2']:.3f}")
        logger.info(f"  TF-IDF: {metrics['similarity_score_variance']['tfidf']:.3f}")
        
        # Log example requirements at DEBUG level
        logger.debug("Example Requirements (first 3):")
        for idx, row in df_analysis.head(3).iterrows():
            logger.debug(f"Requirement Pair {idx+1}:")
            logger.debug(f"Source ID: {row['Source ID']}")
            logger.debug(f"Source Text: {row['Source Text']}")
            logger.debug(f"Target ID: {row['Target ID']}")
            logger.debug(f"Target Text: {row['Target Text']}")
            logger.debug(f"Similarity Scores:")
            logger.debug(f"  Model 1: {row['Model 1']:.3f}")
            logger.debug(f"  Model 2: {row['Model 2']:.3f}")
            logger.debug(f"  TF-IDF: {row['Model 9']:.3f}")
        
        return metrics
        
    except Exception as e:
        logger.error(f"Error during complexity analysis for {category}", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting impact analysis on project effort estimation")
    
    # Analyze true positives
    logger.debug("Analyzing true positive cases")
    tp_metrics = analyze_requirement_complexity(tp_analysis, "True Positives")
    
    # Analyze false positives
    logger.debug("Analyzing false positive cases")
    fp_metrics = analyze_requirement_complexity(fp_analysis, "False Positives")
    
    # Analyze false negatives
    logger.debug("Analyzing false negative cases")
    fn_metrics = analyze_requirement_complexity(fn_analysis, "False Negatives")
    
    # Calculate and log comparative metrics
    logger.info("Comparative Analysis:")
    
    # Compare text lengths
    logger.info("Average Text Length Comparison:")
    categories = ["True Positives", "False Positives", "False Negatives"]
    metrics = [tp_metrics, fp_metrics, fn_metrics]
    
    for cat, met in zip(categories, metrics):
        logger.info(f"{cat}:")
        logger.info(f"  Source Text: {met['avg_text_length']['source']:.1f} characters")
        logger.info(f"  Target Text: {met['avg_text_length']['target']:.1f} characters")
    
    # Compare similarity scores
    logger.info("Average Similarity Score Comparison:")
    for cat, met in zip(categories, metrics):
        logger.info(f"{cat}:")
        logger.info(f"  Model 1: {met['avg_similarity_scores']['model1']:.3f}")
        logger.info(f"  Model 2: {met['avg_similarity_scores']['model2']:.3f}")
        logger.info(f"  TF-IDF: {met['avg_similarity_scores']['tfidf']:.3f}")
    
    logger.debug("Impact analysis completed")
    
except Exception as e:
    logger.error("Failed to complete impact analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [13] - Scatter Plot Analysis
# Purpose: Create scatter plots to visualize distribution patterns of TP, FP, FN, and TN cases across models
# Dependencies: matplotlib, numpy, seaborn, logger, pandas
# Breadcrumbs: Complexity Analysis -> Visualization -> Distribution Pattern Analysis

def create_scatter_plots():
    """
    Create scatter plots showing the distribution of TP, FP, FN, and TN cases
    with TP and FN on top layers for better visibility
    """
    logger.debug("Starting scatter plot analysis")
    try:
        # Prepare data for each category - order determines layer position (last items on top)
        categories = {
            'True Negative': (tn_analysis, 'blue', 'x'),
            'False Positive': (fp_analysis, 'grey', '^'),
            'False Negative': (fn_analysis, 'red', 's'),
            'True Positive': (tp_analysis, 'green', 'o')
        }
        
        # Create subplots for different model combinations
        fig, axes = plt.subplots(2, 2, figsize=(20, 20))
        fig.suptitle('Similarity Score Distribution by Prediction Category', fontsize=16)
        
        # Model combinations to plot - using correct column names
        plot_configs = [
            {
                'x': 'Model 1',
                'y': 'Model 2',
                'title': 'Model1 vs Model2',
                'pos': (0, 0)
            },
            {
                'x': 'Model 1',
                'y': 'Model 9',  # TFIDF is Model 9
                'title': 'Model1 vs TF-IDF',
                'pos': (0, 1)
            },
            {
                'x': 'Model 2',
                'y': 'Model 9',  # TFIDF is Model 9
                'title': 'Model2 vs TF-IDF',
                'pos': (1, 0)
            },
            {
                'x': 'Model 3',
                'y': 'Model 4',
                'title': 'Model3 vs Model4',
                'pos': (1, 1)
            }
        ]
        
        logger.debug("Creating scatter plots for model comparisons")
        # Create each subplot
        for config in plot_configs:
            logger.debug(f"Creating plot for {config['title']}")
            ax = axes[config['pos'][0], config['pos'][1]]
            
            # Plot each category (order matters for layering)
            for category, (data, color, marker) in categories.items():
                # Check for valid data points
                valid_mask = ~(np.isnan(data[config['x']]) | np.isnan(data[config['y']]))
                if valid_mask.sum() == 0:
                    logger.warning(f"No valid data points for {category} in {config['title']}")
                    continue
                
                valid_data = data[valid_mask]
                logger.debug(f"Adding {len(valid_data)} {category} data points")
                
                ax.scatter(
                    valid_data[config['x']],
                    valid_data[config['y']],
                    c=color,
                    marker=marker,
                    label=category,
                    alpha=0.6,
                    s=50  # marker size
                )
                
                # Log statistics for this category and model combination
                logger.info(f"Statistics for {category} in {config['title']}:")
                logger.info(f"  Number of points: {len(valid_data)}")
                logger.info(f"  {config['x']} mean: {valid_data[config['x']].mean():.3f}")
                logger.info(f"  {config['y']} mean: {valid_data[config['y']].mean():.3f}")
            
            # Customize plot appearance
            ax.set_xlabel(config['x'])
            ax.set_ylabel(config['y'])
            ax.set_title(config['title'])
            ax.grid(True, linestyle='--', alpha=0.7)
            ax.legend()
            
            # Add correlation coefficient to plot
            for category, (data, _, _) in categories.items():
                valid_mask = ~(np.isnan(data[config['x']]) | np.isnan(data[config['y']]))
                if valid_mask.sum() > 1:  # Need at least 2 points for correlation
                    valid_data = data[valid_mask]
                    correlation = valid_data[config['x']].corr(valid_data[config['y']])
                    logger.info(f"Correlation for {category} between {config['x']} and {config['y']}: {correlation:.3f}")
                else:
                    logger.warning(f"Insufficient data for correlation calculation in {category}")
        
        plt.tight_layout()
        logger.debug("Scatter plots created successfully")
        
        # Save plot with proper path
        plot_filename = f"similarity_scatter_plots_{project_name}_{timestamp}.png"
        plot_path = results['visualizations'] + "/" + plot_filename
        plt.savefig(plot_path)
        logger.info(f"Scatter plots saved to {plot_path}")
        
        plt.close()
        
    except Exception as e:
        logger.error("Error creating scatter plots", exc_info=True)
        handle_exception(e)
        raise

def create_density_plots():
    """
    Create density plots for each model's score distribution
    """
    logger.debug("Starting density plot analysis")
    try:
        fig, axes = plt.subplots(3, 3, figsize=(20, 20))
        fig.suptitle('Score Distribution Density by Model and Category', fontsize=16)
        
        model_names = [
            'Model 1', 'Model 2', 'Model 3',
            'Model 4', 'Model 5', 'Model 6',
            'Model 7', 'Model 8', 'Model 9'  # Model 9 is TFIDF
        ]
        
        categories = {
            'True Positive': (tp_analysis, 'green'),
            'False Positive': (fp_analysis, 'grey'),
            'False Negative': (fn_analysis, 'red'),
            'True Negative': (tn_analysis, 'blue')
        }
        
        for idx, model in enumerate(model_names):
            logger.debug(f"Creating density plot for {model}")
            ax = axes[idx // 3, idx % 3]
            
            for category, (data, color) in categories.items():
                # Filter out invalid values and check for variance
                valid_data = data[model].dropna()
                if len(valid_data) < 2:
                    logger.warning(f"Insufficient data for {category} in {model}")
                    continue
                    
                if valid_data.var() == 0:
                    logger.warning(f"Zero variance in {category} for {model}")
                    continue
                
                try:
                    sns.kdeplot(
                        data=valid_data,
                        ax=ax,
                        label=category,
                        color=color,
                        warn_singular=False  # Suppress singular matrix warning
                    )
                    
                    # Log statistics for this model and category
                    logger.info(f"Statistics for {category} in {model}:")
                    logger.info(f"  Mean: {valid_data.mean():.3f}")
                    logger.info(f"  Std: {valid_data.std():.3f}")
                    logger.info(f"  Min: {valid_data.min():.3f}")
                    logger.info(f"  Max: {valid_data.max():.3f}")
                except Exception as plot_error:
                    logger.warning(f"Could not create density plot for {category} in {model}: {str(plot_error)}")
            
            ax.set_title(f"{model} Score Distribution")
            ax.grid(True, linestyle='--', alpha=0.7)
            ax.legend()
        
        plt.tight_layout()
        logger.debug("Density plots created successfully")
        
        # Save plot with proper path
        plot_filename = f"similarity_density_plots_{project_name}_{timestamp}.png"
        plot_path = results['visualizations'] + "/" + plot_filename
        plt.savefig(plot_path)
        logger.info(f"Density plots saved to {plot_path}")
        
        plt.close()
        
    except Exception as e:
        logger.error("Error creating density plots", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting visualization analysis")
    logger.info("Creating scatter plots...")
    create_scatter_plots()
    logger.info("Creating density plots...")
    create_density_plots()
    logger.info("Visualization analysis completed")
except Exception as e:
    logger.error("Failed to complete visualization analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [14] - Score Transformation Analysis
# Purpose: Apply and analyze different mathematical transformations to improve score interpretation
# Dependencies: numpy, pandas, matplotlib, seaborn, logger
# Breadcrumbs: Distribution Pattern Analysis -> Transformation Analysis -> Score Optimization

def apply_transformations(scores):
    """
    Apply different transformations to similarity scores
    
    Args:
        scores: numpy array of similarity scores
    Returns:
        dict: Dictionary of transformed scores
    """
    logger.debug("Applying score transformations")
    try:
        transformations = {
            'log': np.log1p(scores),  # log1p to handle zeros
            'exp': np.exp(scores) - 1,  # subtract 1 to maintain 0 baseline
            'squared': np.square(scores),
            'cubic': np.power(scores, 3),
            'sqrt': np.sqrt(scores),
            'sigmoid': 1 / (1 + np.exp(-10 * (scores - 0.5)))  # scaled sigmoid
        }
        
        # Log transformation statistics
        for name, transformed in transformations.items():
            logger.debug(f"{name.capitalize()} transformation stats:")
            logger.debug(f"  Mean: {transformed.mean():.3f}")
            logger.debug(f"  Std: {transformed.std():.3f}")
            logger.debug(f"  Range: [{transformed.min():.3f}, {transformed.max():.3f}]")
            
        return transformations
        
    except Exception as e:
        logger.error("Error applying transformations", exc_info=True)
        handle_exception(e)
        raise

def plot_transformed_scores():
    """
    Plot original vs transformed scores for different categories
    """
    logger.debug("Starting transformed scores visualization")
    try:
        # Combine all datasets with their categories
        data_sources = {
            'True Positive': tp_analysis['Model 1'],
            'False Positive': fp_analysis['Model 1'],
            'False Negative': fn_analysis['Model 1'],
            'True Negative': tn_analysis['Model 1']
        }
        
        all_data = []
        for category, scores in data_sources.items():
            valid_scores = scores.dropna()
            if len(valid_scores) > 0:
                df = pd.DataFrame({'score': valid_scores, 'category': category})
                all_data.append(df)
            else:
                logger.warning(f"No valid scores for {category}")
        
        all_data = pd.concat(all_data)
        logger.info(f"Total samples for transformation analysis: {len(all_data)}")
        
        # Create subplots for each transformation
        fig, axes = plt.subplots(2, 3, figsize=(20, 15))
        fig.suptitle('Score Transformations by Category', fontsize=16)
        
        # Color mapping for categories
        colors = {
            'True Positive': 'green',
            'False Positive': 'grey',
            'False Negative': 'red',
            'True Negative': 'blue'
        }
        
        # Plot each transformation
        transformations = apply_transformations(all_data['score'].values)
        for (name, transformed), ax in zip(transformations.items(), axes.flat):
            logger.debug(f"Creating plot for {name} transformation")
            
            for category in colors.keys():
                category_mask = all_data['category'] == category
                if category_mask.sum() > 0:
                    original = all_data.loc[category_mask, 'score']
                    transformed_scores = transformed[category_mask]
                    
                    ax.scatter(
                        original,
                        transformed_scores,
                        c=colors[category],
                        label=category,
                        alpha=0.6,
                        s=50
                    )
                    
                    # Log correlation between original and transformed scores
                    correlation = np.corrcoef(original, transformed_scores)[0, 1]
                    logger.info(f"Correlation for {category} with {name} transformation: {correlation:.3f}")
            
            ax.set_xlabel('Original Score')
            ax.set_ylabel(f'{name.capitalize()} Score')
            ax.set_title(f'{name.capitalize()} Transformation')
            ax.grid(True, linestyle='--', alpha=0.7)
            ax.legend()
        
        plt.tight_layout()
        logger.debug("Transformation plots created successfully")
        
        # Save plot with proper path
        plot_filename = f"score_transformations_{project_name}_{timestamp}.png"
        plot_path = results['visualizations'] + "/" + plot_filename
        plt.savefig(plot_path)
        logger.info(f"Transformation plots saved to {plot_path}")
        
        # Save transformation statistics to CSV
        stats_data = []
        for category in colors.keys():
            category_mask = all_data['category'] == category
            if category_mask.sum() > 0:
                original_scores = all_data.loc[category_mask, 'score']
                
                for name, transformed in transformations.items():
                    transformed_scores = transformed[category_mask]
                    stats = {
                        'Category': category,
                        'Transformation': name,
                        'Original_Mean': original_scores.mean(),
                        'Original_Std': original_scores.std(),
                        'Transformed_Mean': transformed_scores.mean(),
                        'Transformed_Std': transformed_scores.std(),
                        'Correlation': np.corrcoef(original_scores, transformed_scores)[0, 1]
                    }
                    stats_data.append(stats)
        
        stats_df = pd.DataFrame(stats_data)
        stats_filename = f"transformation_statistics_{project_name}_{timestamp}.csv"
        stats_path = results['visualizations'] + "/" + stats_filename
        stats_df.to_csv(stats_path, index=False)
        logger.info(f"Transformation statistics saved to {stats_path}")
        
        plt.close()
        
    except Exception as e:
        logger.error("Error creating transformation plots", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting score transformation analysis")
    plot_transformed_scores()
    logger.info("Score transformation analysis completed")
except Exception as e:
    logger.error("Failed to complete transformation analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [15] - Log Scale Model Comparison
# Purpose: Create log-scale visualizations to better understand model score distributions and relationships
# Dependencies: matplotlib, numpy, seaborn, logger
# Breadcrumbs: Score Optimization -> Scale Analysis -> Logarithmic Transformation

def create_log_scale_comparisons():
    """
    Create log-scale visualizations comparing different models
    """
    logger.debug("Starting log scale comparison analysis")
    try:
        # Create figure with multiple subplots
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 16))
        fig.suptitle('Model Comparisons with Different Scale Transformations', fontsize=16)
        
        categories = {
            'True Negative': (tn_analysis, 'blue', 'x'),
            'False Positive': (fp_analysis, 'grey', '^'),
            'False Negative': (fn_analysis, 'red', 's'),
            'True Positive': (tp_analysis, 'green', 'o')
        }
        
        logger.debug("Creating log10 scale comparison plot")
        # Plot 1: Log scale scatter
        for category, (data, color, marker) in categories.items():
            # Add validation for data
            if len(data) == 0:
                logger.warning(f"No data available for {category}")
                continue
                
            valid_mask = (data['Model 1'] > 0) & (data['Model 2'] > 0)
            if not valid_mask.any():
                logger.warning(f"No valid positive scores for {category}")
                continue
                
            valid_data = data[valid_mask]
            
            ax1.scatter(
                np.log10(valid_data['Model 1']),
                np.log10(valid_data['Model 2']),
                c=color,
                marker=marker,
                label=category,
                alpha=0.6
            )
            logger.info(f"Log10 scale statistics for {category}:")
            logger.info(f"  Points plotted: {len(valid_data)}")
            logger.info(f"  Model 1 log10 range: [{np.log10(valid_data['Model 1']).min():.3f}, {np.log10(valid_data['Model 1']).max():.3f}]")
            logger.info(f"  Model 2 log10 range: [{np.log10(valid_data['Model 2']).min():.3f}, {np.log10(valid_data['Model 2']).max():.3f}]")
            
        ax1.set_title('Log10 Scale Comparison')
        ax1.set_xlabel('Log10(Model 1 Score)')
        ax1.set_ylabel('Log10(Model 2 Score)')
        ax1.grid(True, linestyle='--', alpha=0.7)
        ax1.legend()

        logger.debug("Creating natural log scale comparison plot")
        # Plot 2: Natural log scale scatter
        for category, (data, color, marker) in categories.items():
            valid_mask = (data['Model 1'] > 0) & (data['Model 2'] > 0)
            if not valid_mask.any():
                continue
                
            valid_data = data[valid_mask]
            
            ax2.scatter(
                np.log(valid_data['Model 1']),
                np.log(valid_data['Model 2']),
                c=color,
                marker=marker,
                label=category,
                alpha=0.6
            )
            logger.info(f"Natural log scale statistics for {category}:")
            logger.info(f"  Points plotted: {len(valid_data)}")
            logger.info(f"  Model 1 ln range: [{np.log(valid_data['Model 1']).min():.3f}, {np.log(valid_data['Model 1']).max():.3f}]")
            logger.info(f"  Model 2 ln range: [{np.log(valid_data['Model 2']).min():.3f}, {np.log(valid_data['Model 2']).max():.3f}]")
            
        ax2.set_title('Natural Log Scale Comparison')
        ax2.set_xlabel('ln(Model 1 Score)')
        ax2.set_ylabel('ln(Model 2 Score)')
        ax2.grid(True, linestyle='--', alpha=0.7)
        ax2.legend()

        logger.debug("Creating density plots")
        # Plot 3: Log-scaled density plot for Model 1
        for category, (data, color, _) in categories.items():
            valid_data = data[data['Model 1'] > 0]
            if len(valid_data) > 0:
                try:
                    sns.kdeplot(
                        data=np.log10(valid_data['Model 1']),
                        ax=ax3,
                        label=category,
                        color=color
                    )
                    logger.debug(f"Created density plot for {category} Model 1")
                except Exception as e:
                    logger.warning(f"Could not create density plot for {category} Model 1: {str(e)}")

        ax3.set_title('Log10 Score Distribution - Model 1')
        ax3.set_xlabel('Log10(Score)')
        ax3.set_ylabel('Density')
        ax3.grid(True, linestyle='--', alpha=0.7)
        ax3.legend()

        # Plot 4: Log-scaled density plot for Model 2
        for category, (data, color, _) in categories.items():
            valid_data = data[data['Model 2'] > 0]
            if len(valid_data) > 0:
                try:
                    sns.kdeplot(
                        data=np.log10(valid_data['Model 2']),
                        ax=ax4,
                        label=category,
                        color=color
                    )
                    logger.debug(f"Created density plot for {category} Model 2")
                except Exception as e:
                    logger.warning(f"Could not create density plot for {category} Model 2: {str(e)}")

        ax4.set_title('Log10 Score Distribution - Model 2')
        ax4.set_xlabel('Log10(Score)')
        ax4.set_ylabel('Density')
        ax4.grid(True, linestyle='--', alpha=0.7)
        ax4.legend()

        plt.tight_layout()
        logger.debug("All plots created successfully")
        
        # Save plot with proper path
        plot_filename = f"log_scale_comparisons_{project_name}_{timestamp}.png"
        plot_path = results['visualizations'] + "/" + plot_filename
        plt.savefig(plot_path)
        logger.info(f"Log scale comparison plots saved to {plot_path}")
        
        # Save statistics to CSV
        stats_data = []
        for category, (data, _, _) in categories.items():
            valid_data = data[(data['Model 1'] > 0) & (data['Model 2'] > 0)]
            if len(valid_data) > 0:
                stats = {
                    'Category': category,
                    'Count': len(valid_data),
                    'Model1_Log10_Mean': np.log10(valid_data['Model 1']).mean(),
                    'Model1_Log10_Std': np.log10(valid_data['Model 1']).std(),
                    'Model2_Log10_Mean': np.log10(valid_data['Model 2']).mean(),
                    'Model2_Log10_Std': np.log10(valid_data['Model 2']).std(),
                    'Log10_Correlation': np.corrcoef(
                        np.log10(valid_data['Model 1']),
                        np.log10(valid_data['Model 2'])
                    )[0, 1]
                }
                stats_data.append(stats)
        
        stats_df = pd.DataFrame(stats_data)
        stats_filename = f"log_scale_statistics_{project_name}_{timestamp}.csv"
        stats_path = results['visualizations'] + "/" + stats_filename
        stats_df.to_csv(stats_path, index=False)
        logger.info(f"Log scale statistics saved to {stats_path}")
        
        plt.close()
        
    except Exception as e:
        logger.error("Error creating log scale comparisons", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting log scale model comparison analysis")
    create_log_scale_comparisons()
    logger.info("Log scale model comparison analysis completed")
except Exception as e:
    logger.error("Failed to complete log scale comparison analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [16] - Optimized Aggregate Score Calculation
# Purpose: Create optimized aggregate scoring system using feature importance weights for LLM analysis
# Dependencies: numpy, pandas, logger, feature_weights
# Breadcrumbs: Logarithmic Transformation -> Aggregation Strategy -> Optimized Scoring

def create_optimized_aggregate(source_text, target_text, model_dict=None, feature_weights=None):
    """
    Create an optimized aggregate of similarity scores for LLM analysis
    
    Args:
        source_text: Source requirement text
        target_text: Target requirement text
        model_dict: Dictionary of model objects with compute_similarity methods
        feature_weights: DataFrame containing feature importance values from random forest
    
    Returns:
        dict: Structured similarity analysis
    """
    # Log the available feature importance values
    if feature_weights is not None:
        logger.debug("Available feature importance values:")
        for _, row in feature_weights.iterrows():
            logger.debug(f"Model: {row['feature']}, Importance: {row['importance']:.4f}")
    
    # If no feature weights provided, use default weights
    if feature_weights is None:
        logger.warning("No feature weights provided, using default weights")
        weights = {
            "miniLM_similarity": 0.1393,
            "distilbert_qa_similarity": 0.1254,
            "tfidf_similarity": 0.0998,
            "distilroberta_similarity": 0.0967
        }
    else:
        # Initialize weights with defaults
        weights = {
            "miniLM_similarity": 0.1393,
            "distilbert_qa_similarity": 0.1254,
            "tfidf_similarity": 0.0998,
            "distilroberta_similarity": 0.0967
        }
        
        # Map feature importance values to weights
        model_name_mapping = {
            'minilm': 'miniLM_similarity',
            'qa-distilbert': 'distilbert_qa_similarity',
            'tfidf': 'tfidf_similarity',
            'distilroberta': 'distilroberta_similarity'
        }
        
        # Update weights with actual values from feature importance
        for _, row in feature_weights.iterrows():
            model_name = row['feature'].lower()
            for key, weight_key in model_name_mapping.items():
                if key in model_name:
                    weights[weight_key] = row['importance']
                    logger.debug(f"Updated weight for {weight_key}: {row['importance']:.4f}")

        logger.debug(f"Final weights after mapping: {weights}")

    # For testing without models, generate random similarities
    if model_dict is None:
        logger.warning("No models provided, using random similarities for testing")
        aggregate = {
            "primary_signals": {
                "miniLM_similarity": np.random.uniform(0, 1),
                "distilbert_qa_similarity": np.random.uniform(0, 1)
            },
            "secondary_signals": {
                "tfidf_similarity": np.random.uniform(0, 1),
                "distilroberta_similarity": np.random.uniform(0, 1)
            },
            "confidence_metrics": {
                "weighted_score": None,
                "agreement_score": None
            }
        }
    else:
        try:
            aggregate = {
                "primary_signals": {
                    "miniLM_similarity": model_dict.compute_similarity(source_text, target_text, model_name="all-MiniLM-L6-v2"),
                    "distilbert_qa_similarity": model_dict.compute_similarity(source_text, target_text, model_name="multi-qa-distilbert-cos-v1")
                },
                "secondary_signals": {
                    "tfidf_similarity": model_dict.compute_similarity(source_text, target_text, model_name="tfidf"),
                    "distilroberta_similarity": model_dict.compute_similarity(source_text, target_text, model_name="all-distilroberta-v1")
                },
                "confidence_metrics": {
                    "weighted_score": None,
                    "agreement_score": None
                }
            }
        except Exception as e:
            logger.error(f"Error computing similarities: {str(e)}")
            raise
    
    # Calculate weighted primary score using actual feature importances
    primary_weight = weights['miniLM_similarity'] + weights['distilbert_qa_similarity']
    weighted_primary = (
        (aggregate["primary_signals"]["miniLM_similarity"] * weights['miniLM_similarity'] + 
         aggregate["primary_signals"]["distilbert_qa_similarity"] * weights['distilbert_qa_similarity']) 
        / primary_weight
    )
    
    # Calculate weighted secondary score using actual feature importances
    secondary_weight = weights['tfidf_similarity'] + weights['distilroberta_similarity']
    weighted_secondary = (
        (aggregate["secondary_signals"]["tfidf_similarity"] * weights['tfidf_similarity'] + 
         aggregate["secondary_signals"]["distilroberta_similarity"] * weights['distilroberta_similarity'])
        / secondary_weight
    )
    
    # Calculate final weighted score using the ratio of primary to secondary importance
    total_primary = primary_weight
    total_secondary = secondary_weight
    total_weight = total_primary + total_secondary
    
    primary_ratio = total_primary / total_weight
    secondary_ratio = total_secondary / total_weight
    
    aggregate["confidence_metrics"]["weighted_score"] = (
        weighted_primary * primary_ratio +
        weighted_secondary * secondary_ratio
    )
    
    # Calculate agreement score (how much models agree with each other)
    scores = [
        aggregate["primary_signals"]["miniLM_similarity"],
        aggregate["primary_signals"]["distilbert_qa_similarity"],
        aggregate["secondary_signals"]["tfidf_similarity"],
        aggregate["secondary_signals"]["distilroberta_similarity"]
    ]
    aggregate["confidence_metrics"]["agreement_score"] = 1 - np.std(scores)
    
    # Log the weights used
    logger.debug(f"Primary weight ratio: {primary_ratio:.3f}")
    logger.debug(f"Secondary weight ratio: {secondary_ratio:.3f}")
    logger.debug(f"Final weighted score: {aggregate['confidence_metrics']['weighted_score']:.3f}")
    
    return aggregate

def create_llm_prompt(aggregate, source_text, target_text, threshold=0.3):
    """
    Create an optimized prompt for LLM analysis
    """
    prompt = f"""Analyze the similarity between these two software requirements:

Source Requirement: {source_text}

Target Requirement: {target_text}

Similarity Analysis:
- Primary Similarity (MiniLM): {aggregate['primary_signals']['miniLM_similarity']:.3f}
- QA-Focused Similarity (DistilBERT): {aggregate['primary_signals']['distilbert_qa_similarity']:.3f}
- Text Similarity (TF-IDF): {aggregate['secondary_signals']['tfidf_similarity']:.3f}
- Semantic Similarity (DistilRoBERTa): {aggregate['secondary_signals']['distilroberta_similarity']:.3f}

Confidence Metrics:
- Weighted Score: {aggregate['confidence_metrics']['weighted_score']:.3f}
- Model Agreement: {aggregate['confidence_metrics']['agreement_score']:.3f}

Based on these similarity scores and your analysis of the actual text content:
1. Are these requirements related? Consider both semantic meaning and functional implications.
2. What specific aspects make them similar or different?
3. Confidence level in your assessment (high/medium/low)?

Note: Our goal is to minimize false negatives (missing actual relationships) while maintaining reasonable precision."""
    
    return prompt

def evaluate_relationship(aggregate, llm_response, threshold=0.3):
    """
    Make final decision about requirement relationship
    """
    weighted_score = aggregate["confidence_metrics"]["weighted_score"]
    agreement_score = aggregate["confidence_metrics"]["agreement_score"]
    
    # High confidence positive match
    if weighted_score > 0.6 and agreement_score > 0.8:
        return True, "high_confidence"
        
    # Likely match needing review
    elif weighted_score > threshold:
        if "high" in llm_response["confidence"].lower():
            return True, "llm_confirmed"
        else:
            return True, "needs_review"
            
    # Potential false negative check
    elif weighted_score > 0.2 and "related" in llm_response["decision"].lower():
        return True, "llm_rescued"
        
    return False, "low_confidence"

try:
    logger.info("Testing optimized aggregate score calculation")
    
    # Log feature importance from Cell 3
    logger.info("\nFeature Importance from Random Forest:")
    for _, row in feature_importance.iterrows():
        logger.info(f"Model: {row['feature']:<30} Importance: {row['importance']:.4f}")
    
    # Get a sample source and target text from our analysis results
    sample_source_id = tp_analysis['Source ID'].iloc[0]
    sample_target_id = tp_analysis['Target ID'].iloc[0]
    
    # Get the actual texts using our existing function
    source_texts, target_texts = get_requirement_texts(
        [sample_source_id], 
        [sample_target_id], 
        neo4j_client
    )
    
    sample_source_text = source_texts[sample_source_id]
    sample_target_text = target_texts[sample_target_id]
    
    logger.debug("Sample texts retrieved:")
    logger.debug(f"Source: {sample_source_text[:100]}...")
    logger.debug(f"Target: {sample_target_text[:100]}...")
    
    # Create a mock model dictionary for testing
    class MockModelDict:
        def compute_similarity(self, source, target, model_name=None):
            return np.random.uniform(0, 1)
    
    mock_model = MockModelDict()
    
    logger.info("Testing with mock model")
    aggregate_result = create_optimized_aggregate(
        source_text=sample_source_text,
        target_text=sample_target_text,
        model_dict=mock_model,  # Use mock model instead of None
        feature_weights=feature_importance
    )
    
    # Log the results
    logger.info("\nAggregate Score Analysis Results:")
    logger.info("Primary Signals:")
    for signal, value in aggregate_result["primary_signals"].items():
        logger.info(f"- {signal}: {value:.3f}")
    
    logger.info("\nSecondary Signals:")
    for signal, value in aggregate_result["secondary_signals"].items():
        logger.info(f"- {signal}: {value:.3f}")
    
    logger.info("\nConfidence Metrics:")
    logger.info(f"- Weighted Score: {aggregate_result['confidence_metrics']['weighted_score']:.3f}")
    logger.info(f"- Agreement Score: {aggregate_result['confidence_metrics']['agreement_score']:.3f}")
    
    # Create and log the LLM prompt
    prompt = create_llm_prompt(aggregate_result, sample_source_text, sample_target_text)
    logger.debug("\nGenerated LLM Prompt:")
    logger.debug(prompt)
    
    # Simulate an LLM response for demonstration
    mock_llm_response = {
        "decision": "related",
        "confidence": "high"
    }
    
    # Evaluate the relationship
    is_related, confidence_level = evaluate_relationship(
        aggregate_result, 
        mock_llm_response
    )
    
    logger.info("\nFinal Evaluation:")
    logger.info(f"Related: {is_related}")
    logger.info(f"Confidence Level: {confidence_level}")
    
except Exception as e:
    logger.error("Error testing optimized aggregate calculation", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [17] - Aggregate Score Analysis and Validation
# Purpose: Analyze and validate the optimized aggregate scoring approach across all prediction categories
# Dependencies: pandas, matplotlib, seaborn, tqdm, numpy, logger
# Breadcrumbs: Optimized Scoring -> Validation Analysis -> Comprehensive Evaluation

def analyze_aggregate_scores(tp_data, fp_data, fn_data, tn_data, feature_importance):
    """
    Analyze and visualize how well the aggregate scoring approach differentiates between different cases
    
    Args:
        tp_data: DataFrame containing true positive cases
        fp_data: DataFrame containing false positive cases
        fn_data: DataFrame containing false negative cases
        tn_data: DataFrame containing true negative cases
        feature_importance: DataFrame containing feature importance values
    """
    logger.info("Starting aggregate score analysis")
    
    try:
        # Create mock model for consistent testing
        class MockModelDict:
            def compute_similarity(self, source, target, model_name=None):
                return np.random.uniform(0, 1)
        
        mock_model = MockModelDict()
        
        # Function to process a batch of cases
        def process_cases(data, case_type):
            logger.info(f"Processing {len(data)} {case_type} cases")
            results = []
            for idx, row in tqdm(data.iterrows(), total=len(data), desc=f"Processing {case_type}"):
                source_id = row['Source ID']
                target_id = row['Target ID']
                
                # Get texts
                source_texts, target_texts = get_requirement_texts(
                    [source_id], 
                    [target_id], 
                    neo4j_client
                )
                
                # Calculate aggregate scores
                aggregate = create_optimized_aggregate(
                    source_text=source_texts[source_id],
                    target_text=target_texts[target_id],
                    model_dict=mock_model,
                    feature_weights=feature_importance
                )
                
                results.append({
                    'case_type': case_type,
                    'source_id': source_id,
                    'target_id': target_id,
                    'weighted_score': aggregate['confidence_metrics']['weighted_score'],
                    'agreement_score': aggregate['confidence_metrics']['agreement_score'],
                    'primary_avg': np.mean([
                        aggregate['primary_signals']['miniLM_similarity'],
                        aggregate['primary_signals']['distilbert_qa_similarity']
                    ]),
                    'secondary_avg': np.mean([
                        aggregate['secondary_signals']['tfidf_similarity'],
                        aggregate['secondary_signals']['distilroberta_similarity']
                    ])
                })
                
            return pd.DataFrame(results)
        
        # Process all cases
        logger.info("Processing all cases:")
        logger.info(f"True Positives: {len(tp_data)}")
        logger.info(f"False Positives: {len(fp_data)}")
        logger.info(f"False Negatives: {len(fn_data)}")
        logger.info(f"True Negatives: {len(tn_data)}")
        
        results_tp = process_cases(tp_data, 'True Positive')
        results_fp = process_cases(fp_data, 'False Positive')
        results_fn = process_cases(fn_data, 'False Negative')
        results_tn = process_cases(tn_data, 'True Negative')
        
        # Combine results
        all_results = pd.concat([results_tp, results_fp, results_fn, results_tn])
        logger.info(f"Total processed cases: {len(all_results)}")
        
        # Create visualizations
        plt.figure(figsize=(20, 15))
        
        # Define colors and markers for consistency
        case_styles = {
            'True Positive': ('green', 'o'),
            'False Positive': ('red', '^'),
            'False Negative': ('orange', 's'),
            'True Negative': ('blue', 'x')
        }
        
        # 1. Scatter plot of weighted score vs agreement score
        plt.subplot(2, 2, 1)
        for case_type, (color, marker) in case_styles.items():
            mask = all_results['case_type'] == case_type
            plt.scatter(
                all_results[mask]['weighted_score'],
                all_results[mask]['agreement_score'],
                label=f"{case_type} (n={sum(mask)})",
                color=color,
                marker=marker,
                alpha=0.6
            )
        plt.xlabel('Weighted Score')
        plt.ylabel('Agreement Score')
        plt.title('Weighted Score vs Agreement Score')
        plt.legend()
        plt.grid(True)
        
        # 2. Primary vs Secondary Signal Scatter
        plt.subplot(2, 2, 2)
        for case_type, (color, marker) in case_styles.items():
            mask = all_results['case_type'] == case_type
            plt.scatter(
                all_results[mask]['primary_avg'],
                all_results[mask]['secondary_avg'],
                label=f"{case_type} (n={sum(mask)})",
                color=color,
                marker=marker,
                alpha=0.6
            )
        plt.xlabel('Average Primary Signal')
        plt.ylabel('Average Secondary Signal')
        plt.title('Primary vs Secondary Signals')
        plt.legend()
        plt.grid(True)
        
        # 3. Score Distribution
        plt.subplot(2, 2, 3)
        for case_type, (color, _) in case_styles.items():
            mask = all_results['case_type'] == case_type
            sns.kdeplot(
                data=all_results[mask]['weighted_score'],
                label=f"{case_type} (n={sum(mask)})",
                color=color,
                fill=True,
                alpha=0.3
            )
        plt.xlabel('Weighted Score')
        plt.ylabel('Density')
        plt.title('Distribution of Weighted Scores')
        plt.legend()
        plt.grid(True)
        
        # 4. Agreement Score Distribution
        plt.subplot(2, 2, 4)
        for case_type, (color, _) in case_styles.items():
            mask = all_results['case_type'] == case_type
            sns.kdeplot(
                data=all_results[mask]['agreement_score'],
                label=f"{case_type} (n={sum(mask)})",
                color=color,
                fill=True,
                alpha=0.3
            )
        plt.xlabel('Agreement Score')
        plt.ylabel('Density')
        plt.title('Distribution of Agreement Scores')
        plt.legend()
        plt.grid(True)
        
        plt.tight_layout()
        
        # Save plot
        plot_filename = f"aggregate_score_analysis_{project_name}_{timestamp}.png"
        plot_path = results['visualizations'] + "/" + plot_filename
        plt.savefig(plot_path)
        logger.info(f"Analysis plots saved to {plot_path}")
        
        # Calculate and log statistics
        logger.info("\nScore Statistics by Case Type:")
        stats_data = []
        for case_type in case_styles.keys():
            mask = all_results['case_type'] == case_type
            case_data = all_results[mask]
            stats = {
                'Case Type': case_type,
                'Count': len(case_data),
                'Weighted Score Mean': case_data['weighted_score'].mean(),
                'Weighted Score Std': case_data['weighted_score'].std(),
                'Agreement Score Mean': case_data['agreement_score'].mean(),
                'Agreement Score Std': case_data['agreement_score'].std(),
                'Primary Signals Mean': case_data['primary_avg'].mean(),
                'Primary Signals Std': case_data['primary_avg'].std(),
                'Secondary Signals Mean': case_data['secondary_avg'].mean(),
                'Secondary Signals Std': case_data['secondary_avg'].std()
            }
            stats_data.append(stats)
            
            logger.info(f"\n{case_type} (n={len(case_data)}):")
            logger.info(f"Weighted Score: mean={stats['Weighted Score Mean']:.3f}, std={stats['Weighted Score Std']:.3f}")
            logger.info(f"Agreement Score: mean={stats['Agreement Score Mean']:.3f}, std={stats['Agreement Score Std']:.3f}")
            logger.info(f"Primary Signals: mean={stats['Primary Signals Mean']:.3f}, std={stats['Primary Signals Std']:.3f}")
            logger.info(f"Secondary Signals: mean={stats['Secondary Signals Mean']:.3f}, std={stats['Secondary Signals Std']:.3f}")
        
        # Save statistics to CSV
        stats_df = pd.DataFrame(stats_data)
        stats_filename = f"aggregate_score_statistics_{project_name}_{timestamp}.csv"
        stats_path = results['visualizations'] + "/" + stats_filename
        stats_df.to_csv(stats_path, index=False)
        logger.info(f"\nDetailed statistics saved to {stats_path}")
        
        return all_results
        
    except Exception as e:
        logger.error("Error in aggregate score analysis", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting aggregate score analysis visualization")
    analysis_results = analyze_aggregate_scores(
        tp_analysis,
        fp_analysis,
        fn_analysis,
        tn_analysis,
        feature_importance
    )
    logger.info("Aggregate score analysis visualization completed")
    
except Exception as e:
    logger.error("Failed to complete aggregate score analysis", exc_info=True)
    handle_exception(e)
    raise