# Similarity Score Performance Analysis
**Evaluates machine learning models on similarity scores from sentence transformers to optimize requirement matching accuracy and feature selection.**


In [None]:
# Cell [0] - Setup and Imports
# Purpose: Import all required libraries and configure environment settings for Multi-LLM testing
# Dependencies: os, io, sys, pathlib, dotenv, pandas, numpy, sklearn, matplotlib, seaborn, tqdm, praxis_sentence_transformer
# Breadcrumbs: Setup -> Imports -> Environment Configuration

import os
import io
import sys
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, fbeta_score, roc_auc_score, confusion_matrix,
    roc_curve, auc, precision_recall_curve, precision_score, recall_score
)
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

# Import project modules (installed via pip)
from praxis_sentence_transformer import (
    setup_logging,
    handle_exception,
    DebugTimer,
    Neo4jClient
)

# Load environment variables
load_dotenv()

# Set up logging with appropriate format
logger = setup_logging("similarity-score-analysis-notebook")

try:
    # Log the initialization of the notebook
    logger.info("Initializing Similarity Score Analysis Notebook")
    
    # Verify environment variables are loaded
    required_env_vars = [
        'NEO4J_URI', 
        'NEO4J_USER', 
        'NEO4J_PASSWORD',
        'PROJECT_NAME'  # Add PROJECT_NAME to required variables
    ]
    missing_vars = [var for var in required_env_vars if not os.getenv(var)]
    
    if missing_vars:
        logger.error(f"Missing required environment variables: {missing_vars}")
        raise EnvironmentError(f"Missing required environment variables: {missing_vars}")
    else:
        logger.debug("All required environment variables loaded successfully")
        logger.info(f"Working with project: {os.getenv('PROJECT_NAME')}")

except Exception as e:
    logger.error("Failed to initialize notebook", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [1] - Neo4j Connection and Data Loading
# Purpose: Establish database connection and load similarity data for analysis
# Dependencies: Neo4jClient, logger, os, pandas
# Breadcrumbs: Setup -> Database Connection -> Data Retrieval

def check_schema():
    """Check Neo4j schema and relationship types"""
    logger.debug("Initializing Neo4j schema check")
    try:
        neo4j_client = Neo4jClient(
            uri=os.getenv('NEO4J_URI'),
            username=os.getenv('NEO4J_USER'),
            password=os.getenv('NEO4J_PASSWORD')
        )
        
        # Modified query to get schema information
        query = """
        CALL apoc.meta.schema()
        YIELD value
        RETURN value
        """
        
        with neo4j_client.driver.session() as session:
            result = session.run(query)
            schema = result.single()['value']
            
            # Log the raw schema for debugging
            logger.debug(f"Raw schema structure: {schema}")
            
            # Get relationship counts for current project
            project_name = os.getenv('PROJECT_NAME')
            count_query = """
            MATCH (p:Project {name: $project_name})<-[:CONTAINS]-(d:Document)-[:CONTAINS]->(r:Requirement)
            WITH r
            MATCH (r)-[rel:SIMILAR_TO]->(other:Requirement)
            WHERE rel.project = $project_name
            RETURN count(rel) as similar_count
            """
            
            similar_count = session.run(count_query, project_name=project_name).single()['similar_count']
            
            logger.info("Database Schema Statistics:")
            logger.info(f"Number of SIMILAR_TO relationships in project {project_name}: {similar_count}")
            
            if 'SIMILAR_TO' in schema:
                logger.debug(f"SIMILAR_TO properties: {schema['SIMILAR_TO']['properties'].keys()}")
            
            return schema
            
    except Exception as e:
        logger.error("Error checking Neo4j schema", exc_info=True)
        handle_exception(e)
        raise
    finally:
        if 'neo4j_client' in locals():
            neo4j_client.close()
            logger.debug("Successfully closed Neo4j connection")

def get_similarity_data():
    """Retrieve similarity data for the current project"""
    logger.debug("Initializing similarity data retrieval")
    try:
        neo4j_client = Neo4jClient(
            uri=os.getenv('NEO4J_URI'),
            username=os.getenv('NEO4J_USER'),
            password=os.getenv('NEO4J_PASSWORD')
        )
        
        project_name = os.getenv('PROJECT_NAME')
        query = """
        MATCH (r1:Requirement)-[s:SIMILAR_TO]->(r2:Requirement)
        WHERE s.project = $project_name
        OPTIONAL MATCH (r1)-[g:GROUND_TRUTH]->(r2)
        WHERE g.project = $project_name
        RETURN 
            r1.id as source_id,
            r2.id as target_id,
            s.similarity as similarity_score,
            s.model as model_name,
            CASE WHEN g IS NOT NULL THEN 1 ELSE 0 END as is_related
        """
        
        with neo4j_client.driver.session() as session:
            result = session.run(query, project_name=project_name)
            records = [dict(record) for record in result]
            
            # Create a DataFrame with all possible model scores initialized to None
            data = pd.DataFrame(records)
            
            if len(data) > 0:
                # Pivot the data to create separate columns for each model
                model_scores = data.pivot(
                    index=['source_id', 'target_id', 'is_related'],
                    columns='model_name',
                    values='similarity_score'
                ).reset_index()
                
                # Rename columns to match expected format
                model_mapping = {
                    'model1': 'model1_score',
                    'model2': 'model2_score',
                    'model3': 'model3_score',
                    'model4': 'model4_score',
                    'model5': 'model5_score',
                    'model6': 'model6_score',
                    'model7': 'model7_score',
                    'model8': 'model8_score',
                    'tfidf': 'tfidf_score'
                }
                model_scores.rename(columns=model_mapping, inplace=True)
                
                logger.info("\nDataset Statistics:")
                logger.info(f"Total pairs: {len(model_scores)}")
                logger.info(f"Related pairs: {model_scores['is_related'].sum()}")
                logger.info(f"Unrelated pairs: {len(model_scores) - model_scores['is_related'].sum()}")
                
                if model_scores.isnull().values.any():
                    logger.warning("Missing values found in the dataset")
                else:
                    logger.debug("No missing values found in the dataset")
                    
                logger.info("\nFirst 5 rows of dataset:")
                logger.info(f"\n{model_scores.head()}")
                logger.info("\nLast 5 rows of dataset:")
                logger.info(f"\n{model_scores.tail()}")
                logger.info("\nDataset Info:")
                logger.info(f"\n{model_scores.info()}")
                
                return model_scores
            else:
                logger.warning("No data found for the specified project")
                return pd.DataFrame()
            
    except Exception as e:
        logger.error("Error retrieving similarity data", exc_info=True)
        handle_exception(e)
        raise
    finally:
        if 'neo4j_client' in locals():
            neo4j_client.close()
            logger.debug("Successfully closed Neo4j connection")

try:
    schema = check_schema()
    similarity_data = get_similarity_data()
except Exception as e:
    logger.error("Failed to initialize data", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [2] - Data Preparation and Model Training
# Purpose: Prepare balanced dataset and train Random Forest classifier for similarity prediction
# Dependencies: pandas, sklearn, logger, os
# Breadcrumbs: Data Retrieval -> Data Preparation -> Model Training

def prepare_and_train_model(data):
    """Prepare balanced dataset and train Random Forest model"""
    logger.info(f"Starting data preparation and model training for project: {os.getenv('PROJECT_NAME')}")
    try:
        # Prepare balanced dataset
        logger.debug("Preparing balanced dataset")
        positive_samples = data[data['is_related'] == 1]
        negative_samples = data[data['is_related'] == 0].sample(n=len(positive_samples), random_state=42)
        balanced_data = pd.concat([positive_samples, negative_samples])
        
        logger.debug(f"Created balanced dataset with {len(balanced_data)} total samples")
        
        # Prepare features and target
        feature_columns = [col for col in balanced_data.columns 
                         if col not in ['source_id', 'target_id', 'is_related']]
        X = balanced_data[feature_columns]
        y = balanced_data['is_related']
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        logger.debug(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")
        
        # Initialize and train model
        logger.debug("Initializing RandomForestClassifier")
        model = RandomForestClassifier(
            n_estimators=100,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            random_state=42
        )
        
        logger.debug("Training RandomForestClassifier")
        model.fit(X_train, y_train)
        
        # Evaluate model
        logger.debug("Evaluating model performance")
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        classification_rep = classification_report(y_test, y_pred)
        f2 = fbeta_score(y_test, y_pred, beta=2)
        roc_auc = roc_auc_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        
        # Log results
        logger.info("\nClassification Report:")
        logger.info(f"\n{classification_rep}")
        logger.info(f"F2 Score: {f2:.3f}")
        logger.info(f"ROC AUC Score: {f2:.3f}")
        logger.info("Confusion Matrix:")
        logger.info(f"\n{conf_matrix}")
        
        # Feature importance analysis
        feature_importance = pd.DataFrame({
            'feature': feature_columns,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        logger.info("\nFeature Importance:")
        logger.info(f"\n{feature_importance}")
        
        return model, feature_importance, X_test, y_test, y_pred
        
    except Exception as e:
        logger.error("Error in model preparation and training", exc_info=True)
        handle_exception(e)
        raise

try:
    model, feature_importance, X_test, y_test, y_pred = prepare_and_train_model(similarity_data)
except Exception as e:
    logger.error("Failed to prepare and train model", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [3] - Visualization Functions
# Purpose: Create comprehensive visualizations for model performance analysis and results interpretation
# Dependencies: matplotlib, seaborn, sklearn.metrics, numpy, pandas, datetime
# Breadcrumbs: Model Training -> Performance Analysis -> Visualization Generation

def create_visualizations(model, feature_importance, X_test, y_test, y_pred):
    """Create and save visualizations for model analysis"""
    project_name = os.getenv('PROJECT_NAME')
    logger.info(f"Creating visualizations for project: {project_name}")
    
    try:
        # Set up the figure with subplots
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 16))
        fig.suptitle(f'Model Analysis Visualizations - Project: {project_name}', fontsize=16)
        
        # 1. Confusion Matrix Heatmap
        logger.debug("Creating confusion matrix heatmap")
        conf_matrix = confusion_matrix(y_test, y_pred)
        sns.heatmap(
            conf_matrix, 
            annot=True, 
            fmt='d', 
            cmap='Blues',
            xticklabels=['Not Related', 'Related'],
            yticklabels=['Not Related', 'Related'],
            ax=ax1
        )
        ax1.set_title('Confusion Matrix')
        ax1.set_xlabel('Predicted')
        ax1.set_ylabel('Actual')
        
        # 2. Feature Importance Plot
        logger.debug("Creating feature importance plot")
        feature_importance_plot = feature_importance.plot(
            kind='barh',
            x='feature',
            y='importance',
            ax=ax2,
            color='skyblue'
        )
        ax2.set_title('Feature Importance')
        ax2.set_xlabel('Importance Score')
        plt.setp(ax2.get_xticklabels(), rotation=45, ha='right')
        
        # 3. ROC Curve
        logger.debug("Creating ROC curve")
        y_prob = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)
        
        ax3.plot(
            fpr, 
            tpr, 
            color='darkorange',
            lw=2, 
            label=f'ROC curve (AUC = {roc_auc:.2f})'
        )
        ax3.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        ax3.set_xlim([0.0, 1.0])
        ax3.set_ylim([0.0, 1.05])
        ax3.set_xlabel('False Positive Rate')
        ax3.set_ylabel('True Positive Rate')
        ax3.set_title('Receiver Operating Characteristic (ROC) Curve')
        ax3.legend(loc="lower right")
        
        # 4. Precision-Recall Curve
        logger.debug("Creating precision-recall curve")
        precision, recall, _ = precision_recall_curve(y_test, y_prob)
        pr_auc = auc(recall, precision)
        
        ax4.plot(
            recall, 
            precision, 
            color='green',
            lw=2, 
            label=f'PR curve (AUC = {pr_auc:.2f})'
        )
        ax4.set_xlim([0.0, 1.0])
        ax4.set_ylim([0.0, 1.05])
        ax4.set_xlabel('Recall')
        ax4.set_ylabel('Precision')
        ax4.set_title('Precision-Recall Curve')
        ax4.legend(loc="lower left")
        
        plt.tight_layout()
        
        # Save the plot
        try:
            plot_filename = f"model_analysis_{project_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
            plt.savefig(plot_filename)
            logger.info(f"Saved visualization plot to {plot_filename}")
        except Exception as save_error:
            logger.warning(f"Could not save plot to file: {str(save_error)}")
        
        plt.show()
        
        # Additional Analysis: Model Performance at Different Thresholds
        logger.debug("Calculating threshold metrics")
        thresholds = np.arange(0.1, 1.0, 0.1)
        threshold_metrics = []
        
        for threshold in thresholds:
            y_pred_threshold = (y_prob >= threshold).astype(int)
            f2 = fbeta_score(y_test, y_pred_threshold, beta=2)
            precision = precision_score(y_test, y_pred_threshold)
            recall = recall_score(y_test, y_pred_threshold)
            
            threshold_metrics.append({
                'threshold': threshold,
                'f2_score': f2,
                'precision': precision,
                'recall': recall
            })
        
        threshold_df = pd.DataFrame(threshold_metrics)
        logger.info("\nThreshold Analysis:")
        logger.info(f"\n{threshold_df}")
        
        # Plot threshold analysis
        fig, ax = plt.subplots(figsize=(10, 6))
        threshold_df.plot(x='threshold', y=['f2_score', 'precision', 'recall'], ax=ax)
        plt.title(f'Model Metrics vs Threshold - Project: {project_name}')
        plt.xlabel('Threshold')
        plt.ylabel('Score')
        plt.grid(True)
        plt.legend()
        
        # Save the threshold analysis plot
        try:
            threshold_plot_filename = f"threshold_analysis_{project_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
            plt.savefig(threshold_plot_filename)
            logger.info(f"Saved threshold analysis plot to {threshold_plot_filename}")
        except Exception as save_error:
            logger.warning(f"Could not save threshold analysis plot to file: {str(save_error)}")
            
        plt.show()
        
    except Exception as e:
        logger.error("Error creating visualizations", exc_info=True)
        handle_exception(e)
        raise

try:
    create_visualizations(model, feature_importance, X_test, y_test, y_pred)
except Exception as e:
    logger.error("Failed to create visualizations", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [4] - Dataset Distribution Analysis
# Purpose: Analyze distribution of related and unrelated requirement pairs for understanding data balance
# Dependencies: pandas, logger
# Breadcrumbs: Visualization Generation -> Dataset Analysis -> Distribution Statistics

try:
    # Method 1: Using sum()
    related_count = df['is_related'].sum()
    logger.info("Dataset Distribution Analysis:")
    logger.info(f"Number of related pairs: {related_count}")

    # Method 2: Using value_counts() to see both related and unrelated counts
    distribution = df['is_related'].value_counts()
    logger.info("Distribution of related/unrelated pairs:")
    for label, count in distribution.items():
        logger.info(f"Class {label}: {count} pairs")

    # Method 3: Using value_counts(normalize=True) to see percentages
    percentage_dist = df['is_related'].value_counts(normalize=True) * 100
    logger.info("Percentage distribution:")
    for label, percentage in percentage_dist.items():
        logger.info(f"Class {label}: {percentage:.2f}%")

    # Additional statistics
    total_pairs = len(df)
    logger.info(f"Total number of pairs analyzed: {total_pairs}")
    logger.info(f"Imbalance ratio (unrelated:related): {(total_pairs - related_count)/related_count:.2f}:1")

except Exception as e:
    logger.error("Failed to analyze dataset distribution", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [5] - Feature Importance Analysis
# Purpose: Visualize and analyze feature importance from trained model for understanding model behavior
# Dependencies: pandas, matplotlib, seaborn
# Breadcrumbs: Dataset Analysis -> Feature Analysis -> Model Interpretation

def plot_feature_importance(rf, feature_names):
    """Plot feature importance"""
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=True)
    
    plt.figure(figsize=(12, 8))  # Made figure slightly larger to accommodate more features
    sns.barplot(data=importance_df, x='importance', y='feature')
    plt.title('Feature Importance in Predicting Related Requirements')
    plt.xlabel('Importance Score')
    plt.tight_layout()
    plt.show()

feature_names = [
    'Model 1 (multi-qa-mpnet-dot)', 
    'Model 2 (all-mpnet)', 
    'Model 3 (MiniLM)', 
    'Model 4 (distilroberta)',
    'Model 5 (multi-qa-distilbert-cos)',
    'Model 6 (multi-qa-MiniLM-L6-cos)',
    'Model 7 (stsb-bert-large)',
    'Model 8 (stsb-bert-base)',
    'TF-IDF'
]
plot_feature_importance(rf, feature_names)

In [None]:
# Cell [6] - Threshold Analysis
# Purpose: Analyze model performance at different probability thresholds including F2 score optimization
# Dependencies: numpy, pandas, sklearn.metrics, matplotlib, pyplot
# Breadcrumbs: Model Interpretation -> Threshold Optimization -> Performance Analysis
def analyze_thresholds(rf, X_test, y_test):
    """Analyze different probability thresholds including F2 score"""
    logger.debug("Starting threshold analysis")
    try:
        y_prob = rf.predict_proba(X_test)[:, 1]
        results = []
        
        for threshold in np.arange(0.1, 1.0, 0.1):
            y_pred = (y_prob >= threshold).astype(int)
            report = classification_report(y_test, y_pred, output_dict=True)
            f2 = fbeta_score(y_test, y_pred, beta=2)
            results.append({
                'threshold': threshold,
                'precision': report['1']['precision'],
                'recall': report['1']['recall'],
                'f1': report['1']['f1-score'],
                'f2': f2
            })
            logger.debug(f"Analyzed threshold {threshold:.1f}: F2={f2:.3f}")
        
        results_df = pd.DataFrame(results)
        
        # Plot metrics vs threshold
        logger.debug("Generating threshold analysis plot")
        plt.figure(figsize=(10, 6))
        for metric in ['precision', 'recall', 'f1', 'f2']:
            plt.plot(results_df['threshold'], results_df[metric], label=metric)
        plt.xlabel('Threshold')
        plt.ylabel('Score')
        plt.title('Metrics vs Classification Threshold')
        plt.legend()
        plt.grid(True)
        plt.show()
        
        # Log best results for each metric
        logger.info("Best Thresholds for Each Metric:")
        best_f1 = results_df.loc[results_df['f1'].idxmax()]
        best_f2 = results_df.loc[results_df['f2'].idxmax()]
        best_precision = results_df.loc[results_df['precision'].idxmax()]
        best_recall = results_df.loc[results_df['recall'].idxmax()]
        
        logger.info(f"Best F1 Score (threshold={best_f1['threshold']:.3f}):")
        logger.info(f"  Precision: {best_f1['precision']:.3f}")
        logger.info(f"  Recall: {best_f1['recall']:.3f}")
        logger.info(f"  F1: {best_f1['f1']:.3f}")
        logger.info(f"  F2: {best_f1['f2']:.3f}")
        
        logger.info(f"Best F2 Score (threshold={best_f2['threshold']:.3f}):")
        logger.info(f"  Precision: {best_f2['precision']:.3f}")
        logger.info(f"  Recall: {best_f2['recall']:.3f}")
        logger.info(f"  F1: {best_f2['f1']:.3f}")
        logger.info(f"  F2: {best_f2['f2']:.3f}")
        
        logger.info(f"Best Precision (threshold={best_precision['threshold']:.3f}):")
        logger.info(f"  Precision: {best_precision['precision']:.3f}")
        logger.info(f"  Recall: {best_precision['recall']:.3f}")
        logger.info(f"  F1: {best_precision['f1']:.3f}")
        logger.info(f"  F2: {best_precision['f2']:.3f}")
        
        logger.info(f"Best Recall (threshold={best_recall['threshold']:.3f}):")
        logger.info(f"  Precision: {best_recall['precision']:.3f}")
        logger.info(f"  Recall: {best_recall['recall']:.3f}")
        logger.info(f"  F1: {best_recall['f1']:.3f}")
        logger.info(f"  F2: {best_recall['f2']:.3f}")
        
        logger.debug("Threshold analysis completed successfully")
        return results_df
        
    except Exception as e:
        logger.error("Error during threshold analysis", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting threshold analysis for model performance")
    threshold_results = analyze_thresholds(rf, X_test, y_test)
    logger.debug("Threshold analysis completed")
    
except Exception as e:
    logger.error("Failed to complete threshold analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [7] - Get Requirement Texts
# Purpose: Retrieve requirement text content from Neo4j database for detailed analysis
# Dependencies: Neo4jClient, logger, os
# Breadcrumbs: Performance Analysis -> Data Retrieval -> Text Analysis
def get_requirement_texts(source_ids, target_ids):
    """Get requirement texts from Neo4j for given IDs"""
    logger.debug("Fetching requirement texts from Neo4j")
    try:
        neo4j_client = Neo4jClient(
            uri=os.getenv('NEO4J_URI'),
            username=os.getenv('NEO4J_USER'),
            password=os.getenv('NEO4J_PASSWORD')
        )
        
        # Query to get source requirement texts
        query = """
        MATCH (s:Requirement {type: 'SOURCE'})
        WHERE s.id IN $source_ids
        RETURN s.id as source_id, s.content as source_text
        """
        with neo4j_client.driver.session() as session:
            result = session.run(query, source_ids=source_ids)
            source_texts = {record['source_id']: record['source_text'] for record in result}
        
        # Query to get target requirement texts
        query = """
        MATCH (t:Requirement {type: 'TARGET'})
        WHERE t.id IN $target_ids
        RETURN t.id as target_id, t.content as target_text
        """
        with neo4j_client.driver.session() as session:
            result = session.run(query, target_ids=target_ids)
            target_texts = {record['target_id']: record['target_text'] for record in result}
        
        neo4j_client.close()
        logger.debug(f"Successfully retrieved texts for {len(source_texts)} source and {len(target_texts)} target requirements")
        return source_texts, target_texts
        
    except Exception as e:
        logger.error("Failed to retrieve requirement texts", exc_info=True)
        handle_exception(e)
        raise

In [None]:
# Cell [8] - Analyze False Negatives
# Purpose: Identify and analyze cases where model failed to detect true positive relationships
# Dependencies: pandas, numpy, logger, matplotlib
# Breadcrumbs: Text Analysis -> Error Analysis -> False Negative Detection
def analyze_false_negatives(rf, X_full, y_full, threshold=0.5):
    """
    Analyze all cases where model predicted negative but actual was positive
    Args:
        rf: Random Forest model
        X_full: Full feature set
        y_full: Full ground truth labels
        threshold: Classification threshold (default 0.5)
    """
    logger.debug(f"Starting false negative analysis with threshold {threshold}")
    try:
        # Get predictions using the specified threshold
        y_prob = rf.predict_proba(X_full)[:, 1]
        y_pred = (y_prob >= threshold).astype(int)
        
        # Find false negative indices
        fn_indices = np.where((y_pred == 0) & (y_full == 1))[0]
        
        # Create DataFrame with false negatives
        fn_data = pd.DataFrame({
            'Actual': y_full.iloc[fn_indices],
            'Predicted': y_pred[fn_indices],
            'Probability': y_prob[fn_indices],
            'Source ID': df.iloc[fn_indices]['source_id'],
            'Target ID': df.iloc[fn_indices]['target_id'],
        })
        
        # Add model scores
        for i, model in enumerate(['model1_score', 'model2_score', 'model3_score', 
                                 'model4_score', 'model5_score', 'model6_score',
                                 'model7_score', 'model8_score', 'tfidf_score']):
            fn_data[f'Model {i+1}'] = df.iloc[fn_indices][model]
        
        # Sort by probability
        fn_data = fn_data.sort_values('Probability', ascending=False)
        
        # Log statistics
        logger.info("\nFalse Negative Analysis Results:")
        logger.info(f"Total false negatives: {len(fn_indices)}")
        logger.info(f"False negative rate: {len(fn_indices)/len(y_full):.2%}")
        
        # Log probability distribution
        logger.info("\nProbability Distribution of False Negatives:")
        prob_ranges = [(0.0, 0.1), (0.1, 0.2), (0.2, 0.3), (0.3, 0.4), (0.4, 0.5)]
        for low, high in prob_ranges:
            count = ((fn_data['Probability'] >= low) & (fn_data['Probability'] < high)).sum()
            logger.info(f"Probability {low:.1f}-{high:.1f}: {count} cases ({count/len(fn_indices):.1%})")
        
        # Log model score statistics
        logger.info("\nModel Score Statistics for False Negatives:")
        for i in range(1, 10):
            scores = fn_data[f'Model {i}']
            logger.info(f"Model {i}:")
            logger.info(f"  Mean: {scores.mean():.3f}")
            logger.info(f"  Std: {scores.std():.3f}")
            logger.info(f"  Min: {scores.min():.3f}")
            logger.info(f"  Max: {scores.max():.3f}")
        
        # Plot probability distribution
        logger.debug("Generating probability distribution plot")
        plt.figure(figsize=(10, 6))
        plt.hist(fn_data['Probability'], bins=20)
        plt.xlabel('Prediction Probability')
        plt.ylabel('Count')
        plt.title('Distribution of False Negative Probabilities')
        plt.grid(True)
        plt.show()
        
        logger.debug("False negative analysis completed successfully")
        return fn_data
        
    except Exception as e:
        logger.error("Error during false negative analysis", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting false negative analysis")
    fn_analysis = analyze_false_negatives(rf, X_full, y_full, threshold=0.3)
    
    # Get requirement texts for example cases
    source_texts, target_texts = get_requirement_texts(
        fn_analysis['Source ID'].head().tolist(),
        fn_analysis['Target ID'].head().tolist()
    )
    
    # Log example cases
    logger.info("Example False Negative Cases (Top 5 by probability):")
    for idx, row in fn_analysis.head().iterrows():
        logger.info(f"Case {idx+1}:")
        logger.info(f"Source ID: {row['Source ID']}")
        logger.info(f"Target ID: {row['Target ID']}")
        logger.info(f"Prediction Probability: {row['Probability']:.3f}")
        for i in range(1, 10):
            logger.info(f"Model {i} Score: {row[f'Model {i}']:.3f}")
        
        # Log requirement texts at DEBUG level
        logger.debug("Requirement Texts:")
        logger.debug(f"Source Text: {source_texts.get(row['Source ID'], 'Not found')}")
        logger.debug(f"Target Text: {target_texts.get(row['Target ID'], 'Not found')}")
    
    logger.debug("False negative analysis completed")
    
except Exception as e:
    logger.error("Failed to complete false negative analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [9] - Analyze False Positives
# Purpose: Identify and analyze cases where model incorrectly predicted positive relationships
# Dependencies: pandas, numpy, logger, matplotlib
# Breadcrumbs: False Negative Detection -> Error Analysis -> False Positive Detection
def analyze_false_positives(rf, X_full, y_full, threshold=0.5):
    """
    Analyze all cases where model predicted positive but actual was negative
    Args:
        rf: Random Forest model
        X_full: Full feature set
        y_full: Full ground truth labels
        threshold: Classification threshold (default 0.5)
    """
    logger.debug(f"Starting false positive analysis with threshold {threshold}")
    try:
        # Get predictions using the specified threshold
        y_prob = rf.predict_proba(X_full)[:, 1]
        y_pred = (y_prob >= threshold).astype(int)
        
        # Find false positive indices
        fp_indices = np.where((y_pred == 1) & (y_full == 0))[0]
        
        # Create DataFrame with false positives
        fp_data = pd.DataFrame({
            'Actual': y_full.iloc[fp_indices],
            'Predicted': y_pred[fp_indices],
            'Probability': y_prob[fp_indices],
            'Source ID': df.iloc[fp_indices]['source_id'],
            'Target ID': df.iloc[fp_indices]['target_id'],
        })
        
        # Add model scores
        for i, model in enumerate(['model1_score', 'model2_score', 'model3_score', 
                                 'model4_score', 'model5_score', 'model6_score',
                                 'model7_score', 'model8_score', 'tfidf_score']):
            fp_data[f'Model {i+1}'] = df.iloc[fp_indices][model]
        
        # Sort by probability
        fp_data = fp_data.sort_values('Probability', ascending=False)
        
        # Log statistics
        logger.info("False Positive Analysis Results:")
        logger.info(f"Total false positives: {len(fp_indices)}")
        logger.info(f"False positive rate: {len(fp_indices)/len(y_full):.2%}")
        
        # Log probability distribution
        logger.info("Probability Distribution of False Positives:")
        prob_ranges = [(0.5, 0.6), (0.6, 0.7), (0.7, 0.8), (0.8, 0.9), (0.9, 1.0)]
        for low, high in prob_ranges:
            count = ((fp_data['Probability'] >= low) & (fp_data['Probability'] < high)).sum()
            logger.info(f"Probability {low:.1f}-{high:.1f}: {count} cases ({count/len(fp_indices):.1%})")
        
        # Log model score statistics
        logger.info("Model Score Statistics for False Positives:")
        for i in range(1, 10):
            scores = fp_data[f'Model {i}']
            logger.info(f"Model {i}:")
            logger.info(f"  Mean: {scores.mean():.3f}")
            logger.info(f"  Std: {scores.std():.3f}")
            logger.info(f"  Min: {scores.min():.3f}")
            logger.info(f"  Max: {scores.max():.3f}")
        
        # Plot probability distribution
        logger.debug("Generating probability distribution plot")
        plt.figure(figsize=(10, 6))
        plt.hist(fp_data['Probability'], bins=20)
        plt.xlabel('Prediction Probability')
        plt.ylabel('Count')
        plt.title('Distribution of False Positive Probabilities')
        plt.grid(True)
        plt.show()
        
        logger.debug("False positive analysis completed successfully")
        return fp_data
        
    except Exception as e:
        logger.error("Error during false positive analysis", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting false positive analysis")
    fp_analysis = analyze_false_positives(rf, X_full, y_full, threshold=0.3)
    
    # Get requirement texts for example cases
    source_texts, target_texts = get_requirement_texts(
        fp_analysis['Source ID'].head().tolist(),
        fp_analysis['Target ID'].head().tolist()
    )
    
    # Log example cases
    logger.info("Example False Positive Cases (Top 5 by probability):")
    for idx, row in fp_analysis.head().iterrows():
        logger.info(f"Case {idx+1}:")
        logger.info(f"Source ID: {row['Source ID']}")
        logger.info(f"Target ID: {row['Target ID']}")
        logger.info(f"Prediction Probability: {row['Probability']:.3f}")
        for i in range(1, 10):
            logger.info(f"Model {i} Score: {row[f'Model {i}']:.3f}")
            
        # Log requirement texts at DEBUG level
        logger.debug("Requirement Texts:")
        logger.debug(f"Source Text: {source_texts.get(row['Source ID'], 'Not found')}")
        logger.debug(f"Target Text: {target_texts.get(row['Target ID'], 'Not found')}")
    
    logger.debug("False positive analysis completed")
    
except Exception as e:
    logger.error("Failed to complete false positive analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [10] - Analyze True Positives
# Purpose: Analyze cases where model correctly identified positive relationships for validation
# Dependencies: pandas, numpy, logger, matplotlib
# Breadcrumbs: False Positive Detection -> Validation Analysis -> True Positive Analysis
def analyze_true_positives(rf, X_full, y_full, threshold=0.5):
    """
    Analyze all cases where model correctly predicted positive matches
    Args:
        rf: Random Forest model
        X_full: Full feature set
        y_full: Full ground truth labels
        threshold: Classification threshold (default 0.5)
    """
    logger.debug(f"Starting true positive analysis with threshold {threshold}")
    try:
        # Get predictions using the specified threshold
        y_prob = rf.predict_proba(X_full)[:, 1]
        y_pred = (y_prob >= threshold).astype(int)
        
        # Find true positive indices
        tp_indices = np.where((y_pred == 1) & (y_full == 1))[0]
        
        # Create DataFrame with true positives
        tp_data = pd.DataFrame({
            'Actual': y_full.iloc[tp_indices],
            'Predicted': y_pred[tp_indices],
            'Probability': y_prob[tp_indices],
            'Source ID': df.iloc[tp_indices]['source_id'],
            'Target ID': df.iloc[tp_indices]['target_id'],
        })
        
        # Add model scores
        for i, model in enumerate(['model1_score', 'model2_score', 'model3_score', 
                                 'model4_score', 'model5_score', 'model6_score',
                                 'model7_score', 'model8_score', 'tfidf_score']):
            tp_data[f'Model {i+1}'] = df.iloc[tp_indices][model]
        
        # Sort by probability
        tp_data = tp_data.sort_values('Probability', ascending=False)
        
        # Log statistics
        logger.info("True Positive Analysis Results:")
        logger.info(f"Total true positives: {len(tp_indices)}")
        logger.info(f"True positive rate: {len(tp_indices)/len(y_full):.2%}")
        
        # Log probability distribution
        logger.info("Probability Distribution of True Positives:")
        prob_ranges = [(0.5, 0.6), (0.6, 0.7), (0.7, 0.8), (0.8, 0.9), (0.9, 1.0)]
        for low, high in prob_ranges:
            count = ((tp_data['Probability'] >= low) & (tp_data['Probability'] < high)).sum()
            logger.info(f"Probability {low:.1f}-{high:.1f}: {count} cases ({count/len(tp_indices):.1%})")
        
        # Log model score statistics
        logger.info("Model Score Statistics for True Positives:")
        for i in range(1, 10):
            scores = tp_data[f'Model {i}']
            logger.info(f"Model {i}:")
            logger.info(f"  Mean: {scores.mean():.3f}")
            logger.info(f"  Std: {scores.std():.3f}")
            logger.info(f"  Min: {scores.min():.3f}")
            logger.info(f"  Max: {scores.max():.3f}")
        
        # Plot probability distribution
        logger.debug("Generating probability distribution plot")
        plt.figure(figsize=(10, 6))
        plt.hist(tp_data['Probability'], bins=20)
        plt.xlabel('Prediction Probability')
        plt.ylabel('Count')
        plt.title('Distribution of True Positive Probabilities')
        plt.grid(True)
        plt.show()
        
        logger.debug("True positive analysis completed successfully")
        return tp_data
        
    except Exception as e:
        logger.error("Error during true positive analysis", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting true positive analysis")
    tp_analysis = analyze_true_positives(rf, X_full, y_full, threshold=0.3)
    
    # Get requirement texts for example cases
    source_texts, target_texts = get_requirement_texts(
        tp_analysis['Source ID'].head().tolist(),
        tp_analysis['Target ID'].head().tolist()
    )
    
    # Log example cases
    logger.info("Example True Positive Cases (Top 5 by probability):")
    for idx, row in tp_analysis.head().iterrows():
        logger.info(f"Case {idx+1}:")
        logger.info(f"Source ID: {row['Source ID']}")
        logger.info(f"Target ID: {row['Target ID']}")
        logger.info(f"Prediction Probability: {row['Probability']:.3f}")
        for i in range(1, 10):
            logger.info(f"Model {i} Score: {row[f'Model {i}']:.3f}")
            
        # Log requirement texts at DEBUG level
        logger.debug("Requirement Texts:")
        logger.debug(f"Source Text: {source_texts.get(row['Source ID'], 'Not found')}")
        logger.debug(f"Target Text: {target_texts.get(row['Target ID'], 'Not found')}")
    
    logger.debug("True positive analysis completed")
    
except Exception as e:
    logger.error("Failed to complete true positive analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [11] - Analyze True Negatives
# Purpose: Analyze cases where model correctly identified negative relationships for completeness
# Dependencies: pandas, numpy, logger, matplotlib
# Breadcrumbs: True Positive Analysis -> Validation Analysis -> True Negative Analysis
def analyze_true_negatives(rf, X_full, y_full, threshold=0.5):
    """
    Analyze all cases where model correctly predicted negative matches
    Args:
        rf: Random Forest model
        X_full: Full feature set
        y_full: Full ground truth labels
        threshold: Classification threshold (default 0.5)
    """
    logger.debug(f"Starting true negative analysis with threshold {threshold}")
    try:
        # Get predictions using the specified threshold
        y_prob = rf.predict_proba(X_full)[:, 1]
        y_pred = (y_prob >= threshold).astype(int)
        
        # Find true negative indices
        tn_indices = np.where((y_pred == 0) & (y_full == 0))[0]
        
        # Create DataFrame with true negatives
        tn_data = pd.DataFrame({
            'Actual': y_full.iloc[tn_indices],
            'Predicted': y_pred[tn_indices],
            'Probability': y_prob[tn_indices],
            'Source ID': df.iloc[tn_indices]['source_id'],
            'Target ID': df.iloc[tn_indices]['target_id'],
        })
        
        # Add model scores
        for i, model in enumerate(['model1_score', 'model2_score', 'model3_score', 
                                 'model4_score', 'model5_score', 'model6_score',
                                 'model7_score', 'model8_score', 'tfidf_score']):
            tn_data[f'Model {i+1}'] = df.iloc[tn_indices][model]
        
        # Sort by probability
        tn_data = tn_data.sort_values('Probability', ascending=False)
        
        # Log statistics
        logger.info("\nTrue Negative Analysis Results:")
        logger.info(f"Total true negatives: {len(tn_indices)}")
        logger.info(f"True negative rate: {len(tn_indices)/len(y_full):.2%}")
        
        # Log probability distribution
        logger.info("\nProbability Distribution of True Negatives:")
        prob_ranges = [(0.0, 0.1), (0.1, 0.2), (0.2, 0.3), (0.3, 0.4), (0.4, 0.5)]
        for low, high in prob_ranges:
            count = ((tn_data['Probability'] >= low) & (tn_data['Probability'] < high)).sum()
            logger.info(f"Probability {low:.1f}-{high:.1f}: {count} cases ({count/len(tn_indices):.1%})")
        
        # Log model score statistics
        logger.info("\nModel Score Statistics for True Negatives:")
        for i in range(1, 10):
            scores = tn_data[f'Model {i}']
            logger.info(f"Model {i}:")
            logger.info(f"  Mean: {scores.mean():.3f}")
            logger.info(f"  Std: {scores.std():.3f}")
            logger.info(f"  Min: {scores.min():.3f}")
            logger.info(f"  Max: {scores.max():.3f}")
        
        # Plot probability distribution
        logger.debug("Generating probability distribution plot")
        plt.figure(figsize=(10, 6))
        plt.hist(tn_data['Probability'], bins=20)
        plt.xlabel('Prediction Probability')
        plt.ylabel('Count')
        plt.title('Distribution of True Negative Probabilities')
        plt.grid(True)
        plt.show()
        
        logger.debug("True negative analysis completed successfully")
        return tn_data
        
    except Exception as e:
        logger.error("Error during true negative analysis", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting true negative analysis")
    tn_analysis = analyze_true_negatives(rf, X_full, y_full, threshold=0.3)
    
    # Get requirement texts for example cases
    source_texts, target_texts = get_requirement_texts(
        tn_analysis['Source ID'].head().tolist(),
        tn_analysis['Target ID'].head().tolist()
    )
    
    # Log example cases
    logger.info("\nExample True Negative Cases (Top 5 by probability):")
    for idx, row in tn_analysis.head().iterrows():
        logger.info(f"\nCase {idx+1}:")
        logger.info(f"Source ID: {row['Source ID']}")
        logger.info(f"Target ID: {row['Target ID']}")
        logger.info(f"Prediction Probability: {row['Probability']:.3f}")
        for i in range(1, 10):
            logger.info(f"Model {i} Score: {row[f'Model {i}']:.3f}")
            
        # Log requirement texts at DEBUG level
        logger.debug("\nRequirement Texts:")
        logger.debug(f"Source Text: {source_texts.get(row['Source ID'], 'Not found')}")
        logger.debug(f"Target Text: {target_texts.get(row['Target ID'], 'Not found')}")
    
    logger.debug("True negative analysis completed")
    
except Exception as e:
    logger.error("Failed to complete true negative analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [12] - Impact Analysis on Project Effort Estimation
# Purpose: Evaluate complexity metrics and effort estimation impact based on prediction categories
# Dependencies: pandas, numpy, logger
# Breadcrumbs: True Negative Analysis -> Business Impact -> Effort Estimation
def analyze_requirement_complexity(df_analysis, category):
    """
    Analyze complexity metrics for requirements in different prediction categories
    (TP, FP, FN, or TN) to understand potential impact on effort estimation
    
    Args:
        df_analysis: DataFrame containing the analysis results
        category: String indicating which category we're analyzing ('TP', 'FP', 'FN', or 'TN')
    """
    logger.debug(f"Starting complexity analysis for {category} requirements")
    try:
        # Get requirement texts for analysis
        source_texts, target_texts = get_requirement_texts(
            df_analysis['Source ID'].tolist(),
            df_analysis['Target ID'].tolist()
        )
        
        # Add texts to dataframe
        df_analysis['Source Text'] = df_analysis['Source ID'].map(source_texts)
        df_analysis['Target Text'] = df_analysis['Target ID'].map(target_texts)
        
        # Calculate complexity metrics
        metrics = {
            'avg_text_length': {
                'source': df_analysis['Source Text'].str.len().mean(),
                'target': df_analysis['Target Text'].str.len().mean()
            },
            'requirement_count': len(df_analysis),
            'avg_similarity_scores': {
                'model1': df_analysis['Model 1'].mean(),
                'model2': df_analysis['Model 2'].mean(),
                'tfidf': df_analysis['Model 9'].mean()  # TFIDF is Model 9
            },
            'similarity_score_variance': {
                'model1': df_analysis['Model 1'].var(),
                'model2': df_analysis['Model 2'].var(),
                'tfidf': df_analysis['Model 9'].var()
            }
        }
        
        # Log analysis results
        logger.info(f"=== {category} Impact Analysis ===")
        logger.info(f"Number of Requirements: {metrics['requirement_count']}")
        
        logger.info("Text Length Analysis:")
        logger.info(f"Average Source Text Length: {metrics['avg_text_length']['source']:.1f} characters")
        logger.info(f"Average Target Text Length: {metrics['avg_text_length']['target']:.1f} characters")
        
        logger.info("Similarity Score Analysis:")
        logger.info("Average Scores:")
        logger.info(f"  Model 1: {metrics['avg_similarity_scores']['model1']:.3f}")
        logger.info(f"  Model 2: {metrics['avg_similarity_scores']['model2']:.3f}")
        logger.info(f"  TF-IDF: {metrics['avg_similarity_scores']['tfidf']:.3f}")
        
        logger.info("Score Variance:")
        logger.info(f"  Model 1: {metrics['similarity_score_variance']['model1']:.3f}")
        logger.info(f"  Model 2: {metrics['similarity_score_variance']['model2']:.3f}")
        logger.info(f"  TF-IDF: {metrics['similarity_score_variance']['tfidf']:.3f}")
        
        # Log example requirements at DEBUG level
        logger.debug("Example Requirements (first 3):")
        for idx, row in df_analysis.head(3).iterrows():
            logger.debug(f"Requirement Pair {idx+1}:")
            logger.debug(f"Source ID: {row['Source ID']}")
            logger.debug(f"Source Text: {row['Source Text']}")
            logger.debug(f"Target ID: {row['Target ID']}")
            logger.debug(f"Target Text: {row['Target Text']}")
            logger.debug(f"Similarity Scores:")
            logger.debug(f"  Model 1: {row['Model 1']:.3f}")
            logger.debug(f"  Model 2: {row['Model 2']:.3f}")
            logger.debug(f"  TF-IDF: {row['Model 9']:.3f}")
        
        return metrics
        
    except Exception as e:
        logger.error(f"Error during complexity analysis for {category}", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting impact analysis on project effort estimation")
    
    # Analyze true positives
    logger.debug("Analyzing true positive cases")
    tp_metrics = analyze_requirement_complexity(tp_analysis, "True Positives")
    
    # Analyze false positives
    logger.debug("Analyzing false positive cases")
    fp_metrics = analyze_requirement_complexity(fp_analysis, "False Positives")
    
    # Analyze false negatives
    logger.debug("Analyzing false negative cases")
    fn_metrics = analyze_requirement_complexity(fn_analysis, "False Negatives")
    
    # Calculate and log comparative metrics
    logger.info("Comparative Analysis:")
    
    # Compare text lengths
    logger.info("Average Text Length Comparison:")
    categories = ["True Positives", "False Positives", "False Negatives"]
    metrics = [tp_metrics, fp_metrics, fn_metrics]
    
    for cat, met in zip(categories, metrics):
        logger.info(f"{cat}:")
        logger.info(f"  Source Text: {met['avg_text_length']['source']:.1f} characters")
        logger.info(f"  Target Text: {met['avg_text_length']['target']:.1f} characters")
    
    # Compare similarity scores
    logger.info("Average Similarity Score Comparison:")
    for cat, met in zip(categories, metrics):
        logger.info(f"{cat}:")
        logger.info(f"  Model 1: {met['avg_similarity_scores']['model1']:.3f}")
        logger.info(f"  Model 2: {met['avg_similarity_scores']['model2']:.3f}")
        logger.info(f"  TF-IDF: {met['avg_similarity_scores']['tfidf']:.3f}")
    
    logger.debug("Impact analysis completed")
    
except Exception as e:
    logger.error("Failed to complete impact analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [13] - Scatter Plot Analysis
# Purpose: Create comprehensive scatter plots and density visualizations for model comparison
# Dependencies: matplotlib, seaborn, numpy, pandas, logger
# Breadcrumbs: Effort Estimation -> Visualization -> Scatter Plot Analysis
def create_scatter_plots():
    """
    Create scatter plots showing the distribution of TP, FP, FN, and TN cases
    with TP and FN on top layers for better visibility
    """
    logger.debug("Starting scatter plot analysis")
    try:
        # Prepare data for each category - order determines layer position (last items on top)
        categories = {
            'True Negative': (tn_analysis, 'blue', 'x'),
            'False Positive': (fp_analysis, 'grey', '^'),
            'False Negative': (fn_analysis, 'red', 's'),
            'True Positive': (tp_analysis, 'green', 'o')
        }
        
        # Create subplots for different model combinations
        fig, axes = plt.subplots(2, 2, figsize=(20, 20))
        fig.suptitle('Similarity Score Distribution by Prediction Category', fontsize=16)
        
        # Model combinations to plot - using correct column names
        plot_configs = [
            {
                'x': 'Model 1',
                'y': 'Model 2',
                'title': 'Model1 vs Model2',
                'pos': (0, 0)
            },
            {
                'x': 'Model 1',
                'y': 'Model 9',  # TFIDF is Model 9
                'title': 'Model1 vs TF-IDF',
                'pos': (0, 1)
            },
            {
                'x': 'Model 2',
                'y': 'Model 9',  # TFIDF is Model 9
                'title': 'Model2 vs TF-IDF',
                'pos': (1, 0)
            },
            {
                'x': 'Model 3',
                'y': 'Model 4',
                'title': 'Model3 vs Model4',
                'pos': (1, 1)
            }
        ]
        
        logger.debug("Creating scatter plots for model comparisons")
        # Create each subplot
        for config in plot_configs:
            logger.debug(f"Creating plot for {config['title']}")
            ax = axes[config['pos'][0], config['pos'][1]]
            
            # Plot each category (order matters for layering)
            for category, (data, color, marker) in categories.items():
                # Check for valid data points
                valid_mask = ~(np.isnan(data[config['x']]) | np.isnan(data[config['y']]))
                if valid_mask.sum() == 0:
                    logger.warning(f"No valid data points for {category} in {config['title']}")
                    continue
                
                valid_data = data[valid_mask]
                logger.debug(f"Adding {len(valid_data)} {category} data points")
                
                ax.scatter(
                    valid_data[config['x']],
                    valid_data[config['y']],
                    c=color,
                    marker=marker,
                    label=category,
                    alpha=0.6,
                    s=50  # marker size
                )
                
                # Log statistics for this category and model combination
                logger.info(f"Statistics for {category} in {config['title']}:")
                logger.info(f"  Number of points: {len(valid_data)}")
                logger.info(f"  {config['x']} mean: {valid_data[config['x']].mean():.3f}")
                logger.info(f"  {config['y']} mean: {valid_data[config['y']].mean():.3f}")
            
            # Customize plot appearance
            ax.set_xlabel(config['x'])
            ax.set_ylabel(config['y'])
            ax.set_title(config['title'])
            ax.grid(True, linestyle='--', alpha=0.7)
            ax.legend()
            
            # Add correlation coefficient to plot
            for category, (data, _, _) in categories.items():
                valid_mask = ~(np.isnan(data[config['x']]) | np.isnan(data[config['y']]))
                if valid_mask.sum() > 1:  # Need at least 2 points for correlation
                    valid_data = data[valid_mask]
                    correlation = valid_data[config['x']].corr(valid_data[config['y']])
                    logger.info(f"Correlation for {category} between {config['x']} and {config['y']}: {correlation:.3f}")
                else:
                    logger.warning(f"Insufficient data for correlation calculation in {category}")
        
        plt.tight_layout()
        logger.debug("Scatter plots created successfully")
        
        # Save plot if needed
        try:
            plot_path = "similarity_scatter_plots.png"
            plt.savefig(plot_path)
            logger.info(f"Scatter plots saved to {plot_path}")
        except Exception as save_error:
            logger.warning(f"Could not save plot to file: {str(save_error)}")
        
        plt.show()
        
    except Exception as e:
        logger.error("Error creating scatter plots", exc_info=True)
        handle_exception(e)
        raise

def create_density_plots():
    """
    Create density plots for each model's score distribution
    """
    logger.debug("Starting density plot analysis")
    try:
        fig, axes = plt.subplots(3, 3, figsize=(20, 20))
        fig.suptitle('Score Distribution Density by Model and Category', fontsize=16)
        
        model_names = [
            'Model 1', 'Model 2', 'Model 3',
            'Model 4', 'Model 5', 'Model 6',
            'Model 7', 'Model 8', 'Model 9'  # Model 9 is TFIDF
        ]
        
        categories = {
            'True Positive': (tp_analysis, 'green'),
            'False Positive': (fp_analysis, 'grey'),
            'False Negative': (fn_analysis, 'red'),
            'True Negative': (tn_analysis, 'blue')
        }
        
        for idx, model in enumerate(model_names):
            logger.debug(f"Creating density plot for {model}")
            ax = axes[idx // 3, idx % 3]
            
            for category, (data, color) in categories.items():
                # Filter out invalid values and check for variance
                valid_data = data[model].dropna()
                if len(valid_data) < 2:
                    logger.warning(f"Insufficient data for {category} in {model}")
                    continue
                    
                if valid_data.var() == 0:
                    logger.warning(f"Zero variance in {category} for {model}")
                    continue
                
                try:
                    sns.kdeplot(
                        data=valid_data,
                        ax=ax,
                        label=category,
                        color=color,
                        warn_singular=False  # Suppress singular matrix warning
                    )
                    
                    # Log statistics for this model and category
                    logger.info(f"Statistics for {category} in {model}:")
                    logger.info(f"  Mean: {valid_data.mean():.3f}")
                    logger.info(f"  Std: {valid_data.std():.3f}")
                    logger.info(f"  Min: {valid_data.min():.3f}")
                    logger.info(f"  Max: {valid_data.max():.3f}")
                except Exception as plot_error:
                    logger.warning(f"Could not create density plot for {category} in {model}: {str(plot_error)}")
            
            ax.set_title(f"{model} Score Distribution")
            ax.grid(True, linestyle='--', alpha=0.7)
            ax.legend()
        
        plt.tight_layout()
        logger.debug("Density plots created successfully")
        
        # Save plot if needed
        try:
            plot_path = "similarity_density_plots.png"
            plt.savefig(plot_path)
            logger.info(f"Density plots saved to {plot_path}")
        except Exception as save_error:
            logger.warning(f"Could not save plot to file: {str(save_error)}")
        
        plt.show()
        
    except Exception as e:
        logger.error("Error creating density plots", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting visualization analysis")
    logger.info("Creating scatter plots...")
    create_scatter_plots()
    logger.info("Creating density plots...")
    create_density_plots()
    logger.info("Visualization analysis completed")
except Exception as e:
    logger.error("Failed to complete visualization analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [14] - Score Transformation Analysis
# Purpose: Apply and analyze various mathematical transformations on similarity scores
# Dependencies: numpy, pandas, matplotlib, logger
# Breadcrumbs: Scatter Plot Analysis -> Data Transformation -> Score Transformation
def apply_transformations(scores):
    """
    Apply different transformations to similarity scores
    Args:
        scores: numpy array of similarity scores
    Returns:
        dict: Dictionary of transformed scores
    """
    logger.debug("Applying score transformations")
    try:
        transformations = {
            'log': np.log1p(scores),  # log1p to handle zeros
            'exp': np.exp(scores) - 1,  # subtract 1 to maintain 0 baseline
            'squared': np.square(scores),
            'cubic': np.power(scores, 3),
            'sqrt': np.sqrt(scores),
            'sigmoid': 1 / (1 + np.exp(-10 * (scores - 0.5)))  # scaled sigmoid
        }
        
        # Log transformation statistics
        for name, transformed in transformations.items():
            logger.debug(f"{name.capitalize()} transformation stats:")
            logger.debug(f"  Mean: {transformed.mean():.3f}")
            logger.debug(f"  Std: {transformed.std():.3f}")
            logger.debug(f"  Range: [{transformed.min():.3f}, {transformed.max():.3f}]")
            
        return transformations
        
    except Exception as e:
        logger.error("Error applying transformations", exc_info=True)
        handle_exception(e)
        raise

def plot_transformed_scores():
    """
    Plot original vs transformed scores for different categories
    """
    logger.debug("Starting transformed scores visualization")
    try:
        # Combine all datasets with their categories
        data_sources = {
            'True Positive': tp_analysis['Model 1'],
            'False Positive': fp_analysis['Model 1'],
            'False Negative': fn_analysis['Model 1'],
            'True Negative': tn_analysis['Model 1']
        }
        
        all_data = []
        for category, scores in data_sources.items():
            valid_scores = scores.dropna()
            if len(valid_scores) > 0:
                df = pd.DataFrame({'score': valid_scores, 'category': category})
                all_data.append(df)
            else:
                logger.warning(f"No valid scores for {category}")
        
        all_data = pd.concat(all_data)
        logger.info(f"Total samples for transformation analysis: {len(all_data)}")
        
        # Create subplots for each transformation
        fig, axes = plt.subplots(2, 3, figsize=(20, 15))
        fig.suptitle('Score Transformations by Category', fontsize=16)
        
        # Color mapping for categories
        colors = {
            'True Positive': 'green',
            'False Positive': 'grey',
            'False Negative': 'red',
            'True Negative': 'blue'
        }
        
        # Plot each transformation
        transformations = apply_transformations(all_data['score'].values)
        for (name, transformed), ax in zip(transformations.items(), axes.flat):
            logger.debug(f"Creating plot for {name} transformation")
            
            for category in colors.keys():
                category_mask = all_data['category'] == category
                if category_mask.sum() > 0:
                    original = all_data.loc[category_mask, 'score']
                    transformed_scores = transformed[category_mask]
                    
                    ax.scatter(
                        original,
                        transformed_scores,
                        c=colors[category],
                        label=category,
                        alpha=0.6,
                        s=50
                    )
                    
                    # Log correlation between original and transformed scores
                    correlation = np.corrcoef(original, transformed_scores)[0, 1]
                    logger.info(f"Correlation for {category} with {name} transformation: {correlation:.3f}")
            
            ax.set_xlabel('Original Score')
            ax.set_ylabel(f'{name.capitalize()} Score')
            ax.set_title(f'{name.capitalize()} Transformation')
            ax.grid(True, linestyle='--', alpha=0.7)
            ax.legend()
        
        plt.tight_layout()
        logger.debug("Transformation plots created successfully")
        
        # Save plot if needed
        try:
            plot_path = "score_transformations.png"
            plt.savefig(plot_path)
            logger.info(f"Transformation plots saved to {plot_path}")
        except Exception as save_error:
            logger.warning(f"Could not save plot to file: {str(save_error)}")
        
        plt.show()
        
    except Exception as e:
        logger.error("Error creating transformation plots", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting score transformation analysis")
    plot_transformed_scores()
    logger.info("Score transformation analysis completed")
except Exception as e:
    logger.error("Failed to complete transformation analysis", exc_info=True)
    handle_exception(e)
    raise

In [None]:
# Cell [15] - Log Scale Model Comparison
# Purpose: Create logarithmic scale visualizations for enhanced model comparison and analysis
# Dependencies: matplotlib, seaborn, numpy, pandas, logger
# Breadcrumbs: Score Transformation -> Advanced Visualization -> Log Scale Analysis
def create_log_scale_comparisons():
    """
    Create log-scale visualizations comparing different models
    """
    logger.debug("Starting log scale comparison analysis")
    try:
        # Create figure with multiple subplots
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 16))
        fig.suptitle('Model Comparisons with Different Scale Transformations', fontsize=16)
        
        categories = {
            'True Negative': (tn_analysis, 'blue', 'x'),
            'False Positive': (fp_analysis, 'grey', '^'),
            'False Negative': (fn_analysis, 'red', 's'),
            'True Positive': (tp_analysis, 'green', 'o')
        }
        
        logger.debug("Creating log10 scale comparison plot")
        # Plot 1: Log scale scatter
        for category, (data, color, marker) in categories.items():
            # Add validation for data
            if len(data) == 0:
                logger.warning(f"No data available for {category}")
                continue
                
            valid_mask = (data['Model 1'] > 0) & (data['Model 2'] > 0)
            if not valid_mask.any():
                logger.warning(f"No valid positive scores for {category}")
                continue
                
            valid_data = data[valid_mask]
            
            ax1.scatter(
                np.log10(valid_data['Model 1']),
                np.log10(valid_data['Model 2']),
                c=color,
                marker=marker,
                label=category,
                alpha=0.6
            )
            logger.info(f"Log10 scale statistics for {category}:")
            logger.info(f"  Points plotted: {len(valid_data)}")
            logger.info(f"  Model 1 log10 range: [{np.log10(valid_data['Model 1']).min():.3f}, {np.log10(valid_data['Model 1']).max():.3f}]")
            logger.info(f"  Model 2 log10 range: [{np.log10(valid_data['Model 2']).min():.3f}, {np.log10(valid_data['Model 2']).max():.3f}]")
            
        ax1.set_title('Log10 Scale Comparison')
        ax1.set_xlabel('Log10(Model 1 Score)')
        ax1.set_ylabel('Log10(Model 2 Score)')
        ax1.grid(True, linestyle='--', alpha=0.7)
        ax1.legend()

        logger.debug("Creating natural log scale comparison plot")
        # Plot 2: Natural log scale scatter
        for category, (data, color, marker) in categories.items():
            valid_mask = (data['Model 1'] > 0) & (data['Model 2'] > 0)
            if not valid_mask.any():
                continue
                
            valid_data = data[valid_mask]
            
            ax2.scatter(
                np.log(valid_data['Model 1']),
                np.log(valid_data['Model 2']),
                c=color,
                marker=marker,
                label=category,
                alpha=0.6
            )
            logger.info(f"Natural log scale statistics for {category}:")
            logger.info(f"  Points plotted: {len(valid_data)}")
            logger.info(f"  Model 1 ln range: [{np.log(valid_data['Model 1']).min():.3f}, {np.log(valid_data['Model 1']).max():.3f}]")
            logger.info(f"  Model 2 ln range: [{np.log(valid_data['Model 2']).min():.3f}, {np.log(valid_data['Model 2']).max():.3f}]")
            
        ax2.set_title('Natural Log Scale Comparison')
        ax2.set_xlabel('ln(Model 1 Score)')
        ax2.set_ylabel('ln(Model 2 Score)')
        ax2.grid(True, linestyle='--', alpha=0.7)
        ax2.legend()

        logger.debug("Creating density plots")
        # Plot 3: Log-scaled density plot for Model 1
        for category, (data, color, _) in categories.items():
            valid_data = data[data['Model 1'] > 0]
            if len(valid_data) > 0:
                try:
                    sns.kdeplot(
                        data=np.log10(valid_data['Model 1']),
                        ax=ax3,
                        label=category,
                        color=color
                    )
                    logger.debug(f"Created density plot for {category} Model 1")
                except Exception as e:
                    logger.warning(f"Could not create density plot for {category} Model 1: {str(e)}")

        ax3.set_title('Log10 Score Distribution - Model 1')
        ax3.set_xlabel('Log10(Score)')
        ax3.set_ylabel('Density')
        ax3.grid(True, linestyle='--', alpha=0.7)
        ax3.legend()

        # Plot 4: Log-scaled density plot for Model 2
        for category, (data, color, _) in categories.items():
            valid_data = data[data['Model 2'] > 0]
            if len(valid_data) > 0:
                try:
                    sns.kdeplot(
                        data=np.log10(valid_data['Model 2']),
                        ax=ax4,
                        label=category,
                        color=color
                    )
                    logger.debug(f"Created density plot for {category} Model 2")
                except Exception as e:
                    logger.warning(f"Could not create density plot for {category} Model 2: {str(e)}")

        ax4.set_title('Log10 Score Distribution - Model 2')
        ax4.set_xlabel('Log10(Score)')
        ax4.set_ylabel('Density')
        ax4.grid(True, linestyle='--', alpha=0.7)
        ax4.legend()

        plt.tight_layout()
        logger.debug("All plots created successfully")
        
        # Save plot if needed
        try:
            plot_path = "log_scale_comparisons.png"
            plt.savefig(plot_path)
            logger.info(f"Log scale comparison plots saved to {plot_path}")
        except Exception as save_error:
            logger.warning(f"Could not save plot to file: {str(save_error)}")
        
        plt.show()
        
    except Exception as e:
        logger.error("Error creating log scale comparisons", exc_info=True)
        handle_exception(e)
        raise

try:
    logger.info("Starting log scale model comparison analysis")
    create_log_scale_comparisons()
    logger.info("Log scale model comparison analysis completed")
except Exception as e:
    logger.error("Failed to complete log scale comparison analysis", exc_info=True)
    handle_exception(e)
    raise