# Multi-Stage LLM Error Analysis
**Calculates error rates and performance metrics across TF-IDF, Claude 2.1, and Claude 3.5 processing stages for requirements analysis pipelines.**


In [None]:
# Cell [0] - Setup and Imports
# Purpose: Import all required libraries and configure environment settings for Multi-LLM testing
# Dependencies: numpy, pandas, matplotlib, seaborn, scipy, sklearn, typing, dataclasses
# Breadcrumbs: Setup -> Imports -> Environment Configuration

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import confusion_matrix, precision_recall_curve
from typing import Dict, Tuple, List
from dataclasses import dataclass

@dataclass
class ProcessingStageResults:
    """Store and calculate metrics for each processing stage"""
    tp: int  # True Positives
    fp: int  # False Positives
    tn: int  # True Negatives
    fn: int  # False Negatives
    name: str  # Stage identifier

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn)

    @property
    def fnr(self) -> float:
        """Calculate False Negative Rate"""
        return self.fn / (self.fn + self.tp)

    @property
    def fpr(self) -> float:
        """Calculate False Positive Rate"""
        return self.fp / (self.fp + self.tn)

class MultiStageLLMAnalysis:
    def __init__(self, 
                 tfidf_results: Dict[str, int],
                 claude21_results: Dict[str, int],
                 claude35_results: Dict[str, int]):
        """
        Initialize analysis with results from all processing stages.
        
        Args:
            tfidf_results: Results from TF-IDF + Transformer stage
            claude21_results: Results from Claude 2.1
            claude35_results: Results from Claude 3.5
        """
        # Initialize each stage with its results
        self.stages = {
            'tfidf': ProcessingStageResults(**tfidf_results, name='TF-IDF + Transformer'),
            'claude21': ProcessingStageResults(**claude21_results, name='Claude 2.1'),
            'claude35': ProcessingStageResults(**claude35_results, name='Claude 3.5')
        }
        
        # Calculate combined error rates for different pipelines
        self.calculate_combined_metrics()

    def calculate_combined_metrics(self):
        """Calculate error rates for different processing pipelines"""
        # Calculate combined FNR for TF-IDF → Claude 2.1
        self.fnr_combined_21 = 1 - (1 - self.stages['tfidf'].fnr) * (1 - self.stages['claude21'].fnr)
        
        # Calculate combined FNR for TF-IDF → Claude 3.5
        self.fnr_combined_35 = 1 - (1 - self.stages['tfidf'].fnr) * (1 - self.stages['claude35'].fnr)

    def calculate_confidence_interval(self, 
                                   stage_key: str,
                                   confidence: float = 0.95) -> Tuple[float, float]:
        """
        Calculate confidence interval for success rate of a specific stage.
        
        Args:
            stage_key: Key identifying the processing stage
            confidence: Confidence level (default: 0.95)
            
        Returns:
            Tuple of (lower_bound, upper_bound)
        """
        stage = self.stages[stage_key]
        z_score = stats.norm.ppf((1 + confidence) / 2)
        p_hat = stage.tp / (stage.tp + stage.fn)
        n = stage.tp + stage.fn
        
        margin_of_error = z_score * np.sqrt((p_hat * (1 - p_hat)) / n)
        return (p_hat - margin_of_error, p_hat + margin_of_error)

    def plot_comparative_metrics(self):
        """Create a comprehensive comparison of metrics across all stages"""
        metrics = {
            'Precision': [stage.precision for stage in self.stages.values()],
            'Recall': [stage.recall for stage in self.stages.values()],
            'FNR': [stage.fnr for stage in self.stages.values()],
            'FPR': [stage.fpr for stage in self.stages.values()]
        }
        
        fig, axes = plt.subplots(2, 1, figsize=(12, 10))
        
        # Plot precision and recall
        x = np.arange(len(self.stages))
        width = 0.35
        
        axes[0].bar(x - width/2, metrics['Precision'], width, label='Precision')
        axes[0].bar(x + width/2, metrics['Recall'], width, label='Recall')
        axes[0].set_ylabel('Rate')
        axes[0].set_title('Precision and Recall by Processing Stage')
        axes[0].set_xticks(x)
        axes[0].set_xticklabels([stage.name for stage in self.stages.values()])
        axes[0].legend()
        
        # Plot error rates
        axes[1].bar(x - width/2, metrics['FNR'], width, label='False Negative Rate')
        axes[1].bar(x + width/2, metrics['FPR'], width, label='False Positive Rate')
        axes[1].set_ylabel('Rate')
        axes[1].set_title('Error Rates by Processing Stage')
        axes[1].set_xticks(x)
        axes[1].set_xticklabels([stage.name for stage in self.stages.values()])
        axes[1].legend()
        
        plt.tight_layout()
        return fig

    def plot_pipeline_comparison(self):
        """Compare the performance of different processing pipelines"""
        # Prepare data for visualization
        pipeline_data = {
            'TF-IDF → Claude 2.1': self.fnr_combined_21,
            'TF-IDF → Claude 3.5': self.fnr_combined_35
        }
        
        fig, ax = plt.subplots(figsize=(10, 6))
        
        # Create bar plot
        bars = ax.bar(pipeline_data.keys(), pipeline_data.values())
        ax.set_ylabel('Combined False Negative Rate')
        ax.set_title('Pipeline Performance Comparison')
        
        # Add value labels on bars
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.1%}',
                   ha='center', va='bottom')
        
        plt.tight_layout()
        return fig

    def calculate_buffer_requirements(self, 
                                   cost_ratio: float,
                                   confidence: float = 0.95) -> Dict[str, float]:
        """
        Calculate required buffers for different pipeline configurations.
        
        Args:
            cost_ratio: Cost ratio of missing requirement vs review
            confidence: Confidence level
            
        Returns:
            Dictionary of pipeline names and their required buffers
        """
        buffers = {}
        
        # Calculate confidence intervals for each LLM stage
        _, margin_21 = self.calculate_confidence_interval('claude21', confidence)
        _, margin_35 = self.calculate_confidence_interval('claude35', confidence)
        
        # Calculate buffers for each pipeline
        buffers['TF-IDF → Claude 2.1'] = self.fnr_combined_21 * cost_ratio * (1 + margin_21)
        buffers['TF-IDF → Claude 3.5'] = self.fnr_combined_35 * cost_ratio * (1 + margin_35)
        
        return buffers

def main():
    # Sample data for all stages
    tfidf_results = {
        'tp': 318,
        'fp': 10156,
        'tn': 41183,
        'fn': 43
    }
    
    claude21_results = {
        'tp': 108,
        'fp': 443,
        'tn': 50896,
        'fn': 253
    }
    
    claude35_results = {
        'tp': 152,
        'fp': 566,
        'tn': 50773,
        'fn': 209
    }
    
    # Initialize analysis
    analysis = MultiStageLLMAnalysis(tfidf_results, claude21_results, claude35_results)
    
    # Generate comparative visualizations
    metrics_fig = analysis.plot_comparative_metrics()
    metrics_fig.savefig('stage_comparison.png')
    
    pipeline_fig = analysis.plot_pipeline_comparison()
    pipeline_fig.savefig('pipeline_comparison.png')
    
    # Calculate and print buffer requirements for different cost ratios
    cost_ratios = [5, 10, 15, 20]
    print("\nBuffer Requirements at Different Cost Ratios:")
    print("-" * 50)
    for ratio in cost_ratios:
        buffers = analysis.calculate_buffer_requirements(ratio)
        print(f"\nCost Ratio: {ratio}x")
        for pipeline, buffer in buffers.items():
            print(f"{pipeline}: {buffer:.1%}")

    # Print detailed metrics for each stage
    print("\nDetailed Stage Metrics:")
    print("-" * 50)
    for name, stage in analysis.stages.items():
        print(f"\n{stage.name}:")
        print(f"Precision: {stage.precision:.3f}")
        print(f"Recall: {stage.recall:.3f}")
        print(f"False Negative Rate: {stage.fnr:.3f}")
        print(f"False Positive Rate: {stage.fpr:.3f}")

if __name__ == "__main__":
    main()