<a href="https://colab.research.google.com/github/micah-shull/AI_Agents/blob/main/324_EaaS_Stats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

def load_historical_evaluations(
    reports_dir: str,
    max_history: int = 10
) -> List[Dict[str, Any]]:
    """
    Load historical evaluation summaries from previous reports.

    Returns list of historical summaries, most recent first.
    """
    import json
    from pathlib import Path
    from datetime import datetime

    history_dir = Path(reports_dir) / "history"
    if not history_dir.exists():
        return []

    historical_summaries = []

    # Look for summary JSON files
    summary_files = sorted(history_dir.glob("summary_*.json"), reverse=True)

    for summary_file in summary_files[:max_history]:
        try:
            with open(summary_file, 'r') as f:
                summary = json.load(f)
                historical_summaries.append(summary)
        except Exception:
            continue

    return historical_summaries


def save_evaluation_summary(
    summary: Dict[str, Any],
    reports_dir: str
) -> str:
    """Save current evaluation summary to history for future comparisons."""
    import json
    from pathlib import Path
    from datetime import datetime

    history_dir = Path(reports_dir) / "history"
    history_dir.mkdir(parents=True, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    summary_file = history_dir / f"summary_{timestamp}.json"

    with open(summary_file, 'w') as f:
        json.dump(summary, f, indent=2)

    return str(summary_file)


In [None]:
def performance_analysis_node(
    state: EvalAsServiceOrchestratorState,
    config: EvalAsServiceOrchestratorConfig
) -> Dict[str, Any]:
    """Analyze agent performance and generate summaries with statistical significance."""
    print("  [Performance Analysis Node] Starting...")
    agents = state.get('specialist_agents', {})
    evaluations = state.get('executed_evaluations', [])
    scores = state.get('evaluation_scores', [])

    # Load historical data for statistical significance testing
    print("    Loading historical data for statistical analysis...")
    historical_data = load_historical_evaluations(config.reports_dir)
    if historical_data:
        print(f"    Loaded {len(historical_data)} historical evaluation(s)")
    else:
        print("    No historical data available (this is normal for first run)")

    agent_performance_summaries = []
    statistical_assessments = {}

    # Calculate performance for each agent (with ROI and statistical significance)
    for agent_id in agents.keys():
        summary = calculate_agent_performance_summary(
            agent_id,
            evaluations,
            scores,
            config.health_thresholds,
            include_roi=True,
            historical_data=historical_data
        )

        # Add statistical significance testing if historical data exists
        if historical_data and summary.get('statistical_assessment', {}).get('has_historical_data'):
            try:
                # Extract historical scores for this agent
                historical_scores = summary['statistical_assessment']['historical_scores']
                current_score = summary.get('average_score', 0.0)

                # KPI significance test
                kpi_assessment = assess_kpi_with_significance(
                    current_value=current_score,
                    historical_values=historical_scores,
                    target_value=config.health_thresholds.get('healthy', 0.85),
                    confidence_level=0.95
                )

                # ROI significance test (if we have historical ROI data)
                if 'revenue_impact' in summary and 'total_cost' in summary:
                    historical_roi = []
                    for hist_summary in historical_data:
                        agent_summaries = hist_summary.get('agent_performance_summary', [])
                        for hist_agent in agent_summaries:
                            if hist_agent.get('agent_id') == agent_id:
                                hist_roi = hist_agent.get('net_roi', 0.0)
                                if hist_roi is not None:
                                    historical_roi.append(hist_roi)
                                break

                    if historical_roi:
                        roi_assessment = assess_roi_with_significance(
                            roi_estimate=summary.get('net_roi', 0.0),
                            cost=summary.get('total_cost', 0.0),
                            historical_roi=historical_roi,
                            confidence_level=0.95,
                            positive_threshold=0.0
                        )
                        summary['roi_statistical_assessment'] = roi_assessment

                summary['kpi_statistical_assessment'] = kpi_assessment
                statistical_assessments[agent_id] = {
                    'kpi': kpi_assessment,
                    'has_roi': 'roi_statistical_assessment' in summary
                }

            except Exception as e:
                print(f"    Warning: Statistical assessment failed for {agent_id}: {str(e)}")
                summary['statistical_assessment']['error'] = str(e)

        agent_performance_summaries.append(summary)

    # Calculate overall evaluation summary
    total_scenarios = len(state.get('journey_scenarios', []))
    total_evaluations = len(evaluations)
    total_passed = sum(1 for s in scores if s.get('passed', False))
    total_failed = len(scores) - total_passed
    overall_pass_rate = total_passed / len(scores) if scores else 0.0
    average_score = sum(s.get('overall_score', 0.0) for s in scores) / len(scores) if scores else 0.0

    healthy_agents = sum(1 for s in agent_performance_summaries if s.get('health_status') == 'healthy')
    degraded_agents = sum(1 for s in agent_performance_summaries if s.get('health_status') == 'degraded')
    critical_agents = sum(1 for s in agent_performance_summaries if s.get('health_status') == 'critical')

    # Calculate total costs and ROI
    total_cost = sum(s.get('total_cost', 0.0) for s in agent_performance_summaries)
    total_revenue_impact = sum(s.get('revenue_impact', 0.0) for s in agent_performance_summaries)
    total_net_roi = sum(s.get('net_roi', 0.0) for s in agent_performance_summaries)
    overall_roi_percent = ((total_revenue_impact - total_cost) / total_cost * 100) if total_cost > 0 else 0.0
    agents_with_positive_roi = sum(1 for s in agent_performance_summaries if s.get('roi_percent', 0) > 0)
    agents_needing_optimization = sum(1 for s in agent_performance_summaries if s.get('roi_ratio', 0) < 2.0 and s.get('roi_ratio', 0) != float('inf'))

    evaluation_summary = {
        "total_scenarios": total_scenarios,
        "total_evaluations": total_evaluations,
        "total_passed": total_passed,
        "total_failed": total_failed,
        "overall_pass_rate": overall_pass_rate,
        "average_score": average_score,
        "agents_evaluated": len(agents),
        "healthy_agents": healthy_agents,
        "degraded_agents": degraded_agents,
        "critical_agents": critical_agents,
        "total_cost": round(total_cost, 2),
        "total_revenue_impact": round(total_revenue_impact, 2),
        "total_net_roi": round(total_net_roi, 2),
        "overall_roi_percent": round(overall_roi_percent, 2),
        "agents_with_positive_roi": agents_with_positive_roi,
        "agents_needing_optimization": agents_needing_optimization,
        "cost_per_evaluation": round(total_cost / total_evaluations, 2) if total_evaluations > 0 else 0.0
    }

    # Workflow analysis (using toolshed)
    workflow_analysis = []
    if config.enable_workflow_analysis:
        for summary in agent_performance_summaries:
            # Use failure rate as metric for workflow health
            failure_rate = (summary.get('failed_count', 0) / summary.get('total_evaluations', 1)) * 100

            workflow = {
                "workflow_id": f"eval_{summary.get('agent_id')}",
                "agent_id": summary.get('agent_id'),
                "failure_rate_7d": failure_rate
            }

            # Use workflow health analysis
            thresholds = {
                "healthy": 10.0,    # <= 10% failure rate
                "degraded": 30.0,   # 10-30% failure rate
                "critical": 30.0    # > 30% failure rate
            }

            analysis = analyze_workflow_health(workflow, thresholds)
            workflow_analysis.append(analysis)

    # Performance metrics (using toolshed)
    performance_metrics = {}
    if config.enable_performance_tracking:
        metrics_definitions = [
            {
                "name": "evaluation_time",
                "description": "Average evaluation execution time",
                "unit": "seconds",
                "thresholds": {
                    "healthy": 1.0,      # <= 1 second
                    "degraded": 2.0,    # 1-2 seconds
                    "critical": 2.0     # > 2 seconds
                },
                "weight": 0.5
            },
            {
                "name": "pass_rate",
                "description": "Overall evaluation pass rate",
                "unit": "ratio",
                "thresholds": {
                    "healthy": 0.90,    # >= 90%
                    "degraded": 0.70,   # 70-90%
                    "critical": 0.0    # < 70%
                },
                "weight": 0.5
            }
        ]

        metrics_config = create_metrics_config(metrics_definitions)

        avg_eval_time = sum(e.get('execution_time_seconds', 0.0) for e in evaluations) / len(evaluations) if evaluations else 0.0

        performance_metrics = {
            "average_evaluation_time": avg_eval_time,
            "overall_pass_rate": overall_pass_rate,
            "metrics_config": metrics_config
        }

    # Save current summary to history for future comparisons
    try:
        summary_to_save = {
            "timestamp": datetime.now().isoformat(),
            "evaluation_summary": evaluation_summary,
            "agent_performance_summary": agent_performance_summaries
        }
        save_evaluation_summary(summary_to_save, config.reports_dir)
        print("    Saved evaluation summary to history")
    except Exception as e:
        print(f"    Warning: Could not save evaluation summary: {str(e)}")

    print(f"  [Performance Analysis Node] Analyzed {len(agent_performance_summaries)} agents")
    if statistical_assessments:
        print(f"    Statistical significance calculated for {len(statistical_assessments)} agent(s)")

    return {
        "agent_performance_summary": agent_performance_summaries,
        "evaluation_summary": evaluation_summary,
        "workflow_analysis": workflow_analysis,
        "performance_metrics": performance_metrics,
        "statistical_assessments": statistical_assessments
    }

