<a href="https://colab.research.google.com/github/micah-shull/AI_Agents/blob/main/319_EaaS_Nodes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Nodes for EaaS Orchestrator Agent

In [None]:
"""Nodes for EaaS Orchestrator Agent

Orchestration logic for the evaluation workflow.
"""

from typing import Dict, Any
from datetime import datetime
from toolshed.progress import calculate_progress, calculate_elapsed_time, estimate_remaining_time
from toolshed.performance import create_metrics_config, track_execution_time
from toolshed.workflows import analyze_workflow_health
from toolshed.validation import validate_data_structure
from agents.eval_as_service.utilities import (
    load_journey_scenarios,
    load_specialist_agents,
    load_supporting_data,
    load_decision_rules,
    build_agent_lookup,
    build_scenario_lookup,
    simulate_agent_execution,
    score_evaluation,
    calculate_agent_performance_summary
)
from config import EvalAsServiceOrchestratorState, EvalAsServiceOrchestratorConfig


def goal_node(state: EvalAsServiceOrchestratorState) -> Dict[str, Any]:
    """Define the goal for evaluation."""
    scenario_id = state.get('scenario_id')
    target_agent_id = state.get('target_agent_id')

    if scenario_id:
        goal_description = f"Evaluate agent performance for scenario {scenario_id}"
    elif target_agent_id:
        goal_description = f"Evaluate agent {target_agent_id} across all scenarios"
    else:
        goal_description = "Evaluate all agents across all scenarios"

    return {
        "goal": {
            "description": goal_description,
            "type": "evaluation",
            "scope": {
                "scenario_id": scenario_id,
                "target_agent_id": target_agent_id
            }
        }
    }


def planning_node(state: EvalAsServiceOrchestratorState) -> Dict[str, Any]:
    """Create execution plan for evaluation."""
    plan = [
        {
            "step": 1,
            "task": "Load evaluation data",
            "description": "Load scenarios, agents, and supporting data"
        },
        {
            "step": 2,
            "task": "Execute evaluations",
            "description": "Run scenarios through target agents"
        },
        {
            "step": 3,
            "task": "Score evaluations",
            "description": "Compare actual outputs to expected outcomes"
        },
        {
            "step": 4,
            "task": "Analyze performance",
            "description": "Calculate agent performance summaries"
        },
        {
            "step": 5,
            "task": "Generate report",
            "description": "Create comprehensive evaluation report"
        }
    ]

    return {"plan": plan}


def data_loading_node(
    state: EvalAsServiceOrchestratorState,
    config: EvalAsServiceOrchestratorConfig
) -> Dict[str, Any]:
    """Load all required data for evaluation."""
    errors = state.get('errors', [])

    try:
        # Load scenarios
        scenarios = load_journey_scenarios(config.data_dir, config.journey_scenarios_file)

        # Filter by scenario_id if specified
        scenario_id = state.get('scenario_id')
        if scenario_id:
            scenarios = [s for s in scenarios if s.get('scenario_id') == scenario_id]

        # Load agents
        agents_dict = load_specialist_agents(config.data_dir, config.specialist_agents_file)

        # Filter by target_agent_id if specified
        target_agent_id = state.get('target_agent_id')
        if target_agent_id:
            agents_dict = {k: v for k, v in agents_dict.items() if k == target_agent_id}

        # Load supporting data
        supporting_data = load_supporting_data(
            config.data_dir,
            config.customers_file,
            config.orders_file,
            config.logistics_file,
            config.marketing_signals_file
        )

        # Load decision rules
        decision_rules = load_decision_rules(config.data_dir, config.decision_rules_file)

        # Build lookups
        agent_lookup = build_agent_lookup(agents_dict)
        scenario_lookup = build_scenario_lookup(scenarios)

        # Validate data structure if enabled
        if config.enable_validation:
            try:
                validate_data_structure(scenarios, required_fields=['scenario_id', 'customer_id', 'order_id'])
            except Exception as e:
                errors.append(f"Validation warning: {str(e)}")

        return {
            "journey_scenarios": scenarios,
            "specialist_agents": agents_dict,
            "supporting_data": supporting_data,
            "decision_rules": decision_rules,
            "errors": errors
        }
    except Exception as e:
        errors.append(f"Data loading error: {str(e)}")
        return {"errors": errors}


In [None]:
def evaluation_execution_node(
    state: EvalAsServiceOrchestratorState,
    config: EvalAsServiceOrchestratorConfig
) -> Dict[str, Any]:
    """Execute evaluations by running scenarios through agents."""
    scenarios = state.get('journey_scenarios', [])
    agents = state.get('specialist_agents', {})
    supporting_data = state.get('supporting_data', {})

    executed_evaluations = []
    errors = state.get('errors', [])

    # Track start time for progress
    start_time = state.get('evaluation_start_time')
    if not start_time:
        start_time = datetime.now().isoformat()

    # Determine which agents to evaluate for each scenario
    for scenario in scenarios:
        expected_resolution_path = scenario.get('expected_resolution_path', [])

        # Evaluate each agent in the expected resolution path
        for agent_id in expected_resolution_path:
            if agent_id not in agents:
                errors.append(f"Agent {agent_id} not found for scenario {scenario.get('scenario_id')}")
                continue

            try:
                # Track execution time
                execution_start = datetime.now()

                # Simulate agent execution
                result = simulate_agent_execution(
                    agent_id,
                    scenario,
                    supporting_data,
                    agents
                )

                execution_end = datetime.now()
                execution_time = (execution_end - execution_start).total_seconds()

                evaluation = {
                    "scenario_id": scenario.get('scenario_id'),
                    "target_agent_id": agent_id,
                    "input": {
                        "customer_message": scenario.get('customer_message'),
                        "customer_id": scenario.get('customer_id'),
                        "order_id": scenario.get('order_id')
                    },
                    "actual_output": result.get('output'),
                    "expected_output": {
                        "expected_issue_type": scenario.get('expected_issue_type'),
                        "expected_resolution_path": expected_resolution_path,
                        "expected_outcome": scenario.get('expected_outcome')
                    },
                    "execution_time_seconds": execution_time,
                    "status": result.get('status', 'failed'),
                    "error": result.get('error')
                }

                executed_evaluations.append(evaluation)

            except Exception as e:
                errors.append(f"Evaluation error for scenario {scenario.get('scenario_id')}, agent {agent_id}: {str(e)}")
                executed_evaluations.append({
                    "scenario_id": scenario.get('scenario_id'),
                    "target_agent_id": agent_id,
                    "status": "failed",
                    "error": str(e)
                })

    # Update progress
    total = len(executed_evaluations)
    completed = len([e for e in executed_evaluations if e.get('status') == 'completed'])

    progress = calculate_progress(completed=completed, total=total) if total > 0 else 0.0

    elapsed = calculate_elapsed_time(start_time)
    remaining = estimate_remaining_time(
        completed=completed,
        total=total,
        elapsed_minutes=elapsed / 60.0
    ) if total > 0 else 0.0

    return {
        "executed_evaluations": executed_evaluations,
        "evaluations_completed": completed,
        "evaluations_total": total,
        "progress_percentage": progress,
        "elapsed_time_seconds": elapsed,
        "estimated_remaining_seconds": remaining * 60.0,
        "evaluation_start_time": start_time,
        "errors": errors
    }



In [None]:
def scoring_node(
    state: EvalAsServiceOrchestratorState,
    config: EvalAsServiceOrchestratorConfig
) -> Dict[str, Any]:
    """Score evaluations by comparing actual outputs to expected outcomes."""
    evaluations = state.get('executed_evaluations', [])
    scenarios = state.get('journey_scenarios', [])
    scenario_lookup = build_scenario_lookup(scenarios)

    evaluation_scores = []
    errors = state.get('errors', [])

    for evaluation in evaluations:
        scenario_id = evaluation.get('scenario_id')
        scenario = scenario_lookup.get(scenario_id)

        if not scenario:
            errors.append(f"Scenario {scenario_id} not found for scoring")
            continue

        expected_outcome = scenario.get('expected_outcome')
        expected_resolution_path = scenario.get('expected_resolution_path', [])

        try:
            score = score_evaluation(
                evaluation,
                expected_outcome,
                expected_resolution_path,
                config.scoring_weights,
                config.pass_threshold
            )

            score['scenario_id'] = scenario_id
            score['target_agent_id'] = evaluation.get('target_agent_id')

            evaluation_scores.append(score)

        except Exception as e:
            errors.append(f"Scoring error for scenario {scenario_id}: {str(e)}")

    return {
        "evaluation_scores": evaluation_scores,
        "errors": errors
    }

In [None]:
def performance_analysis_node(
    state: EvalAsServiceOrchestratorState,
    config: EvalAsServiceOrchestratorConfig
) -> Dict[str, Any]:
    """Analyze agent performance and generate summaries."""
    agents = state.get('specialist_agents', {})
    evaluations = state.get('executed_evaluations', [])
    scores = state.get('evaluation_scores', [])

    agent_performance_summaries = []

    # Calculate performance for each agent
    for agent_id in agents.keys():
        summary = calculate_agent_performance_summary(
            agent_id,
            evaluations,
            scores,
            config.health_thresholds
        )
        agent_performance_summaries.append(summary)

    # Calculate overall evaluation summary
    total_scenarios = len(state.get('journey_scenarios', []))
    total_evaluations = len(evaluations)
    total_passed = sum(1 for s in scores if s.get('passed', False))
    total_failed = len(scores) - total_passed
    overall_pass_rate = total_passed / len(scores) if scores else 0.0
    average_score = sum(s.get('overall_score', 0.0) for s in scores) / len(scores) if scores else 0.0

    healthy_agents = sum(1 for s in agent_performance_summaries if s.get('health_status') == 'healthy')
    degraded_agents = sum(1 for s in agent_performance_summaries if s.get('health_status') == 'degraded')
    critical_agents = sum(1 for s in agent_performance_summaries if s.get('health_status') == 'critical')

    evaluation_summary = {
        "total_scenarios": total_scenarios,
        "total_evaluations": total_evaluations,
        "total_passed": total_passed,
        "total_failed": total_failed,
        "overall_pass_rate": overall_pass_rate,
        "average_score": average_score,
        "agents_evaluated": len(agents),
        "healthy_agents": healthy_agents,
        "degraded_agents": degraded_agents,
        "critical_agents": critical_agents
    }

    # Workflow analysis (using toolshed)
    workflow_analysis = []
    if config.enable_workflow_analysis:
        for summary in agent_performance_summaries:
            # Use failure rate as metric for workflow health
            failure_rate = (summary.get('failed_count', 0) / summary.get('total_evaluations', 1)) * 100

            workflow = {
                "workflow_id": f"eval_{summary.get('agent_id')}",
                "agent_id": summary.get('agent_id'),
                "failure_rate_7d": failure_rate
            }

            # Use workflow health analysis
            thresholds = {
                "healthy": 10.0,    # <= 10% failure rate
                "degraded": 30.0,   # 10-30% failure rate
                "critical": 30.0    # > 30% failure rate
            }

            analysis = analyze_workflow_health(workflow, thresholds)
            workflow_analysis.append(analysis)

    # Performance metrics (using toolshed)
    performance_metrics = {}
    if config.enable_performance_tracking:
        metrics_config = create_metrics_config(
            metrics={
                "evaluation_time": {"threshold": 2.0, "unit": "seconds"},
                "pass_rate": {"threshold": 0.80, "unit": "ratio"}
            }
        )

        avg_eval_time = sum(e.get('execution_time_seconds', 0.0) for e in evaluations) / len(evaluations) if evaluations else 0.0

        performance_metrics = {
            "average_evaluation_time": avg_eval_time,
            "overall_pass_rate": overall_pass_rate,
            "metrics_config": metrics_config
        }

    return {
        "agent_performance_summary": agent_performance_summaries,
        "evaluation_summary": evaluation_summary,
        "workflow_analysis": workflow_analysis,
        "performance_metrics": performance_metrics
    }


In [None]:
def report_generation_node(
    state: EvalAsServiceOrchestratorState,
    config: EvalAsServiceOrchestratorConfig
) -> Dict[str, Any]:
    """Generate comprehensive evaluation report."""
    from toolshed.reporting import generate_mission_report, save_report

    summary = state.get('evaluation_summary', {})
    agent_summaries = state.get('agent_performance_summary', [])
    scores = state.get('evaluation_scores', [])
    evaluations = state.get('executed_evaluations', [])

    # Build report sections
    report_sections = []

    # Executive Summary
    report_sections.append("## Executive Summary\n\n")
    report_sections.append(f"- **Total Scenarios Evaluated:** {summary.get('total_scenarios', 0)}\n")
    report_sections.append(f"- **Total Evaluations:** {summary.get('total_evaluations', 0)}\n")
    report_sections.append(f"- **Overall Pass Rate:** {summary.get('overall_pass_rate', 0.0):.1%}\n")
    report_sections.append(f"- **Average Score:** {summary.get('average_score', 0.0):.2f}\n")
    report_sections.append(f"- **Healthy Agents:** {summary.get('healthy_agents', 0)}\n")
    report_sections.append(f"- **Degraded Agents:** {summary.get('degraded_agents', 0)}\n")
    report_sections.append(f"- **Critical Agents:** {summary.get('critical_agents', 0)}\n\n")

    # Agent Performance Details
    report_sections.append("## Agent Performance Details\n\n")
    for agent_summary in agent_summaries:
        report_sections.append(f"### {agent_summary.get('agent_id')}\n\n")
        report_sections.append(f"- **Status:** {agent_summary.get('health_status', 'unknown')}\n")
        report_sections.append(f"- **Total Evaluations:** {agent_summary.get('total_evaluations', 0)}\n")
        report_sections.append(f"- **Passed:** {agent_summary.get('passed_count', 0)}\n")
        report_sections.append(f"- **Failed:** {agent_summary.get('failed_count', 0)}\n")
        report_sections.append(f"- **Average Score:** {agent_summary.get('average_score', 0.0):.2f}\n")
        report_sections.append(f"- **Average Response Time:** {agent_summary.get('average_response_time', 0.0):.2f}s\n\n")

    # Evaluation Results
    report_sections.append("## Evaluation Results\n\n")
    report_sections.append("| Scenario | Agent | Score | Passed | Issues |\n")
    report_sections.append("|----------|-------|-------|--------|--------|\n")

    for score in scores[:20]:  # Limit to first 20 for readability
        scenario_id = score.get('scenario_id', 'N/A')
        agent_id = score.get('target_agent_id', 'N/A')
        overall_score = score.get('overall_score', 0.0)
        passed = "✓" if score.get('passed', False) else "✗"
        issues = ", ".join(score.get('issues', []))[:50]  # Truncate long issues
        report_sections.append(f"| {scenario_id} | {agent_id} | {overall_score:.2f} | {passed} | {issues} |\n")

    if len(scores) > 20:
        report_sections.append(f"\n*Showing first 20 of {len(scores)} evaluations*\n\n")

    # Combine report
    report = "# Evaluation-as-a-Service Report\n\n"
    report += f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
    report += "".join(report_sections)

    # Save report if enabled
    report_file_path = None
    if config.enable_reporting:
        try:
            report_file_path = save_report(
                report_content=report,
                reports_dir=config.reports_dir,
                report_name="evaluation_report"
            )
        except Exception as e:
            state.get('errors', []).append(f"Report saving error: {str(e)}")

    return {
        "evaluation_report": report,
        "report_file_path": report_file_path
    }