# Supervisor Agent

The Supervisor Agent is the final decision-making component of the RCA workflow. It receives and aggregates all the individual diagnostic reports from the parallel RCA Workers. Its core objective is to synthesize these fragmented findings, correlate information across different investigation tasks, and formulate a single, comprehensive root cause diagnosis for the entire incident. 

**TODO**: If the findings are inconclusive, it can also trigger a new planning cycle.

In [None]:
from dotenv import load_dotenv
import os
# Get the path to the root directory of the repository
root_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))

# Load environment variables from .env file in the root directory
load_dotenv(os.path.join(root_dir, '.env'), verbose=True)

In [None]:
import sys

# Add MCP-server to path
mcp_server_path = os.path.abspath(os.path.join(os.getcwd(), '../../MCP-server'))
sys.path.insert(0, mcp_server_path)

## Build the agent

### Objects

In [None]:
from pydantic import BaseModel, Field
from typing import List, Literal

class Symptom(BaseModel):
    """A symptom observed in the Kubernetes cluster"""
    potential_symptom: str = Field(..., description="Type of symptom observed")
    resource_type: Literal["pod", "service"] = Field(..., description="Type of resource experiencing the issue")
    affected_resource: str = Field(..., description="Name of the resource experiencing the issue")
    evidence: str = Field(..., description="Evidence supporting this symptom identification")

In [None]:
class FinalReport(BaseModel):
    """The Final report created by the supervisor agent"""
    root_cause: str = Field(..., description="The identified root cause of the incident")
    affected_resources: List[str] = Field(..., description="List of all resources affected by the incident")
    evidence_summary: str = Field(..., description="Summary of evidence from all RCA workers")
    investigation_summary: str = Field(..., description="Overview of the investigation process and findings")

In [None]:
from typing import TypedDict

class TriageAgentState(TypedDict):
    app_name: str
    app_summary: str
    symptoms: List[Symptom]
    rca_analysis: List[dict]
    final_report: FinalReport

In [None]:
from langchain_openai import ChatOpenAI

gpt5mini = ChatOpenAI(model="gpt-5-mini")

llm_for_final_report = gpt5mini.with_structured_output(FinalReport)

In [None]:
import json
from langchain_core.prompts import ChatPromptTemplate

supervisor_prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are an expert Site Reliability Engineer analyzing RCA findings to determine the root cause of an incident.

Analyze all symptoms and investigation findings to:
1. Identify patterns and correlations across findings
2. Determine the primary root cause
3. List all affected resources
4. Summarize key evidence

Provide a clear, specific root cause statement that explains what caused the incident.""",
        ),
        ("human", "{human_input}"),
    ]
)

In [None]:
def supervisor_agent(state: TriageAgentState):
    """Analyze all RCA findings and produce final root cause diagnosis"""
    
    symptoms = state.get("symptoms", [])
    rca_analysis = state.get("rca_analysis", [])
    app_summary = state.get("app_summary", "")
    app_name = state.get("app_name", "")
    
    if not rca_analysis and not symptoms:
        return {"final_report": FinalReport(
            root_cause="No analysis data available",
            affected_resources=[],
            evidence_summary="No symptoms or RCA analysis provided",
            investigation_summary="Investigation incomplete - insufficient data"
        )}
    
    # Build human prompt with all investigation data in markdown format
    human_parts = []
    human_parts.append(f"# Incident Analysis Summary\n\n")
    human_parts.append(f"- **Application**: {app_name}\n")
    human_parts.append(f"- **Summary**: {app_summary}\n\n")
    human_parts.append("---\n\n")
    
    # Add symptoms
    if symptoms:
        human_parts.append("# Symptoms Identified\n\n")
        for i, symptom in enumerate(symptoms, 1):
            human_parts.append(f"## Symptom {i}\n\n")
            human_parts.append(f"**Type**: {symptom.potential_symptom}\n\n")
            human_parts.append(f"**Resource**: `{symptom.affected_resource}` ({symptom.resource_type})\n\n")
            human_parts.append(f"**Evidence**: {symptom.evidence}\n\n")
        human_parts.append("---\n\n")
    
    # Add RCA analysis findings
    if rca_analysis:
        human_parts.append("# RCA Investigation Findings\n\n")
        for i, analysis in enumerate(rca_analysis, 1):
            human_parts.append(f"## Investigation {i}\n\n")
            human_parts.append(f"```json\n{json.dumps(analysis, indent=2)}\n```\n\n")
        human_parts.append("---\n\n")
    
    human_input = "".join(human_parts)
    human_input += "\n\nBased on all the above information, provide a comprehensive root cause diagnosis."
    
    # Create and invoke chain
    supervisor_chain = supervisor_prompt_template | llm_for_final_report
    final_report = supervisor_chain.invoke({"human_input": human_input})
    
    return {"final_report": final_report}

In [None]:
from langgraph.graph import START, END, StateGraph
from IPython.display import Image, display

# Build the supervisor graph
builder = StateGraph(TriageAgentState)
builder.add_node("supervisor", supervisor_agent)
builder.add_edge(START, "supervisor")
builder.add_edge("supervisor", END)

supervisor_graph = builder.compile()

# Visualize
display(Image(supervisor_graph.get_graph(xray=True).draw_mermaid_png()))

In [None]:
import time
from IPython.display import Markdown

def run_supervisor_agent(graph, app_name: str, app_summary: str, symptoms: List[Symptom], rca_analysis: List[dict], trace_name: str | None = None):
    """Execute the supervisor agent"""
    
    initial_state = {
        "app_name": app_name,
        "app_summary": app_summary,
        "symptoms": symptoms,
        "rca_analysis": rca_analysis,
        "final_report": None
    }
    
    start_time = time.time()
    
    # Configuration for the graph execution
    config = {"recursion_limit": 50}
    if trace_name:
        config["run_name"] = trace_name  # type: ignore
    
    result = graph.invoke(initial_state, config)
    execution_time = time.time() - start_time
    
    return result, execution_time


def display_final_report(result):
    """Display final RCA report in markdown"""
    
    report = result["final_report"]
    
    md = f"""
# 📊 Final Root Cause Analysis Report

## Root Cause
{report.root_cause}

### Affected Resources
- {chr(10).join([f'`{resource}`' for resource in report.affected_resources])}

### Evidence Summary
{report.evidence_summary}

### Investigation Summary
{report.investigation_summary}
"""
    
    return Markdown(md)

In [None]:
# Example: Run supervisor agent with sample data

# Sample symptoms
sample_symptoms = [
    Symptom(
        potential_symptom="Container repeatedly crashing (terminated with error)",
        resource_type="pod",
        affected_resource="geo-6b4b89b5f5-rsrh7",
        evidence="Container 'hotel-reserv-geo' reported 'Terminated With Error' (reason: Error) with exit_code=2 and restart_count=2 (pod_phase: Running)."
    ),
    Symptom(
        potential_symptom="High memory usage",
        resource_type="pod",
        affected_resource="recommendation-5f8d9c2b1a-xyz12",
        evidence="Memory usage at 92% of limit (1.84 GB / 2 GB)"
    )
]

# Sample RCA analysis data
sample_rca_analysis = [
    {
        "task": "Investigate geo service pod crash",
        "target_resource": "geo-6b4b89b5f5-rsrh7",
        "resource_type": "pod",
        "findings": {
            "logs": "OutOfMemory killer triggered",
            "metrics": "Memory leak detected in service",
            "traces": "Unbounded query results accumulating in memory"
        }
    },
    {
        "task": "Investigate recommendation service memory",
        "target_resource": "recommendation-5f8d9c2b1a-xyz12",
        "resource_type": "pod",
        "findings": {
            "logs": "Repeated cache miss errors",
            "metrics": "Memory gradually increasing over time",
            "traces": "Cache invalidation not working properly"
        }
    }
]

# Run the supervisor agent
app_summary = """
The application implements a hotel reservation service, built with Go and gRPC. 
The service includes a geo service for location queries and a recommendation system.
"""

experiment_name = input("Enter experiment name (or press Enter for 'Supervisor analysis'): ") or "Supervisor analysis"

result, exec_time = run_supervisor_agent(
    graph=supervisor_graph,
    app_name="Hotel Reservation Service",
    app_summary=app_summary,
    symptoms=sample_symptoms,
    rca_analysis=sample_rca_analysis,
    trace_name=experiment_name
)

print(f"✅ Analysis completed in {exec_time:.2f} seconds")

# Display the report
display(display_final_report(result))