# RCA Agent

The RCA agents take as input the affected resource, the symptom discovered by the triage agent and perform the Root Cause Analysis.

In [None]:
from dotenv import load_dotenv
import os
# Get the path to the root directory of the repository
root_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))

# Load environment variables from .env file in the root directory
load_dotenv(os.path.join(root_dir, '.env'), verbose=True)

In [None]:
import sys

# Add MCP-server to path
mcp_server_path = os.path.abspath(os.path.join(os.getcwd(), '../../MCP-server'))
sys.path.insert(0, mcp_server_path)

## MCP Server

List of available MCP servers:
- kubectl: provides an interface for the kubernetes cli
- cluster_api: custom MCP server developed to interact with the cluster and see dependencies, traces and metrics


In [None]:
from langchain_mcp_adapters.client import MultiServerMCPClient

mcp_client = MultiServerMCPClient(
    {
        "kubernetes" : {
            "command": "npx",
            "args": ["mcp-server-kubernetes"],
            "transport": "stdio",
            "env": {
                "ALLOW_ONLY_NON_DESTRUCTIVE_TOOLS": "true"
            }
        },
        "cluster_api" : {
            "url": "http://localhost:8000/mcp",
            "transport": "streamable_http"
        }
    }
)

mcp_tools = await mcp_client.get_tools()

In [None]:
# Tools allowed
k8s_tools_allowed = ["kubectl_get", "kubectl_describe", "explain_resource", "list_api_resources", "ping"]
custom_tools_allowed = ["get_metrics", "get_metrics_range", "get_pods_from_service", "get_cluster_pods_and_services", "get_services_used_by", "get_dependencies", "get_logs", "get_traces", "get_trace"]

tools_allowed = k8s_tools_allowed + custom_tools_allowed

tools = []
for tool in mcp_tools:
    if tool.name in tools_allowed:
        tools.append(tool)

In [None]:
for tool in tools:
    print(f"üîß {tool.name}: {tool.description}")

## Build the agent

In [None]:
from langchain_openai import ChatOpenAI

gpt5mini = ChatOpenAI(model="gpt-5-mini")

In [None]:
from pydantic import BaseModel, Field
from typing import TypedDict, Literal, Annotated, List
import operator

class RCATask(BaseModel):
    """A RCA task to be performed by the RCA agent"""
    investigation_goal: str = Field(..., description="Goal of the investigation")
    target_resource: str = Field(..., description="Name of the resource to investigate")
    resource_type: Literal["pod", "service"] = Field(..., description="Type of resource being investigated")
    suggested_tools: List[str] = Field(default_factory=list, description="List of tools suggested for the investigation")


In [None]:
from typing import List
from langgraph.graph.message import add_messages, AnyMessage

class RcaAgentState(TypedDict):
    messages: Annotated[list[AnyMessage], add_messages]
    app_summary: str
    rca_task: RCATask
    insights: Annotated[list[str], operator.add]
    prev_steps: Annotated[list[str], operator.add]
    rca_analysis: dict

In [None]:
class UpdateAgentData(BaseModel):
    """
    Represents a step performed by the SRE agent.
    """
    insight: str = Field(..., description="Most important new finding")
    prev_step: str = Field(..., description="Concise description of the most recent action taken")

In [None]:
rca_agent_prompt = """
    You are an expert DevOps engineer performing Root Cause Analysis on a Kubernetes service.

    Service: {app_summary}

    Investigation Task:
    - **Goal**: {investigation_goal}
    - **Target**: {resource_type} named '{target_resource}'
    - **Suggested Tools**: {suggested_tools}

    Your objective is to ACCOMPLISH the investigation goal by identifying the root cause that directly answers it.

    Investigation Context:
    *Previous Steps:* {prev_steps}
    *Insights:* {insights}

    Instructions:
    1. Use available tools (kubectl, metrics, logs, traces, dependencies) to investigate the target resource
    2. Follow the evidence trail to identify the root cause that explains the investigation goal
    3. When you have sufficient evidence, call submit_final_diagnosis with:
       - diagnosis: The root cause (must directly address the investigation goal)
       - reasoning: How your diagnosis answers the investigation goal

    IMPORTANT: Do not submit until your diagnosis directly explains what the investigation goal asked you to determine.
"""

In [None]:
summarise_prompt = """
    You are an autonomous SRE agent performing Root Cause Analysis on a Kubernetes incident.

    Context:

    Previous Insights: 
    {insights}
    
    Previous Steps:
    {prev_steps}

    Below are the latest messages (tool calls and/or tool responses - may include parallel executions):
    {last_messages}

    Instructions:
    1. **Extract the key insight**: Identify the most important NEW finding from all the latest messages that helps diagnose the root cause. Focus on:
       - Anomalies or unusual patterns
       - Resource states that could cause issues
       - Dependencies or relationships discovered
       - Error messages or failure indicators
       - Patterns across multiple tool responses (in case of parallel calls)
       If the tool calls failed or returned no useful data, note this as the insight.
    
    2. **Describe the actions taken**: Write a concise description of what tools were called and what resources were examined.
       Format: "Checked [resource/metric] using [tool_name]" (list all tools if multiple parallel calls)
       Example for parallel: "Checked pod logs and dependencies using get_logs and get_dependencies"

    Keep both responses under 150 characters each. Be specific and actionable.
"""

In [None]:
def get_insights_str(state):
    """Return a string with the formatted list of insights gathered during exploration"""
    if len(state["insights"]) > 0:
        return "\n- ".join([""] + state["insights"])
    else:
        return "No insights yet"
    
def get_prev_steps_str(state):
    """Return a string with the formatted list of previous steps performed during exploration"""
    if len(state["prev_steps"]) > 0:
        return "\n- ".join([""] + state["prev_steps"])
    else:
        return "No previous steps yet"

In [None]:
llm_with_strct_output = gpt5mini.with_structured_output(UpdateAgentData)

In [None]:
from langchain_core.messages import HumanMessage, AIMessage

# Node used to summarise the infos given the latest messages (handles parallel tool calls)
async def summarise(state: RcaAgentState):

    # Gather all recent messages starting from the last AI message (tool calls)
    # and all subsequent tool responses
    messages = state["messages"]
    last_ai_idx = None
    
    # Find the last AI message (which contains tool calls)
    for i in range(len(messages) - 1, -1, -1):
        if isinstance(messages[i], AIMessage):
            last_ai_idx = i
            break
    
    # Collect messages from last AI message onwards (to capture all parallel responses)
    if last_ai_idx is not None:
        last_messages = messages[last_ai_idx:]
    else:
        last_messages = messages[-2:]  # Fallback to last 2 messages

    insights_str = get_insights_str(state)
    prev_step_str = get_prev_steps_str(state)

    prompt = HumanMessage(content=summarise_prompt.format(
        prev_steps = prev_step_str,
        insights=insights_str,
        last_messages=last_messages))

    data = llm_with_strct_output.invoke([prompt])

    return {"insights" : [data.insight], "prev_steps" : [data.prev_step]} #type: ignore

In [None]:
from typing import Annotated
from langgraph.types import Command
from langchain_core.messages import ToolMessage
from langchain_core.tools import tool, InjectedToolCallId

#¬†Tool used to submit the final response
@tool
def submit_final_diagnosis(
    diagnosis: str, 
    reasoning: str,
    tool_call_id: Annotated[str, InjectedToolCallId]
) -> Command:
    """
    Submit the final diagnosis when investigation is complete.
    
    Args:
        diagnosis: The issue you have identified (without fixing it)
        reasoning: Your reasoning and thought process behind the diagnosis (keep it concise)
    
    Returns:
        Command to update state and end workflow
    """
    final_response = {
        "diagnosis" : diagnosis,
        "reasoning" : reasoning
    }
    
    return Command(
        update={
            "rca_analysis": final_response,
            "messages": [
                ToolMessage(
                    content="Final diagnosis submitted successfully. Investigation complete.",
                    tool_call_id=tool_call_id
                )
            ]
        },
        goto="format-output" # End the loop cycle
    )

In [None]:
# Append the tool for submission to the list of tools (MCP servers)
completion_tool = submit_final_diagnosis
tools_with_completion = tools + [completion_tool]

In [None]:
async def rcaAgent(state: RcaAgentState):

    insights_str = get_insights_str(state)
    prev_step_str = get_prev_steps_str(state)
    
    # Extract task details
    task = state["rca_task"]
    suggested_tools_str = ", ".join(task.suggested_tools) if task.suggested_tools else "Use your best judgment"

    prompt = HumanMessage(content=rca_agent_prompt.format(
        prev_steps=prev_step_str, 
        insights=insights_str, 
        app_summary=state["app_summary"],
        investigation_goal=task.investigation_goal,
        resource_type=task.resource_type,
        target_resource=task.target_resource,
        suggested_tools=suggested_tools_str
    ))

    llm_with_completion_tools = gpt5mini.bind_tools(tools_with_completion, parallel_tool_calls=False)
    return {"messages": [llm_with_completion_tools.invoke([prompt])]}

In [None]:
from langchain_core.messages import AIMessage
from collections import Counter

def count_tool_calls(messages):
    """
    Plots a bar chart of tool call occurrences by tool name from state['messages'].
    """
    # Extract tool names from ToolMessage objects
    tool_calls = []
    for msg in messages:

        if isinstance(msg, AIMessage):
            if hasattr(msg, 'additional_kwargs'):
                if "tool_calls" in msg.additional_kwargs:
                    for call in msg.additional_kwargs['tool_calls']:
                        if "function" in call:
                            if "name" in call["function"]:
                                tool_calls.append(call["function"]["name"])

    # Count occurrences
    counts = Counter(tool_calls)

    return dict(counts)

In [None]:
async def format_response(state: RcaAgentState):

    final_report = state["rca_analysis"]
    
    task = state["rca_task"]
    final_report["task"] = {
        "investigation_goal" : task.investigation_goal,
        "target_resource" : task.target_resource,
        "resource_type" : task.resource_type,
        "suggested_tools" : task.suggested_tools
    }
    
    final_report["insights"] = state["insights"]
    final_report["steps_performed"] = state["prev_steps"]
    final_report["tools_stats"] = count_tool_calls(state["messages"])

    return {"rca_analysis" : final_report}

In [None]:
from langgraph.graph import START, END, StateGraph
from langgraph.prebuilt import tools_condition, ToolNode
from IPython.display import Image, display

# Build the graph
builder = StateGraph(RcaAgentState)

# Add nodes
builder.add_node("rca-agent", rcaAgent)
builder.add_node("tools", ToolNode(tools_with_completion)) # Tool node is executing the tool called in the previous message
builder.add_node("summarise", summarise) # Node to reduce the raw data into a schema
builder.add_node("format-output", format_response)

# Add edges
builder.add_edge(START, "rca-agent")

# Conditional edge from sre-agent
builder.add_conditional_edges(
    "rca-agent",
    # Use in the conditional_edge to route to the ToolNode if the last message has tool calls. Otherwise, route to the end.
    tools_condition,
)

# After tools, decide whether to summarise or end
def after_tools_condition(state: RcaAgentState):
    # If rca analysis is filled, investigation is complete (end of the workflow)
    if state.get("rca_analysis") and "diagnosis" in state["rca_analysis"] and "reasoning" in state["rca_analysis"]:
        return "format-output"
    return "summarise"

builder.add_conditional_edges(
    "tools",
    after_tools_condition,
    {
        "summarise": "summarise",
        "format-output": "format-output"
    }
)

# After summarise, continue investigation (go to rca-agent)
builder.add_edge("summarise", "rca-agent")
builder.add_edge("format-output", END)

# Compile the graph
structured_graph = builder.compile()

# Show the graph
display(Image(structured_graph.get_graph(xray=True).draw_mermaid_png()))

In [None]:
import time

async def test_structured_graph(graph, app_summary: str, rca_task: RCATask, human_message: str = "", trace_name: str | None = None):
    """Test the structured graph with RcaAgentState"""
    # Create initial state with RcaAgentState structure
    initial_state = {
        "messages": [HumanMessage(content=human_message)] if human_message else [],
        "rca_task" : rca_task,
        "insights": [],
        "prev_steps": [],
        "rca_analysis" : {},
        "app_summary" : app_summary
    }
    
    # Start time tracking
    start_time = time.time()

    # Configuration for the graph execution
    config = {"recursion_limit": 50}
    if trace_name:
        config["run_name"] = trace_name # type: ignore
    
    # Invoke the graph asynchronously
    result = await graph.ainvoke(initial_state, config)

    # Calculate execution time
    execution_time = time.time() - start_time

    return result

In [None]:
experiment_name = input("Enter experiment name: ")

if experiment_name.strip() == "":
    experiment_name = "RCA agent"

In [None]:
app_summary = """
    The application implements a hotel reservation service, build with Go and gRPC. The initial project is extended in several ways, including adding back-end in-memory and persistent databases, adding a recommender system for obtaining hotel recommendations, and adding the functionality to place a hotel reservation. 
"""

In [None]:
rca_task = RCATask(
    investigation_goal="Collect recent stdout/stderr logs from the 'hotel-reserv-geo' container in pod geo-6b4b89b5f5-rsrh7 to identify the runtime error(s) that produced exit_code=2 and triggered restarts.",
    target_resource="geo-6b4b89b5f5-rsrh7",
    resource_type="pod",
    suggested_tools=["get_logs", "get_traces", "get_metrics_range"]
)

In [None]:
human = ""
rca_agent_output = await test_structured_graph(structured_graph, app_summary, rca_task, trace_name=experiment_name)

In [None]:
rca_agent_output["rca_analysis"]

In [None]:
from IPython.display import Markdown, display

def display_rca_analysis(rca_analysis):
    """Display RCA analysis in a formatted markdown"""
    
    md = f"""
# üîç Root Cause Analysis Report

---

## üìã Investigation Task

**Goal:** {rca_analysis['task']['investigation_goal']}  
**Resource Type:** {rca_analysis['task']['resource_type']}  
**Target Resource:** `{rca_analysis['task']['target_resource']}`

**Suggested Tools:**
- {', '.join(rca_analysis['task']['suggested_tools']) if rca_analysis['task']['suggested_tools'] else 'None'}

---

## üéØ Diagnosis

{rca_analysis['diagnosis']}

---

## üí° Reasoning

{rca_analysis['reasoning']}

---

## üîé Investigation Details

### Steps Performed ({len(rca_analysis['steps_performed'])} steps)

"""
    
    for i, step in enumerate(rca_analysis['steps_performed'], 1):
        md += f"{i}. {step}\n"
    
    md += "\n### Key Insights\n\n"
    
    for i, insight in enumerate(rca_analysis['insights'], 1):
        md += f"{i}. {insight}\n"
    
    md += "\n---\n\n## üìä Tool Usage Statistics\n\n"
    
    total_calls = sum(rca_analysis['tools_stats'].values())
    md += f"**Total Tool Calls:** {total_calls}\n\n"
    
    for tool, count in sorted(rca_analysis['tools_stats'].items(), key=lambda x: x[1], reverse=True):
        md += f"- `{tool}`: ({count})\n"
    
    return Markdown(md)

In [None]:
display(display_rca_analysis(rca_agent_output["rca_analysis"]))

In [None]:
for m in rca_agent_output['messages']:
    m.pretty_print()