In [3]:
import os
import requests
import json
import uuid
from datetime import datetime, timedelta

from crewai import Agent, Task, Crew, Process
from crewai.flow import Flow, start, listen, router
from crewai.tools import BaseTool
from pydantic import BaseModel, Field, ValidationError
from typing import List, Dict, Optional, Any, Tuple, Type
from dataclasses import dataclass
from config import (
    OPENROUTER_API_KEY,
    OPENROUTER_MODEL,
    AGENT_VERBOSE,
    MAX_AGENT_ITERATIONS,
)
from langchain_openai import ChatOpenAI

In [6]:
MAX_RETRIES = MAX_AGENT_ITERATIONS  # Max retries for validation failures
PROMETHEUS_BASE_URL = os.environ.get("PROMETHEUS_URL", "http://localhost:9090")

llm = ChatOpenAI(
    model=OPENROUTER_MODEL,
    openai_api_key=OPENROUTER_API_KEY,
    openai_api_base="https://openrouter.ai/api/v1",
)

In [7]:
class PromParams(BaseModel):
    start: Optional[int] = None  # Unix timestamp
    end: Optional[int] = None    # Unix timestamp
    step: Optional[str] = "1m"  # Step duration like '1m', '5m'

class PromQLPlan(BaseModel):
    query: str
    query_type: str = "query_range"
    parameters: PromParams = None

class PromQLWorkflowState(BaseModel):
    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
    original_query: str = ""
    promql_plan: PromQLPlan = None
    validation_result: bool = None
    validation_feedback: Optional[str] = None
    final_prometheus_result: Optional[Dict] = None
    final_answer: Optional[str] = None
    retry_count: int = 0
    next_task: Optional[str] = None


In [12]:
def PrometheusQueryTool(plan: Dict) -> Dict:
    try:
        # Use Pydantic model for robust parsing and validation of the plan
        query_plan = PromQLPlan(**plan)
        print(
            f"--- TOOL: Executing PromQL plan: {query_plan.model_dump_json()} ---"
        )

        url = f"{PROMETHEUS_BASE_URL}/api/v1/{query_plan.query_type}"

        # Build params, including dynamic start/end for range queries
        params = {"query": query_plan.query}
        if query_plan.query_type == "query_range":
            if query_plan.parameters:
                # A real implementation would parse timeframes like "15m"
                # For simplicity, we'll assume they are pre-calculated.
                params.update(query_plan.parameters)
            else:  # Default to last 15 minutes if not specified
                end_time = datetime.now(datetime.timezone.utc)
                start_time = end_time - timedelta(minutes=15)
                params["start"] = start_time.isoformat() + "Z"
                params["end"] = end_time.isoformat() + "Z"
                params["step"] = "1m"
        
        print(f"--- TOOL: Querying Prometheus at {url} with params {params} ---")

        response = requests.get(url, params=params, timeout=60)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        import traceback
        return {"error": True, "message": f"failed: {e} {traceback.format_exc()}"}

In [23]:
from datetime import datetime, timezone, timedelta

plan = PromQLPlan(
    query="node_memory_MemFree_bytes",
    query_type="query_range",
    parameters= PromParams(
        start= int((datetime.now(timezone.utc) - timedelta(minutes=15)).timestamp()),
        end= int(datetime.now(timezone.utc).timestamp()),
        step="1m"
    ),
).model_dump()

tool_result = PrometheusQueryTool(plan)
print(tool_result)

--- TOOL: Executing PromQL plan: {"query":"node_memory_MemFree_bytes","query_type":"query_range","parameters":{"start":1756913823,"end":1756914723,"step":"1m"}} ---
--- TOOL: Querying Prometheus at http://localhost:9090/api/v1/query_range with params {'query': 'node_memory_MemFree_bytes', 'start': 1756913823, 'end': 1756914723, 'step': '1m'} ---
{'status': 'success', 'data': {'resultType': 'matrix', 'result': [{'metric': {'__name__': 'node_memory_MemFree_bytes', 'instance': 'node-exporter:9100', 'job': 'node-exporter'}, 'values': [[1756914363, '458301440'], [1756914423, '445927424'], [1756914483, '445669376'], [1756914543, '445218816'], [1756914603, '463351808'], [1756914663, '462897152'], [1756914723, '458563584']]}]}}


In [None]:
PlanningAgent = Agent(
    verbose = AGENT_VERBOSE,
    role="planner",
    goal="Convert natural language query from user into detailed PromQL",
    llm=llm,
    backstory="The PlanningAgent is designed to bridge the gap "
    "between human intent and machine execution. "
    "It interprets user queries, identifies the underlying requirements, "
    "and formulates a precise PromQL query. ",
    max_iter= MAX_AGENT_ITERATIONS
)

AnalyzerAgent = Agent(
    verbose = AGENT_VERBOSE,
    role="analyzer",
    goal="Analyze Prometheus query results, extract meaningful insights, "
    "and provide actionable recommendations based on the data.",
    llm=llm,
    # tools=[FileStorageTool()],
    backstory="The AnalyzerAgent is the system's intelligence layer. "
    "It processes the raw results from Prometheus, identifies patterns, "
    "and generates insights that are both actionable and easy to understand. "
    "This agent ensures that users can make informed decisions based on the data.",
    max_iter= MAX_AGENT_ITERATIONS
)
# --- 6. Task and Flow Definition ---
# Define tasks with unique names
PlanTask = Task(
    description ="""
    Plan PromQL query based on user's natural language input {original_query}. 
    This task involves interpreting the user's intent,
    identifying the appropriate PromQL constructs.
    STRICTLY use the unix timestamp for any time parameters. 
    In the output json, if the query_type is "query_range",
    include "start", "end" (both as unix timestamps) and "step" in parameters.
    """,
    agent=PlanningAgent,
    output_json=PromQLPlan,
    expected_output="The output should be a JSON object containing "
    "the PromQL query string with parameters, the query type (e.g., 'query', 'query_range'), " 
    "parameters are optional and different per query_type like 'start', 'end', 'step', 'time', etc "
    "and an optional step size if applicable."
)

AnalyzeTask = Task(
    description=""" For given user's query: {original_query}, 
    Analyze the results of the PromQL query. 
    ```json
    {prometheus_results}
    ```
    This task processes the raw data, identifies key insights, and
    generates a summary that includes statistical analysis and actionable recommendations. The goal is to
    provide users with a clear understanding of the data and its implications.
    """,
    agent=AnalyzerAgent,
    expected_output="The output should be in easy to understand language " 
    "Provide some statistics bullet points, "
    "and a list of actionable recommendations. "
    "Do not give too long answers, keep it concise and to the point."
)

In [None]:
original_query = "Get me the CPU usage for 1st Sept 2025, 7 to 8 AM IST"

def generate_query() -> Dict:
    """Start by analyzing the user query."""
    coding_task, coding_agent = PlanTask, PlanningAgent
    # print(f"{'='*20}",coding_task, coding_agent, f"{'='*20}")
    global original_query
    print(f"--- FLOW: Starting analysis for query: {original_query} ---")
    
    # Create crew for this task
    generate_query_crew = Crew(
        agents=[coding_agent],
        tasks=[coding_task],
        process=Process.sequential,
        verbose=True,
    )
    print("created crew ")
    result = generate_query_crew.kickoff(inputs={"original_query": original_query})
    print(result)
    return result

result = generate_query()

--- FLOW: Starting analysis for query: Get me the CPU usage for Sept 1 2025, 7 to 8 AM IST ---
created crew 


{'query': '(1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100', 'query_type': 'query_range', 'parameters': {'start': 1756776600, 'end': 1756780200, 'step': '60s'}}


In [45]:
# promql_plan
promql_plan = PromQLPlan(**json.loads(result.raw))
promql_plan

PromQLPlan(query='(1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100', query_type='query_range', parameters=PromParams(start=1756776600, end=1756780200, step='60s'))

In [47]:
plan = promql_plan.model_dump_json()
print(plan, type(plan))

{"query":"(1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100","query_type":"query_range","parameters":{"start":1756776600,"end":1756780200,"step":"60s"}} <class 'str'>


In [66]:
pm_result = PrometheusQueryTool(json.loads(plan))

print(pm_result, type(pm_result))

--- TOOL: Executing PromQL plan: {"query":"(1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100","query_type":"query_range","parameters":{"start":1756776600,"end":1756780200,"step":"60s"}} ---
--- TOOL: Querying Prometheus at http://localhost:9090/api/v1/query_range with params {'query': '(1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100', 'start': 1756776600, 'end': 1756780200, 'step': '60s'} ---
{'status': 'success', 'data': {'resultType': 'matrix', 'result': [{'metric': {'instance': 'node-exporter:9100'}, 'values': [[1756779180, '94.785745'], [1756779240, '74.69500861904761'], [1756779300, '54.97508810256411'], [1756779360, '35.44477830555491'], [1756779420, '15.690193626666659'], [1756779480, '1.556648666542515'], [1756779540, '1.5564655172413677'], [1756779600, '1.4711206896551587'], [1756779660, '1.30741830144242'], [1756779720, '1.3171506305150515'], [1756779780, '1.3086206896551489'], [1756779840, '1.504153491243032'], [175

In [64]:
pm_result={"success": False, "message": "failed"}

In [65]:
if pm_result.get("success", True):
    print("analyze_results")
else:
    print("handle_failure")

handle_failure


In [74]:
analysis_task, analysis_agent = AnalyzeTask, AnalyzerAgent
prometheus_results = pm_result
analysis_crew = Crew(
            agents=[analysis_agent],
            tasks=[analysis_task],
            process=Process.sequential,
            verbose=True,
        )
an_result = analysis_crew.kickoff(
    inputs={
        "prometheus_results": prometheus_results, 
        "original_query": original_query,
        }
    )

print(an_result, type(an_result))

**CPU Usage Analysis for September 1, 2025 (7:00–8:00 AM IST)**  
*Note: Data covers 6:43–7:00 AM IST (17 minutes) due to query constraints. Timestamps align with UTC+5:30 (IST).*  

**Key Statistics**:  
- **Peak Usage**: 94.79% (7:13 AM IST) – critical threshold exceeded.  
- **Lowest Usage**: 1.31% (7:23 AM IST) – stable baseline.  
- **Average Usage**: 16.67% – skewed by initial spike.  
- **Trend**: Sharp decline from 94.79% → 1.31% within 10 minutes, followed by gradual rise to 2.74% by 7:30 AM IST.  
- **Duration of High Load**: 5 minutes (7:13–7:18 AM IST) above 15% usage.  

**Critical Observations**:  
1. **Critical Spike**: CPU usage surged to 94.79% at the start of the window, indicating a resource-intensive process (e.g., batch job, backup, or unoptimized service).  
2. **Rapid Recovery**: Drop to 1.5% within 10 minutes suggests the process completed or was terminated abruptly.  
3. **Stable Baseline**: Sustained sub-3% usage for 12 minutes confirms normal idle state.  
4.

In [75]:
from IPython.display import display, Markdown, Latex
display(Markdown(an_result.raw))

**CPU Usage Analysis for September 1, 2025 (7:00–8:00 AM IST)**  
*Note: Data covers 6:43–7:00 AM IST (17 minutes) due to query constraints. Timestamps align with UTC+5:30 (IST).*  

**Key Statistics**:  
- **Peak Usage**: 94.79% (7:13 AM IST) – critical threshold exceeded.  
- **Lowest Usage**: 1.31% (7:23 AM IST) – stable baseline.  
- **Average Usage**: 16.67% – skewed by initial spike.  
- **Trend**: Sharp decline from 94.79% → 1.31% within 10 minutes, followed by gradual rise to 2.74% by 7:30 AM IST.  
- **Duration of High Load**: 5 minutes (7:13–7:18 AM IST) above 15% usage.  

**Critical Observations**:  
1. **Critical Spike**: CPU usage surged to 94.79% at the start of the window, indicating a resource-intensive process (e.g., batch job, backup, or unoptimized service).  
2. **Rapid Recovery**: Drop to 1.5% within 10 minutes suggests the process completed or was terminated abruptly.  
3. **Stable Baseline**: Sustained sub-3% usage for 12 minutes confirms normal idle state.  
4. **Late Increase**: Gradual rise to 2.74% may indicate background services ramping up (e.g., scheduled tasks).  

**Actionable Recommendations**:  
1. **Investigate Root Cause**:  
   - Check system logs (e.g., `journalctl`, `top` history) at **7:13 AM IST** for processes consuming 94.79% CPU. Prioritize identifying if this was a one-time anomaly or recurring task (e.g., cron job).  
2. **Optimize High-Load Processes**:  
   - If the spike is intentional (e.g., nightly backup), reschedule it to off-peak hours or throttle CPU allocation using `cpulimit` or Kubernetes resource limits.  
3. **Implement Alerting**:  
   - Configure Prometheus alerts for CPU >90% for >2 minutes to detect future anomalies. Example rule: `alert: HighCPU, expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90`.  
4. **Validate System Resilience**:  
   - Test failover mechanisms during high-CPU scenarios to ensure service continuity (e.g., auto-scaling groups, redundant nodes).  
5. **Extend Monitoring Window**:  
   - Refine the PromQL query to capture the full 7:00–8:00 AM IST window (e.g., adjust `start`/`end` parameters) to rule out missing data.  

**Urgency Assessment**:  
- **High Priority**: The 94.79% spike risks system instability. Immediate log review is critical to prevent recurrence.  
- **Low Risk**: Post-spike stability (1.3–2.7% usage) confirms no ongoing degradation.  

*Recommend closing the loop within 24 hours by validating logs and adjusting schedules if needed.*

In [None]:
import json
raw_result = """{"query": "(node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / node_memory_MemTotal_bytes * 100", "query_type": "query_range", "parameters": {"start": 1718698500, "end": 1718700000, "step": "1m"}}"""
json.loads(raw_result)

dict

In [None]:
import pickle
# fs = open("final_state.pkl", 'rb')
with open("final_state.pkl", 'rb') as fs:
    final_state = pickle.load(fs)

In [10]:
final_state_raw = "{\"id\":\"e682fbb8-aaef-48d2-8b4f-4039b708ae6f\",\"original_query\":\"Analyze the Memory usage for the last 15 minutes. current epoch is 1756921654\",\"promql_plan\":{\"query\":\"node_memory_MemTotal_bytes - node_memory_MemFree_bytes\",\"query_type\":\"query_range\",\"parameters\":{\"start\":1756920754,\"end\":1756921654,\"step\":\"15s\"}},\"validation_result\":true,\"validation_feedback\":\"Query executed successfully.\",\"final_prometheus_result\":{\"status\":\"success\",\"data\":{\"resultType\":\"matrix\",\"result\":[{\"metric\":{\"instance\":\"node-exporter:9100\",\"job\":\"node-exporter\"},\"values\":[[1756920754,\"1599045632\"],[1756920769,\"1603207168\"],[1756920784,\"1603465216\"],[1756920799,\"1603825664\"],[1756920814,\"1603825664\"],[1756920829,\"1603710976\"],[1756920844,\"1604911104\"],[1756920859,\"1605726208\"],[1756920874,\"1605726208\"],[1756920889,\"1607254016\"],[1756920904,\"1606582272\"],[1756920919,\"1606725632\"],[1756920934,\"1603305472\"],[1756920949,\"1603305472\"],[1756920964,\"1603641344\"],[1756920979,\"1608609792\"],[1756920994,\"1609478144\"],[1756921009,\"1612660736\"],[1756921024,\"1612660736\"],[1756921039,\"1612660736\"],[1756921054,\"1610813440\"],[1756921069,\"1607344128\"],[1756921084,\"1605992448\"],[1756921099,\"1599381504\"],[1756921114,\"1594347520\"],[1756921129,\"1600790528\"],[1756921144,\"1601224704\"],[1756921159,\"1601998848\"],[1756921174,\"1602088960\"],[1756921189,\"1600659456\"],[1756921204,\"1600409600\"],[1756921219,\"1600409600\"],[1756921234,\"1602244608\"],[1756921249,\"1602412544\"],[1756921264,\"1601986560\"],[1756921279,\"1603215360\"],[1756921294,\"1603076096\"],[1756921309,\"1606705152\"],[1756921324,\"1605992448\"],[1756921339,\"1607192576\"],[1756921354,\"1607081984\"],[1756921369,\"1611538432\"],[1756921384,\"1612242944\"],[1756921399,\"1614049280\"],[1756921414,\"1614565376\"],[1756921429,\"1611370496\"],[1756921444,\"1603612672\"],[1756921459,\"1606074368\"],[1756921474,\"1606332416\"],[1756921489,\"1603600384\"],[1756921504,\"1604632576\"],[1756921519,\"1604505600\"],[1756921534,\"1604423680\"],[1756921549,\"1599897600\"],[1756921564,\"1599897600\"],[1756921579,\"1595408384\"],[1756921594,\"1590022144\"],[1756921609,\"1585934336\"],[1756921624,\"1590128640\"],[1756921639,\"1590218752\"],[1756921654,\"1590476800\"]]}]}},\"final_answer\":\"**Key Insights:**  \\n- Memory usage peaked at ~1.505 GB (1,614,565,376 bytes) at 2025-09-05 10:03:34 (UTC) and dipped to a minimum of ~1.479 GB (1,585,934,336 bytes) at 2025-09-05 10:10:09.  \\n- Fluctuations are minor (~26 MB range) with no sustained upward or downward trend, indicating stable memory usage over the 15-minute window.  \\n- Recent values (~1.48 GB) show a slight decrease from the peak but remain within normal variation.  \\n\\n**Actionable Recommendations:**  \\n1. Investigate processes active during the peak (10:03:34) to confirm if the spike was expected (e.g., batch jobs, deployments).  \\n2. Ensure system memory capacity is sufficient to accommodate brief peaks, especially if close to hardware limits.  \\n3. Monitor for recurring spikes at similar intervals; schedule resource-intensive tasks during off-peak hours if patterns emerge.  \\n4. Cross-reference with CPU and disk metrics during the peak period to identify correlated resource stress.  \\n5. If memory usage trends upward in future data, consider optimizing memory-heavy applications or scaling infrastructure.\",\"retry_count\":1,\"next_task\":null}"

In [11]:
import json
final_state_json = json.loads(final_state_raw)
print(final_state_raw, type(final_state_raw))

{"id":"e682fbb8-aaef-48d2-8b4f-4039b708ae6f","original_query":"Analyze the Memory usage for the last 15 minutes. current epoch is 1756921654","promql_plan":{"query":"node_memory_MemTotal_bytes - node_memory_MemFree_bytes","query_type":"query_range","parameters":{"start":1756920754,"end":1756921654,"step":"15s"}},"validation_result":true,"validation_feedback":"Query executed successfully.","final_prometheus_result":{"status":"success","data":{"resultType":"matrix","result":[{"metric":{"instance":"node-exporter:9100","job":"node-exporter"},"values":[[1756920754,"1599045632"],[1756920769,"1603207168"],[1756920784,"1603465216"],[1756920799,"1603825664"],[1756920814,"1603825664"],[1756920829,"1603710976"],[1756920844,"1604911104"],[1756920859,"1605726208"],[1756920874,"1605726208"],[1756920889,"1607254016"],[1756920904,"1606582272"],[1756920919,"1606725632"],[1756920934,"1603305472"],[1756920949,"1603305472"],[1756920964,"1603641344"],[1756920979,"1608609792"],[1756920994,"1609478144"],[175

In [None]:
metrics = final_state_json.get("final_prometheus_result")['data']["result"][0]["values"]

[[1756920754, '1599045632'],
 [1756920769, '1603207168'],
 [1756920784, '1603465216'],
 [1756920799, '1603825664'],
 [1756920814, '1603825664'],
 [1756920829, '1603710976'],
 [1756920844, '1604911104'],
 [1756920859, '1605726208'],
 [1756920874, '1605726208'],
 [1756920889, '1607254016'],
 [1756920904, '1606582272'],
 [1756920919, '1606725632'],
 [1756920934, '1603305472'],
 [1756920949, '1603305472'],
 [1756920964, '1603641344'],
 [1756920979, '1608609792'],
 [1756920994, '1609478144'],
 [1756921009, '1612660736'],
 [1756921024, '1612660736'],
 [1756921039, '1612660736'],
 [1756921054, '1610813440'],
 [1756921069, '1607344128'],
 [1756921084, '1605992448'],
 [1756921099, '1599381504'],
 [1756921114, '1594347520'],
 [1756921129, '1600790528'],
 [1756921144, '1601224704'],
 [1756921159, '1601998848'],
 [1756921174, '1602088960'],
 [1756921189, '1600659456'],
 [1756921204, '1600409600'],
 [1756921219, '1600409600'],
 [1756921234, '1602244608'],
 [1756921249, '1602412544'],
 [1756921264, 