In [1]:
import sys
sys.path.append("..")  # Add the project root to Python path

import requests
from datetime import datetime, timedelta
from typing import List, Dict, Optional

from notebooks.plan_optimization_chat import get_task_answer, update_plan, execute_task_using_new_plan, stackvm_host
from app.core.plan.evaluator import evaulate_answer

def get_evaluation_pending_tasks(        
    start_time: Optional[datetime] = None,
    end_time: Optional[datetime] = None,
    evaluation_statuses: Optional[List[str]] = None
) -> List[Dict]:
    """
    Fetches the list of tasks pending evaluation from the API.

    Args:
        start_time (Optional[datetime]): The start time to filter tasks.
        end_time (Optional[datetime]): The end time to filter tasks.
        evaluation_statuses (Optional[List[str]]): List of evaluation statuses to filter by. Defaults to ['NOT_EVALUATED'].

    Returns:
        List[Dict]: A list of tasks pending evaluation.
    
    Raises:
        requests.exceptions.RequestException: If the request fails.
        ValueError: If the response cannot be decoded.
    """
    endpoint = f"{stackvm_host}/api/tasks/evaluation"
    params = {}
    
    if start_time:
        params['start_time'] = start_time.isoformat()
    if end_time:
        params['end_time'] = end_time.isoformat()
    if evaluation_statuses:
        # Join multiple statuses with commas
        params['evaluation_status'] = ','.join(evaluation_statuses)
    else:
        # Default to NOT_EVALUATED if no statuses are provided
        params['evaluation_status'] = 'NOT_EVALUATED'
    
    try:
        response = requests.get(endpoint, params=params)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4XX or 5XX)
        data = response.json()
        
        if not isinstance(data, list):
            raise ValueError("Unexpected response format: Expected a list of tasks.")
        
        return data
    except requests.exceptions.RequestException as e:
        # Handle network-related errors
        print(f"An error occurred while making the request: {e}")
        raise
    except ValueError as ve:
        # Handle JSON decoding errors or unexpected data formats
        print(f"An error occurred while processing the response: {ve}")
        raise

def record_evaluation(
    task_id: str,
    evaluation_status: str,
    evaluation_reason: Optional[str] = "",
    timeout: int = 60
) -> Dict:
    """
    Records the evaluation result of a specific task by calling the API endpoint.

    Args:
        base_url (str): The base URL of the API (e.g., 'http://stackvm-dev.tidb.ai:5556').
        task_id (str): The ID of the task to be evaluated.
        evaluation_status (str): The evaluation status (e.g., "APPROVED", "REJECTED").
        evaluation_reason (Optional[str]): The reason for the evaluation decision.
        api_token (Optional[str]): API token for authentication, if required.
        timeout (int): Timeout in seconds for the API request.

    Returns:
        Dict: The JSON response from the API indicating success or failure.
    
    Raises:
        requests.exceptions.RequestException: If the request fails.
        ValueError: If the response cannot be decoded or contains an error.
    """
    endpoint = f"{stackvm_host}/api/tasks/{task_id}/evaluation"
    payload = {
        "evaluation_status": evaluation_status,
        "evaluation_reason": evaluation_reason
    }
    headers = {
        "Content-Type": "application/json"
    }

    try:
        response = requests.post(endpoint, json=payload, headers=headers, timeout=timeout)
        response.raise_for_status()
        data = response.json()

        if not isinstance(data, dict):
            raise ValueError("Unexpected response format: Expected a JSON object.")

        if not data.get("success", False):
            error_message = data.get("error", "Unknown error occurred.")
            raise ValueError(f"API Error: {error_message}")

        return data

    except requests.exceptions.RequestException as e:
        print(f"An error occurred while making the request: {e}")
        raise
    except ValueError as ve:
        print(f"An error occurred while processing the response: {ve}")
        raise

def record_human_evaluation(
    task_id: str,
    evaluation_status: str,
    feedback: Optional[str] = "",
    timeout: int = 60
) -> Dict:
    """
    Records the evaluation result of a specific task by calling the API endpoint.

    Args:
        base_url (str): The base URL of the API (e.g., 'http://stackvm-dev.tidb.ai:5556').
        task_id (str): The ID of the task to be evaluated.
        evaluation_status (str): The evaluation status (e.g., "APPROVED", "REJECTED").
        evaluation_reason (Optional[str]): The reason for the evaluation decision.
        api_token (Optional[str]): API token for authentication, if required.
        timeout (int): Timeout in seconds for the API request.

    Returns:
        Dict: The JSON response from the API indicating success or failure.
    
    Raises:
        requests.exceptions.RequestException: If the request fails.
        ValueError: If the response cannot be decoded or contains an error.
    """
    endpoint = f"{stackvm_host}/api/tasks/{task_id}/human_evaluation"
    payload = {
        "evaluation_status": evaluation_status,
        "feedback": feedback
    }
    headers = {
        "Content-Type": "application/json"
    }

    try:
        response = requests.post(endpoint, json=payload, headers=headers, timeout=timeout)
        response.raise_for_status()
        data = response.json()

        if not isinstance(data, dict):
            raise ValueError("Unexpected response format: Expected a JSON object.")

        if not data.get("success", False):
            error_message = data.get("error", "Unknown error occurred.")
            raise ValueError(f"API Error: {error_message}")

        return data

    except requests.exceptions.RequestException as e:
        print(f"An error occurred while making the request: {e}")
        raise
    except ValueError as ve:
        print(f"An error occurred while processing the response: {ve}")
        raise




In [6]:
import json
from app.utils.json import extract_json
from app.llm.interface import LLMInterface
from app.config.settings import REASON_LLM_PROVIDER, REASON_LLM_MODEL

eval_llm = LLMInterface(REASON_LLM_PROVIDER, REASON_LLM_MODEL)

def optimize_plan(task_id:str, branch_name:Optional[str]="main", max_iteration=2):
    current_branch_name = branch_name
    error_message = None
    iteration_round = 0

    while True:
        print(f"Start to evaluate plan for task(id={task_id},branch={current_branch_name})")
        detail = get_task_answer(task_id, current_branch_name)

        if detail is not None:
            goal = detail.get("goal")
            final_answer = detail.get("final_answer")
            plan = detail.get("plan")
            metadata = detail.get("metadata")

            eval_res = evaulate_answer(eval_llm, goal, metadata, final_answer, plan)
            eval_status = "APPROVED" if eval_res.get("accept", False) else "REJECTED"
            eval_reason = json.dumps(eval_res, indent=2) 

            record_evaluation(task_id, eval_status, eval_reason)

            if eval_res.get("accept", False) is True:
                print(f"Goal Pass! {goal}, evaluation result:{eval_reason}")
                return

            print(f"Goal Not Pass! {goal}, the evaluation result:{eval_reason}")

            if iteration_round >= max_iteration:
                break

            revised_plan = update_plan(goal, metadata, plan, eval_reason)
            print("revised plan:", revised_plan)

            try:
                updated_result = execute_task_using_new_plan(task_id, revised_plan)
                print(f"Revised plan execution result {updated_result}")
            except Exception as e:
                error_message = f"Failed to execute task using new plan {e}"
                break
            
            current_branch_name = updated_result.get("branch_name", None)
            current_final_answer = updated_result.get("final_answer", None)
            if current_branch_name is None or current_final_answer is None:
                error_message = "Failed to execut task using new plan, get empty answer"
                break

            iteration_round += 1
    
    if error_message is None:
        error_message = "Still failed after two evaluations round."
    print(f"Failed to evaluate plan for task(id={task_id}): {error_message}")
    record_human_evaluation(task_id, "WAITTING_FOR_EVALUATION", error_message)


In [7]:
from app.core.labels.classifier import LabelClassifier

# optimize_plan("d89cab55-9930-41bb-b3ee-2de93d99246b", "main")

classifier = LabelClassifier()

end_time = datetime.utcnow()
start_time = end_time - timedelta(hours=2)

pending_tasks = get_evaluation_pending_tasks(
    start_time=start_time
)

for task in pending_tasks:
    task_id = task["id"]
    optimize_plan(task_id, "main")

Start to evaluate plan for task(id=e182ae0f-7ec3-4202-882d-d76af8338824,branch=main)


2025-02-07 17:38:14,406 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Not Pass! Does TiDB Support CRDB?, the evaluation result:{
  "accept": false,
  "answer_quality_assessment_explanation": "The final answer does not effectively resolve the user's goal. The user asked whether TiDB supports CRDB, which likely refers to CockroachDB. The final answer states that there is no clear evidence of support based on the information and document search results, but it does not provide a definitive answer or any specific findings from the knowledge graph or vector search. The answer suggests checking official documentation or contacting support, which indicates a lack of confidence in the provided answer. This suggests that the Plan did not successfully retrieve or synthesize relevant information to provide a clear and direct answer.",
  "plan_adjustment_suggestion": "To improve the Plan, the following adjustments are suggested:\n1. **Clarify the Goal**: Ensure that the term 'CRDB' is clearly understood as referring to CockroachDB. This should be explicitly con

2025-02-07 17:38:45,015 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


revised plan: [{'seq_no': 0, 'type': 'reasoning', 'parameters': {'chain_of_thoughts': "To determine if TiDB supports CRDB, we will follow a structured approach:\n\n1. **Clarify the Goal**:\n   - Confirm that 'CRDB' refers to CockroachDB, a distributed SQL database.\n\n2. **Overall Strategy**:\n   - Step 1: Use the knowledge graph to gather information about TiDB's compatibility and support for CockroachDB.\n   - Step 2: Use vector search to find detailed documentation or discussions about TiDB's support for CockroachDB.\n   - Step 3: Synthesize the information gathered to provide a clear answer.\n\n3. **Key Decision Points**:\n   - Using both knowledge graph and vector search ensures we cover both structured data and detailed documentation.\n\n4. **Assumptions**:\n   - The term 'CRDB' is confirmed to refer to CockroachDB.\n\n5. **Compliance Checks**:\n   - ✓ No user-specific queries planned.\n   - ✓ All responses will maintain consistent language (Chinese).\n   - ✓ Final answer will be

2025-02-07 17:39:15,428 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! Does TiDB Support CRDB?, evaluation result:{
  "accept": true,
  "answer_quality_assessment_explanation": "The Final Answer effectively resolves the Goal by clearly stating that there is no direct support for CockroachDB (CRDB) in TiDB, based on the information retrieved. The Plan demonstrates a sufficient understanding of the user's question by confirming the meaning of 'CRDB' as CockroachDB and using a structured approach to gather information. The Plan includes steps to retrieve data from a knowledge graph and perform a vector search, ensuring comprehensive coverage of available information. The synthesis of this information into a clear answer aligns with the user's query, and the response is provided in Chinese, as specified in the supplementary goal information.",
  "plan_adjustment_suggestion": null,
  "goal_classification": "Direct Problem Resolution"
}
