In [1]:
import sys
sys.path.append("..")  # Add the project root to Python path

import requests
import json
import os

from app.services.llm_interface import LLMInterface
from app.config.settings import LLM_MODEL, LLM_PROVIDER

stackvm_host = os.getenv("STACKVM_HOST", None)
assert stackvm_host is not None, "STACKVM_HOST environment variable is not set."

def get_task_branch_answer_detail(task_id: str, branch_name: str) -> dict:
    """
    Retrieves the answer detail for a specific task and branch using the API.

    Args:
        task_id: The ID of the task.
        branch_name: The name of the branch.
        base_url: The base URL of the API.

    Returns:
        A dictionary containing the API response, or None if an error occurred.
    """
    url = f"{stackvm_host}/api/tasks/{task_id}/branches/{branch_name}/answer_detail"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error during request: {e}")
        return None
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON response: {e}")
        return None
    

llm_client = LLMInterface(LLM_PROVIDER, LLM_MODEL)




In [11]:
def evaluation_task(goal, answer, plan):
    evluation_prompt = f"""You are tasked with evaluating and improving the effectiveness of a problem-solving workflow. Below is a description of a Goal, a Plan used to address it, and the Final Answer generated. Your task is to evaluate the quality of the answer and diagnose whether the plan sufficiently aligned with the goal. If issues are present (e.g., the answer does not fully meet the goal or contains irrelevant information), you must:
1. Analyze the Plan to identify weaknesses or misalignments with the Goal.
2. Provide detailed suggestions to adjust or rewrite the Plan to improve the answer quality.

Your output must include:
1. Answer Quality Assessment: Clearly state whether the final answer resolves the goal. If not, explain why and identify any irrelevant or missing elements.
2. Plan Analysis: Examine the steps in the plan, identify where they failed or could be improved, and explain why adjustments are necessary.
3. Plan Adjustment Suggestions: Provide a revised or improved version of the plan to address the identified shortcomings.

Here are the inputs:

## Goal 
{goal}

## Answer
{answer}

## plan
{plan}

Your Output Format:
You must return a JSON object with the following keys:
- accept: Boolean value (true or false) indicating whether the final answer effectively resolves the goal.
- answer_quality_assessment_explaination: A detailed explanation justifying why the final answer does or does not meet the goal, highlighting key points or missing elements.
- plan_adjustment_suggestion: If answer is not accepted, please provide a comprehensive analysis of the plan and recommendations for how to adjust or improve it to better achieve the goal.

Example Output:
{{
  "accept": False/True,
  "answer_quality_assessment_explaination": "...",
  "plan_adjustment_suggestion": {...}
}}
"""
    
    return llm_client.generate(prompt=evluation_prompt)


In [15]:
import json

from app.utils.json import extract_json


# task_id = 'c3382869-e2b2-4244-b971-d00a14701681'
task_id = 'fe605c06-1fc5-47d8-a728-25f1a025befd'
branch_name = 'main'

print(f"Start to evaluate plan for task(id={task_id},branch={branch_name})")

detail = get_task_branch_answer_detail(task_id, branch_name)
state = detail.get('vm_state')
goal_completed = False
final_answer = None
plan = None
goal = None

if state is not None:
    plan = state.get("current_plan", None)
    goal_completed = state.get("goal_completed", False)
    goal = state.get("goal", None)
    if state.get("variables", None) is not None:
        final_answer = state['variables'].get("final_answer", None)

    if goal is not None and goal_completed is True and plan is not None and final_answer is not None:
        response = evaluation_task(goal, final_answer, plan)
        eval_res_str = extract_json(response)
        eval_res = json.loads(eval_res_str)
        accept = eval_res.get("accept", None)
        if accept is not None and not accept:
            




Start to evaluate plan for task(id=fe605c06-1fc5-47d8-a728-25f1a025befd,branch=main)


In [16]:
eval_res

{'accept': True,
 'answer_quality_assessment_explaination': 'The final answer effectively resolves the goal by providing a comprehensive list of potential reasons why TiDB Drainer might fail to start, along with corresponding solutions for each identified issue. The answer is well-structured and covers a range of possible causes, including configuration issues, resource limitations, network problems, and version compatibility. It also provides actionable solutions, such as using specific tools and checking configurations, which are relevant and practical for addressing the startup issues. There are no irrelevant elements, and the answer is complete in addressing the goal.',
 'plan_adjustment_suggestion': None}