In [1]:
import sys
sys.path.append("..")  # Add the project root to Python path

import requests
import json
import os
from typing import List, Dict, Any, Optional

from app.services.llm_interface import LLMInterface
from app.config.settings import LLM_MODEL, LLM_PROVIDER

stackvm_host = os.getenv("STACKVM_HOST", None)
assert stackvm_host is not None, "STACKVM_HOST environment variable is not set."

def get_task_branch_answer_detail(task_id: str, branch_name: Optional[str]="main") -> dict:
    """
    Retrieves the answer detail for a specific task and branch using the API.

    Args:
        task_id: The ID of the task.
        branch_name: The name of the branch.
        base_url: The base URL of the API.

    Returns:
        A dictionary containing the API response, or None if an error occurred.
    """
    url = f"{stackvm_host}/api/tasks/{task_id}/branches/{branch_name}/answer_detail"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error during request: {e}")
        raise e
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON response: {e}")
        raise e

def re_execute_task(
    task_id: str,
    plan: List[Dict[str, Any]]
) -> dict:
    """
    Updates a task with a new suggestion and sets the task to be re-run from scratch.
    """
    url = f"{stackvm_host}/api/tasks/{task_id}/re_execute"
    
    payload = {
        "plan": plan,
    }

    try:
        response = requests.post(url, json=payload)
        response.raise_for_status()
        
        return response.json()
    
    except requests.exceptions.RequestException as e:
        if response.status_code == 400:
            raise ValueError("Missing required parameters")
        elif response.status_code == 404:
            raise ValueError(f"Task with ID {task_id} not found")
        elif response.status_code == 500:
            raise ValueError("Failed to re-execute task")
        else:
            raise e
    

llm_client = LLMInterface(LLM_PROVIDER, LLM_MODEL)




In [2]:
def evaluation_task(goal, answer, plan):
    evluation_prompt = f"""You are tasked with evaluating and improving the effectiveness of a problem-solving workflow. Below is a description of a Goal, a Plan used to address it, and the Final Answer generated. Your task is to evaluate the quality of the answer and diagnose whether the plan sufficiently aligned with the goal. If issues are present (e.g., the answer does not fully meet the goal or contains irrelevant information), you must:
1. Analyze the Plan to identify weaknesses or misalignments with the Goal.
2. Provide detailed suggestions to adjust or rewrite the Plan to improve the answer quality.

Your output must include:
1. Answer Quality Assessment: Clearly state whether the final answer resolves the goal. If not, explain why and identify any irrelevant or missing elements.
2. Plan Analysis: Examine the steps in the plan, identify where they failed or could be improved, and explain why adjustments are necessary.
3. Plan Adjustment Suggestions: Provide a revised or improved version of the plan to address the identified shortcomings.

Here are the inputs:

## Goal 
{goal}

## Answer
{answer}

## plan
{plan}

Your Output Format:
You must return a JSON object with the following keys:
- accept: Boolean value (true or false) indicating whether the final answer effectively resolves the goal.
- answer_quality_assessment_explaination: A detailed explanation justifying why the final answer does or does not meet the goal, highlighting key points or missing elements.
- plan_adjustment_suggestion: If answer is not accepted, please provide a comprehensive analysis of the plan and recommendations for how to adjust or improve it to better achieve the goal.

Example Output:
{{
  "accept": False/True,
  "answer_quality_assessment_explaination": "...",
  "plan_adjustment_suggestion": {...}
}}
"""
    
    return llm_client.generate(prompt=evluation_prompt)


In [3]:
def generate_new_plan(goal, answer, plan, adjustment_suggestion):
    """
    Generates a new plan based on the evaluation results. It modifies only the necessary parts
    of the original plan to address the identified issues.

    Parameters:
    - goal (str): The original goal.
    - answer (str): The final answer generated.
    - plan (str): The original plan used to achieve the goal.
    - adjustment_suggestion (str): The plan adjustment suggestions from the evaluation.

    Returns:
    - new_plan (str): The revised plan addressing the evaluation feedback.
    """
        
    # Craft a prompt to generate the new plan based on the suggestions
    new_plan_prompt = f"""You are tasked with revising an existing plan to better achieve a specified goal based on evaluation feedback. The revisions should be minimal, only addressing the issues identified without overhauling the entire plan.

## Goal:
{goal}

## Original Plan:
{plan}

## Original Answer:
{answer}

## Evaluation Feedback:
{adjustment_suggestion}

## Requirements for the New Plan:
- Modify only the necessary parts of the original plan.
- Incorporate the suggestions from the evaluation feedback.
- Ensure the revised plan is coherent and aligned with the goal.
- **Information Retrieval Enhancement:** When performing information retrieval, use both knowledge graph search and vector search to ensure the richness of retrieved information. Note that knowledge graph search is a powerful retrieval function.
- **Selective Plan Modification:** If parts of the original answer meet the expected outcomes, identify and retain the corresponding information retrieval steps from the original plan. This approach ensures that only necessary modifications are made, preventing unpredictable performance fluctuations in the revised plan.

Now, let's update the plan.

**Output**:
1. Provide the complete updated plan in JSON format, ensuring it adheres to the VM specification.
2. Provide a summary of the changes made to the plan, including a diff with the previous plan.
"""

    # Generate the new plan using the LLM
    new_plan = llm_client.generate(prompt=new_plan_prompt)

    return new_plan

In [7]:
from datetime import datetime

def get_evaluation_pending_tasks(        
    start_time: Optional[datetime] = None,
    end_time: Optional[datetime] = None
) -> List[Dict]:
    """
    Fetches the list of tasks pending evaluation from the API.

    Args:
        base_url (str): The base URL of the API (e.g., 'http://stackvm-dev.tidb.ai:5556').
        start_time (Optional[datetime]): The start time to filter tasks.
        end_time (Optional[datetime]): The end time to filter tasks.

    Returns:
        List[Dict]: A list of tasks pending evaluation.
    
    Raises:
        requests.exceptions.RequestException: If the request fails.
        ValueError: If the response cannot be decoded.
    """
    endpoint = f"{stackvm_host}/api/tasks/evaluation_pending"
    params = {}
    
    if start_time:
        params['start_time'] = start_time.isoformat()
    if end_time:
        params['end_time'] = end_time.isoformat()
    
    try:
        response = requests.get(endpoint, params=params)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4XX or 5XX)
        data = response.json()
        
        if not isinstance(data, list):
            raise ValueError("Unexpected response format: Expected a list of tasks.")
        
        return data
    except requests.exceptions.RequestException as e:
        # Handle network-related errors
        print(f"An error occurred while making the request: {e}")
        raise
    except ValueError as ve:
        # Handle JSON decoding errors or unexpected data formats
        print(f"An error occurred while processing the response: {ve}")
        raise

def record_evaluation(
    task_id: str,
    evaluation_status: str,
    evaluation_reason: Optional[str] = "",
    timeout: int = 10
) -> Dict:
    """
    Records the evaluation result of a specific task by calling the API endpoint.

    Args:
        base_url (str): The base URL of the API (e.g., 'http://stackvm-dev.tidb.ai:5556').
        task_id (str): The ID of the task to be evaluated.
        evaluation_status (str): The evaluation status (e.g., "APPROVED", "REJECTED").
        evaluation_reason (Optional[str]): The reason for the evaluation decision.
        api_token (Optional[str]): API token for authentication, if required.
        timeout (int): Timeout in seconds for the API request.

    Returns:
        Dict: The JSON response from the API indicating success or failure.
    
    Raises:
        requests.exceptions.RequestException: If the request fails.
        ValueError: If the response cannot be decoded or contains an error.
    """
    endpoint = f"{stackvm_host}/api/tasks/{task_id}/evaluation"
    payload = {
        "evaluation_status": evaluation_status,
        "evaluation_reason": evaluation_reason
    }
    headers = {
        "Content-Type": "application/json"
    }

    try:
        response = requests.post(endpoint, json=payload, headers=headers, timeout=timeout)
        response.raise_for_status()
        data = response.json()

        if not isinstance(data, dict):
            raise ValueError("Unexpected response format: Expected a JSON object.")

        if not data.get("success", False):
            error_message = data.get("error", "Unknown error occurred.")
            raise ValueError(f"API Error: {error_message}")

        return data

    except requests.exceptions.RequestException as e:
        print(f"An error occurred while making the request: {e}")
        raise
    except ValueError as ve:
        print(f"An error occurred while processing the response: {ve}")
        raise

In [6]:
import json

from app.utils.json import extract_json

pending_tasks = get_evaluation_pending_tasks()

for task in pending_tasks:
    task_id = task["id"]
    print(f"Start to evaluate plan for task(id={task_id},branch=main)")
    detail = get_task_branch_answer_detail(task_id)

    state = detail.get('vm_state')
    goal_completed = False
    final_answer = None
    plan = None
    goal = None

    if state is not None:
        plan = state.get("current_plan", None)
        goal_completed = state.get("goal_completed", False)
        goal = state.get("goal", None)
        if state.get("variables", None) is not None:
            final_answer = state['variables'].get("final_answer", None)

        if goal is not None and goal_completed is True and plan is not None and final_answer is not None:
            response = evaluation_task(goal, final_answer, plan)
            eval_res_str = extract_json(response)
            eval_res = json.loads(eval_res_str)

            eval_status = "APPROVED" if eval_res.get("accept", False) else "REJECTED"
            eval_reason = json.dumps(eval_res) 
            print(eval_res)

            record_evaluation(task_id, eval_status, eval_reason)
            break

Start to evaluate plan for task(id=0340dd64-41c2-452a-a6ae-587f698a3126,branch=main)
{'accept': False, 'answer_quality_assessment_explaination': "The final answer does not effectively resolve the goal. The goal was to answer the question 'Who are you?' with a clear self-identification of the AI assistant. However, the provided answer is a generic response that does not address the question directly. It lacks specific information about the identity and capabilities of the AI assistant, which were intended to be included according to the plan.", 'plan_adjustment_suggestion': {'Plan Analysis': "The plan intended to use the LLM to generate a response that includes the identity and capabilities of the AI assistant. However, the final answer did not reflect this intention. The plan's execution failed to ensure that the generated response was aligned with the goal. The prompt used in the plan was appropriate, but the final step did not correctly assign the generated content to the final answe