In [1]:
import sys
sys.path.append("..")  # Add the project root to Python path

import requests
import json
import os
from typing import List, Dict, Any

from app.services.llm_interface import LLMInterface
from app.config.settings import LLM_MODEL, LLM_PROVIDER

stackvm_host = os.getenv("STACKVM_HOST", None)
assert stackvm_host is not None, "STACKVM_HOST environment variable is not set."

def get_task_branch_answer_detail(task_id: str, branch_name: str) -> dict:
    """
    Retrieves the answer detail for a specific task and branch using the API.

    Args:
        task_id: The ID of the task.
        branch_name: The name of the branch.
        base_url: The base URL of the API.

    Returns:
        A dictionary containing the API response, or None if an error occurred.
    """
    url = f"{stackvm_host}/api/tasks/{task_id}/branches/{branch_name}/answer_detail"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error during request: {e}")
        raise e
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON response: {e}")
        raise e

import requests
from typing import Optional

def re_execute_task(
    task_id: str,
    plan: List[Dict[str, Any]]
) -> dict:
    """
    Updates a task with a new suggestion and sets the task to be re-run from scratch.
    """
    url = f"{stackvm_host}/api/tasks/{task_id}/re_execute"
    
    payload = {
        "plan": plan,
    }

    try:
        response = requests.post(url, json=payload)
        response.raise_for_status()
        
        return response.json()
    
    except requests.exceptions.RequestException as e:
        if response.status_code == 400:
            raise ValueError("Missing required parameters")
        elif response.status_code == 404:
            raise ValueError(f"Task with ID {task_id} not found")
        elif response.status_code == 500:
            raise ValueError("Failed to re-execute task")
        else:
            raise e
    

llm_client = LLMInterface(LLM_PROVIDER, LLM_MODEL)




In [2]:
def evaluation_task(goal, answer, plan):
    evluation_prompt = f"""You are tasked with evaluating and improving the effectiveness of a problem-solving workflow. Below is a description of a Goal, a Plan used to address it, and the Final Answer generated. Your task is to evaluate the quality of the answer and diagnose whether the plan sufficiently aligned with the goal. If issues are present (e.g., the answer does not fully meet the goal or contains irrelevant information), you must:
1. Analyze the Plan to identify weaknesses or misalignments with the Goal.
2. Provide detailed suggestions to adjust or rewrite the Plan to improve the answer quality.

Your output must include:
1. Answer Quality Assessment: Clearly state whether the final answer resolves the goal. If not, explain why and identify any irrelevant or missing elements.
2. Plan Analysis: Examine the steps in the plan, identify where they failed or could be improved, and explain why adjustments are necessary.
3. Plan Adjustment Suggestions: Provide a revised or improved version of the plan to address the identified shortcomings.

Here are the inputs:

## Goal 
{goal}

## Answer
{answer}

## plan
{plan}

Your Output Format:
You must return a JSON object with the following keys:
- accept: Boolean value (true or false) indicating whether the final answer effectively resolves the goal.
- answer_quality_assessment_explaination: A detailed explanation justifying why the final answer does or does not meet the goal, highlighting key points or missing elements.
- plan_adjustment_suggestion: If answer is not accepted, please provide a comprehensive analysis of the plan and recommendations for how to adjust or improve it to better achieve the goal.

Example Output:
{{
  "accept": False/True,
  "answer_quality_assessment_explaination": "...",
  "plan_adjustment_suggestion": {...}
}}
"""
    
    return llm_client.generate(prompt=evluation_prompt)


In [3]:
def generate_new_plan(goal, answer, plan, adjustment_suggestion):
    """
    Generates a new plan based on the evaluation results. It modifies only the necessary parts
    of the original plan to address the identified issues.

    Parameters:
    - goal (str): The original goal.
    - answer (str): The final answer generated.
    - plan (str): The original plan used to achieve the goal.
    - adjustment_suggestion (str): The plan adjustment suggestions from the evaluation.

    Returns:
    - new_plan (str): The revised plan addressing the evaluation feedback.
    """
        
    # Craft a prompt to generate the new plan based on the suggestions
    new_plan_prompt = f"""You are tasked with revising an existing plan to better achieve a specified goal based on evaluation feedback. The revisions should be minimal, only addressing the issues identified without overhauling the entire plan.

## Goal:
{goal}

## Original Plan:
{plan}

## Original Answer:
{answer}

## Evaluation Feedback:
{adjustment_suggestion}

## Requirements for the New Plan:
- Modify only the necessary parts of the original plan.
- Incorporate the suggestions from the evaluation feedback.
- Ensure the revised plan is coherent and aligned with the goal.
- **Information Retrieval Enhancement:** When performing information retrieval, use both knowledge graph search and vector search to ensure the richness of retrieved information. Note that knowledge graph search is a powerful retrieval function.
- **Selective Plan Modification:** If parts of the original answer meet the expected outcomes, identify and retain the corresponding information retrieval steps from the original plan. This approach ensures that only necessary modifications are made, preventing unpredictable performance fluctuations in the revised plan.

Now, let's update the plan.

**Output**:
1. Provide the complete updated plan in JSON format, ensuring it adheres to the VM specification.
2. Provide a summary of the changes made to the plan, including a diff with the previous plan.
"""

    # Generate the new plan using the LLM
    new_plan = llm_client.generate(prompt=new_plan_prompt)

    return new_plan

In [4]:
import json

from app.utils.json import extract_json

task_id = '5575fe37-6b39-493a-929a-42112416a86f'
# task_id = 'c3382869-e2b2-4244-b971-d00a14701681'
# task_id = 'fe605c06-1fc5-47d8-a728-25f1a025befd'
branch_name = 'main'

print(f"Start to evaluate plan for task(id={task_id},branch={branch_name})")

detail = get_task_branch_answer_detail(task_id, branch_name)

state = detail.get('vm_state')
goal_completed = False
final_answer = None
plan = None
goal = None
updated_plan = None

if state is not None:
    plan = state.get("current_plan", None)
    goal_completed = state.get("goal_completed", False)
    goal = state.get("goal", None)
    if state.get("variables", None) is not None:
        final_answer = state['variables'].get("final_answer", None)

    if goal is not None and goal_completed is True and plan is not None and final_answer is not None:
        response = evaluation_task(goal, final_answer, plan)
        eval_res_str = extract_json(response)
        eval_res = json.loads(eval_res_str)
        accept = eval_res.get("accept", None)

        if accept is not True:
            explanation = eval_res.get("answer_quality_assessment_explaination", None)
            suggestion = eval_res.get("plan_adjustment_suggestion", None)

            if suggestion is not None:
                print(f"Update plan based on the suggestion: {suggestion}")
                print(f"Explanation: {explanation}")
                updated_plan_response = generate_new_plan(goal, final_answer, plan, suggestion)
                print(f"Updated plan: {updated_plan_response}")
                updated_plan_str = extract_json(updated_plan_response)
                updated_plan = json.loads(updated_plan_str)
            else:
                print("No suggestion found")
        else:
            print("The final answer is accepted")

Start to evaluate plan for task(id=5575fe37-6b39-493a-929a-42112416a86f,branch=main)
Update plan based on the suggestion: {'Plan Analysis': 'The plan has several weaknesses. First, it does not clearly separate the tasks of configuring the `region-split-size` and verifying its persistence. The inclusion of unrelated configuration examples suggests a lack of focus in the information retrieval and generation steps. Additionally, the plan does not ensure that the generated content is concise and directly relevant to the goal.', 'Plan Adjustment Suggestions': ['1. Refine the initial reasoning step to emphasize the specific goal of ensuring `region-split-size` persistence, avoiding unrelated configuration examples.', '2. In the information retrieval steps (seq_no 1 and 2), focus queries specifically on `region-split-size` persistence, rather than general configuration persistence.', '3. In the LLM generation step (seq_no 3), ensure the prompt explicitly requests steps for configuring and ver

In [None]:
updated_res = re_execute_task(task_id, updated_plan)
updated_res