In [1]:
import json
from typing import Optional

from app.core.plan.evaluator import evaulate_answer
from app.llm.interface import LLMInterface
from app.config.settings import EVALUATION_LLM_PROVIDER, EVALUATION_LLM_MODEL

from notebooks.plan_chat_optimizer import get_task_answer, update_plan, execute_task_using_new_plan
from notebooks.tasks import get_evaluation_pending_tasks, record_evaluation

eval_llm = LLMInterface(EVALUATION_LLM_PROVIDER, EVALUATION_LLM_MODEL)

def optimize_plan(task_id:str, branch_name:Optional[str]="main", max_iteration=3):
    current_branch_name = branch_name
    error_message = None
    iteration_round = 0

    while True:
        iteration_round += 1
        print(f"Start to evaluate plan for task(id={task_id},branch={current_branch_name})")
        detail = get_task_answer(task_id, current_branch_name)

        if detail is not None:
            goal = detail.get("goal")
            final_answer = detail.get("final_answer")
            plan = detail.get("plan")
            metadata = detail.get("metadata")

            if plan is None:
                record_evaluation(task_id, "REJECTED", "No plan found")
                return

            eval_res = evaulate_answer(eval_llm, goal, metadata, final_answer, plan)
            eval_status = "APPROVED" if eval_res.get("accept", False) else "WAITING_FOR_EVALUATION"
            eval_reason = json.dumps(eval_res, indent=2) 

            record_evaluation(task_id, eval_status, eval_reason)

            if eval_res.get("accept", False) is True:
                print(f"Goal Pass! {goal}, evaluation result:{eval_reason}")
                return

            print(f"Goal Not Pass! {goal}, the evaluation result:{eval_reason}")

            if iteration_round >= max_iteration:
                break

            revised_plan = update_plan(goal, metadata, eval_reason, plan)
            print("revised plan:", revised_plan)
            reasoning = revised_plan.get("reasoning", None)
            revised_plan = revised_plan.get("plan", None)

            try:
                updated_result = execute_task_using_new_plan(task_id, revised_plan, reasoning)
                print(f"Revised plan execution result {updated_result}")
            except Exception as e:
                error_message = f"Failed to execute task using new plan {e}"
                break
            
            current_branch_name = updated_result.get("branch_name", None)
            current_final_answer = updated_result.get("final_answer", None)
            if current_branch_name is None or current_final_answer is None:
                error_message = "Failed to execut task using new plan, get empty answer"
                break
    
    if error_message is None:
        error_message = "Still failed after two evaluations round."
    print(f"Failed to evaluate plan for task(id={task_id}): {error_message}")


2025-02-18 17:39:47,657 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
2025-02-18 17:39:47,659 - apscheduler.scheduler - INFO - Added job "SimpleCache.refresh_cache" to job store "default"
2025-02-18 17:39:47,659 - apscheduler.scheduler - INFO - Scheduler started
2025-02-18 17:39:47,660 - app.core.task.simple_cache - INFO - Started cache refresh scheduler to run every 24 hours.
2025-02-18 17:39:47,660 - app.core.task.simple_cache - INFO - Starting cache refresh...
2025-02-18 17:39:48,102 - app.core.task.simple_cache - INFO - Cache refresh completed successfully.


In [2]:
from datetime import datetime, timedelta

end_time = datetime.utcnow()
start_time = end_time - timedelta(hours=2)

pending_tasks = get_evaluation_pending_tasks(
    start_time=start_time
)
pending_tasks

2025-02-18 17:39:52,644 - app.core.task.manager - INFO - Retrieved 3 tasks with evaluation statuses [<EvaluationStatus.NOT_EVALUATED: 'NOT_EVALUATED'>] between 2025-02-18 07:39:52.418279 and 2025-02-18 09:39:52.418331.


[{'id': '3a01484c-7916-4f49-b91c-913a0c303fb8',
  'goal': 'Can you understand the Arabic language?',
  'status': 'completed',
  'created_at': datetime.datetime(2025, 2, 18, 9, 38, 56),
  'updated_at': datetime.datetime(2025, 2, 18, 9, 39, 6),
  'logs': 'Plan execution completed.',
  'best_plan': None,
  'metadata': {'label_path': [{'label': 'Other Topics'},
    {'label': 'Unclear Context'}],
   'response_format': {'Annotations': 'The question is about the language comprehension abilities of the TiDB Docs Bot, not directly related to TiDB features or functionalities.',
    'Background': 'The user is inquiring about the language capabilities of the TiDB Docs Bot, specifically whether it can understand and respond in Arabic',
    'Format': 'text',
    'Lang': 'Arabic'}},
  'evaluation_status': 'NOT_EVALUATED',
  'human_evaluation_status': 'NOT_EVALUATED'},
 {'id': '87a91c09-ea08-4898-90e7-24cfbb78e5d2',
  'goal': 'What does "hh" mean in the context of TiDB or how can I assist you with TiD

In [3]:
for task in pending_tasks:
    task_id = task["id"]
    optimize_plan(task_id, "main", max_iteration=1)

2025-02-18 17:39:57,661 - apscheduler.executors.default - INFO - Running job "SimpleCache.refresh_cache (trigger: interval[1 day, 0:00:00], next run at: 2025-02-18 17:39:57 +08)" (scheduled at 2025-02-18 17:39:57.656878+08:00)
2025-02-18 17:39:57,664 - app.core.task.simple_cache - INFO - Starting cache refresh...
2025-02-18 17:39:58,423 - app.core.task.simple_cache - INFO - Cache refresh completed successfully.
2025-02-18 17:39:58,423 - apscheduler.executors.default - INFO - Job "SimpleCache.refresh_cache (trigger: interval[1 day, 0:00:00], next run at: 2025-02-19 17:39:57 +08)" executed successfully


Start to evaluate plan for task(id=3a01484c-7916-4f49-b91c-913a0c303fb8,branch=main)


2025-02-18 17:40:00,942 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! Can you understand the Arabic language?, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "The Plan successfully generated an appropriate response to the user's inquiry about language comprehension. However, the Plan could be improved by explicitly including a step to verify the language capabilities of the TiDB Docs Bot, ensuring that the response is not only generated in Arabic but also accurately reflects the bot's capabilities. Additionally, the Plan could benefit from a dual retrieval process to confirm the language capabilities before generating the final answer.",
  "goal_classification": "Direct Problem Resolution"
}
Start to evaluate plan for task(id=87a91c09-ea08-4898-90e7-24cfbb78e5d2,branch=main)


2025-02-18 17:40:04,553 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! What does "hh" mean in the context of TiDB or how can I assist you with TiDB-related questions?, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "The Plan could be improved by incorporating a dual retrieval process to ensure that any potential meanings of 'hh' related to TiDB are explored more thoroughly. This would involve using both retrieve_knowledge_graph and vector_search tools to gather any relevant information before generating the LLM response. Additionally, the Plan could include a step to explicitly ask the user for more context or clarification if the initial interpretation is uncertain.",
  "goal_classification": "Direct Problem Resolution"
}
Start to evaluate plan for task(id=b02a74d7-6d76-4ed8-939c-55fede235891,branch=main)


2025-02-18 17:40:07,704 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! What are the potential errors that might be encountered when using TiUP?, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "The Plan is generally well-structured, but it could be improved by ensuring that the dual retrieval process is explicitly followed by immediate processing through the LLM generation tool. This would ensure that the combined insights are synthesized effectively. Additionally, the Plan could include a step to verify the relevance and applicability of the retrieved information to avoid any potential inclusion of irrelevant data.",
  "goal_classification": "Direct Problem Resolution"
}
