In [1]:
import json
from typing import Optional

from app.core.plan.evaluator import evaulate_answer
from app.llm.interface import LLMInterface
from app.config.settings import EVALUATION_LLM_PROVIDER, EVALUATION_LLM_MODEL

from notebooks.plan_chat_optimizer import get_task_answer, update_plan, execute_task_using_new_plan
from notebooks.tasks import get_evaluation_pending_tasks, record_evaluation

eval_llm = LLMInterface(EVALUATION_LLM_PROVIDER, EVALUATION_LLM_MODEL)

def optimize_plan(task_id:str, branch_name:Optional[str]="main", max_iteration=3):
    current_branch_name = branch_name
    error_message = None
    iteration_round = 0

    while True:
        iteration_round += 1
        print(f"Start to evaluate plan for task(id={task_id},branch={current_branch_name})")
        detail = get_task_answer(task_id, current_branch_name)

        if detail is not None:
            goal = detail.get("goal")
            final_answer = detail.get("final_answer")
            plan = detail.get("plan")
            metadata = detail.get("metadata")

            if plan is None:
                record_evaluation(task_id, "REJECTED", "No plan found")
                return

            eval_res = evaulate_answer(eval_llm, goal, metadata, final_answer, plan)
            eval_status = "APPROVED" if eval_res.get("accept", False) else "WAITING_FOR_EVALUATION"
            eval_reason = json.dumps(eval_res, indent=2) 

            record_evaluation(task_id, eval_status, eval_reason)

            if eval_res.get("accept", False) is True:
                print(f"Goal Pass! {goal}, evaluation result:{eval_reason}")
                return

            print(f"Goal Not Pass! {goal}, the evaluation result:{eval_reason}")

            if iteration_round >= max_iteration:
                break

            revised_plan = update_plan(goal, metadata, eval_reason, plan)
            print("revised plan:", revised_plan)
            reasoning = revised_plan.get("reasoning", None)
            revised_plan = revised_plan.get("plan", None)

            try:
                updated_result = execute_task_using_new_plan(task_id, revised_plan, reasoning)
                print(f"Revised plan execution result {updated_result}")
            except Exception as e:
                error_message = f"Failed to execute task using new plan {e}"
                break
            
            current_branch_name = updated_result.get("branch_name", None)
            current_final_answer = updated_result.get("final_answer", None)
            if current_branch_name is None or current_final_answer is None:
                error_message = "Failed to execut task using new plan, get empty answer"
                break
    
    if error_message is None:
        error_message = "Still failed after two evaluations round."
    print(f"Failed to evaluate plan for task(id={task_id}): {error_message}")


2025-02-18 19:38:04,193 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
2025-02-18 19:38:04,194 - apscheduler.scheduler - INFO - Added job "SimpleCache.refresh_cache" to job store "default"
2025-02-18 19:38:04,195 - apscheduler.scheduler - INFO - Scheduler started
2025-02-18 19:38:04,195 - app.core.task.simple_cache - INFO - Started cache refresh scheduler to run every 24 hours.
2025-02-18 19:38:04,195 - app.core.task.simple_cache - INFO - Starting cache refresh...
2025-02-18 19:38:14,195 - apscheduler.executors.default - INFO - Running job "SimpleCache.refresh_cache (trigger: interval[1 day, 0:00:00], next run at: 2025-02-18 19:38:14 +08)" (scheduled at 2025-02-18 19:38:14.192442+08:00)
2025-02-18 19:38:14,197 - app.core.task.simple_cache - INFO - Starting cache refresh...
2025-02-18 19:38:23,368 - app.core.task.simple_cache - INFO - Cache refresh completed successfully.


In [2]:
from datetime import datetime, timedelta

end_time = datetime.utcnow()
start_time = end_time - timedelta(hours=2)

pending_tasks = get_evaluation_pending_tasks(
    start_time=start_time
)
pending_tasks

2025-02-18 19:38:28,360 - app.core.task.manager - INFO - Retrieved 18 tasks with evaluation statuses [<EvaluationStatus.NOT_EVALUATED: 'NOT_EVALUATED'>] between 2025-02-18 09:38:27.071674 and 2025-02-18 11:38:27.071725.


[{'id': '2dc87c69-fd03-4c62-93d8-97a3a368e0d8',
  'goal': 'What are the specific advantages and challenges of deploying the PD component in microservices mode in TiDB?',
  'status': 'failed',
  'created_at': datetime.datetime(2025, 2, 18, 11, 5, 57),
  'updated_at': datetime.datetime(2025, 2, 18, 11, 6, 23),
  'logs': 'Execution was interrupted by the client.',
  'best_plan': None,
  'metadata': {'label_path': [{'label': 'Complex Task Planning'},
    {'label': 'Research & Analysis'},
    {'label': 'Technical Design'}],
   'response_format': {'Background': "TiDB's PD component and its role in the architecture, including its functions such as metadata management, scheduling, and high availability. The microservices mode for PD was introduced experimentally in version 8.0.0 to reduce module interference and support larger workloads, as discussed in the previous conversation. Annotations: User is seeking detailed insights into the benefits and potential difficulties associated with deployi

In [3]:
for task in pending_tasks:
    task_id = task["id"]
    optimize_plan(task_id, "main", max_iteration=1)

Start to evaluate plan for task(id=2dc87c69-fd03-4c62-93d8-97a3a368e0d8,branch=main)


2025-02-18 19:38:33,560 - app.core.task.simple_cache - INFO - Cache refresh completed successfully.
2025-02-18 19:38:33,561 - apscheduler.executors.default - INFO - Job "SimpleCache.refresh_cache (trigger: interval[1 day, 0:00:00], next run at: 2025-02-19 19:38:14 +08)" executed successfully


2025-02-18 19:38:35,259 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Not Pass! What are the specific advantages and challenges of deploying the PD component in microservices mode in TiDB?, the evaluation result:{
  "accept": false,
  "plan_adjustment_suggestion": "The Plan lacks dual retrieval, which could enhance the quality of the information gathered. It should incorporate both retrieve_knowledge_graph and vector_search tools to ensure comprehensive data collection. Additionally, the Plan should include immediate processing of combined results through the LLM generation tool to extract key insights and present a coherent narrative. The absence of a Final Answer suggests a possible execution error or incomplete data processing, which needs to be addressed.",
  "goal_classification": "Direct Problem Resolution"
}
Failed to evaluate plan for task(id=2dc87c69-fd03-4c62-93d8-97a3a368e0d8): Still failed after two evaluations round.
Start to evaluate plan for task(id=55ac031c-68a6-429d-8976-a16ca9f3512b,branch=main)


2025-02-18 19:38:40,444 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! What is the command to scale out a TiDB cluster using TiUP?, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "While the answer is acceptable and provides a comprehensive guide on scaling out a TiDB cluster using TiUP, the Plan could be improved by ensuring that the retrieval steps are more focused on directly addressing the user's primary query about the specific command. The Plan currently includes extensive information on topology preparation, risks, and verification, which, while useful, may not be necessary for a user solely interested in the command itself. Streamlining the Plan to prioritize the retrieval and generation of the specific command could enhance efficiency.",
  "goal_classification": "Direct Problem Resolution"
}
Start to evaluate plan for task(id=55bb0c88-2763-40b0-83b0-570a820496fe,branch=main)


2025-02-18 19:38:44,931 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! What are the best practices for performance optimization in TiDB?, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "The Final Answer effectively addresses the Goal by providing a comprehensive overview of best practices for performance optimization in TiDB. However, the Plan could be improved by ensuring dual retrieval is consistently followed by immediate processing through the LLM generation tool to extract key insights and present a coherent narrative. Additionally, the Plan should ensure that the retrieval results are not passed as raw data to non-LLM tools, which could lead to inefficiencies.",
  "goal_classification": "Direct Problem Resolution"
}
Start to evaluate plan for task(id=59ed101b-fbcf-4bd3-aa95-c4e289f5c232,branch=main)


2025-02-18 19:38:51,295 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! If the `id` column is the primary key in the proposed SQL for partitioning a table in TiDB, how should the partition definition be structured?, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "The Plan successfully retrieves relevant information and generates an appropriate SQL example in Japanese. However, it could be improved by implementing dual retrieval using both retrieve_knowledge_graph and vector_search tools to ensure comprehensive coverage of TiDB partitioning features and primary key constraints. Additionally, the Plan should immediately process combined results through the LLM generation tool to extract key insights and present a coherent narrative. This would enhance the robustness and reliability of the Final Answer.",
  "goal_classification": "Direct Problem Resolution"
}
Start to evaluate plan for task(id=5fab14aa-83b1-4e67-951f-0dc3932427c1,branch=main)


2025-02-18 19:38:56,698 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Not Pass! Are there any syntax issues with the following table definition in TiDB?

```sql
CREATE TABLE `swap` (
    `id` BIGINT UNSIGNED AUTO_RANDOM NOT NULL,
    `network` VARCHAR(10) NOT NULL COMMENT '所属网络',
    `pool_address` VARCHAR(80) NOT NULL COMMENT '交易池地址',
    `block_number` BIGINT UNSIGNED NOT NULL COMMENT '区块号',
    `tx_hash` VARCHAR(100) NOT NULL COMMENT '交易哈希',
    `tx_index` INT UNSIGNED NOT NULL COMMENT '交易偏移量',
    `created` BIGINT UNSIGNED NOT NULL COMMENT '交易时间戳',
    `side_type` TINYINT UNSIGNED NOT NULL COMMENT '交易类型',
    `amount_in` VARCHAR(64) NOT NULL COMMENT '交易输入数量',
    `amount_out` VARCHAR(64) NOT NULL COMMENT '交易输出数量',
    `wallet` VARCHAR(80) NOT NULL COMMENT '支付钱包地址',
    `amount_in_token` VARCHAR(80) NOT NULL COMMENT '输入代币',
    `amount_out_token` VARCHAR(80) NOT NULL COMMENT '输出代币',
    `stable_usd_price` VARCHAR(64) NOT NULL COMMENT '稳定币美元价格',
    `token_usd_price` VARCHAR(64) NOT NULL COMMENT '兑换货币美元价格',
    `usd_volume` VARCHAR(64) NOT NULL CO

2025-02-18 19:39:02,967 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! What is MySQL?, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "The Plan effectively generates a comprehensive answer by using two LLM generation steps: one for introducing MySQL and another for comparing MySQL with TiDB. However, the Plan could be improved by incorporating a dual retrieval step to ensure that the information is up-to-date and accurate, especially when discussing technical aspects and comparisons. This would involve using both retrieve_knowledge_graph and vector_search tools to gather the latest data before generating the final answer.",
  "goal_classification": "Direct Problem Resolution"
}
Start to evaluate plan for task(id=6cf09abd-8910-45e2-9842-524214490ca4,branch=main)


2025-02-18 19:39:08,090 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! tiup mirror merge 命令的作用是什么？, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "While the answer is acceptable, the Plan could be improved by ensuring that the retrieval steps are more focused on extracting the most relevant and concise information. The Plan should also ensure that the dual retrieval process is immediately followed by processing through the LLM generation tool to avoid any potential inclusion of irrelevant data. Additionally, the Plan could benefit from explicitly checking for any missing steps that might overlook critical aspects of the Goal.",
  "goal_classification": "Direct Problem Resolution"
}
Start to evaluate plan for task(id=81a9e7e9-f18a-44cd-8ac2-ebfd4781e02d,branch=main)


2025-02-18 19:39:13,218 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! NUMA binding failure in TiDB, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "The Final Answer effectively addresses the user's Goal by providing a comprehensive analysis of potential causes and solutions for NUMA binding failures in TiDB. However, the Plan could be improved by ensuring that each retrieval step is immediately followed by processing through the LLM generation tool to extract key insights and present a coherent narrative. This would enhance the efficiency of the Plan and ensure that the information is synthesized effectively at each stage.",
  "goal_classification": "Direct Problem Resolution"
}
Start to evaluate plan for task(id=871b70e7-c036-498c-8a9b-ea6dc4b9c65b,branch=main)


2025-02-18 19:39:18,484 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! What are the specifications for scaling out using TiUP?, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "The Final Answer effectively resolves the user's Goal by providing a comprehensive and detailed guide on how to scale out a TiDB cluster using TiUP. However, the Plan could be improved by ensuring that each retrieval step is immediately followed by processing through the LLM generation tool to extract key insights and present a coherent narrative. This would prevent any potential loss of context or information between retrieval and generation steps. Additionally, the Plan should ensure that all retrievals are necessary and directly contribute to the Final Answer, avoiding any redundant steps.",
  "goal_classification": "Direct Problem Resolution"
}
Start to evaluate plan for task(id=949c9928-8c07-45df-8edb-3773319c4be0,branch=main)


2025-02-18 19:39:23,282 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! Compare TiDB and Citus in detail, and provide guidance on how to choose between them for a project., evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "The Final Answer effectively resolves the Goal by providing a detailed comparison of TiDB and Citus, focusing on their features, performance, scalability, and use cases. However, the Plan could be improved by ensuring that after each dual retrieval (using both vector_search and retrieve_knowledge_graph), the results are immediately processed through the LLM generation tool to extract key insights and present a coherent narrative. This would prevent the potential risk of including raw data or irrelevant information. Additionally, the Plan should ensure that the context is properly set for each LLM generation step to maintain coherence and relevance throughout the answer generation process.",
  "goal_classification": "Direct Problem Resolution"
}
Start to evaluate plan for task(id=a301ba2a-1639-456b-8c89-8d5

2025-02-18 19:39:28,365 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Not Pass! How to generate the DDL statements for all tables in TiDB?, the evaluation result:{
  "accept": false,
  "plan_adjustment_suggestion": "The Final Answer does not fully resolve the user's Goal. The answer provides example DDL statements for specific tables in the INFORMATION_SCHEMA, but it does not explain how to generate DDL statements for all user-defined tables in a TiDB database. The Plan should include steps to retrieve and process information on how to extract DDL statements for all tables in a TiDB database, not just the INFORMATION_SCHEMA tables. Additionally, the Plan should ensure that the answer includes a method or SQL query that can be executed to achieve the user's goal. The Plan should also implement dual retrieval using both retrieve_knowledge_graph and vector_search tools to ensure comprehensive information gathering.",
  "goal_classification": "Direct Problem Resolution"
}
Failed to evaluate plan for task(id=a301ba2a-1639-456b-8c89-8d5942436ac4): Still f

2025-02-18 19:39:32,933 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! How to address the error related to assertion failure in TiDB, specifically the "assertion failure" in prewrite.rs, and what steps can be taken to resolve it?, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "The Final Answer effectively resolves the user's Goal by providing a comprehensive analysis of the prewrite phase in TiDB and offering actionable solutions to address the assertion failure. However, the Plan could be improved by ensuring dual retrieval is consistently applied. Specifically, after retrieving information from both the knowledge graph and vector search, the Plan should immediately process these combined results through the LLM generation tool to extract key insights and present a coherent narrative. This would enhance the efficiency and coherence of the answer generation process.",
  "goal_classification": "Direct Problem Resolution"
}
Start to evaluate plan for task(id=b3b537af-ef4b-4fc9-a612-fcc448aa5f57,branch=main)


2025-02-18 19:39:38,102 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! What is the ticdc_kvclient_cached_region metric in TiDB?, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "While the answer is acceptable, the Plan could be improved by ensuring that the retrieval steps are more focused on the specific metric rather than general TiCDC metrics. Additionally, the Plan should ensure that the retrieval process includes a step to verify the relevance and accuracy of the retrieved information before synthesis. Implementing dual retrieval consistently and processing results immediately through the LLM generation tool could enhance the quality and relevance of the final answer.",
  "goal_classification": "Direct Problem Resolution"
}
Start to evaluate plan for task(id=c255c42b-c85d-450f-aa2b-b101f37adef8,branch=main)


2025-02-18 19:39:43,069 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Not Pass! Who are you?, the evaluation result:{
  "accept": false,
  "plan_adjustment_suggestion": "The Final Answer does not directly address the user's Goal of understanding the identity and capabilities of the TiDB Docs Bot. The Plan should include a step to explicitly state the identity and capabilities of the TiDB Docs Bot, as outlined in the supplementary goal information. This could involve directly incorporating the information from the prompt used in the 'llm_generate' tool into the Final Answer. Additionally, the Plan should ensure that the generated response clearly communicates the bot's role and functions to the user.",
  "goal_classification": "Direct Problem Resolution"
}
Failed to evaluate plan for task(id=c255c42b-c85d-450f-aa2b-b101f37adef8): Still failed after two evaluations round.
Start to evaluate plan for task(id=c5c0bc70-8b3f-437e-9c4b-4563e4c79be4,branch=main)


2025-02-18 19:39:47,538 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! When does disk space get reclaimed for row deletions in TiDB?, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "While the answer is acceptable, the Plan could be improved by ensuring that the retrieval process explicitly checks for the most recent and relevant documentation to avoid potential outdated information. Additionally, the Plan could benefit from a step that verifies the accuracy of the retrieved information against the latest TiDB updates or release notes to ensure the answer remains current.",
  "goal_classification": "Direct Problem Resolution"
}
Start to evaluate plan for task(id=cf211e26-2517-4264-8354-d2989f594093,branch=main)


2025-02-18 19:39:52,540 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! What is the maximum number of nodes that a TiDB cluster can have in a real-world scenario?, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "The answer effectively resolves the user's goal by providing detailed information on the scalability limits of TiDB, TiKV, and PD nodes. However, the Plan could be improved by ensuring that the retrieval process includes a step to verify the most current and comprehensive data from both the knowledge graph and vector search. Additionally, the Plan should ensure that the retrieval tools are used in a complementary manner to cover any potential gaps in information. This could involve a more explicit integration of the results from both retrieval methods before generating the final answer.",
  "goal_classification": "Direct Problem Resolution"
}
Start to evaluate plan for task(id=d86ef57b-946a-4d69-b3b1-d9e04beb0bd2,branch=main)


2025-02-18 19:39:57,578 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! How to evaluate the impact of future expansion plans on the choice between TiDB and Citus? What specific measures do TiDB and Citus have for handling data security and privacy? How is Citus's sharding capability specifically implemented in multi-tenant application scenarios? What are some successful cases of TiDB's HTAP capabilities in practical applications? In scenarios involving geographically distributed data, how does Citus ensure data consistency and access speed?, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "The Final Answer effectively addresses the user's Goal by providing a comprehensive comparison of TiDB and Citus across various dimensions such as scalability, security, sharding, HTAP capabilities, and geographic distribution. However, the Plan could be improved by ensuring that each retrieval step is immediately followed by processing through the LLM generation tool to extract key insights and present a coherent narrative. Additionally,

2025-02-18 19:40:02,235 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Goal Pass! Why do we need the PD (Placement Driver) microservice in TiDB, and in what scenarios or data volume should it be enabled?, evaluation result:{
  "accept": true,
  "plan_adjustment_suggestion": "The Final Answer effectively addresses the user's Goal by explaining the role of the PD component in TiDB and the scenarios where it is essential. However, the Plan could be improved by ensuring that the retrieval steps are more focused on the specific aspects of the Goal, such as the data volume thresholds for enabling PD. Additionally, the Plan could benefit from a more streamlined approach by reducing the number of retrieval steps and ensuring that each retrieval directly contributes to the final answer. Implementing dual retrieval for each query and immediately processing the results through the LLM generation tool could enhance efficiency and coherence.",
  "goal_classification": "Direct Problem Resolution"
}
