In [2]:
import pandas as pd
from typing import List
from sqlalchemy.orm import joinedload

from app.llm.interface import LLMInterface
from app.core.task.utils import describe_goal
from app.config.database import SessionLocal
from app.storage.models import Task, TaskStatus
from notebooks.plan_mcts_optimizer import get_task_commit_tree

eval_client = LLMInterface("openai", "o3-mini")

def get_answer_commits(root_commit, commit_tree) -> List:
    if not root_commit.get("children"):
        final_answer = root_commit.get("vm_state", {}).get("variables", {}).get("final_answer", None)
        if final_answer:
            return [{
                "commit_hash": root_commit["commit_hash"],
                "committed_at": root_commit["committed_at"],
                "description": root_commit["description"],
                "seq_no": root_commit["seq_no"],
                "commit_type": root_commit["commit_type"],
                "parent_hash": root_commit["parent_hash"],
                "final_answer": final_answer,
                "reasoning": root_commit.get("vm_state", {}).get("reasoning", None),
                "plan": root_commit.get("vm_state", {}).get("current_plan", None),
            }]
        else:
            return []

    leaves = []
    for child_hash in root_commit.get("children"):
        leaves.extend(get_answer_commits(commit_tree.get(child_hash), commit_tree))
    return leaves

def find_plan_commit(plan, answers):
    for answer in answers:
        if answer['plan'] == plan:
            return answer
    return None


def list_tasks(filter_out_ids=None):
    with SessionLocal() as session:
        query = (
            session.query(Task)
            .options(joinedload(Task.namespace))
            .filter(Task.status != TaskStatus.deleted)
            .filter(Task.best_plan != None)
        )
        
        # Add filter only if we have IDs to exclude
        if filter_out_ids:
            query = query.filter(Task.id.not_in(filter_out_ids))
            
        return query.order_by(Task.updated_at.desc()).all()

In [3]:
import os

columns = ['id', 'goal', 'metadata', 'reasoning', 'plan', 
           'plan_commit_hash', 'answers', 'qualified_answers', 
           'is_valid', 'language']
local_sample_df = pd.DataFrame(columns=columns)


local_sample_file = "notebooks/local_samples.pkl"
try:
    if os.path.exists(local_sample_file):
        local_sample_df = pd.read_pickle(local_sample_file)
    task_ids_list = local_sample_df['id'].tolist()
    tasks = list_tasks(task_ids_list)
except Exception as e:
    print(f"Error reading local sample file: {e}")
    tasks = []

In [4]:
samples = []
index = 0

for task in tasks:
    index += 1
    print(index, "Processing task to get plan samples", task.id)
    commit_tree = get_task_commit_tree(task.id)
    if len(commit_tree) == 0:
        print(f"No commit tree found for task {task.id}, skipping")
        metadata = task.meta
        response_format = metadata.get("response_format", {})
        sample = {
            "id": task.id,
            "goal": task.goal,
            "metadata": metadata,
            "plan": task.best_plan,
            "reasoning": None,
            "plan_commit_hash": None,
            "answers": [],
            "qualified_answers": [],
            "is_valid": True,
            "language": response_format.get("Lang") or response_format.get("lang") if response_format else None
        }
        print(sample)
        samples.append(sample)
        continue
    try:
        root_commit = next(
            commit for _, commit in commit_tree.items() if not commit["parent_hash"]
        )
    except StopIteration:
        print(f"No root commit found for task {task.id}")
        continue

    leaves = get_answer_commits(root_commit, commit_tree)
    try:
        metadata = task.meta
        answer_commit = find_plan_commit(task.best_plan, leaves)
        response_format = metadata.get("response_format", {})
        sample = {
            "id": task.id,
            "goal": task.goal,
            "metadata": metadata,
            "plan": task.best_plan,
            "reasoning": answer_commit.get("reasoning") if answer_commit else None,
            "plan_commit_hash": answer_commit.get("commit_hash") if answer_commit else None,
            "answers": leaves,
            "qualified_answers": [],
            "is_valid": True,
            "language": response_format.get("Lang") or response_format.get("lang") if response_format else None
        }
        print(sample)
        samples.append(sample)
    except Exception as e:
        print(f"Error describing goal for task {task.id}: {e}. Data {task.meta}")
        raise e


1 Processing task to get plan samples 35690183-70c9-401e-9617-435f1acb09c0
{'id': '35690183-70c9-401e-9617-435f1acb09c0', 'goal': 'How does TiDB handle case sensitivity in schema names when querying the `information_schema.columns` table, and how does this behavior differ between TiDB versions 6.5 and 5.1?', 'metadata': {'label_path': [{'label': 'Complex Task Planning'}, {'label': 'Research & Analysis'}, {'label': 'Comparative Analysis'}, {'label': 'Version Comparison'}], 'response_format': {'Annotations': 'User is comparing the behavior of schema name case sensitivity in TiDB versions 6.5 and 5.1, as evidenced by the provided SQL query results', 'Background': "TiDB's handling of case sensitivity in schema and table names, particularly in the `information_schema` database, and how this behavior may vary between different TiDB versions", 'Format': 'text', 'Include SQLs Example Section': 'If (and only if) the answer contains SQL operations, please feel free to provide an example section 

In [5]:
new_samples_df = pd.DataFrame(samples)
local_sample_df = pd.concat([local_sample_df, new_samples_df], ignore_index=True)

# Save updated DataFrame back to file
local_sample_df.to_pickle(local_sample_file)

print(f"Added {len(samples)} new samples. Total samples: {len(local_sample_df)}")

Added 89 new samples. Total samples: 264


In [9]:
from app.utils.json import extract_json

def _evaluate_criteria(llm_client, prompt_template: str, context: dict) -> tuple[int, str]:
    """Evaluate a single quality criteria using LLM.
    
    Args:
        llm_client: Initialized LLM client
        prompt_template: Prompt template with placeholders
        context: Dictionary containing template variables
        
    Returns:
        Tuple of (score, explanation) with fail-safes for errors
    """
    try:
        formatted_prompt = prompt_template.format(**context)
        response = llm_client.generate(formatted_prompt)
        json_obj = extract_json(response)
        return json_obj.get("score", 0), json_obj.get("explanation", "No explanation provided")
    except Exception as e:
        print(f"Evaluation failed: {str(e)}, data: {response}")
        raise e

def evaluate_and_filter_answers(llm_client, row, threshold: int = 7) -> list[dict]:
    """Evaluate answers against quality criteria and filter based on scores.
    
    Args:
        llm_client: Initialized LLM client interface
        row: DataFrame row containing goal and answers
        threshold: Minimum score required for retention (0-10 scale)
        
    Returns:
        List of filtered answer dictionaries with evaluation metadata
    """
    filtered_answers = []
    goal_desc = describe_goal(row['goal'], row['metadata'])
    
    for answer in row['answers']:
        print(f"Evaluating answer: {answer['commit_hash']}")
        # Prepare evaluation context
        context = {
            'goal_desc': goal_desc,
            'final_answer': answer.get('final_answer', ''),
            'plan': str(answer.get('plan', '')),
            'response_lang': row['metadata'].get('response_language', 'match goal language')
        }

        # 1. Language Consistency Evaluation
        lang_prompt = """Evaluate language consistency between goal and answer:
Goal: {goal_desc} (Response Language: {response_lang})
Answer: {final_answer}

Checklist:
✓ Language matches specified response language
✓ Consistent terminology usage
✓ No unintentional code-switching
✓ Grammar/syntax correctness

Provide your evaluation in this JSON format:

{{
    "score": [0-10], # 10 is the highest score means the plan and answer are perfect.
    "explanation": "Detailed explanation of the score..."
}}"""

        lang_score, lang_expl = _evaluate_criteria(llm_client, lang_prompt, context)

        # 2. Logical Flow Evaluation  
        logic_prompt = """Analyze logical consistency of this plan for '{goal_desc}':
Plan: {plan}

Checklist:
✓ Step-by-step progression without gaps
✓ No contradictory statements
✓ Clear termination condition
✓ Appropriate error handling
✓ Unnecessary repeated steps (Note: Different tools for the same query are intentional design choices and should NOT be considered redundant.)
✓ Redundant variables
✓ Over-engineering indicators

Note: Different query methods for the same target are intentional design choices and should NOT be considered redundant.


Provide your evaluation in this JSON format:

{{
    "score": [0-10], # 10 is the highest score means the plan is perfect.
    "explanation": "Detailed explanation of the score..."
}}"""
        logic_score, logic_expl = _evaluate_criteria(llm_client, logic_prompt, context)

        # Store evaluation metadata
        evaluation_res = {
            'lang_score': lang_score,
            'lang_explanation': lang_expl,
            'logic_score': logic_score,
            'logic_explanation': logic_expl,
        }
        answer.update(evaluation_res)
        if all(score >= threshold for score in [lang_score, logic_score]):
            filtered_answers.append(answer)
        else:
            print(f"Answer {answer['commit_hash']} failed one or more checks:{evaluation_res}")
    
    return filtered_answers

In [None]:
import pandas as pd
from tqdm.auto import tqdm

# Evaluate unevaluated samples
for index, row in tqdm(local_sample_df.iterrows(), total=len(local_sample_df), desc="Evaluating answers"):
    # Skip if already evaluated (qualified_answers is not empty)
    if row['qualified_answers'] and len(row['qualified_answers']) > 0:
        continue

    if row['answers'] is None or len(row['answers']) == 0:
        continue
        
    try:
        # Evaluate and update qualified_answers
        qualified_answers = evaluate_and_filter_answers(eval_client, row)
        local_sample_df.at[index, 'qualified_answers'] = qualified_answers
        local_sample_df.at[index, 'is_valid'] = len(qualified_answers) > 0 and row['plan_commit_hash'] in [ans['commit_hash'] for ans in qualified_answers]
        
        # Save periodically (e.g., every 10 samples)
        if index % 10 == 0:
            local_sample_df.to_pickle(local_sample_file)
            
    except Exception as e:
        print(f"Error processing row {index}: {str(e)}")
        continue

# Save final results
local_sample_df.to_pickle(local_sample_file)

# Print summary
total_evaluated = len(local_sample_df[local_sample_df['is_valid'] == True])
print(f"Evaluation complete. {total_evaluated} samples are valid (have qualified answers and valid best plan).")

In [49]:
def evaluate_best_plan(llm_client, row) -> str:
    """
    Evaluate and select the best plan from filtered answers using LLM.
    Returns the commit hash of the best plan.
    """
    if not row['filtered_answers']:
        return None
    
    # Prepare evaluation prompt
    prompt = f"""Evaluate these plan candidates for the goal: {row['goal_description']}

Evaluation Criteria:
1. Answer Quality: Correctness, completeness, and clarity of final answer
2. Plan Logic: Logical flow, step progression, and error handling
3. Efficiency: Minimal redundant steps while maintaining intentional design choices
4. Documentation: Clear reasoning and plan documentation

Plans to Evaluate:
"""
    for idx, answer in enumerate(row['filtered_answers'], 1):
        prompt += f"\nPlan {idx} (Commit: {answer['commit_hash']}):\n"
        prompt += f"- Final Answer: {answer['final_answer']}\n"
        prompt += f"- Plan: {answer['plan']}\n"
        prompt += f"- Reasoning: {answer['reasoning']}\n"

    prompt += """
Output MUST be JSON format:
{
    "best_commit": "commit_hash",
    "explanation": "Detailed comparison analysis..."
}"""

    try:
        response = llm_client.generate(prompt)
        result = extract_json(response)
        return result.get('best_commit')
    except Exception as e:
        print(f"Best plan evaluation failed for task {row['task_id']}: {str(e)}")
        return None

In [7]:

def generate_reasoning(llm_client, row):
    goal = row['goal']
    best_plan = row['plan']
    final_answer = None
    for answer in row['answers']:
        if row['plan_commit_hash'] == answer['commit_hash']:
            final_answer = answer['final_answer']
            break
    
    prompt = (
        'You are tasked with generating a reasoning process based on a given goal, which should lead to a specific plan and final answer. However, the reasoning must represent the detailed thought process that occurs *before* the plan is created and the final answer is determined. Here\'s the information:\n\n'
        f'Goal: "{goal}"\n'
        f'Reference Plan (do not mention in output): "{best_plan}"\n'
        f'Reference Final Answer (do not mention in output): "{final_answer}"\n\n'
        'Your task is to write a reasoning narrative that:\n'
        '- Starts with the goal and explores how to approach it.\n'
        '- Includes logical, detailed steps of thinking, such as identifying what needs to be done, breaking it into smaller sub-goals, considering possible methods or tools, reflecting on potential challenges or alternatives, and systematically narrowing down the approach.\n'
        '- Uses a natural, human-like thought process (e.g., "I need to... First, I\'ll... Then considering... What if... Maybe I should also check... This makes sense because... Therefore...").\n'
        '- Does NOT explicitly mention the specific steps of the reference plan or the reference final answer, but ensures the reasoning logically and naturally builds toward them.\n'
        '- Remains a general thought process that could precede the creation of the plan and answer.\n'
        '- Must be detailed and thorough, providing a rich exploration of the problem with high-quality insights (e.g., justifying choices, weighing pros and cons, considering edge cases), while avoiding irrelevant tangents or redundant filler.\n'
        '- Should feel like a coherent, step-by-step intellectual journey that demonstrates deep engagement with the goal.\n\n'
        'Output the reasoning as a concise yet detailed paragraph or a short sequence of thoughts. Here\'s an example for reference:\n\n'
        'Example Goal: "Find the value of x in the equation x + 3 = 7"\n'
        'Example Reference Plan: "Step 1: Subtract 3 from both sides. Step 2: Simplify to find x."\n'
        'Example Reference Final Answer: "4"\n'
        'Example Reasoning: "I need to figure out what x equals in this equation. Looking at it, I see x is being added to 3, and that total is 7. My goal is to isolate x, so I need to think about how to get rid of the 3. What’s happening here mathematically? Addition is tying x and 3 together, so maybe I can reverse that operation. If I subtract something, though, won’t that change the equation? Wait—I remember equations need balance, so if I adjust one side, I have to adjust the other too. Let’s explore: if I take 3 away from 7, I’d get a number, but I need x alone on the other side. So, subtracting 3 from both sides might work. Could there be another way, like multiplying or dividing? No, that doesn’t fit here—addition’s the key. This feels promising; I’ll proceed by simplifying after balancing both sides, and that should reveal x."\n\n'
        f'Now, generate the reasoning for the given goal: "{goal}", ensuring it aligns with the reference plan "{best_plan}" and reference final answer "{final_answer}" without mentioning their specific details. Provide a detailed, high-quality thought process that deeply engages with the goal.'
    )

    if final_answer is None:
        prompt = (
            'You are tasked with generating a reasoning process based on a given goal, which should lead to a specific plan. However, the reasoning must represent the detailed thought process that occurs *before* the plan is created. Here\'s the information:\n\n'
            f'Goal: "{goal}"\n'
            f'Reference Plan (do not mention in output): "{best_plan}"\n'
            'Your task is to write a reasoning narrative that:\n'
            '- Starts with the goal and explores how to approach it.\n'
            '- Includes logical, detailed steps of thinking, such as identifying what needs to be done, breaking it into smaller sub-goals, considering possible methods or tools, reflecting on potential challenges or alternatives, and systematically narrowing down the approach.\n'
            '- Uses a natural, human-like thought process (e.g., "I need to... First, I\'ll... Then considering... What if... Maybe I should also check... This makes sense because... Therefore...").\n'
            '- Does NOT explicitly mention the specific steps of the reference plan, but ensures the reasoning logically and naturally builds toward them.\n'
            '- Remains a general thought process that could precede the creation of the plan and answer.\n'
            '- Must be detailed and thorough, providing a rich exploration of the problem with high-quality insights (e.g., justifying choices, weighing pros and cons, considering edge cases), while avoiding irrelevant tangents or redundant filler.\n'
            '- Should feel like a coherent, step-by-step intellectual journey that demonstrates deep engagement with the goal.\n\n'
            'Output the reasoning as a concise yet detailed paragraph or a short sequence of thoughts. Here\'s an example for reference:\n\n'
            'Example Goal: "Find the value of x in the equation x + 3 = 7"\n'
            'Example Reference Plan: "Step 1: Subtract 3 from both sides. Step 2: Simplify to find x."\n'
            'Example Reasoning: "I need to figure out what x equals in this equation. Looking at it, I see x is being added to 3, and that total is 7. My goal is to isolate x, so I need to think about how to get rid of the 3. What’s happening here mathematically? Addition is tying x and 3 together, so maybe I can reverse that operation. If I subtract something, though, won’t that change the equation? Wait—I remember equations need balance, so if I adjust one side, I have to adjust the other too. Let’s explore: if I take 3 away from 7, I’d get a number, but I need x alone on the other side. So, subtracting 3 from both sides might work. Could there be another way, like multiplying or dividing? No, that doesn’t fit here—addition’s the key. This feels promising; I’ll proceed by simplifying after balancing both sides, and that should reveal x."\n\n'
            f'Now, generate the reasoning for the given goal: "{goal}", ensuring it aligns with the reference plan "{best_plan}"" without mentioning their specific details. Provide a detailed, high-quality thought process that deeply engages with the goal.'
        )
    
    reasoning = llm_client.generate(prompt)
    return reasoning


In [8]:
from tqdm.auto import tqdm

# Process each row to generate missing reasoning
for index, row in tqdm(local_sample_df.iterrows(), total=len(local_sample_df), desc="Generating reasoning"):
    # Skip if reasoning already exists and is not empty
    if row['reasoning'] and isinstance(row['reasoning'], str) and len(row['reasoning'].strip()) > 0:
        continue
        
    try:
        # Generate reasoning for this row
        reasoning = generate_reasoning(eval_client, row)
        local_sample_df.at[index, 'reasoning'] = reasoning
        
        # Save periodically to prevent data loss
        if index % 10 == 0:
            local_sample_df.to_pickle(local_sample_file)
            
    except Exception as e:
        print(f"Error generating reasoning for row {index}: {str(e)}")
        continue

# Save final results
local_sample_df.to_pickle(local_sample_file)

# Print summary
total_with_reasoning = len(local_sample_df[local_sample_df['reasoning'].notna()])
print(f"Processing complete. {total_with_reasoning} samples now have reasoning.")

  from .autonotebook import tqdm as notebook_tqdm
Generating reasoning: 100%|██████████| 264/264 [00:17<00:00, 14.90it/s]

Processing complete. 264 samples now have reasoning.





In [10]:
local_sample_df

Unnamed: 0,id,goal,metadata,reasoning,plan,plan_commit_hash,answers,filtered_answers,is_valid,language,qualified_answers
0,55f17399-fa2f-4697-a4b1-562925fad20d,"How does a composite index work in TiDB, and h...","{'label_path': [{'label': 'Basic Knowledge'}, ...","Okay, so I need to figure out how to update th...",[{'parameters': {'output_vars': ['composite_in...,27b1f6846ff642f0b90e9f30a2c98cd1,[{'commit_hash': 'b0d212533bb74644a1e7943e1943...,,True,English,[{'commit_hash': 'b0d212533bb74644a1e7943e1943...
1,509b857b-9cac-4553-8e2f-ec27fbdb1ba2,Can I upload a file to TiDB?,"{'label_path': [{'label': 'Operation Guide'}, ...",To address the goal of determining whether a f...,[{'parameters': {'output_vars': ['import_metho...,c6fe82e85e634e8abf7c09c5192b2919,[{'commit_hash': 'c6fe82e85e634e8abf7c09c5192b...,,True,English,[{'commit_hash': 'c6fe82e85e634e8abf7c09c5192b...
2,50011e03-74a4-4b3b-ac9a-8697de6358f6,Is it possible to configure the `readpool.stor...,"{'label_path': [{'label': 'Basic Knowledge'}, ...","Okay, I need to update the plan based on the u...",[{'parameters': {'chain_of_thoughts': 'To dete...,0e310465d0854fda8d7b4fe73e55cc82,[{'commit_hash': 'c763984c7ed249a88f0bfc2cfe90...,,True,English,[{'commit_hash': 'c763984c7ed249a88f0bfc2cfe90...
3,4edfca50-ee50-4ac2-b642-5f89ccc8bede,What does the ERROR 9004 (HY000): Resolve lock...,"{'label_path': [{'label': 'Troubleshooting'}, ...",好的，我现在需要帮助用户更新他们的计划，以更好地解决TiDB中的ERROR 9004（HY0...,[{'parameters': {'chain_of_thoughts': 'The ERR...,a4457d735b4a4fc6a61db8aa5961adf2,[{'commit_hash': '12bd88f85a4b484fb6c3ced3ffc5...,,True,Chinese,[{'commit_hash': '12bd88f85a4b484fb6c3ced3ffc5...
4,4c158355-6263-4b60-ae9a-ed4187be4ffc,Under what circumstances would the TiDB Docs B...,"{'label_path': [{'label': 'Other Topics'}, {'l...","Okay, so I need to update the plan based on th...",[{'parameters': {'chain_of_thoughts': 'The goa...,2f8173ca765443b09ac6c95d179f5551,[{'commit_hash': '2f8173ca765443b09ac6c95d179f...,,True,Chinese,[{'commit_hash': '2f8173ca765443b09ac6c95d179f...
...,...,...,...,...,...,...,...,...,...,...,...
259,00285472-0333-4451-bf7a-d3c4cc079717,What are some scenarios where disabling automa...,{'label_path': [{'label': 'Complex Task Planni...,"To address the user's goal, we need to explore...",[{'parameters': {'output_vars': ['compaction_s...,ebc20405f04e4f50bde604127003a3a1,[{'commit_hash': 'ebc20405f04e4f50bde604127003...,,True,English,[]
260,5bda95aa-57ee-4693-9d3b-506c04e3c769,"What does the error message ""ddl puller resolv...","{'label_path': [{'label': 'Troubleshooting'}, ...","Alright, so I'm trying to help the user update...",[{'parameters': {'chain_of_thoughts': 'The err...,f4841938fd1440b9915ca0b30615cc2e,[{'commit_hash': 'f253080e34b9418d86295b3cd6b5...,,True,English,[]
261,5b61e0b0-cb3a-407a-955c-d99df60e0e2b,Compare the stability of TiDB and MySQL.,{'label_path': [{'label': 'Complex Task Planni...,"To compare the stability of TiDB and MySQL, we...",[{'parameters': {'output_vars': ['architecture...,db30247532e94f2ab44ac5941d3e5b2d,[{'commit_hash': 'db30247532e94f2ab44ac5941d3e...,,True,Chinese,[]
262,5a3de4fd-b937-40b0-b727-1bc553759c84,ERROR 1221 (HY000): Incorrect usage of DB GRAN...,"{'label_path': [{'label': 'Troubleshooting'}, ...",好的，我现在需要分析用户的问题，并根据提供的建议和当前的执行结果来更新计划。首先，用户的问题...,[{'parameters': {'error_message': 'ERROR 1221 ...,e4d701b5ed2b4bf4a38d3479db720865,[{'commit_hash': 'a6da2e65b09d43aea7e4852d66e9...,,True,Chinese,[]


In [12]:
local_sample_df.to_pickle(local_sample_file)

In [18]:
def adjust_plan_steps(plan):
    """
    remove useless steps
    """
    if not plan or not isinstance(plan, list):
        return plan
    
    origin_length = len(plan)
    
    # deep copy to avoid modifying original data
    new_plan = [step.copy() for step in plan if isinstance(step, dict)]
    
    # check the first step
    if new_plan and new_plan[0].get('type') == 'reasoning' and new_plan[0].get('seq_no') == 0:
        # delete the first step
        new_plan.pop(0)
        
        # re-number the remaining steps
        for idx, step in enumerate(new_plan):
            step['seq_no'] = idx

        if len(new_plan) != origin_length - 1:
            raise ValueError(f"The number of steps in the adjusted plan does not match the original plan: {len(new_plan)} != {origin_length - 1}")
        
        expected_seq = 0
        for step in new_plan:
            if step.get('seq_no') != expected_seq:
                raise ValueError(f"The sequence of steps in the adjusted plan is not continuous: {step.get('seq_no')} != {expected_seq}")
            expected_seq += 1
        return new_plan
    else:
        if len(new_plan) != origin_length:
            raise ValueError(f"The number of steps in the adjusted plan does not match the original plan: {len(new_plan)} != {origin_length}")
    
    return new_plan

In [19]:
from tqdm.auto import tqdm
import pandas as pd

local_sample_df = pd.read_pickle("notebooks/local_samples.pkl")

# Process each row to adjust plans
for index, row in tqdm(local_sample_df.iterrows(), total=len(local_sample_df), desc="Adjusting plans"):
    try:
        # Adjust plan for this row
        new_plan = adjust_plan_steps(row['plan'])
        local_sample_df.at[index, 'plan'] = new_plan
    except Exception as e:
        print(f"Error adjusting plan for row {index}: {str(e)}")
        continue

valid_rows = local_sample_df[local_sample_df['is_valid'] == True]
valid_rows

Adjusting plans: 100%|██████████| 264/264 [00:00<00:00, 30337.71it/s]


Unnamed: 0,id,goal,metadata,reasoning,plan,plan_commit_hash,answers,is_valid,language,qualified_answers
0,55f17399-fa2f-4697-a4b1-562925fad20d,"How does a composite index work in TiDB, and h...","{'label_path': [{'label': 'Basic Knowledge'}, ...","Okay, so I need to figure out how to update th...",[{'parameters': {'output_vars': ['composite_in...,27b1f6846ff642f0b90e9f30a2c98cd1,[{'commit_hash': 'b0d212533bb74644a1e7943e1943...,True,English,[{'commit_hash': 'b0d212533bb74644a1e7943e1943...
1,509b857b-9cac-4553-8e2f-ec27fbdb1ba2,Can I upload a file to TiDB?,"{'label_path': [{'label': 'Operation Guide'}, ...",To address the goal of determining whether a f...,[{'parameters': {'output_vars': ['import_metho...,c6fe82e85e634e8abf7c09c5192b2919,[{'commit_hash': 'c6fe82e85e634e8abf7c09c5192b...,True,English,[{'commit_hash': 'c6fe82e85e634e8abf7c09c5192b...
2,50011e03-74a4-4b3b-ac9a-8697de6358f6,Is it possible to configure the `readpool.stor...,"{'label_path': [{'label': 'Basic Knowledge'}, ...","Okay, I need to update the plan based on the u...",[{'parameters': {'output_vars': ['tikv_config_...,0e310465d0854fda8d7b4fe73e55cc82,[{'commit_hash': 'c763984c7ed249a88f0bfc2cfe90...,True,English,[{'commit_hash': 'c763984c7ed249a88f0bfc2cfe90...
3,4edfca50-ee50-4ac2-b642-5f89ccc8bede,What does the ERROR 9004 (HY000): Resolve lock...,"{'label_path': [{'label': 'Troubleshooting'}, ...",好的，我现在需要帮助用户更新他们的计划，以更好地解决TiDB中的ERROR 9004（HY0...,[{'parameters': {'error_message': 'ERROR 9004 ...,a4457d735b4a4fc6a61db8aa5961adf2,[{'commit_hash': '12bd88f85a4b484fb6c3ced3ffc5...,True,Chinese,[{'commit_hash': '12bd88f85a4b484fb6c3ced3ffc5...
4,4c158355-6263-4b60-ae9a-ed4187be4ffc,Under what circumstances would the TiDB Docs B...,"{'label_path': [{'label': 'Other Topics'}, {'l...","Okay, so I need to update the plan based on th...",[{'parameters': {'output_vars': ['pricing_know...,2f8173ca765443b09ac6c95d179f5551,[{'commit_hash': '2f8173ca765443b09ac6c95d179f...,True,Chinese,[{'commit_hash': '2f8173ca765443b09ac6c95d179f...
...,...,...,...,...,...,...,...,...,...,...
259,00285472-0333-4451-bf7a-d3c4cc079717,What are some scenarios where disabling automa...,{'label_path': [{'label': 'Complex Task Planni...,"To address the user's goal, we need to explore...",[{'parameters': {'output_vars': ['compaction_s...,ebc20405f04e4f50bde604127003a3a1,[{'commit_hash': 'ebc20405f04e4f50bde604127003...,True,English,[]
260,5bda95aa-57ee-4693-9d3b-506c04e3c769,"What does the error message ""ddl puller resolv...","{'label_path': [{'label': 'Troubleshooting'}, ...","Alright, so I'm trying to help the user update...",[{'parameters': {'output_vars': ['ddl_puller_k...,f4841938fd1440b9915ca0b30615cc2e,[{'commit_hash': 'f253080e34b9418d86295b3cd6b5...,True,English,[]
261,5b61e0b0-cb3a-407a-955c-d99df60e0e2b,Compare the stability of TiDB and MySQL.,{'label_path': [{'label': 'Complex Task Planni...,"To compare the stability of TiDB and MySQL, we...",[{'parameters': {'output_vars': ['architecture...,db30247532e94f2ab44ac5941d3e5b2d,[{'commit_hash': 'db30247532e94f2ab44ac5941d3e...,True,Chinese,[]
262,5a3de4fd-b937-40b0-b727-1bc553759c84,ERROR 1221 (HY000): Incorrect usage of DB GRAN...,"{'label_path': [{'label': 'Troubleshooting'}, ...",好的，我现在需要分析用户的问题，并根据提供的建议和当前的执行结果来更新计划。首先，用户的问题...,[{'parameters': {'error_message': 'ERROR 1221 ...,e4d701b5ed2b4bf4a38d3479db720865,[{'commit_hash': 'a6da2e65b09d43aea7e4852d66e9...,True,Chinese,[]


In [20]:
selected_columns = ['id','goal', 'metadata', 'reasoning', 'plan']
dataset_df =   valid_rows[selected_columns].copy()
dataset_df.columns

Index(['id', 'goal', 'metadata', 'reasoning', 'plan'], dtype='object')

In [21]:
import json
from datasets import Dataset


for col in ["metadata", "plan"]:
    dataset_df[col] = dataset_df[col].apply(lambda x: json.dumps(x) if isinstance(x, (dict, list)) else x)

dataset_df = dataset_df.reset_index(drop=True)
dataset = Dataset.from_pandas(dataset_df)
dataset


Dataset({
    features: ['id', 'goal', 'metadata', 'reasoning', 'plan'],
    num_rows: 235
})

In [22]:
dataset.push_to_hub("ianthereal-z/tidb_bot")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 99.77ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:05<00:00,  5.08s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/ianthereal-z/tidb_bot/commit/f879b1d5e21fa1db6fa772c19cf9a369d549f807', commit_message='Upload dataset', commit_description='', oid='f879b1d5e21fa1db6fa772c19cf9a369d549f807', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ianthereal-z/tidb_bot', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ianthereal-z/tidb_bot'), pr_revision=None, pr_num=None)