In [None]:
!pip install -U datasets huggingface_hub fsspec

In [None]:
from datasets import load_dataset, Dataset
dataset = load_dataset("natural_questions", split="train", streaming=True)
sample_dataset = []
for i, item in enumerate(dataset):
    if i >= 1500:
        break
    sample_dataset.append(item)

queries = [item['question'] for item in sample_dataset]

In [None]:
def create_prompt_for_query_analysis(query):
    prompt = f"""
You are an assistant that performs step-by-step analysis of user queries.

**Instructions for Query Analysis:**
When given a query, please **understand the query intents**, and classify the query as either **[Local]** or **[Global]**.
- **[Global]**: The query requires a broad or vague range of knowledge (e.g., summary or open-ended questions), and may require a comprehensive understanding of the document.
- **[Local]**: The query has a clear and fixed answer with a narrow scope of knowledge (e.g., factual questions), and only a small amount of text fragments are needed to answer.

**Output Format:**
Please present the results in JSON format with the following keys:
**query_type**: [Local] or [Global]

**Demonstration**
Query: What is the summary of the movie "Inception"?
Results: {{"query_type": "[Global]"}}
Query: Who directed the movie "Inception"?
Results: {{"query_type": "[Local]"}}

Query: {query}
Results:
"""
    return prompt

In [None]:

def get_query_type_from_llm(query):
    """
    使用 API 配置调用 LLM 并获取查询类型。
    """
    prompt = create_prompt_for_query_analysis(query)
    try:
        completion = client.chat.completions.create(
            model="qwen3-30b-a3b",
            messages=[
                {'role': 'user', 'content': prompt}
            ],
            temperature=0.2, # 使用低温度确保结果的确定性
            extra_body={"enable_thinking": False}

        )

        # 获取模型返回的内容
        response_content = completion.choices[0].message.content

        # 尝试解析JSON
        result_json = json.loads(response_content)
        query_type = result_json.get("query_type")

        if query_type not in ["[Local]", "[Global]"]:
             print(f"Warning: Invalid query_type '{query_type}' for query: '{query}'")
             return None

        return query_type

    except json.JSONDecodeError:
        print(f"Error: Failed to decode JSON from response for query '{query}'. Response: '{response_content}'")
        return None
    except Exception as e:
        print(f"Error processing query '{query}': {e}")
        return None

In [None]:
# --- 5. 主循环和数据保存 ---
from tqdm import tqdm
from openai import OpenAI
import os
import json
training_data = []
print("Starting data generation...")

client = OpenAI(
        api_key="sk-5b66681c866645db837eb74acc66637e",
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
    )


for query in tqdm(queries, desc="Generating Query Types"):
    query_type = get_query_type_from_llm(query)
    if query_type:
        training_data.append({"query": query, "query_type": query_type})

print(f"\nSuccessfully generated data for {len(training_data)} queries.")

# 保存为JSON Lines文件
output_filename = "step1_data.jsonl"
print(f"Saving data to '{output_filename}'...")
try:
    with open(output_filename, "w", encoding="utf-8") as f:
        for item in training_data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")
    print("Data generation complete!")
except Exception as e:
    print(f"Error saving data to file: {e}")

Starting data generation...


Generating Query Types: 100%|██████████| 1500/1500 [24:26<00:00,  1.02it/s]


Successfully generated data for 1500 queries.
Saving data to 'query_analysis_data.jsonl'...
Data generation complete!



