In [1]:
import json
import re
from datasets import load_dataset
from tqdm import tqdm
import wikipediaapi
import itertools

# --- 1. 初始化工具 (无变化) ---
print("Initializing Wikipedia API...")
wiki_api = wikipediaapi.Wikipedia(
    language='en',
    extract_format=wikipediaapi.ExtractFormat.WIKI,
    user_agent="MyKILTDataGenerator/1.0 (contact@example.com)"
)

# --- 2. 辅助函数 (无变化) ---
def clean_text(text):
    """移除维基百科文本中的常见噪声"""
    # ... (代码与之前相同) ...
    text = re.sub(r'\{\{.*?\}\}', '', text)
    text = re.sub(r'\[\[File:.*?\]\]', '', text)
    text = re.sub(r"''+", "", text)
    return text.strip()

def truncate_paragraph(paragraph_text, k=30):
    """对段落进行截断"""
    # ... (代码与之前相同) ...
    words = paragraph_text.split()
    if len(words) > 2 * k:
        return ' '.join(words[:k]) + ' <skip> ' + ' '.join(words[-k:])
    else:
        return paragraph_text

def process_wiki_section(section, level=2):
    """递归处理维基百科页面章节"""
    # ... (代码与之前相同) ...
    plain_texts = []
    xml_parts = []
    paragraphs = [p for p in section.text.split('\n') if p.strip()]
    for p_text in paragraphs:
        cleaned_p = clean_text(p_text)
        if len(cleaned_p.split()) < 5:
            continue
        plain_texts.append(cleaned_p)
        truncated_p = truncate_paragraph(cleaned_p)
        xml_parts.append(truncated_p + "<br>")
    for subsection in section.sections:
        sub_plain, sub_xml = process_wiki_section(subsection, level + 1)
        tag = "subsection" if level >= 2 else "section"
        title = clean_text(subsection.title)
        plain_texts.append(title)
        plain_texts.extend(sub_plain)
        xml_parts.append(f"<{tag}: {title}>")
        xml_parts.extend(sub_xml)
        xml_parts.append(f"</{tag.split(':')[0]}>")
    return plain_texts, xml_parts

def generate_training_pair(wiki_page):
    """为单个维基百科页面生成训练数据对"""
    # ... (代码与之前相同) ...
    if not wiki_page.exists():
        return None, None
    summary_text = clean_text(wiki_page.summary)
    if not summary_text:
        return None, None
    plain_texts = [summary_text]
    xml_parts = [f"<abstract>{truncate_paragraph(summary_text)}</abstract>"]
    for section in wiki_page.sections:
        sec_plain, sec_xml = process_wiki_section(section)
        title = clean_text(section.title)
        plain_texts.append(title)
        plain_texts.extend(sec_plain)
        xml_parts.append(f"<section: {title}>")
        xml_parts.extend(sec_xml)
        xml_parts.append("</section>")
    input_text = "\n".join(plain_texts)
    output_xml = re.sub(r'(\s*<br>\s*)+', '<br>', "".join(xml_parts)).strip()
    if len(input_text.split()) < 50 or len(output_xml.split()) < 20:
        return None, None
    return input_text, output_xml

# --- 4. 主循环和数据加载/保存 (已修复和优化) ---

def extract_titles_from_kilt(dataset):
    """
    一个更健壮的函数，用于从KILT数据集中提取维基百科标题。
    """
    all_titles = []
    
    # KILT数据集的每个样本都是一个独立的字典
    for sample in dataset:
        # 'output' 键的值可能是一个字典或列表
        output_field = sample.get('output')
        if not output_field:
            continue

        # 确保我们处理的是一个列表，即使它只有一个元素
        outputs_to_check = output_field if isinstance(output_field, list) else [output_field]

        for item in outputs_to_check:
            # 检查 'provenance' 键
            if isinstance(item, dict) and 'provenance' in item:
                provenance_list = item['provenance']
                
                # 再次确保 provenance_list 是一个列表
                if isinstance(provenance_list, list):
                    for prov in provenance_list:
                        if isinstance(prov, dict) and 'wikipedia_title' in prov:
                            all_titles.append(prov['wikipedia_title'])
                            
    # 去重并排序
    return sorted(list(set(all_titles)))


if __name__ == "__main__":
    print("Loading KILT dataset...")
    unique_titles = []
    
    try:
        # 使用 kilt_tasks 的 hotpotqa 子集作为示例
        kilt_dataset = load_dataset("kilt_tasks", "hotpotqa", split="train[:500]")
        
        # 调试: 打印第一个样本的结构
        if len(kilt_dataset) > 0:
            print("\n--- Debug: First sample structure ---")
            print(kilt_dataset[0])
            print("-------------------------------------\n")
            
        # 使用新的健壮函数来提取标题
        unique_titles = extract_titles_from_kilt(kilt_dataset)
        
        print(f"Found {len(unique_titles)} unique Wikipedia articles to process.")
        if len(unique_titles) > 0:
            print("Sample titles:", unique_titles[:5])
        
    except Exception as e:
        print(f"Failed to load and parse KILT dataset: {e}")
        exit()

    if not unique_titles:
        print("No unique titles found in the dataset. Exiting.")
        exit()

    training_data = []
    output_filename = "document_structuring_data.jsonl"

    print("\nStarting data generation for document structuring...")
    for title in tqdm(unique_titles, desc="Processing Wikipedia Pages"):
        try:
            page = wiki_api.page(title)
            input_text, output_xml = generate_training_pair(page)
            if input_text and output_xml:
                training_data.append({
                    "input_text": input_text,
                    "output_xml": output_xml
                })
        except Exception as e:
            print(f"Skipping article '{title}' due to an unexpected error: {e}")
            continue

    print(f"\nSuccessfully generated {len(training_data)} training pairs.")

    print(f"Saving data to '{output_filename}'...")
    try:
        with open(output_filename, "w", encoding="utf-8") as f:
            for item in training_data:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")
        print("Stage 2 data generation complete!")
        if training_data:
            print("\n--- Sample Generated Data ---")
            print(json.dumps(training_data[0], indent=2, ensure_ascii=False))
            print("---------------------------\n")

    except Exception as e:
        print(f"Error saving data to file: {e}")

Initializing Wikipedia API...
Loading KILT dataset...

--- Debug: First sample structure ---
{'id': '5a7a06935542990198eaf050', 'input': "Which magazine was started first Arthur's Magazine or First for Women?", 'meta': {'left_context': '', 'mention': '', 'right_context': '', 'partial_evidence': [], 'obj_surface': [], 'sub_surface': [], 'subj_aliases': [], 'template_questions': []}, 'output': [{'answer': "Arthur's Magazine", 'meta': {'score': -1}, 'provenance': [{'bleu_score': 1.0, 'start_character': 0, 'start_paragraph_id': 1, 'end_character': 112, 'end_paragraph_id': 1, 'meta': {'fever_page_id': '', 'fever_sentence_id': -1, 'annotation_id': '-1', 'yes_no_answer': '', 'evidence_span': []}, 'section': 'Section::::Abstract.', 'title': "Arthur's Magazine", 'wikipedia_id': '27290714'}, {'bleu_score': 1.0, 'start_character': 0, 'start_paragraph_id': 1, 'end_character': 80, 'end_paragraph_id': 1, 'meta': {'fever_page_id': '', 'fever_sentence_id': -1, 'annotation_id': '-1', 'yes_no_answer': '

Processing Wikipedia Pages: 0it [00:00, ?it/s]


Successfully generated 0 training pairs.
Saving data to 'document_structuring_data.jsonl'...
Stage 2 data generation complete!





In [14]:
import json
import re
from datasets import load_dataset
from tqdm import tqdm
import wikipediaapi
import itertools


print("Initializing Wikipedia API...")
wiki_api = wikipediaapi.Wikipedia(
    language='en',
    extract_format=wikipediaapi.ExtractFormat.WIKI,
    user_agent="MyKILTDataGenerator/1.0 (contact@example.com)"
)


def clean_text(text):
    text = re.sub(r'\{\{.*?\}\}', '', text)
    text = re.sub(r'\[\[File:.*?\]\]', '', text)
    text = re.sub(r"''+", "", text)
    return text.strip()

def truncate_paragraph(paragraph_text, k=15):
    words = paragraph_text.split()
    if len(words) > 2 * k:
        return ' '.join(words[:k]) + ' <skip> ' + ' '.join(words[-k:])
    else:
        return paragraph_text

def process_wiki_section(section, level=2):
    plain_texts = []
    xml_parts = []
    paragraphs = [p for p in section.text.split('\n') if p.strip()]
    for p_text in paragraphs:
        cleaned_p = clean_text(p_text)
        if len(cleaned_p.split()) < 5:
            continue
        plain_texts.append(cleaned_p)
        truncated_p = truncate_paragraph(cleaned_p)
        xml_parts.append(truncated_p + "<br>")
    for subsection in section.sections:
        sub_plain, sub_xml = process_wiki_section(subsection, level + 1)
        tag = "subsection" if level >= 2 else "section"
        title = clean_text(subsection.title)
        plain_texts.append(title)
        plain_texts.extend(sub_plain)
        xml_parts.append(f"<{tag}: {title}>")
        xml_parts.extend(sub_xml)
        xml_parts.append(f"</{tag.split(':')[0]}>")
    return plain_texts, xml_parts

def generate_training_pair(wiki_page):
    if not wiki_page.exists():
        return None, None
    summary_text = clean_text(wiki_page.summary)
    if not summary_text:
        return None, None
    plain_texts = [summary_text]
    xml_parts = [f"<abstract>{truncate_paragraph(summary_text)}</abstract>"]
    for section in wiki_page.sections:
        sec_plain, sec_xml = process_wiki_section(section)
        title = clean_text(section.title)
        plain_texts.append(title)
        plain_texts.extend(sec_plain)
        xml_parts.append(f"<section: {title}>")
        xml_parts.extend(sec_xml)
        xml_parts.append("</section>")
    input_text = "\n".join(plain_texts)
    output_xml = re.sub(r'(\s*<br>\s*)+', '<br>', "".join(xml_parts)).strip()
    if len(input_text.split()) < 50 or len(output_xml.split()) < 20:
        return None, None
    return input_text, output_xml



def extract_titles_from_kilt(dataset):
    """
    一个严格按照KILT HotpotQA样本结构来提取标题的函数。
    """
    all_titles = []
    
    for sample in dataset:
        # 'output' 键的值是一个列表
        output_list = sample.get('output')
        if not isinstance(output_list, list):
            continue

        for output_item in output_list:
            # 每个元素是一个字典，包含 'provenance'
            if not isinstance(output_item, dict):
                continue
            
            provenance_list = output_item.get('provenance')
            if not isinstance(provenance_list, list):
                continue

            for prov_item in provenance_list:
                if not isinstance(prov_item, dict):
                    continue
                
                # KILT数据集的键名可能是 'title' 或 'wikipedia_title'
                # 我们两个都检查，以提高兼容性
                title = prov_item.get('title') or prov_item.get('wikipedia_title')
                
                if title:
                    all_titles.append(title)
                            
    # 去重并排序
    return sorted(list(set(all_titles)))


if __name__ == "__main__":
    print("Loading KILT dataset...")
    unique_titles = []
    
    try:
        kilt_dataset = load_dataset("kilt_tasks", "hotpotqa", split="train[500:1500]")
        
        
        unique_titles = extract_titles_from_kilt(kilt_dataset)
        
        print(f"Found {len(unique_titles)} unique Wikipedia articles to process.")
        if len(unique_titles) > 0:
            print("Sample titles:", unique_titles[:10]) 
        
    except Exception as e:
        print(f"An error occurred during dataset loading or parsing: {e}")
        exit()

    if not unique_titles:
        print("No unique titles found in the dataset. This might be normal for a small sample. Exiting.")
        exit()

    training_data = []
    output_filename = "document_structuring_data.jsonl"

    print("\nStarting data generation for document structuring...")
    for title in tqdm(unique_titles, desc="Processing Wikipedia Pages"):
        try:
            page = wiki_api.page(title)
            input_text, output_xml = generate_training_pair(page)
            if input_text and output_xml:
                training_data.append({
                    "input_text": input_text,
                    "output_xml": output_xml
                })
        except Exception as e:
            print(f"Skipping article '{title}' due to an unexpected error: {e}")
            continue

    print(f"\nSuccessfully generated {len(training_data)} training pairs.")

    print(f"Saving data to '{output_filename}'...")
    try:
        with open(output_filename, "w", encoding="utf-8") as f:
            for item in training_data:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")
        print("Stage 2 data generation complete!")
        if training_data:
            print("\n--- Sample Generated Data ---")
            
            if training_data:
                print(json.dumps(training_data[0], indent=2, ensure_ascii=False))
            print("---------------------------\n")

    except Exception as e:
        print(f"Error saving data to file: {e}")

Initializing Wikipedia API...
Loading KILT dataset...
Found 769 unique Wikipedia articles to process.
Sample titles: ['101 Park Avenue', '12th Lumières Awards', '1895–96 Everton F.C. season', '1963 Pan American Games', '1976 German Grand Prix', '1979 (song)', "1999 French Open – Women's Doubles", '2000–01 Utah Jazz season', '2001 Asian Junior Athletics Championships', '2003–04 VfL Wolfsburg season']

Starting data generation for document structuring...


Processing Wikipedia Pages: 100%|██████████| 769/769 [11:51<00:00,  1.08it/s]


Successfully generated 751 training pairs.
Saving data to 'document_structuring_data.jsonl'...
Stage 2 data generation complete!

--- Sample Generated Data ---
{
  "input_text": "101 Park Avenue is a 629-foot (192 m) tall skyscraper at 41st Street and Park Avenue in the Murray Hill neighborhood of Manhattan, New York.\nEli Attia Architects designed the tower. The building contains various tenants, as well as several attractions and amenities such as Convene, Five Iron Golf, and Museum of the Dog. \nThe building is assigned its own ZIP Code, 10178; it was one of 41 buildings in Manhattan that had their own ZIP Codes as of 2019.\nIn popular culture\nIt features in the 1983 film Escape from the Bronx as President Clark's headquarters of the General Construction (GC) Corporation. It was used as the facade of the fictional \"Pemrose building\" in the 1987 film The Secret of My Success, as well as the fictional \"Clamp Center\" in the 1990 film Gremlins 2: The New Batch. The building featur




# training_data

In [16]:
type(training_data)

list

In [20]:
# 写入 JSONL 文件
with open("step2_data.jsonl", "w", encoding="utf-8") as f:
    for item in training_data:
        json_line = json.dumps(item, ensure_ascii=False)
        f.write(json_line + "\n")

In [22]:
import os
import json
import re
from openai import OpenAI
from datasets import load_dataset
from tqdm import tqdm
import wikipediaapi

# --- 1. 初始化工具 ---

print("Initializing OpenAI and Wikipedia APIs...")
# 初始化你的LLM Client
try:
    client = OpenAI(
        api_key="sk-5b02f81c8ebb45db837eb74acc16437e",
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
    )
    if not client.api_key or client.api_key == "YOUR_DASHSCOPE_API_KEY":
        raise ValueError("API Key not found. Please set the environment variable.")
except Exception as e:
    print(f"Error initializing OpenAI client: {e}")
    exit()

# 初始化维基百科API
wiki_api = wikipediaapi.Wikipedia(
    language='en',
    extract_format=wikipediaapi.ExtractFormat.WIKI,
    user_agent="MyStage3DataGenerator/1.0 (contact@example.com)"
)

# --- 2. 辅助函数 ---

def clean_text(text):
    text = re.sub(r'\{\{.*?\}\}', '', text)
    text = re.sub(r'\[\[File:.*?\]\]', '', text)
    text = re.sub(r"''+", "", text)
    return text.strip()

def get_document_outline(wiki_page):
    """从维基百科页面对象中解析出摘要和大纲。"""
    if not wiki_page.exists():
        return None, None
    
    # 摘要是页面开头的第一段非空文本
    abstract = clean_text(wiki_page.summary)
    
    # 大纲是所有章节的标题列表
    outline = []
    for section in wiki_page.sections:
        outline.append(clean_text(section.title))
        # 也可以包含子章节，但为简化起见，我们先只用顶级章节
        for subsection in section.sections:
            outline.append(f"  - {clean_text(subsection.title)}")
            
    if not abstract or not outline:
        return None, None
        
    return abstract, outline

def create_prompt_for_global_selection(query, abstract, outline):
    """根据论文附录 Prompt B 构建标注指令。"""
    outline_str = "\n".join(f"- {title}" for title in outline)
    
    prompt = f"""
You are an expert assistant. Your task is to identify which sections of a document are relevant for answering a given question.

You will be provided with three inputs:
1. A question.
2. The abstract of a document.
3. An outline of the document, containing its section titles.

Based on these inputs, select all the section titles from the outline that are helpful for answering the question. Always include 'abstract' in your selection if the abstract itself contains relevant information.

**Output Format:**
Provide the output as a JSON object with a single key "selected_titles", which is a list of strings. Each string must be an exact title from the outline, or the word 'abstract'.

**Example:**
Question: Who was the first man to walk on the moon, and what was his role in the Apollo program?
Document abstract: Neil Armstrong was an American astronaut and the first person to walk on the Moon. He was a key figure in the Apollo 11 mission...
Document outline:
- Early life
- NASA career
- Apollo 11 mission
- Later life
- Legacy
Output:
{{
  "selected_titles": ["abstract", "NASA career", "Apollo 11 mission"]
}}

**Your Task:**
Question: {query}
Document abstract: {abstract}
Document outline:
{outline_str}

Output:
"""
    return prompt

def get_global_selection_from_llm(query, abstract, outline):
    """调用 LLM API 获取相关的章节标题。"""
    prompt = create_prompt_for_global_selection(query, abstract, outline)
    try:
        completion = client.chat.completions.create(
            model="qwen3-30b-a3b", # 使用你的模型
            messages=[{'role': 'user', 'content': prompt}],
            temperature=0.0,
            response_format={"type": "json_object"}, # 尝试使用JSON模式
            extra_body={"enable_thinking": False}
        )
        response_content = completion.choices[0].message.content
        result_json = json.loads(response_content)
        
        # 验证返回的数据结构
        selected_titles = result_json.get("selected_titles")
        if isinstance(selected_titles, list):
            return selected_titles
        else:
            print(f"Warning: LLM returned invalid format for query '{query}'. Response: {response_content}")
            return None

    except Exception as e:
        print(f"Error calling LLM for query '{query}': {e}")
        return None

# --- 3. 主处理流程 ---

if __name__ == "__main__":
    print("Loading KILT dataset...")
    try:
        # 加载少量数据作为示例
        kilt_dataset = load_dataset("kilt_tasks", "hotpotqa", split="train[600:800]")
    except Exception as e:
        print(f"Failed to load KILT dataset: {e}")
        exit()

    training_data = []
    output_filename = "global_selection_data3.jsonl"
    
    print("Starting Stage 3 data generation...")
    for sample in tqdm(kilt_dataset, desc="Processing Samples"):
        query = sample['input']
        
        # 一个查询可能关联多个文档，我们只处理第一个
        provenance_list = sample['output'][0].get('provenance')
        if not provenance_list:
            continue
        
        # 以第一个证据来源作为文档
        doc_title = provenance_list[0].get('title') or provenance_list[0].get('wikipedia_title')
        if not doc_title:
            continue
            
        try:
            # 获取文档大纲
            wiki_page = wiki_api.page(doc_title)
            abstract, outline = get_document_outline(wiki_page)
            if not abstract or not outline:
                # print(f"Skipping '{doc_title}' due to missing abstract or outline.")
                continue

            # 调用LLM进行标注
            selected_titles = get_global_selection_from_llm(query, abstract, outline)
            
            if selected_titles is not None:
                # 构造最终的训练样本
                input_prompt = f"Question: {query}\nAbstract: {abstract}\nOutline:\n" + "\n".join(outline)
                training_data.append({
                    "input": input_prompt,
                    "output": selected_titles
                })

        except Exception as e:
            print(f"An unexpected error occurred for doc '{doc_title}': {e}")
            continue

    print(f"\nSuccessfully generated {len(training_data)} training samples.")

    print(f"Saving data to '{output_filename}'...")
    try:
        with open(output_filename, "w", encoding="utf-8") as f:
            for item in training_data:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")
        print("Stage 3 data generation complete!")
        if training_data:
            print("\n--- Sample Generated Data ---")
            print(json.dumps(training_data[0], indent=2, ensure_ascii=False))
            print("---------------------------\n")

    except Exception as e:
        print(f"Error saving data to file: {e}")

Initializing OpenAI and Wikipedia APIs...
Loading KILT dataset...
Starting Stage 3 data generation...


Processing Samples: 100%|██████████| 200/200 [06:18<00:00,  1.89s/it]


Successfully generated 159 training samples.
Saving data to 'global_selection_data3.jsonl'...
Stage 3 data generation complete!

--- Sample Generated Data ---
{
  "input": "Question: What piece was composed in 1937 for the American Bandmasters Association and has a melody similar to Green Bushes?\nAbstract: Green Bushes is an English folk song (Roud #1040, Laws P2) which is featured in the second movement of Vaughan Williams's English Folk Song Suite, in Percy Grainger's Green Bushes (Passacaglia on an English Folksong), and in George Butterworth's The Banks of Green Willow. The melody is very similar to that of the \"Lost Lady Found\" movement of Percy Grainger's Lincolnshire Posy, and to \"Cutty Wren\".\nAccording to Roud and Bishop\n\nThis was an immensely popular song, collected many times across England, although not so often elsewhere. It was also very popular with nineteenth-century broadside printers.\nThe song first appears in broadsides of the 1820s or 1830s. Its popularity 


