In [2]:
import csv
import json

def csv_to_jsonl(csv_file_path, jsonl_file_path):
    key_mapping = {
        "ID": "id",
        "Question": "problem",
        "Year": "year",
        "Problem Number": "problem number",
        "Answer": "answer",
        "Part": "part",
        # 添加更多需要替换的字段
    }

    with open(csv_file_path, mode='r', encoding='utf-8') as csv_file, \
         open(jsonl_file_path, mode='w', encoding='utf-8') as jsonl_file:
        
        reader = csv.DictReader(csv_file)
        for row in reader:
            new_row = {key_mapping.get(k, k): v for k, v in row.items()}
            json_line = json.dumps(new_row, ensure_ascii=False)
            jsonl_file.write(json_line + '\n')

    print(f"转换完成：{csv_file_path} → {jsonl_file_path}")

# 示例用法
if __name__ == "__main__":
    csv_to_jsonl("AIME_Dataset_1983_2024.csv", "AIME_Dataset_1983_2024.jsonl")


转换完成：AIME_Dataset_1983_2024.csv → AIME_Dataset_1983_2024.jsonl


In [3]:
import json
import random

def sample_and_rename_key(input_file, output_file, sample_size=600, exclude_year='2024'):
    with open(input_file, 'r', encoding='utf-8') as f:
        filtered_data = []
        for line in f:
            item = json.loads(line)
            year = item.get("year")
            # 处理整数或字符串形式的 year 字段
            if str(year) != str(exclude_year):
                # 替换键名：problem → question
                if "problem" in item:
                    item["question"] = item.pop("problem")
                filtered_data.append(item)

    if len(filtered_data) < sample_size:
        print(f"⚠️ 可用数据不足 {sample_size} 条，仅有 {len(filtered_data)} 条。将全部保存。")
        sample_size = len(filtered_data)

    sampled_data = random.sample(filtered_data, sample_size)

    with open(output_file, 'w', encoding='utf-8') as out_f:
        for item in sampled_data:
            out_f.write(json.dumps(item, ensure_ascii=False) + '\n')

    print(f"✅ 已保存 {sample_size} 条数据到 {output_file}")

# 示例调用
if __name__ == "__main__":
    sample_and_rename_key("/home/syji/code/cot/train/eval/omni-math-rule-main/evaluation/data/aime1983_2024/AIME_Dataset_1983_2024.jsonl", "AIME_Dataset_sampled_600.jsonl")


✅ 已保存 600 条数据到 AIME_Dataset_sampled_600.jsonl


In [None]:
import json
import random

def load_and_tag_jsonl(file_path, source_name):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [
            {**json.loads(line), "source": source_name}
            for line in f
        ]

def merge_and_shuffle(file1, source1, file2, source2, output_file):
    # 加载并打标签
    data1 = load_and_tag_jsonl(file1, source1)
    data2 = load_and_tag_jsonl(file2, source2)

    # 合并并随机打乱
    combined = data1 + data2
    random.shuffle(combined)

    # 写入输出文件
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for item in combined:
            out_f.write(json.dumps(item, ensure_ascii=False) + '\n')

    print(f"✅ 合并完成，共 {len(combined)} 条记录，已保存到 {output_file}")

# 示例用法
if __name__ == "__main__":
    merge_and_shuffle(
        "gsm8k_train_sampled_600.jsonl", "gsm8k",
        "AIME_Dataset_sampled_600.jsonl", "aime",
        "merged_gsm8k_aime.jsonl"
    )


✅ 合并完成，共 400 条记录，已保存到 merged_gsm8k_aime.jsonl
