In [27]:
"""Data preprocess for GPQA
Data Link: https://huggingface.co/datasets/Idavidrein/gpqa
"""
import csv
import json
import random
from tqdm import tqdm

# Paths to data
split = 'diamond'  # diamond, main, extended
data_path = f'./GPQA/original_data/gpqa_{split}.csv'
output_path = f'./GPQA/{split}.json'

# Define the keys we want to keep
keys_to_keep = [
    'id',
    'Question',
    'Subdomain',
    'High-level domain',
    'Correct Answer',
    'Incorrect Answer 1',
    'Incorrect Answer 2',
    'Incorrect Answer 3'
]

filtered_data = []
with open(data_path, mode='r', encoding='utf-8') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for idx, row in enumerate(tqdm(csv_reader), 0):
        # Add id field
        row['id'] = idx
        # Create new dictionary with only desired keys
        filtered_row = {key: row[key] for key in keys_to_keep}

        # Extract answers and shuffle them
        answers = [
            ('Correct Answer', filtered_row['Correct Answer']),
            ('Incorrect Answer 1', filtered_row['Incorrect Answer 1']),
            ('Incorrect Answer 2', filtered_row['Incorrect Answer 2']),
            ('Incorrect Answer 3', filtered_row['Incorrect Answer 3'])
        ]
        random.shuffle(answers)

        # Assign new choices A, B, C, D in order and determine the correct choice
        choices = ['A', 'B', 'C', 'D']
        formatted_answers = []
        correct_choice = None
        for i, (label, answer) in enumerate(answers):
            choice = choices[i]
            formatted_answers.append((choice, answer))
            if label == 'Correct Answer':
                correct_choice = choice

        # Update the Question field
        formatted_choices = "\n".join([f"({choice}) {answer}" for choice, answer in formatted_answers])
        filtered_row['Question'] = f"{filtered_row['Question']} Choices:\n{formatted_choices}\n"

        # Add the Correct Choice field
        filtered_row['Correct Choice'] = correct_choice

        # Append the updated row to filtered_data
        filtered_data.append(filtered_row)

# Write the updated data to JSON
with open(output_path, mode='w', encoding='utf-8') as json_file:
    json.dump(filtered_data, json_file, indent=4, ensure_ascii=False)


0it [00:00, ?it/s]

198it [00:00, 3792.39it/s]


In [25]:
"""Data preprocess for MATH500
Data Link: https://huggingface.co/datasets/HuggingFaceH4/MATH-500
"""
import csv
import json
from tqdm import tqdm

test_path = './MATH500/original_data/test.jsonl'
output_path = './MATH500/test.json'

data_list = []
with open(test_path, 'r') as file:
    for id, line in enumerate(file.readlines()):
        line = json.loads(line)
        data_list.append({
            'id': id, 
            'Question': line['problem'],
            'solution': line['solution'],
            'answer': line['answer'],
            'subject': line['subject'],
            'level': line['level'],
            'unique_id': line['unique_id'],
        })

# Write the updated data to JSON
with open(output_path, mode='w', encoding='utf-8') as json_file:
    json.dump(data_list, json_file, indent=4, ensure_ascii=False)


FileNotFoundError: [Errno 2] No such file or directory: './MATH500/original_data/test.jsonl'

In [None]:
"""Convert Parquet to JSON (AIME)
Data Link: https://huggingface.co/datasets/AI-MO/aimo-validation-aime
"""
import pandas as pd

# Specify the Parquet file path
parquet_file = "./AIME/original_data/train-00000-of-00001.parquet"

# Use pandas to read the Parquet file
df = pd.read_parquet(parquet_file)

# Filter the DataFrame to keep only rows where '2024_AIME' appears in the 'url' column
filtered_df = df[df['url'].str.contains('2024_AIME', na=False)]

# Print the first few rows of the filtered DataFrame to confirm
print(filtered_df.head())

# Export to a JSON file with indentation
json_file = "./AIME/original_data/aime_2024.json"
filtered_df.to_json(json_file, orient='records', force_ascii=False, indent=4)

print(f"Filtered data has been saved to {json_file}")

In [None]:
"""Data preprocess for AIME
"""
import csv
import json
from tqdm import tqdm

test_path = './AIME/original_data/aime_2024.json'
output_path = './AIME/test.json'

data_list = []
with open(test_path, 'r') as file:
    data = json.load(file)
    for id, line in enumerate(tqdm(data)):
        data_list.append({
            'id': id, 
            'Question': line['problem'],
            'Solution': line['solution'],
            'answer': str(int(line['answer'])),
        })

# Write the updated data to JSON
with open(output_path, mode='w', encoding='utf-8') as json_file:
    json.dump(data_list, json_file, indent=4, ensure_ascii=False)


In [None]:
"""Convert Parquet to JSON (AIME)
Data Link: https://huggingface.co/datasets/AI-MO/aimo-validation-amc
"""
import pandas as pd

# 指定 Parquet 文件路径
parquet_file = "./AMC/original_data/train-00000-of-00001.parquet"

# 使用 pandas 读取 parquet 文件
df = pd.read_parquet(parquet_file)

# 打印 DataFrame 的前几行查看数据
print(df.head())

# 导出为 JSON 文件，并加上缩进
json_file = "./AMC/original_data/amc_2022_2023.json"
df.to_json(json_file, orient='records', force_ascii=False, indent=4)

In [None]:
"""Data preprocess for AMC
"""
import csv
import json
from tqdm import tqdm

test_path = './AMC/original_data/amc_2022_2023.json'
output_path = './AMC/test.json'

data_list = []
with open(test_path, 'r') as file:
    data = json.load(file)
    id = 0
    for line in tqdm(data):
        if '2023' not in line['url']:
            continue
        data_list.append({
            'id': id, 
            'Question': line['problem'],
            'answer': str(int(line['answer'])),
            'url': line['url'],
        })
        id += 1

# Write the updated data to JSON
with open(output_path, mode='w', encoding='utf-8') as json_file:
    json.dump(data_list, json_file, indent=4, ensure_ascii=False)


In [None]:
"""Data preprocess for LiveCodeBench
Data Link: https://huggingface.co/datasets/livecodebench/code_generation_lite
"""
import json
from tqdm import tqdm
from datetime import datetime

def is_valid_date(date_str):
    """
    Check if the given date string is within the range from August 1, 2024, to November 30, 2024.

    Args:
        date_str (str): The date string in the format "%Y-%m-%dT%H:%M:%S".

    Returns:
        bool: True if the date is within the specified range, False otherwise.
    """
    try:
        # Parse the date string into a datetime object
        date = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S")
    except ValueError:
        # If the date string is not in the expected format, consider it invalid
        return False

    # Define the start and end dates for the valid range
    start_date = datetime(2024, 8, 1)
    end_date = datetime(2024, 11, 30)

    # Check if the date falls within the valid range
    return start_date <= date <= end_date

# Define the paths to the input JSONL files
test_paths = [
    './LiveCodeBench/test.jsonl',
    './LiveCodeBench/test2.jsonl',
    './LiveCodeBench/test3.jsonl',
    './LiveCodeBench/test4.jsonl'
]

# Define the path to the output JSON file
output_path = './LiveCodeBench/test.json'

data_list = []
seen_questions = set()  # To track unique questions based on 'question_content'
current_id = 0  # To assign unique IDs across all files

for test_path in test_paths:
    try:
        with open(test_path, 'r', encoding='utf-8') as file:
            # Use tqdm to show progress; total can be estimated if needed
            for line in tqdm(file, desc=f'Processing {test_path}'):
                try:
                    # Parse the JSON line
                    line_data = json.loads(line)
                except json.JSONDecodeError:
                    # Skip lines that are not valid JSON
                    continue

                # Check if the 'contest_date' field exists and is valid
                contest_date = line_data.get('contest_date')
                if not contest_date or not is_valid_date(contest_date):
                    continue

                # Get the question content to check for duplicates
                question_content = line_data.get('question_content')
                if not question_content:
                    continue  # Skip if 'question_content' is missing

                if question_content in seen_questions:
                    continue  # Duplicate question; skip

                # Add the question to the seen set
                seen_questions.add(question_content)

                # Append the question data to the list
                data_list.append({
                    'id': current_id,
                    'Question': question_content,
                    'question_title': line_data.get('question_title', ''),
                    'contest_date': contest_date,
                    'difficulty': line_data.get('difficulty', ''),
                    'public_test_cases': line_data.get('public_test_cases', [])
                })

                current_id += 1  # Increment the unique ID

    except FileNotFoundError:
        print(f"File not found: {test_path}")
    except Exception as e:
        print(f"An error occurred while processing {test_path}: {e}")

# Write the aggregated and deduplicated data to the output JSON file
try:
    with open(output_path, mode='w', encoding='utf-8') as json_file:
        json.dump(data_list, json_file, indent=4, ensure_ascii=False)
    print(f"Data successfully written to {output_path}")
except Exception as e:
    print(f"Failed to write data to {output_path}: {e}")


In [None]:
"""Data preprocess for FlashRAG ODQA datasets
Data Link: https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets
"""
import csv
import json
from tqdm import tqdm

dataset_name = 'bamboogle'
split = 'test'
data_num = 500

test_path = f'./FlashRAG_datasets/{dataset_name}/{split}.jsonl'
output_path = f'./QA_Datasets/{dataset_name}.json'

data_list = []
with open(test_path, 'r') as file:
    for id, line in enumerate(tqdm(file.readlines())):
        line = json.loads(line)
        data_list.append({
            'id': id, 
            'Question': line['question'],
            'answer': line["golden_answers"],
        })
        if len(data_list) >= data_num:
            break

# Write the updated data to JSON
with open(output_path, mode='w', encoding='utf-8') as json_file:
    json.dump(data_list, json_file, indent=4, ensure_ascii=False)
