# 分野の割合を確認して、取得件数を検討

In [None]:
dataset = load_dataset("cais/hle", split="test")
dataset = dataset.filter(lambda item: item['image'] == "")

In [None]:
df = dataset.to_pandas()

In [None]:
category_percentage = df["category"].value_counts() / len(df)
category_percentage

category
Math                         0.452271
Computer Science/AI          0.103800
Biology/Medicine             0.102873
Physics                      0.093605
Humanities/Social Science    0.089435
Other                        0.081557
Chemistry                    0.046803
Engineering                  0.029657
Name: count, dtype: float64

In [None]:
selected_counts = (category_percentage * 500).round().astype(int).to_dict()

In [None]:
selected_counts

{'Math': 226,
 'Computer Science/AI': 52,
 'Biology/Medicine': 51,
 'Physics': 47,
 'Humanities/Social Science': 45,
 'Other': 41,
 'Chemistry': 23,
 'Engineering': 15}

# ランダムでHLEと同じ分布で500問取得するコード

In [None]:
import random
from collections import defaultdict
from datasets import load_dataset, Dataset
import csv
from huggingface_hub import login
login(token="")

In [None]:
def get_questions_by_category():
    """
    データセットを読み込み、カテゴリごとに問題をグループ化して返します。
    """
    dataset = load_dataset("cais/hle", split="test")
    dataset = dataset.filter(lambda item: item['image'] == "")

    questions_by_category = defaultdict(list)
    for item in dataset:
        questions_by_category[item["category"]].append(item["id"])

    return questions_by_category

def get_random_questions(questions_by_category):
    """
    指定された問題数に基づいて、カテゴリごとにランダムに問題IDとカテゴリのペアを取得します。
    """
    category_counts = {
        'Math': 226,
        'Computer Science/AI': 52,
        'Biology/Medicine': 51,
        'Physics': 47,
        'Humanities/Social Science': 45,
        'Other': 41,
        'Chemistry': 23,
        'Engineering': 15
    }

    selected_ids = []

    for category, count in category_counts.items():
        if category in questions_by_category:
            random_ids = random.sample(questions_by_category[category], count)
            for q_id in random_ids:
                selected_ids.append({"id": q_id, "category": category})
        else:
            print(f"Warning: Category '{category}' not found in the dataset.")

    if len(selected_ids) != 500:
        print(f"Error: The total number of selected questions is {len(selected_ids)}, not 500.")

    return selected_ids

def get_full_questions_by_ids(selected_ids):
    """
    選択されたIDに基づいて、元のデータセットから問題の完全な情報を取得します。
    """
    dataset = load_dataset("cais/hle", split="test")
    selected_questions_data = [item for item in dataset if item["id"] in {q["id"] for q in selected_ids}]
    return selected_questions_data

def push_to_hf(data):
    if not data:
        print("データが空です。Hugging Face Hubにプッシュされません。")
        return
    dataset_name = "suzakuteam/HLE500x1"
    dataset = Dataset.from_list(data)
    dataset = dataset.remove_columns(["image", "image_preview", "author_name", "rationale_image"])
    dataset.push_to_hub(dataset_name, commit_message="HLE500_random_select")

if __name__ == "__main__":
    try:
        # ステップ1: カテゴリごとに問題をグループ化
        questions_by_category = get_questions_by_category()

        # ステップ2: カテゴリ分布に基づいて問題をランダムに200問取得
        selected_question_ids = get_random_questions(questions_by_category)

        # ステップ3: 取得したIDの完全なデータセットを取得
        full_selected_questions = get_full_questions_by_ids(selected_question_ids)

        # ステップ4: Hugging Face Hubにプッシュ
        push_to_hf(full_selected_questions)

    except Exception as e:
        print(f"An error occurred: {e}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  81%|########1 |  528kB /  651kB            

README.md:   0%|          | 0.00/699 [00:00<?, ?B/s]