In [None]:
import io
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path

import datasets
from datasets import Dataset, DatasetDict, load_dataset
from more_itertools import chunked
from PIL import Image
from tqdm.auto import tqdm
import json

Image.MAX_IMAGE_PIXELS = None


def load_json(file_path: Path, is_jsonl: bool = False):
    if is_jsonl:
        with open(file_path, "r", encoding="utf-8") as f:
            return [json.loads(line) for line in f]
    else:
        with open(file_path, "r", encoding="utf-8") as f:
            return json.loads(f.read())


def write_json(data: list, file_path: Path, is_jsonl: bool = False, **kwargs):
    if is_jsonl:
        with open(file_path, "w", encoding="utf-8") as f:
            for item in data:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")
    else:
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=True, **kwargs)
            # f.write(orjson.dumps(data).decode("utf-8"))

In [None]:
ds = load_dataset("initiacms/XLRS-Bench-lite")
# ds = datasets.load_from_disk("~/download/datasets/XLRS-Bench-lite")
ds

In [None]:
data = ds["train"].to_list()
all_bytes = []
for it in data:
    for img in it["image"]:
        all_bytes.append(img["bytes"])
set_all_bytes = set(all_bytes)
len(all_bytes), len(set_all_bytes)
del all_bytes
import gc

gc.collect()

In [None]:
img_base = Path("/path/to/save/XLRS-imgs")
bytes2idx = dict()


def func(bytes, path):
    img = Image.open(io.BytesIO(bytes))
    img.save(path)


with ThreadPoolExecutor(max_workers=16) as executor:
    futs = []
    for idx, bytes in tqdm(enumerate(set_all_bytes), total=len(set_all_bytes)):
        bytes2idx[bytes] = idx
        # using jpg or png according to your needs
        # which may affect evaluation results, but jpg is smaller in size
        path = img_base / f"{idx}.jpg"
        if path.exists():
            continue
        fut = executor.submit(func, bytes, str(path))
        futs.append(fut)
        if len(futs) >= 128:
            for fut in as_completed(futs):
                _ = fut.result()
            futs = []
    for fut in as_completed(futs):
        _ = fut.result()

In [None]:
from copy import deepcopy

new_data = []
for it in data:
    it = deepcopy(it)
    new_image = []
    for img in it["image"]:
        idx = bytes2idx[img["bytes"]]
        path = img_base / f"{idx}.jpg"
        new_image.append(str(path))
    it["image"] = new_image
    new_data.append(it)

In [None]:
ds = DatasetDict({"train": Dataset.from_list(new_data)})
ds["train"].to_json("XLRS-Bench.json")

In [None]:
data = load_json(Path("XLRS-Bench.json"))
target_dir = Path("/path/to/converted_xlrs_data/")
categories = set([it["category"] for it in data])
write_json({k: [] for k in categories}, target_dir / "categories.json")
target_dir

In [None]:
for it in tqdm(data):
    unique_id = f"{it['category']}/{it['index']}"
    unique_id = unique_id.replace("/", "__").replace(" ", "_")
    annotation = {
        "question": it["question"],
        "options": it["multi-choice options"],
        "answer": it["answer"],
        "category": it["category"],
        # "subcategory": sample.get("l2-category", "default"),  # 已去除
        "original_path": it["path"],
        "original_index": it["index"],
        "unique_id": unique_id,
    }
    # i_ps = [i.replace("/home/wfx524866/download/", "") for i in it["image"]]
    i_ps = ["/".join(i.split("/")[-2:]) for i in it["image"]]
    # for i in i_ps:
    #     assert (Path("/home/wfx524866/download/") / i).exists(), i
    if len(i_ps) == 1:
        # 单图
        annotation |= {
            "image_path": i_ps[0],
        }
    else:
        # 多图
        annotation |= {
            "image_paths": i_ps,
            "is_multi_image": True,
        }
    # print(annotation)
    # break
    write_json(
        annotation,
        # f"/data/oss_bucket_0/wangfengxiang/datasets/deepeyes/converted_data/xlrs/{unique_id}.json",
        target_dir / f"{unique_id}.json",
    )