In [1]:
import os
import sys

sys.path.append(os.path.join(os.getcwd(), "data"))
from data.loader import DataLoader, get_paths

from pathlib import Path

from datasets import Dataset, DatasetDict, DatasetInfo, Features, Array3D, Array4D, Sequence, Value

In [2]:
primary_directory = Path().resolve()
provided_data_dir = primary_directory / "data"
training_data_dir = provided_data_dir / "train-pats"
validation_data_dir = provided_data_dir / "validation-pats"
testing_data_dir = provided_data_dir / "test-pats"

In [3]:
def gen(path: Path):
    # obtain the paths to the plan files
    plan_paths = get_paths(path)
    dataloader = DataLoader(plan_paths)

    # ensure that the dataloader is in the correct mode
    dataloader.set_mode("training_model")

    for batch in dataloader.get_batches():
        yield {
            "ct": batch.ct.squeeze(),
            "dose": batch.dose.squeeze(),
            "voxel_dimensions": batch.voxel_dimensions.squeeze(),
            "patient": batch.patient_list[-1],
            "possible_dose_mask": batch.possible_dose_mask.squeeze(),
            "structure_masks": batch.structure_masks.squeeze(),
            "structure_mask_names": batch.structure_mask_names,
        }


features = Features(
    {
        "ct": Array3D(shape=(128, 128, 128), dtype="float32"),
        "dose": Array3D(shape=(128, 128, 128), dtype="float32"),
        "voxel_dimensions": Sequence(length=3, feature=Value(dtype="float32")),
        "patient": Value(dtype="string"),
        "possible_dose_mask": Array3D(shape=(128, 128, 128), dtype="float32"),
        "structure_masks": Array4D(shape=(128, 128, 128, 10), dtype="float32"),
        "structure_mask_names": Sequence(length=10, feature=Value(dtype="string")),
    }
)

In [6]:
gen_config = {
    "generator": gen,
    "features": features,
    # due to the large size of the ct and dose arrays, the default batch size of 1000 is too large
    # and causes the arrow writer to crash, so lets write in smaller batches
    "writer_batch_size": 50,
}

dataset = DatasetDict(
    {
        "train": Dataset.from_generator(
            **gen_config,
            gen_kwargs={"path": training_data_dir},
        ),
        "validation": Dataset.from_generator(
            **gen_config,
            gen_kwargs={"path": validation_data_dir},
        ),
        "test": Dataset.from_generator(
            **gen_config,
            gen_kwargs={"path": testing_data_dir},
        ),
    },
)

Generating train split: 0 examples [00:00, ? examples/s]

DatasetGenerationError: An error occurred while generating the dataset

In [None]:
print(dataset["train"])

In [None]:
info = DatasetInfo(
    description="🤗 mirror of the Open Knowledge-Based Planning (OpenKBP) dataset.",
    homepage="https://github.com/ababier/open-kbp",
    license="MIT",
)

In [None]:
dataset.push_to_hub(
    repo_id="oxkitsune/open-kbp",
    commit_message="clean up shards",
    num_shards={
        "train": 20,
        "validation": 4,
        "test": 10,
    }
)