Import modules.


In [None]:
import os
import re

import h5py
import numpy as np
import pandas as pd

import chiron


Detect and sort `.mat` files.


In [None]:
root_dir = os.path.join("..", "data", "brain-tumor-public-dataset")
mat_dir = os.path.join(root_dir, "mat")
data = []
for dirpath, _, filenames in os.walk(mat_dir):
    for filename in filenames:
        match = re.match("([0-9]+).mat", filename)
        if match:
            data.append(
                {
                    "image_index": int(match.group(1)),
                    "path": os.path.join(dirpath, filename),
                }
            )
df = pd.DataFrame(data)
df.sort_values("image_index", inplace=True)
df


Load cross-validation fold index.


In [None]:
with h5py.File(os.path.join(mat_dir, "cvind.mat"), "r") as f:
    df["fold_index"] = f["cvind"][0].astype(int)
df


Make output directories.


In [None]:
tfrecord_dir = os.path.join(root_dir, "tfrecord")
train_dir = os.path.join(tfrecord_dir, "train")
val_dir = os.path.join(tfrecord_dir, "val")


def mkdir(path):
    if not os.path.isdir(path):
        os.makedirs(path)


mkdir(train_dir)
mkdir(val_dir)


Save TFRecord files.


In [None]:
tumor_id_map = {1: "meningioma", 2: "glioma", 3: "pituitary"}


def generate(df):
    for path in df.path:
        with h5py.File(path, "r") as f:
            group = f["cjdata"]
            image = group["image"][()][:, :, np.newaxis].astype("float32")
            label = tumor_id_map[int(group["label"][0, 0])]
        yield image, label


for fold_index in df.fold_index.unique():
    cond = df.fold_index == fold_index
    df_train = df[~cond]
    df_val = df[cond]
    gen_train = generate(df_train)
    gen_val = generate(df_val)
    filename = f"fold-{fold_index}.tfrecord"
    chiron.save_tfrecord(os.path.join(train_dir, filename), gen_train)
    chiron.save_tfrecord(os.path.join(val_dir, filename), gen_val)
