In [1]:
import os

os.environ.update({"TF_CPP_MIN_LOG_LEVEL": "3", "CUDA_VISIBLE_DEVICES": ""})

import re

import h5py
import numpy as np
import pandas as pd

import chiron


In [2]:
data_dir = os.path.join("..", "data", "cheng-et-al")

data = []
for dirpath, _, filenames in os.walk(data_dir):
    for filename in filenames:
        match = re.match("([0-9]+).mat", filename)
        if match:
            data.append(
                {
                    "image_index": int(match.group(1)),
                    "path": os.path.join(dirpath, filename),
                }
            )
df = pd.DataFrame(data)
df.sort_values("image_index", inplace=True)
df


Unnamed: 0,image_index,path
1351,1,../data/cheng-et-al/brainTumorDataPublic_1-766...
883,2,../data/cheng-et-al/brainTumorDataPublic_1-766...
1249,3,../data/cheng-et-al/brainTumorDataPublic_1-766...
831,4,../data/cheng-et-al/brainTumorDataPublic_1-766...
1155,5,../data/cheng-et-al/brainTumorDataPublic_1-766...
...,...,...
2757,3060,../data/cheng-et-al/brainTumorDataPublic_2299-...
2485,3061,../data/cheng-et-al/brainTumorDataPublic_2299-...
2486,3062,../data/cheng-et-al/brainTumorDataPublic_2299-...
2801,3063,../data/cheng-et-al/brainTumorDataPublic_2299-...


In [3]:
with h5py.File(os.path.join(data_dir, "cvind.mat"), "r") as f:
    df["fold_index"] = f["cvind"][0].astype(int)
df


Unnamed: 0,image_index,path,fold_index
1351,1,../data/cheng-et-al/brainTumorDataPublic_1-766...,5
883,2,../data/cheng-et-al/brainTumorDataPublic_1-766...,5
1249,3,../data/cheng-et-al/brainTumorDataPublic_1-766...,5
831,4,../data/cheng-et-al/brainTumorDataPublic_1-766...,5
1155,5,../data/cheng-et-al/brainTumorDataPublic_1-766...,5
...,...,...,...
2757,3060,../data/cheng-et-al/brainTumorDataPublic_2299-...,4
2485,3061,../data/cheng-et-al/brainTumorDataPublic_2299-...,4
2486,3062,../data/cheng-et-al/brainTumorDataPublic_2299-...,4
2801,3063,../data/cheng-et-al/brainTumorDataPublic_2299-...,2


In [4]:
tumor_id_map = {1: "meningioma", 2: "glioma", 3: "pituitary"}


def generate(df):
    for path in df.path:
        with h5py.File(path, "r") as f:
            group = f["cjdata"]
            image = group["image"][()][:, :, np.newaxis].astype(np.float32)
            label = tumor_id_map[int(group["label"][0, 0])]
        yield image, label


for fold_index in df.fold_index.unique():
    cond = df.fold_index == fold_index
    train_df = df[~cond]
    val_df = df[cond]
    output_dir = os.path.join(
        "..", "tfrecord", "cheng-et-al", f"fold-{fold_index}"
    )
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    chiron.tfrecord.save_tfrecord(
        os.path.join(output_dir, "train.tfrecord"), generate(train_df)
    )
    chiron.tfrecord.save_tfrecord(
        os.path.join(output_dir, "val.tfrecord"), generate(val_df)
    )
