In [None]:
import os

os.environ.update({"TF_CPP_MIN_LOG_LEVEL": "3", "CUDA_VISIBLE_DEVICES": ""})

import tensorflow as tf

import chiron


In [None]:
def load_kaggle(filename):
    return (
        chiron.tfrecord.load_tfrecord(
            os.path.join("..", "tfrecord", "kaggle", filename)
        )
        .filter(lambda _, label: label == "notumor")
        .map(chiron.preprocessing.MinMaxScaler())
    )


kaggle_train = load_kaggle("train.tfrecord")
kaggle_val = load_kaggle("val.tfrecord")


In [None]:
def load_cheng_et_al(filename):
    return (
        chiron.tfrecord.load_tfrecord(
            os.path.join("..", "tfrecord", "cheng-et-al", "fold-1", filename)
        )
        .map(chiron.preprocessing.MinMaxScaler())
        .map(chiron.preprocessing.ConvertImageDtype(tf.uint8))
        .map(chiron.preprocessing.ConvertImageDtype(tf.float32))
    )


cheng_et_al_train = load_cheng_et_al("train.tfrecord")
cheng_et_al_val = load_cheng_et_al("val.tfrecord")


In [None]:
def get_length(dataset):
    i = 0
    for _ in dataset:
        i += 1
    return i


kaggle_length = get_length(kaggle_train)
cheng_et_al_length = get_length(cheng_et_al_train)

combined_length = kaggle_length + cheng_et_al_length


In [None]:
combined_train = tf.data.experimental.sample_from_datasets(
    [kaggle_train, cheng_et_al_train],
    weights=[
        kaggle_length / combined_length,
        cheng_et_al_length / combined_length,
    ],
)


In [None]:
combined_val = kaggle_val.concatenate(cheng_et_al_val)


In [None]:
def generate(dataset):
    for image, label in dataset:
        yield image.numpy(), label.numpy().decode()


output_dir = os.path.join("..", "tfrecord", "combined")

if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

chiron.tfrecord.save_tfrecord(
    os.path.join(output_dir, "train.tfrecord"), generate(combined_train)
)
chiron.tfrecord.save_tfrecord(
    os.path.join(output_dir, "val.tfrecord"), generate(combined_val)
)
