In [22]:
import os
# import pprint
import numpy as np
import random
import pandas as pd
import tensorflow as tf

`train_edited.csv` は以下手順で作成。

```python
ROOT_DIR = '../input/rsna-miccai-brain-tumor-radiogenomic-classification'
TRAIN_DIR = os.path.join(ROOT_DIR, 'train')
TRAIN_CSV = os.path.join(ROOT_DIR, 'train_labels.csv')

train_df = pd.read_csv(TRAIN_CSV)
TRAIN_NPY_DIR = "/kaggle/input/create-3d-npz-rsna-radiogenomic-classification/test"
TRAIN_FLAIR_NPY_DIR = os.path.join(TRAIN_NPY_DIR, "FLAIR")
train_df["path_to_flair_dir"] = train_df.BraTS21ID.apply(lambda x: os.path.join(TRAIN_DIR, f"{x:>05}", "FLAIR"))
train_df["flair_image_count"] = train_df.path_to_flair_dir.apply(lambda x: len(os.listdir(x)))
train_df["path_to_flair_npz"] = train_df.BraTS21ID.apply(lambda x: os.path.join(TRAIN_FLAIR_NPY_DIR, f"{x:>05}.npz"))
```

In [24]:
train_df = pd.read_csv("train_edited.csv")

dataset_storage = {}
use_col = "path_to_flair_npz"
col_identifier = use_col.split("_")[2] # flair
dataset_storage[col_identifier] = {}

# npz ファイルとその MGMT_value を紐付け
dataset_storage[col_identifier]["train_ds_df"] = train_df[train_df[use_col].apply(lambda x: os.path.isfile(x))][["MGMT_value", use_col]]
# 上記 MGMT_value をリスト化
dataset_storage[col_identifier]["train_lbl_list"] = dataset_storage[col_identifier]["train_ds_df"].MGMT_value.to_list()
# 上記 "path_to_flair_npz" をリスト化
dataset_storage[col_identifier]["train_npz_file_list"] = dataset_storage[col_identifier]["train_ds_df"][use_col].to_list()

# ↑と同じことを test ディレクトリにも適用
# dataset_storage[col_identifier]["test_ds_df"] = ss_df[ss_df[use_col].apply(lambda x: True if os.path.isfile(x) else False)][["BraTS21ID", use_col]]
# dataset_storage[col_identifier]["test_id_list"] = dataset_storage[col_identifier]["test_ds_df"].BraTS21ID.to_list()
# dataset_storage[col_identifier]["test_npz_file_list"] = dataset_storage[col_identifier]["test_ds_df"][use_col].to_list()

# This is for splitting

# npz ファイルの数
dataset_storage[col_identifier]["N_EX"] = len(dataset_storage[col_identifier]["train_lbl_list"])
dataset_storage[col_identifier]["VAL_FRAC"] = 0.1

# npz ファイルの数 * (1 - 0.9) (訓練用データ)
dataset_storage[col_identifier]["N_TRAIN"] = int(dataset_storage[col_identifier]["N_EX"]*(1-dataset_storage[col_identifier]["VAL_FRAC"]))
# npz ファイルの数 * 0.1 (バリデーション用データ)
dataset_storage[col_identifier]["N_VAL"] = dataset_storage[col_identifier]["N_EX"]-dataset_storage[col_identifier]["N_TRAIN"]

# 重複なしで N_EX 個のサンプルをシャッフルされた状態で抽出
dataset_storage[col_identifier]["RANDOM_INDICES"] = random.sample(range(dataset_storage[col_identifier]["N_EX"]), dataset_storage[col_identifier]["N_EX"])
# ↑から N_EX * 0.9 個抽出
dataset_storage[col_identifier]["TRAIN_INDICES"] = dataset_storage[col_identifier]["RANDOM_INDICES"][:dataset_storage[col_identifier]["N_TRAIN"]]
# ↑から N_EX * 0.1 個抽出
dataset_storage[col_identifier]["VAL_INDICES"] = dataset_storage[col_identifier]["RANDOM_INDICES"][dataset_storage[col_identifier]["N_TRAIN"]:]

In [None]:
# TF Data Integration
dataset_storage[col_identifier]["lbl_train_ds"] = tf.data.Dataset.from_tensor_slices(np.array(dataset_storage[col_identifier]["train_lbl_list"])[dataset_storage[col_identifier]["TRAIN_INDICES"]])
dataset_storage[col_identifier]["npz_file_train_ds"] = tf.data.Dataset.from_tensor_slices(np.array(dataset_storage[col_identifier]["train_npz_file_list"])[dataset_storage[col_identifier]["TRAIN_INDICES"]])
dataset_storage[col_identifier]["raw_train_ds"] = tf.data.Dataset.zip((dataset_storage[col_identifier]["npz_file_train_ds"], dataset_storage[col_identifier]["lbl_train_ds"]))

dataset_storage[col_identifier]["lbl_val_ds"] = tf.data.Dataset.from_tensor_slices(np.array(dataset_storage[col_identifier]["train_lbl_list"])[dataset_storage[col_identifier]["VAL_INDICES"]])
dataset_storage[col_identifier]["npz_file_val_ds"] = tf.data.Dataset.from_tensor_slices(np.array(dataset_storage[col_identifier]["train_npz_file_list"])[dataset_storage[col_identifier]["VAL_INDICES"]])
dataset_storage[col_identifier]["raw_val_ds"] = tf.data.Dataset.zip((dataset_storage[col_identifier]["npz_file_val_ds"], dataset_storage[col_identifier]["lbl_val_ds"]))

dataset_storage[col_identifier]["id_test_ds"] = tf.data.Dataset.from_tensor_slices(dataset_storage[col_identifier]["test_id_list"])
dataset_storage[col_identifier]["npz_file_test_ds"] = tf.data.Dataset.from_tensor_slices(dataset_storage[col_identifier]["test_npz_file_list"])
dataset_storage[col_identifier]["raw_test_ds"] = tf.data.Dataset.zip((dataset_storage[col_identifier]["npz_file_test_ds"], dataset_storage[col_identifier]["id_test_ds"]))

if INPUT_SHAPE[-1]==3:
    dataset_storage[col_identifier]["train_ds"] = dataset_storage[col_identifier]["raw_train_ds"].map(lambda x,y: (tf.repeat(tf.reshape(tf.py_function(
        load_npz, [x, True], (tf.uint8,)
    ), (*INPUT_SHAPE[:-1], 1))/255, axis=-1, repeats=3), tf.cast(y, tf.uint8)), num_parallel_calls=tf.data.AUTOTUNE)
    dataset_storage[col_identifier]["train_ds"] = dataset_storage[col_identifier]["train_ds"].shuffle(BATCH_SIZE*5).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    dataset_storage[col_identifier]["val_ds"] = dataset_storage[col_identifier]["raw_val_ds"].map(lambda x,y: (tf.repeat(tf.reshape(tf.py_function(
        load_npz, [x, True], (tf.uint8,)
    ), (*INPUT_SHAPE[:-1], 1))/255, axis=-1, repeats=3), tf.cast(y, tf.uint8)), num_parallel_calls=tf.data.AUTOTUNE)
    dataset_storage[col_identifier]["val_ds"] = dataset_storage[col_identifier]["val_ds"].batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    dataset_storage[col_identifier]["test_ds"] = dataset_storage[col_identifier]["raw_test_ds"].map(lambda x,y: (tf.repeat(tf.reshape(tf.py_function(
        load_npz, [x, True], (tf.uint8,)
    ), (*INPUT_SHAPE[:-1], 1))/255, axis=-1, repeats=3), tf.cast(y, tf.int32)), num_parallel_calls=tf.data.AUTOTUNE)
    dataset_storage[col_identifier]["test_ds"] = dataset_storage[col_identifier]["test_ds"].batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
else:
    dataset_storage[col_identifier]["train_ds"] = dataset_storage[col_identifier]["raw_train_ds"].map(lambda x,y: (tf.reshape(tf.py_function(
        load_npz, [x, True], (tf.uint8,)
    ), INPUT_SHAPE)/255, tf.cast(y, tf.uint8)), num_parallel_calls=tf.data.AUTOTUNE)
    dataset_storage[col_identifier]["train_ds"] = dataset_storage[col_identifier]["train_ds"].shuffle(BATCH_SIZE*5).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    dataset_storage[col_identifier]["val_ds"] = dataset_storage[col_identifier]["raw_val_ds"].map(lambda x,y: (tf.reshape(tf.py_function(
        load_npz, [x, True], (tf.uint8,)
    ), INPUT_SHAPE)/255, tf.cast(y, tf.uint8)), num_parallel_calls=tf.data.AUTOTUNE)
    dataset_storage[col_identifier]["val_ds"] = dataset_storage[col_identifier]["val_ds"].batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    dataset_storage[col_identifier]["test_ds"] = dataset_storage[col_identifier]["raw_test_ds"].map(lambda x,y: (tf.reshape(tf.py_function(
        load_npz, [x, True], (tf.uint8,)
    ), INPUT_SHAPE)/255, tf.cast(y, tf.int32)), num_parallel_calls=tf.data.AUTOTUNE)
    dataset_storage[col_identifier]["test_ds"] = dataset_storage[col_identifier]["test_ds"].batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [25]:
tf.data.Dataset.from_tensor_slices(np.array(dataset_storage[col_identifier]["train_lbl_list"])[dataset_storage[col_identifier]["TRAIN_INDICES"]])

2021-09-04 18:03:31.117151: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<TensorSliceDataset shapes: (), types: tf.float64>

In [29]:
np.array([1, 0, 1, 1, 0])[[0, 2, 3]]

array([1, 1, 1])