[🧠🧬 EDA+3D-Baseline – RSNA – Glioma Radiogenomics](https://www.kaggle.com/dschettler8845/eda-3d-baseline-rsna-glioma-radiogenomics/notebook#modelling) の
`6.2 CREATE THE DATASETS` 部分を解読。`train_edited.csv` は以下手順で作成。

```python
ROOT_DIR = '../input/rsna-miccai-brain-tumor-radiogenomic-classification'
TRAIN_DIR = os.path.join(ROOT_DIR, 'train')
TRAIN_CSV = os.path.join(ROOT_DIR, 'train_labels.csv')

train_df = pd.read_csv(TRAIN_CSV)
TRAIN_NPY_DIR = "/kaggle/input/create-3d-npz-rsna-radiogenomic-classification/test"
TRAIN_FLAIR_NPY_DIR = os.path.join(TRAIN_NPY_DIR, "FLAIR")
train_df["path_to_flair_dir"] = train_df.BraTS21ID.apply(lambda x: os.path.join(TRAIN_DIR, f"{x:>05}", "FLAIR"))
train_df["flair_image_count"] = train_df.path_to_flair_dir.apply(lambda x: len(os.listdir(x)))
train_df["path_to_flair_npz"] = train_df.BraTS21ID.apply(lambda x: os.path.join(TRAIN_FLAIR_NPY_DIR, f"{x:>05}.npz"))
```

### まとめ

評価用データ、テスト用データの処理も含まれているので煩雑に見える。  
訓練データの部分を抜き出すと以下のようになる。

In [1]:
import os
import pprint
import numpy as np
import random
import pandas as pd
import tensorflow as tf

def load_npz(np_file_path, is_tf=False):
    if is_tf:
        return np.load(np_file_path.numpy().decode())["arr_0"] 
    else:
        return np.load(np_file_path)["arr_0"] 

INPUT_SHAPE = (128,128,32,1)
BATCH_SIZE = 8
train_df = pd.read_csv("train_edited.csv")
dataset_storage = {}
use_col = "path_to_flair_npz"
col_identifier = use_col.split("_")[2] # flair
dataset_storage[col_identifier] = {}

# npz ファイルとその MGMT_value を紐付け
dataset_storage[col_identifier]["train_ds_df"] = train_df[train_df[use_col].apply(lambda x: os.path.isfile(x))][["MGMT_value", use_col]]
# 上記 MGMT_value をリスト化 (lbl = label)
dataset_storage[col_identifier]["train_lbl_list"] = dataset_storage[col_identifier]["train_ds_df"].MGMT_value.to_list()
# 上記 "path_to_flair_npz" をリスト化
dataset_storage[col_identifier]["train_npz_file_list"] = dataset_storage[col_identifier]["train_ds_df"][use_col].to_list()

# npz ファイルの数を算出
dataset_storage[col_identifier]["N_EX"] = len(dataset_storage[col_identifier]["train_lbl_list"])
dataset_storage[col_identifier]["VAL_FRAC"] = 0.1

# npz ファイルの数 * (1 - 0.9) (訓練用データ)
dataset_storage[col_identifier]["N_TRAIN"] = int(dataset_storage[col_identifier]["N_EX"]*(1-dataset_storage[col_identifier]["VAL_FRAC"]))

# 重複なしで N_EX 個のサンプルをシャッフルされた状態で抽出
dataset_storage[col_identifier]["RANDOM_INDICES"] = random.sample(range(dataset_storage[col_identifier]["N_EX"]), dataset_storage[col_identifier]["N_EX"])
# ↑から N_EX * 0.9 個抽出
dataset_storage[col_identifier]["TRAIN_INDICES"] = dataset_storage[col_identifier]["RANDOM_INDICES"][:dataset_storage[col_identifier]["N_TRAIN"]]
# 訓練用データのラベルを tf.data.Dataset 化
dataset_storage[col_identifier]["lbl_train_ds"] = tf.data.Dataset.from_tensor_slices(np.array(dataset_storage[col_identifier]["train_lbl_list"])[dataset_storage[col_identifier]["TRAIN_INDICES"]])
# 訓練用データの npz ファイルへのパスを tf.data.Dataset 化
dataset_storage[col_identifier]["npz_file_train_ds"] = tf.data.Dataset.from_tensor_slices(np.array(dataset_storage[col_identifier]["train_npz_file_list"])[dataset_storage[col_identifier]["TRAIN_INDICES"]])
# 訓練用データの npz ファイルへのパスとラベルを zip 化
dataset_storage[col_identifier]["raw_train_ds"] = tf.data.Dataset.zip((dataset_storage[col_identifier]["npz_file_train_ds"], dataset_storage[col_identifier]["lbl_train_ds"]))

# 1. npz ファイルから画像データをロード(tf.py_function(load_npz, [x, True], [tf.unit8]))
# 2. (128, 128, 32, 1) に形状を変化 (tf.reshape)
# 3. 正規化 ( /255)
# 4. shuffle -> batch -> prefetch(GPUに最適化)
dataset_storage[col_identifier]["train_ds"] = dataset_storage[col_identifier]["raw_train_ds"].map(lambda x,y: (
    tf.reshape(
        tf.py_function(
            load_npz, [x, True], (tf.uint8,)
        ),
        INPUT_SHAPE
    )/255,
    tf.cast(y, tf.uint8)
), num_parallel_calls=tf.data.AUTOTUNE)
dataset_storage[col_identifier]["train_ds"] = dataset_storage[col_identifier]["train_ds"].shuffle(BATCH_SIZE*5).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

2021-09-05 10:38:12.477650: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### 解読用

In [2]:
import os
import pprint
import numpy as np
import random
import pandas as pd
import tensorflow as tf

In [3]:
train_df = pd.read_csv("train_edited.csv")

dataset_storage = {}
use_col = "path_to_flair_npz"
col_identifier = use_col.split("_")[2] # flair
dataset_storage[col_identifier] = {}

# npz ファイルとその MGMT_value を紐付け
dataset_storage[col_identifier]["train_ds_df"] = train_df[train_df[use_col].apply(lambda x: os.path.isfile(x))][["MGMT_value", use_col]]
# 上記 MGMT_value をリスト化 (lbl = label)
dataset_storage[col_identifier]["train_lbl_list"] = dataset_storage[col_identifier]["train_ds_df"].MGMT_value.to_list()
# 上記 "path_to_flair_npz" をリスト化
dataset_storage[col_identifier]["train_npz_file_list"] = dataset_storage[col_identifier]["train_ds_df"][use_col].to_list()

# ↑と同じことを test ディレクトリにも適用
# dataset_storage[col_identifier]["test_ds_df"] = ss_df[ss_df[use_col].apply(lambda x: True if os.path.isfile(x) else False)][["BraTS21ID", use_col]]
# dataset_storage[col_identifier]["test_id_list"] = dataset_storage[col_identifier]["test_ds_df"].BraTS21ID.to_list()
# dataset_storage[col_identifier]["test_npz_file_list"] = dataset_storage[col_identifier]["test_ds_df"][use_col].to_list()

# This is for splitting

# npz ファイルの数
dataset_storage[col_identifier]["N_EX"] = len(dataset_storage[col_identifier]["train_lbl_list"])
dataset_storage[col_identifier]["VAL_FRAC"] = 0.1

# npz ファイルの数 * (1 - 0.9) (訓練用データ)
dataset_storage[col_identifier]["N_TRAIN"] = int(dataset_storage[col_identifier]["N_EX"]*(1-dataset_storage[col_identifier]["VAL_FRAC"]))
# npz ファイルの数 * 0.1 (バリデーション用データ)
# dataset_storage[col_identifier]["N_VAL"] = dataset_storage[col_identifier]["N_EX"]-dataset_storage[col_identifier]["N_TRAIN"]

# 重複なしで N_EX 個のサンプルをシャッフルされた状態で抽出
dataset_storage[col_identifier]["RANDOM_INDICES"] = random.sample(range(dataset_storage[col_identifier]["N_EX"]), dataset_storage[col_identifier]["N_EX"])
# ↑から N_EX * 0.9 個抽出
dataset_storage[col_identifier]["TRAIN_INDICES"] = dataset_storage[col_identifier]["RANDOM_INDICES"][:dataset_storage[col_identifier]["N_TRAIN"]]
# ↑から N_EX * 0.1 個抽出
# dataset_storage[col_identifier]["VAL_INDICES"] = dataset_storage[col_identifier]["RANDOM_INDICES"][dataset_storage[col_identifier]["N_TRAIN"]:]

pprint.pprint(dataset_storage)

{'flair': {'N_EX': 0,
           'N_TRAIN': 0,
           'RANDOM_INDICES': [],
           'TRAIN_INDICES': [],
           'VAL_FRAC': 0.1,
           'train_ds_df': Empty DataFrame
Columns: [MGMT_value, path_to_flair_npz]
Index: [],
           'train_lbl_list': [],
           'train_npz_file_list': []}}


In [4]:
# 訓練用データのラベルを tf.data.Dataset 化
dataset_storage[col_identifier]["lbl_train_ds"] = tf.data.Dataset.from_tensor_slices(np.array(dataset_storage[col_identifier]["train_lbl_list"])[dataset_storage[col_identifier]["TRAIN_INDICES"]])
# 訓練用データの npz ファイルへのパスを tf.data.Dataset 化
dataset_storage[col_identifier]["npz_file_train_ds"] = tf.data.Dataset.from_tensor_slices(np.array(dataset_storage[col_identifier]["train_npz_file_list"])[dataset_storage[col_identifier]["TRAIN_INDICES"]])
# 訓練用データの npz ファイルへのパスとラベルを zip 化
dataset_storage[col_identifier]["raw_train_ds"] = tf.data.Dataset.zip((dataset_storage[col_identifier]["npz_file_train_ds"], dataset_storage[col_identifier]["lbl_train_ds"]))

# バリデーションデータで上記と同じことを実施
# dataset_storage[col_identifier]["lbl_val_ds"] = tf.data.Dataset.from_tensor_slices(np.array(dataset_storage[col_identifier]["train_lbl_list"])[dataset_storage[col_identifier]["VAL_INDICES"]])
# dataset_storage[col_identifier]["npz_file_val_ds"] = tf.data.Dataset.from_tensor_slices(np.array(dataset_storage[col_identifier]["train_npz_file_list"])[dataset_storage[col_identifier]["VAL_INDICES"]])
# dataset_storage[col_identifier]["raw_val_ds"] = tf.data.Dataset.zip((dataset_storage[col_identifier]["npz_file_val_ds"], dataset_storage[col_identifier]["lbl_val_ds"]))

# テストデータで上記と同じことを実施
# dataset_storage[col_identifier]["id_test_ds"] = tf.data.Dataset.from_tensor_slices(dataset_storage[col_identifier]["test_id_list"])
# dataset_storage[col_identifier]["npz_file_test_ds"] = tf.data.Dataset.from_tensor_slices(dataset_storage[col_identifier]["test_npz_file_list"])
# dataset_storage[col_identifier]["raw_test_ds"] = tf.data.Dataset.zip((dataset_storage[col_identifier]["npz_file_test_ds"], dataset_storage[col_identifier]["id_test_ds"]))

def load_npz(np_file_path, is_tf=False):
    if is_tf:
        return np.load(np_file_path.numpy().decode())["arr_0"] 
    else:
        return np.load(np_file_path)["arr_0"] 

INPUT_SHAPE = (128,128,32,1)
BATCH_SIZE = 8

# 1. npz ファイルから画像データをロード(tf.py_function(load_npz, [x, True], [tf.unit8]))
# 2. (128, 128, 32, 1) に形状を変化 (tf.reshape)
# 3. 正規化 ( /255)
# 4. shuffle -> batch -> prefetch(GPUに最適化)
dataset_storage[col_identifier]["train_ds"] = dataset_storage[col_identifier]["raw_train_ds"].map(lambda x,y: (
    tf.reshape(
        tf.py_function(
            load_npz, [x, True], (tf.uint8,)
        ),
        INPUT_SHAPE
    )/255,
    tf.cast(y, tf.uint8)
), num_parallel_calls=tf.data.AUTOTUNE)
dataset_storage[col_identifier]["train_ds"] = dataset_storage[col_identifier]["train_ds"].shuffle(BATCH_SIZE*5).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# dataset_storage[col_identifier]["val_ds"] = dataset_storage[col_identifier]["raw_val_ds"].map(lambda x,y: (tf.reshape(tf.py_function(
#     load_npz, [x, True], (tf.uint8,)
# ), INPUT_SHAPE)/255, tf.cast(y, tf.uint8)), num_parallel_calls=tf.data.AUTOTUNE)
# dataset_storage[col_identifier]["val_ds"] = dataset_storage[col_identifier]["val_ds"].batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# dataset_storage[col_identifier]["test_ds"] = dataset_storage[col_identifier]["raw_test_ds"].map(lambda x,y: (tf.reshape(tf.py_function(
#     load_npz, [x, True], (tf.uint8,)
# ), INPUT_SHAPE)/255, tf.cast(y, tf.int32)), num_parallel_calls=tf.data.AUTOTUNE)
# dataset_storage[col_identifier]["test_ds"] = dataset_storage[col_identifier]["test_ds"].batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [5]:
INPUT_SHAPE = (128,128,32,1)
INPUT_SHAPE[-1]

1