## 0. Set up preparation

### Auto-Reload

In [1]:
%load_ext autoreload
%autoreload 2
# 파이썬 코드를 실행하기 전에 항상 모든 모듈을 Reload

### Add a module path

In [2]:
import os
import sys

In [None]:
jupyter_run_dir = os.getcwd()
print(f'The current active folder is "{jupyter_run_dir}".\n')

# module path
module_dir = jupyter_run_dir + '/Splitting-data'

sys.path.append(module_dir)
print(sys.path[-1])

The current active folder is "/mnt/4d055040/lab".

/mnt/4d055040/lab/Splitting-data


### Maintain the experiment under the same conditions.

In [4]:
import random
import numpy as np

def seed(seed = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed()

## 1. Splitting

In [None]:
train_ratio = 0.6
valid_ratio = 0.2
test_ratio = 0.2

### Loading an npz dataset.
> Loading the npz file saved in `0-image_to_NPZ.ipynb`.

In [5]:
npz_path = '/mnt/7977asdasf/npz/img2npz.npz'
with np.load(npz_path) as f:
    class_names = f['class_names']
    dataset = {'data':f['data'], 'label':f['label']}
class_names, np.unique(dataset['label'], return_counts=True)

(array(['TNBC_baseline', 'Normal'], dtype='<U13'),
 (array([0, 1]), array([609, 277])))

### Splitting into a test dataset and a training dataset.

In [6]:
from utils.utils import make_a_splitset

In [8]:
x_train, x_test, y_train, y_test = make_a_splitset(dataset, class_names, splitset_size=test_ratio, save=False)

Creating training, validation, and test datasets. -> test0.2.npz
data shape : (708, 800, 400), (178, 800, 400)
label shape : (708,), (178,)


### Make a Validation dataset.
> Calculate the number of data points corresponding to the validation ratio based on the entire dataset.   
> e.g., There are 100 data points in total, with 80 of them in the training dataset and 20 in the test dataset.    
> &emsp;&emsp;You want to use 20% of the entire dataset as the validation dataset.    
> &emsp;&emsp;To do this, you should recalculate the validation dataset size based on the training dataset size.

In [9]:
from utils.utils import make_a_validationset_static_test

In [32]:
valid_data_count = len(dataset['label']) * valid_ratio
round(valid_data_count, 2)

177.2

In [28]:
save_dir = '/mnt/7977asdasf/npz'
make_a_validationset_static_test(x_train, x_test, y_train, y_test, class_names, train_val_test_size=(train_ratio, valid_ratio, test_ratio), save_dir=save_dir)

train + val = 0.8
val = 0.25
data shape : (531, 800, 400), (177, 800, 400), (178, 800, 400)
label shape : (531,), (177,), (178,)
save... /mnt/7977asdasf/npz/train0.6val0.2test0.2.npz

