In [None]:
import pydicom
import os
import numpy as np
import glob
import h5py
from tqdm import tqdm
import pandas as pd
import json
import cv2 as cv
import matplotlib.pyplot as plt

In [None]:
train = open(f"/home/ncp/workspace/seung-ah/train.txt",'r').read().splitlines()
val = open(f"/home/ncp/workspace/seung-ah/val.txt",'r').read().splitlines()
test = open(f"/home/ncp/workspace/seung-ah/test.txt",'r').read().splitlines()

dataset = train+val+test

In [None]:
'''
Save hdf5 files by single cpu processor
'''
class hdf5_Converter:
    def __init__(self, dataset, save_dir, mode, root_dir='/home/ncp/workspace/data/train'):
        self.save_dir = save_dir
        self.root_dir = root_dir
        
        self.dataset = dataset
        self.mode = mode
        self.data_list = []

    def start_parsing(self):
        self._get_infarct_data()
        print(len(self.data_list))
        print(f'Finish Getting DICOM datas...\n')
        self._save_image_to_hdf5()

    def _get_infarct_data(self):
        # f = open('/utils/dataset/KNUH/CheckingDescription.txt', 'w')
        for data in tqdm(self.dataset):
            # json 파일 정보 읽어들이기
            with open(data, 'r', encoding='UTF8') as f:
                content = json.load(f)
            file_name = content['identifier']
            dicom_path = os.path.join(self.root_dir, content['mask_image']['org_dicom_file'][4:])

            label = int(content['patient']['diagnosis'])
            if label == 9:
                label = 7

            f.close()
            self.data_list.append([dicom_path, label, file_name])


    def _save_image_to_hdf5(self):
        save_path = os.path.join(self.save_dir, self.mode)
        
        if not os.path.exists(save_path):
            os.makedirs(save_path)

        for data in tqdm(self.data_list):
            dicom_path = data[0]
            class_id = data[1]
            file_name = data[2]

            output_path = os.path.join(save_path, f'{file_name}.hdf5')
            
            with h5py.File(output_path, 'w') as hf:  # open a hdf5 file
                dcm = pydicom.dcmread(dicom_path)
                image = dcm.pixel_array
                
                image_size=256
                # resize image
                dim = (image_size, image_size)
                image = cv.resize(image, dim, interpolation = cv.INTER_AREA)
                
                if isinstance(image, (np.ndarray, np.generic)):
                    if len(image.shape) == 3:
                        image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
                else:
                    continue
                
                HEIGHT, WIDTH = image.shape

                _image = hf.create_dataset(
                    name='input',
                    data=image,
                    shape=(HEIGHT, WIDTH),
                    maxshape=(HEIGHT, WIDTH),
                    compression="gzip",
                    compression_opts=9)
                
                _label = hf.create_dataset(name='class_id', data=[class_id])

                hf.close()

In [None]:
convert_hdf5 = hdf5_Converter(train,  "/home/ncp/workspace/seung-ah/hdf5", 'train')
convert_hdf5.start_parsing()
convert_hdf5 = hdf5_Converter(train,  "/home/ncp/workspace/seung-ah/hdf5", 'val')
convert_hdf5.start_parsing()
convert_hdf5 = hdf5_Converter(train,  "/home/ncp/workspace/seung-ah/hdf5", 'test')
convert_hdf5.start_parsing()