In [None]:
import data.factory as data_factory
import utils.config as config
import data.util as data_util
import data.dcm_util as dcm_util

import os
import cv2
import pydicom
import numpy as np
import pandas as pd

from tqdm import tqdm
from joblib import Parallel, delayed

### Data folders config

In [None]:
cfg = config.Config()
cfg.path = '../data/'

cfg.train = dict()
cfg.train.folder = 'stage_2_train_images/'
cfg.train.df_name = 'stage_2_train.csv'
cfg.train.meta = 'stage_2_train_metadata.csv'
cfg.train.path = cfg.path + cfg.train.folder

cfg.test = dict()
cfg.test.folder = 'stage_2_test_images/'
cfg.test.df_name = 'stage_2_sample_submission.csv'
cfg.test.path = cfg.path + cfg.test.folder

### Load data frames

In [None]:
train_df, _, _ = data_factory.load_train_dataframes(cfg)

In [None]:
test_df = data_util.load_csv(cfg.path + cfg.test.df_name)

In [None]:
def save_img(folder, name, img):
    cv2.imwrite(folder + name + '.png' , img)    

def save_np(folder, name, data):
    data = np.int16(data)
    np.savez(folder + name, data=data)
    
def save_img_16(folder, name, img):
    img = img + 2000
    img = np.uint16(img)
    cv2.imwrite(folder + name + '.png' , img)

def prep_and_save(path, target, row, img_sz):
    try:
        img = dcm_util.load_and_preprocess(path, row['Image'], img_sz, window_type, window_conf)
        save_img(target, row['Image'], img)
    except Exception as e:
        print(row['Image'], e)


In [None]:
path_train = cfg.train.path
path_test = cfg.test.path

### Normal window data 256

In [None]:
window_conf = [[-1, -1], [80, 200], [600, 2800]]
window_type = 0  # 0 - normal, 1 - sigmoid
img_sz = 256

In [None]:
path_train_out = cfg.path + 'stage_2_train_images_{}_png/'.format(img_sz)

os.mkdir(path_train_out)

Parallel(n_jobs=-1)(delayed(prep_and_save)(path_train, path_train_out, row, img_sz) for index, row in tqdm(train_df.iterrows()))

In [None]:
path_test_out  = cfg.path + 'stage_2_test_images_{}_png/'.format(img_sz)

os.mkdir(path_test_out)

Parallel(n_jobs=-1)(delayed(prep_and_save)(path_test, path_test_out, row, img_sz) for index, row in tqdm(test_df.iterrows()))

### Sigmoid window data 378

In [None]:
window_conf = [[-1, -1], [80, 200], [600, 2800]]
window_type = 1  # 0 - normal, 1 - sigmoid
img_sz = 378

In [None]:
path_train_out = cfg.path + 'stage_1_train_sig_images_{}_png/'.format(img_sz)

os.mkdir(path_train_out)

Parallel(n_jobs=-1)(delayed(prep_and_save)(path_train, path_train_out, row, img_sz) for index, row in tqdm(train_df.iterrows()))

In [None]:
path_test_out  = cfg.path + 'stage_2_test_sig_images_{}_png/'.format(img_sz)

os.mkdir(path_test_out)

Parallel(n_jobs=-1)(delayed(prep_and_save)(path_test, path_test_out, row, img_sz) for index, row in tqdm(test_df.iterrows()))

### Normal window data 378 v2

In [None]:
window_conf = [[-1, -1], [80, 200], [40, 380]]
window_type = 0  # 0 - normal, 1 - sigmoid
img_sz = 378

In [None]:
path_train_out = cfg.path + 'stage_1_train_images_{}_png_v2/'.format(img_sz)

os.mkdir(path_train_out)

Parallel(n_jobs=-1)(delayed(prep_and_save)(path_train, path_train_out, row, img_sz) for index, row in tqdm(train_df.iterrows()))

In [None]:
path_test_out  = cfg.path + 'stage_1_test_images_{}_png_v2/'.format(img_sz)

os.mkdir(path_test_out)

Parallel(n_jobs=-1)(delayed(prep_and_save)(path_test, path_test_out, row, img_sz) for index, row in tqdm(test_df.iterrows()))

### Full data 256 with adj images

In [None]:
def load_dcm_data(path, img_id):
    data = pydicom.dcmread(path + img_id + '.dcm')
    data = data.pixel_array
    return data


def load_and_preprocess_v2(path, index, img_sz, meta):   
    curr   = meta.iloc[index]
    img_id = curr['Image']
    
    if index == meta.index[0]:
        prev = curr
    else:
        prev = meta.loc[index - 1]
        
    if index == meta.index[-1]:
        post = curr
    else:
        post = meta.loc[index + 1]
    
    
    data = load_dcm_data(path, img_id)
    img = np.zeros((*np.shape(data), 3), dtype=np.float32)    

    if curr['PatientID'] == prev['PatientID'] and curr['StudyInstanceUID'] == prev['StudyInstanceUID']:
        img[..., 0] = load_dcm_data(path, prev['Image'])
    else:
        img[..., 0] = data

    img[..., 1] = data

    if curr['PatientID'] == post['PatientID'] and curr['StudyInstanceUID'] == post['StudyInstanceUID']:
        img[..., 2] = load_dcm_data(path, post['Image'])
    else:
        img[..., 2] = data

    if img_sz:
        img = cv2.resize(img, (img_sz, img_sz))

    return img

In [None]:
def prep_and_save_v2(path, target, index, row, img_sz, meta):
    try:
        img = load_and_preprocess_v2(path, index, img_sz, meta)
        save_img_16(target, row['Image'], img)
    except Exception as e:
        print(row['Image'], e)

In [None]:
img_sz = 256
train_meta_df = pd.read_csv(cfg.path + 'stage_2_train_metadata.csv')
test_meta_df  = pd.read_csv(cfg.path + 'stage_2_test_metadata.csv')

train_meta_df = train_meta_df.rename(columns={"ID": "Image"})
test_meta_df  = test_meta_df.rename(columns={"ID": "Image"})

In [None]:
train_meta_df['pos_Z'] = train_meta_df['ImagePositionPatient'].apply(lambda x: float(x[1:-1].replace('\'', '').split(',')[-1]))
train_meta_df.sort_values(by=['PatientID', 'StudyInstanceUID', 'pos_Z'], inplace=True)
train_meta_df.reset_index(drop=True, inplace=True)

In [None]:
test_meta_df['pos_Z'] = test_meta_df['ImagePositionPatient'].apply(lambda x: float(x[1:-1].replace('\'', '').split(',')[-1]))
test_meta_df.sort_values(by=['PatientID', 'StudyInstanceUID', 'pos_Z'], inplace=True)
test_meta_df.reset_index(drop=True, inplace=True)

In [None]:
print(len(test_meta_df))

In [None]:
path_train_out = cfg.path + 'stage_2_train_images_{}_full/'.format(img_sz)

os.mkdir(path_train_out)

Parallel(n_jobs=-1)(delayed(prep_and_save_v2)(path_train, path_train_out, index, row, img_sz, train_meta_df) for index, row in tqdm(train_meta_df.iterrows()))

In [None]:
path_test_out  = cfg.path + 'stage_2_test_images_{}_np/'.format(img_sz)

os.mkdir(path_test_out)

Parallel(n_jobs=-1)(delayed(prep_and_save_v2)(path_test, path_test_out, index, row, img_sz, test_meta_df) for index, row in tqdm(test_df.iterrows()))

### Check generated data

In [None]:
I = load_and_preprocess_v2(path_train, 0, img_sz, train_meta_df)
print(np.min(I), np.max(I))

In [None]:
from matplotlib import pyplot as plt

In [None]:
I = cv2.imread('../data/stage_2_train_images_256_full/' + 'ID_033b3ee42' + '.png', -1)

print(np.min(I), np.max(I))

f, ax = plt.subplots(ncols=3, figsize=(20,4))
for i in range(3):
    ax[i].imshow(I[...,i] / 8000)

In [None]:
len(train_meta_df)

### Move data
a way to move large number of files

In [None]:
x = os.listdir('../data/stage_1_test_sig_images_378_png/')
len(x)

find stage_1_test_sig_images_378_png -name '*.*' -exec mv {} stage_1_train_sig_images_378_png \;