# Data Prep

In [23]:
import numpy as np
import pandas as pd
import os
import shutil
import pydicom
import png
from sklearn.model_selection import GroupKFold
pd.set_option('display.max_colwidth',None)

In [89]:
dicom_train_path = './data/images/dicom/train/'
dicom_test_path = './data/images/dicom/test/'
img_train_path = '../data/images/train/'
img_val_path = './data/images/val/'
img_test_path = './data/images/test/'
train_csv_path = './data/train.csv'
new_csv_path = './data/new_train.csv'
lbl_train_path = './data/labels/train/'
lbl_val_path = './data/labels/val/'
label_dir = './data/yolo-labels'
os.makedirs(lbl_train_path, exist_ok = True)
os.makedirs(lbl_val_path, exist_ok = True)
os.makedirs(img_val_path, exist_ok = True)
os.makedirs(img_train_path, exist_ok = True)
os.makedirs(img_test_path, exist_ok = True)

In [90]:
# Reset
#df = pd.read_csv(new_csv_path).drop_duplicates('image_id')
#df = df[df['fold'] == 4]
#df
files = os.listdir(img_val_path)
for file in files:
    shutil.move(img_val_path + file, img_train_path)

In [95]:
train_df = pd.read_csv(train_csv_path)
train_df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max
0,50a418190bc3fb1ef1633bf9678929b3,No finding,14,R11,,,,
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,14,R7,,,,
2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0
4,063319de25ce7edb9b1c6b8881290140,No finding,14,R10,,,,


In [96]:
len(train_df.image_id.unique())

15000

# Only 14 Class

In [97]:
train_df = train_df[train_df.class_id != 14].reset_index(drop = True)

In [98]:
len(train_df.image_id.unique())

4394

# Split

In [99]:
fold = 4
gkf = GroupKFold(n_splits = 5)
images = train_df.drop_duplicates('image_id')[['image_id']].sort_values('image_id').reset_index(drop = True)
images['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(gkf.split(images, groups = images.image_id.tolist())):
    images.loc[val_idx, 'fold'] = fold
images.head()

Unnamed: 0,image_id,fold
0,0005e8e3701dfb1dd93d53e2ff537b6e,3
1,0007d316f756b3fa0baea2ff514ce945,0
2,000d68e42b71d3eac10ccc077aba07c1,4
3,00150343289f317a0ad5629d5b7d9ef9,3
4,001d127bad87592efe45a5c7678f8b8d,2


In [100]:
images.fold.value_counts()

0    879
1    879
2    879
3    879
4    878
Name: fold, dtype: int64

In [101]:
#missing = []
def set_file_paths(row):
    if not os.path.exists(dicom_train_path + row.image_id + ('.dicom')):
        print(f'MISSING: {dicom_train_path + row.image_id}.dicom')
        #missing.append(row.image_id)
        return "MISSING"
    
    if row['fold'] == fold:
        if os.path.exists(img_train_path + row.image_id + '.png'):
            shutil.move((img_train_path + row.image_id + ('.png')), img_val_path)
        return img_val_path + row.image_id + ('.png')
        
    else:
        return img_train_path + row.image_id + ('.png')

In [102]:
images['image_path'] = images.apply(set_file_paths, axis = 1)
images.head()

MISSING: ./data/images/dicom/train/31901b2d1d2eb92c235f47f498b907c0.dicom
MISSING: ./data/images/dicom/train/347180362348e522905047dde655b6d7.dicom
MISSING: ./data/images/dicom/train/3c98c90840a9290511ae8192107f8250.dicom
MISSING: ./data/images/dicom/train/4016b176eaf40d514b0559a0aeb48f21.dicom
MISSING: ./data/images/dicom/train/415f0f58066bad3d69bd5fac2c80574b.dicom
MISSING: ./data/images/dicom/train/42c049fd05428f7d606e9da4a95a8c3b.dicom
MISSING: ./data/images/dicom/train/47ed17dcb2cbeec15182ed335a8b5a9e.dicom
MISSING: ./data/images/dicom/train/4d01d09027d1d1e0513de4c8b4fc20e1.dicom
MISSING: ./data/images/dicom/train/6833c509c10b569d6167db70b83ee5d5.dicom
MISSING: ./data/images/dicom/train/6ce61a39f1e1bff629566de047ab8775.dicom
MISSING: ./data/images/dicom/train/7043451da60cb89537435829965759c3.dicom
MISSING: ./data/images/dicom/train/7653a1c4431f1929ae8c73588e39b8d5.dicom
MISSING: ./data/images/dicom/train/7c6f191b5d28bc1992e491d906f0d1a5.dicom
MISSING: ./data/images/dicom/train/924

Unnamed: 0,image_id,fold,image_path
0,0005e8e3701dfb1dd93d53e2ff537b6e,3,../data/images/train/0005e8e3701dfb1dd93d53e2ff537b6e.png
1,0007d316f756b3fa0baea2ff514ce945,0,../data/images/train/0007d316f756b3fa0baea2ff514ce945.png
2,000d68e42b71d3eac10ccc077aba07c1,4,./data/images/val/000d68e42b71d3eac10ccc077aba07c1.png
3,00150343289f317a0ad5629d5b7d9ef9,3,../data/images/train/00150343289f317a0ad5629d5b7d9ef9.png
4,001d127bad87592efe45a5c7678f8b8d,2,../data/images/train/001d127bad87592efe45a5c7678f8b8d.png


In [103]:
def dim_from_dicom(image_id):
    path = os.path.join(dicom_train_path, image_id) + '.dicom'
    ds = pydicom.dcmread(path)
    return ds.get('Columns'), ds.get('Rows')

In [104]:
images = images[images.image_path != 'MISSING']

In [105]:
images['width'], images['height'] = zip(*images['image_id'].map(dim_from_dicom))

images.head()

Unnamed: 0,image_id,fold,image_path,width,height
0,0005e8e3701dfb1dd93d53e2ff537b6e,3,../data/images/train/0005e8e3701dfb1dd93d53e2ff537b6e.png,3072,3072
1,0007d316f756b3fa0baea2ff514ce945,0,../data/images/train/0007d316f756b3fa0baea2ff514ce945.png,2304,2880
2,000d68e42b71d3eac10ccc077aba07c1,4,./data/images/val/000d68e42b71d3eac10ccc077aba07c1.png,2304,2880
3,00150343289f317a0ad5629d5b7d9ef9,3,../data/images/train/00150343289f317a0ad5629d5b7d9ef9.png,2525,2508
4,001d127bad87592efe45a5c7678f8b8d,2,../data/images/train/001d127bad87592efe45a5c7678f8b8d.png,3072,3072


In [106]:
# https://github.com/pydicom/pydicom/issues/352#issuecomment-467595258
from skimage.transform import resize, rescale
def dicom2png(source_folder, output_folder):
    list_of_files = os.listdir(source_folder)
    img_size = 640
    for file in list_of_files:
        try:
            filename = file.split('/')[-1].split('.')[0]
            
            ds = pydicom.dcmread(os.path.join(source_folder,file))
            shape = ds.pixel_array.shape
            
            # Convert to float to avoid overflow or underflow losses.
            image_2d = ds.pixel_array.astype(float)

            # Rescaling grey scale between 0-255
            image_2d_scaled = (np.maximum(image_2d,0) / image_2d.max()) * 255.0

            # Convert to uint
            image_2d_scaled = np.uint8(image_2d_scaled)

            # Write the PNG file
            with open(os.path.join(output_folder,filename)+'.png' , 'wb') as png_file:
                w = png.Writer(shape[1], shape[0], greyscale=True)
                w.write(png_file, image_2d_scaled)
            
        except:
            print('Could not convert: ', file)

In [107]:
%%time
#dicom2png(dicom_train_path, img_train_path)
#dicom2png(dicom_test_path, img_test_path)

Wall time: 0 ns


In [108]:
import cv2
def resizePng(source_folder, max_dim):
    files = os.listdir(source_folder)
    #print(files)
    for file in files:
        img = cv2.imread(os.path.join(source_folder, file), cv2.IMREAD_UNCHANGED)
        
        scale = max_dim / max(img.shape)
        width = int(img.shape[1] * scale)
        height = int(img.shape[0] * scale)
        
        resized = cv2.resize(img, (width, height), interpolation = cv2.INTER_AREA)
        
        #print('Resized Dimensions : ', resized.shape)
        cv2.imwrite(os.path.join(source_folder, file), resized)

In [109]:
%%time
#resizePng(img_test_path, 640)
#resizePng(img_train_path, 640)

Wall time: 0 ns


In [110]:
train_df = train_df.merge(images.drop('image_path', axis = 1), how = 'left', on = 'image_id')
train_df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,fold,width,height
0,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0,4.0,2080.0,2336.0
1,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0,0.0,2304.0,2880.0
2,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,627.0,357.0,947.0,433.0,4.0,2540.0,3072.0
3,0c7a38f293d5f5e4846aa4ca6db4daf1,ILD,5,R17,1347.0,245.0,2188.0,2169.0,4.0,2285.0,2555.0
4,47ed17dcb2cbeec15182ed335a8b5a9e,Nodule/Mass,8,R9,557.0,2352.0,675.0,2484.0,,,


In [111]:
train_df = train_df.dropna()
train_df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,fold,width,height
0,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0,4.0,2080.0,2336.0
1,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0,0.0,2304.0,2880.0
2,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,627.0,357.0,947.0,433.0,4.0,2540.0,3072.0
3,0c7a38f293d5f5e4846aa4ca6db4daf1,ILD,5,R17,1347.0,245.0,2188.0,2169.0,4.0,2285.0,2555.0
5,d3637a1935a905b3c326af31389cb846,Aortic enlargement,0,R10,1329.0,743.0,1521.0,958.0,3.0,2304.0,2880.0


In [112]:
train_df['x_min'] = train_df.apply(lambda row: (row.x_min) / row.width, axis = 1)
train_df['y_min'] = train_df.apply(lambda row: (row.y_min) / row.height, axis = 1)

train_df['x_max'] = train_df.apply(lambda row: (row.x_max) / row.width, axis = 1)
train_df['y_max'] = train_df.apply(lambda row: (row.y_max) / row.height, axis = 1)

train_df['x_mid'] = train_df.apply(lambda row: (row.x_max + row.x_min) / 2, axis = 1)
train_df['y_mid'] = train_df.apply(lambda row: (row.y_max + row.y_min) / 2, axis = 1)

train_df['w'] = train_df.apply(lambda row: (row.x_max - row.x_min), axis = 1)
train_df['h'] = train_df.apply(lambda row: (row.y_max - row.y_min), axis = 1)

train_df['area'] = train_df['w'] * train_df['h']
train_df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,fold,width,height,x_mid,y_mid,w,h,area
0,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,0.332212,0.588613,0.794712,0.783818,4.0,2080.0,2336.0,0.563462,0.686216,0.4625,0.195205,0.090283
1,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,0.548611,0.257986,0.699219,0.353819,0.0,2304.0,2880.0,0.623915,0.305903,0.150608,0.095833,0.014433
2,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,0.24685,0.116211,0.372835,0.140951,4.0,2540.0,3072.0,0.309843,0.128581,0.125984,0.02474,0.003117
3,0c7a38f293d5f5e4846aa4ca6db4daf1,ILD,5,R17,0.589497,0.09589,0.957549,0.848924,4.0,2285.0,2555.0,0.773523,0.472407,0.368053,0.753033,0.277156
5,d3637a1935a905b3c326af31389cb846,Aortic enlargement,0,R10,0.576823,0.257986,0.660156,0.332639,3.0,2304.0,2880.0,0.61849,0.295312,0.083333,0.074653,0.006221


# Make Labels

In [114]:
str = ''
for i, j in images.iterrows():
    for k, l in train_df[train_df.image_id == j.image_id].iterrows():
        str += f'{l.class_id} {l.x_mid} {l.y_mid} {l.w} {l.h}\n'
    
    if j.fold == fold:
        with open(lbl_val_path + j.image_id + ('.txt'), 'w') as f:
            f.write(str)
    
    else:
        with open(lbl_train_path + j.image_id + ('.txt'), 'w') as f:
            f.write(str)
    str = ''

# Save to csv

In [115]:
#if os.exists("")
train_df.set_index('image_id').to_csv(new_csv_path)