In [1]:
# from https://www.kaggle.com/code/xhlulu/siim-covid-19-convert-to-jpg-256px?scriptVersionId=63196459
import os

from PIL import Image
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut


In [2]:
image_size = 512

In [5]:
def read_xray(path, voi_lut = True, fix_monochrome = True):
    dicom = pydicom.read_file(path)

    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [4]:
def resize(array, size, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    
    if keep_ratio:
        im.thumbnail((size, size), resample)
    else:
        im = im.resize((size, size), resample)
    
    return im

In [7]:
image_id = []
dim0 = []
dim1 = []
splits = []
study_id = []
for split in ['test', 'train']:
    save_dir = f'./tmp_{image_size}/{split}/'

    # os.makedirs(save_dir, exist_ok=True)
    
    for dirname, _, filenames in tqdm(os.walk(f'./siim-covid19-detection/{split}')):
        for file in filenames:
            # set keep_ratio=True to have original aspect ratio
            xray = read_xray(os.path.join(dirname, file))
            # im = resize(xray, size=image_size)  
            # im.save(os.path.join(save_dir, file.replace('dcm', 'png')))

            image_id.append(file.replace('.dcm', ''))
            dim0.append(xray.shape[0])
            dim1.append(xray.shape[1])
            splits.append(split)
            study_id.append(dirname.split('/')[-2])

0it [00:00, ?it/s]

0it [00:00, ?it/s]



In [7]:
os.path.join(dirname, file)

'./siim-covid19-detection/train/4d7a2748dfb1/49f74cf4cbad/0049814626c8.dcm'

In [9]:
df = pd.DataFrame.from_dict({'image_id': image_id, 'dim0': dim0, 'dim1': dim1, 'split': splits,'study_id': study_id})
df.to_csv(f'meta_{image_size}.csv', index=False)

In [9]:
df = pd.read_csv('meta.csv')
from numpy import NaN
train_label = pd.read_csv('./siim-covid19-detection/train_image_level.csv')

In [10]:
train_label.id = train_label.id.str.replace('_image','')
train_label

Unnamed: 0,id,boxes,label,StudyInstanceUID
0,000a312787f2,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75
1,000c3a3f293f,,none 1 0 0 1 1,ff0879eb20ed
2,0012ff7358bc,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7
3,001398f4ff4f,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2
4,001bd15d1891,"[{'x': 623.23328, 'y': 1050, 'width': 714, 'he...",opacity 1 623.23328 1050 1337.23328 2156 opaci...,dfd9fdd85a3e
...,...,...,...,...
6329,ffcc6edd9445,,none 1 0 0 1 1,7e6c68462e06
6330,ffd91a2c4ca0,,none 1 0 0 1 1,8332bdaddb6e
6331,ffd9b6cf2961,"[{'x': 2197.38566, 'y': 841.07361, 'width': 31...",opacity 1 2197.38566 841.07361 2513.80265 1292...,7eed9af03814
6332,ffdc682f7680,"[{'x': 2729.27083, 'y': 332.26044, 'width': 14...",opacity 1 2729.27083 332.26044 4225.52099 2936...,a0cb0b96fb3d


In [11]:
train_label = train_label.merge(df, left_on='id',right_on='image_id', how='left')
train_label.head()

Unnamed: 0,id,boxes,label,StudyInstanceUID,image_id,dim0,dim1,split
0,000a312787f2,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,000a312787f2,3488,4256,train
1,000c3a3f293f,,none 1 0 0 1 1,ff0879eb20ed,000c3a3f293f,2320,2832,train
2,0012ff7358bc,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7,0012ff7358bc,2544,3056,train
3,001398f4ff4f,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2,001398f4ff4f,3520,4280,train
4,001bd15d1891,"[{'x': 623.23328, 'y': 1050, 'width': 714, 'he...",opacity 1 623.23328 1050 1337.23328 2156 opaci...,dfd9fdd85a3e,001bd15d1891,2800,3408,train


In [12]:
IMG_SIZE = image_size

def get_bbox(row):
    bboxes = []
    bbox = []
    labels  = row.label.split(' ')
    bboxes = []
    for i in range(0,len(labels), 6):
        # print(labels[i+2:i+6])
        bboxes.append(list(map(float,labels[i+2:i+6])))
    return np.array(bboxes)

        
def scale(row,bboxes):
    scalex = IMG_SIZE/row.dim1
    scaley = IMG_SIZE/row.dim0
    scaled_bboxes = np.zeros_like(bboxes)
    scaled_bboxes[:,[0,2]] = bboxes[:,[0,2]]*scalex
    scaled_bboxes[:,[1,3]] = bboxes[:,[1,3]]*scaley
    return scaled_bboxes

#boxes boundaries may be float
def boxes_to_image(bboxes):
    img = np.zeros((IMG_SIZE,IMG_SIZE),dtype=int)
    for box in bboxes:
        x1 = int(box[0])
        x2 = int(box[2])
        y1 = int(box[1])
        y2 = int(box[3])
        img[y1:y2,x1:x2] = 255
    return img

In [13]:
import matplotlib.pyplot as plt
from PIL import ImageDraw
save_dir = f'./tmp_{IMG_SIZE}/train_mask/'
def gen_mask_image(df):
    os.makedirs(f'./tmp_{IMG_SIZE}/train_mask/',exist_ok=True)
    for i in tqdm(range(len(df))):
        row = df.loc[i]
        label = row.label.split(' ')[0]
        if label == 'opacity':
            bboxes = get_bbox(row)
            bboxes = scale(row,bboxes)
            # print(bboxes)
            # mask = boxes_to_image(bboxes)
            # mask = resize(mask, size=IMG_SIZE)
            mask = np.zeros((IMG_SIZE,IMG_SIZE),dtype=int)
            # print(mask[mask.nonzero()])
            # break
            
            im = Image.fromarray(mask,mode='L')
            # print(np.array(list(im.getdata(0))).nonzero())
            im = im.convert('RGB')
            for box in bboxes:
                # print(list(box))
                ImageDraw.Draw(im).rectangle(list(box),fill=0xffffff)
            # plt.imshow(im)
            im.save(save_dir+row.id.split('_')[0]+'.png')
            # break
        else:
            mask = np.zeros((IMG_SIZE,IMG_SIZE),dtype=int)
            im = Image.fromarray(mask,mode='L')
            im = im.convert('RGB')
            im.save(save_dir+row.id.split('_')[0]+'.png')

In [14]:
gen_mask_image(train_label)

  0%|          | 0/6334 [00:00<?, ?it/s]

NameError: name 'im' is not defined

In [28]:
train_label

Unnamed: 0,id,boxes,label,StudyInstanceUID
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75
1,000c3a3f293f_image,,none 1 0 0 1 1,ff0879eb20ed
2,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7
3,001398f4ff4f_image,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2
4,001bd15d1891_image,"[{'x': 623.23328, 'y': 1050, 'width': 714, 'he...",opacity 1 623.23328 1050 1337.23328 2156 opaci...,dfd9fdd85a3e
...,...,...,...,...
6329,ffcc6edd9445_image,,none 1 0 0 1 1,7e6c68462e06
6330,ffd91a2c4ca0_image,,none 1 0 0 1 1,8332bdaddb6e
6331,ffd9b6cf2961_image,"[{'x': 2197.38566, 'y': 841.07361, 'width': 31...",opacity 1 2197.38566 841.07361 2513.80265 1292...,7eed9af03814
6332,ffdc682f7680_image,"[{'x': 2729.27083, 'y': 332.26044, 'width': 14...",opacity 1 2729.27083 332.26044 4225.52099 2936...,a0cb0b96fb3d


In [30]:
df

Unnamed: 0,image_id,dim0,dim1,split
0,ab15692269fc,2800,3408,test
1,bc72a19f5d87,2800,2874,test
2,05435da60872,2320,2832,test
3,7e3c0527ddb7,1958,1960,test
4,c5a76564ef83,3480,4240,test
...,...,...,...,...
7592,12c435a26db5,2800,3408,train
7593,a06bc359b5fb,2800,3408,train
7594,4ed22347ea95,3480,4248,train
7595,f071f42fe94e,3198,3056,train
