In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import cv2
import glob
import time
import pydicom
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

import dask as dd
import dask.array as da
from dask.distributed import Client, progress

print(os.listdir("../input/sample images"))

['1.2.276.0.7230010.3.1.4.8323329.1000.1517875165.878027.dcm', '1.2.276.0.7230010.3.1.4.8323329.10002.1517875220.939397.dcm', '1.2.276.0.7230010.3.1.4.8323329.4982.1517875185.837576.dcm', '1.2.276.0.7230010.3.1.4.8323329.12743.1517875241.599591.dcm', '1.2.276.0.7230010.3.1.4.8323329.4440.1517875182.865105.dcm', '1.2.276.0.7230010.3.1.4.8323329.10003.1517875220.942420.dcm', '1.2.276.0.7230010.3.1.4.8323329.4904.1517875185.355709.dcm', '1.2.276.0.7230010.3.1.4.8323329.1314.1517875167.222290.dcm', 'train-rle-sample.csv', '1.2.276.0.7230010.3.1.4.8323329.10001.1517875220.930580.dcm', '1.2.276.0.7230010.3.1.4.8323329.10000.1517875220.938530.dcm']


In [2]:
# Path to the data
data_dir = Path('../input/sample images/')

# get the list of all the dcm files recursively
all_files = list(data_dir.glob("**/*.dcm"))

print("Number of dcm files found: ", len(all_files))

Number of dcm files found:  10


In [3]:
# Define the path to output directory
outdir = "./processed_images/"

# Make the directory
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [4]:
# Convert DICOM to JPG/PNG via openCV
def convert_images(filename, img_type='jpg'):
    """Reads a dcm file and saves the files as png/jpg
    
    Args:
        filename: path to the dcm file
        img_type: format of the processed file (jpg or png)
        
    """
    # extract the name of the file
    name = filename.parts[-1]
    
    # read the dcm file
    ds = pydicom.read_file(str(filename)) 
    img = ds.pixel_array
    
    # save the image as jpg/png
    if img_type=="jpg":
        cv2.imwrite(outdir + name.replace('.dcm','.jpg'), img)
    else:
        cv2.imwrite(outdir + name.replace('.dcm','.png'), img)

In [5]:
# Making the list bigger hust for showcasing 
all_files = all_files*1000
print("Total number of files: ", len(all_files))

Total number of files:  10000


In [6]:
# First using the simple way: the for loop
t = time.time()
for f in all_files:
    convert_images(f)
print("Time taken : ", time.time() - t)

Time taken :  153.49075484275818


In [7]:
# Using dask 
all_images = [dd.delayed(convert_images)(all_files[x]) for x in range(len(all_files))]

t = time.time()
dd.compute(all_images)
print("Time taken when using all cores: ", time.time()-t)

Time taken when using all cores:  61.7934250831604


In [8]:
# Confirm that all the original 10 images are saved 
! ls ./processed_images/* | wc -l

10


There is still so much of room left to make it even faster!