In [None]:
import SimpleITK as sitk
import os
import multiprocessing

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm

!mkdir img_256/ img_512/ img_1024/
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Runn!

In [None]:
def convert_image(input_file_name,output_file_name, new_width=None):
    try:
        image_file_reader = sitk.ImageFileReader()
        image_file_reader.SetImageIO("GDCMImageIO")
        image_file_reader.SetFileName(input_file_name)
        image_file_reader.ReadImageInformation()
        image_size = list(image_file_reader.GetSize())
        if len(image_size) == 3 and image_size[2] == 1:
            image_size[2] = 0
        image_file_reader.SetExtractSize(image_size)
        image = image_file_reader.Execute()
        if new_width:
            original_size = image.GetSize()
            original_spacing = image.GetSpacing()
            new_spacing = [
                (original_size[0] - 1) * original_spacing[0] / (new_width - 1)
            ] * 2
            new_size = [
                new_width,
                int(
                    (original_size[1] - 1)
                    * original_spacing[1]
                    / new_spacing[1]
                ),
            ]
            image = sitk.Resample(
                image1=image,
                size=new_size,
                transform=sitk.Transform(),
                interpolator=sitk.sitkLinear,
                outputOrigin=image.GetOrigin(),
                outputSpacing=new_spacing,
                outputDirection=image.GetDirection(),
                defaultPixelValue=0,
                outputPixelType=image.GetPixelID(),
            )
        if image.GetNumberOfComponentsPerPixel() == 1:
            image = sitk.RescaleIntensity(image, 0, 255)
            if (
                image_file_reader.GetMetaData("0028|0004").strip() == "MONOCHROME1"
            ):
                image = sitk.InvertIntensity(image, maximum=255)
            image = sitk.Cast(image, sitk.sitkUInt8)
        sitk.WriteImage(image, output_file_name)
        print(output_file_name)
        return True
    except BaseException:
        return False

In [None]:
def preprocess(row):
    scale = 512
    row = row[1]
    patient_id = row["patient_id"]
    image_id = row["image_id"]
    input_file_name = f"/kaggle/input/rsna-breast-cancer-detection/train_images/{patient_id}/{image_id}.dcm"
    output_file_name = f"/kaggle/working/img_{scale}/{patient_id}_{image_id}.jpg"
    convert_image(input_file_name,output_file_name,scale)

In [None]:
df = pd.read_csv("/kaggle/input/rsna-breast-cancer-detection/train.csv")

In [None]:
with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
    list(tqdm(pool.imap_unordered(preprocess, df.iterrows()), total=len(df)))

In [None]:
!zip -r img.zip img

In [None]:
nan