In [1]:
import os
import cv2
import glob
import gdcm
import pydicom
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json

from tqdm.notebook import tqdm
from joblib import Parallel, delayed

# Load data

In [2]:
path = r"D:\Breast_Cancer_Detection\train_images\dicom13\*\*.dcm"
train_images = glob.glob(path)
len(train_images)  # 54706

2538

# Crop image

In [4]:
def crop_image(img, show=True):
    # Binarize the image
    bin_pixels = cv2.threshold(img, 20, 255, cv2.THRESH_BINARY)[1]
   
    # Make contours around the binarized image, keep only the largest contour
    contours, _ = cv2.findContours(bin_pixels, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    contour = max(contours, key=cv2.contourArea)

    # Create a mask from the largest contour
    mask = np.zeros(img.shape, np.uint8)
    cv2.drawContours(mask, [contour], -1, 255, cv2.FILLED)
   
    # Use bitwise_and to get masked part of the original image
    out = cv2.bitwise_and(img, mask)
    
    # get bounding box of contour
    y1, y2 = np.min(contour[:, :, 1]), np.max(contour[:, :, 1])
    x1, x2 = np.min(contour[:, :, 0]), np.max(contour[:, :, 0])
    
    x1 = int(0.99 * x1)
    x2 = int(1.01 * x2)
    y1 = int(0.99 * y1)
    y2 = int(1.01 * y2)
    
    if show:
        plt.imshow(out[y1:y2, x1:x2], cmap="gray") ; 

    return out[y1:y2, x1:x2]

In [5]:
for f in tqdm(train_images[5:7]):
    print(90*"=")
    patient = f.split('\\')[-2]
    image = f.split('\\')[-1][:-4]
    
    print(f"patient {patient}\n")
    
    dicom = pydicom.dcmread(f)
    img = dicom.pixel_array

    img = (img - img.min()) / (img.max() - img.min())    
    img *= 255
    img = np.uint8(img)

    if dicom.PhotometricInterpretation == "MONOCHROME1":
        img = 1 - img

    #plt.figure(figsize=(5, 5))
    #plt.imshow(img, cmap="gray")
    #plt.title(f"original image for {patient} {image}")
    #plt.show()
        
    img = crop_image(img, show=False)
    
    #plt.figure(figsize=(5, 5))
    #plt.imshow(img, cmap="gray")
    #plt.title(f"after text removal / cropping {patient} {image}")
    #plt.show()
    
    crop_image(img, show=False)

  0%|          | 0/2 [00:00<?, ?it/s]

patient 10011

patient 10011



In [42]:
##### Resize 512,256

In [3]:
#Load dcm
for f in tqdm(train_images[::]):
    patient = f.split('\\')[-2]
    image = f.split('\\')[-1][:-4]

    dicom = pydicom.dcmread(f)
    img = dicom.pixel_array

    img = (img - img.min()) / (img.max() - img.min())

    if dicom.PhotometricInterpretation == "MONOCHROME1":
        img = 1 - img
        
    #plt.figure(figsize=(15, 15))
    #plt.imshow(img, cmap="gray")
    #plt.title(f"{patient} {image}")
    #plt.show()

  0%|          | 0/9220 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Remove letter

In [3]:
def remove_charater(f,size = (512,512),save_folder=""):
    for f in tqdm(train_images[::]):
        patient = f.split('\\')[-2]
        image = f.split('\\')[-1][:-4]
    
        dicom = pydicom.dcmread(f)
        pixels = dicom.pixel_array

        if dicom.PhotometricInterpretation == "MONOCHROME1":
            pixels = np.amax(pixels) - pixels
        else:
            pixels = pixels - np.min(pixels)
        
        if np.max(pixels) != 0:
            pixels = pixels / np.max(pixels)
            pixels = (pixels * 255).astype(np.uint8)
        # Binarize the image
        bin_pixels = cv2.threshold(pixels, 20, 255, cv2.THRESH_BINARY)[1]

        # Make contours around the binarized image, keep only the largest contour
        contours, _ = cv2.findContours(bin_pixels, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
        contour = max(contours, key=cv2.contourArea)

        # Create a mask from the largest contour
        mask = np.zeros(pixels.shape, np.uint8)
        cv2.drawContours(mask, [contour], -1, 255, cv2.FILLED)

        # Use bitwise_and to get masked part of the original image
        out = cv2.bitwise_and(pixels,mask)
        cv2.imwrite(save_folder + f"{patient}_{image}.png",cv2.resize(out,size))
        
    return out

In [4]:
SAVE_FOLDER = "output/"
SIZE = (512,512)
EXTENSION = "png"

#os.makedirs(SAVE_FOLDER, exist_ok=True)

In [None]:
_ = Parallel(n_jobs=4)(
    delayed(remove_charater)(uid, size=SIZE, save_folder=SAVE_FOLDER)
    for uid in tqdm(train_images[:])
)

  0%|          | 0/2538 [00:00<?, ?it/s]