In [109]:
import pandas as pd
from skimage.filters import threshold_otsu
from scipy.ndimage import binary_fill_holes
from skimage import io, color, measure, morphology
import numpy as np
import os
from typing import List
from matplotlib import pyplot as plt
from tqdm import tqdm

In [110]:
def get_padding(image, size) -> List[int]:
    imsize = image.shape
    h_padding = (size[0] - imsize[0]) / 2
    v_padding = (size[1] - imsize[1]) / 2
    l_pad = h_padding if h_padding % 1 == 0 else h_padding + 0.5
    t_pad = v_padding if v_padding % 1 == 0 else v_padding + 0.5
    r_pad = h_padding if h_padding % 1 == 0 else h_padding - 0.5
    b_pad = v_padding if v_padding % 1 == 0 else v_padding - 0.5

    padding = ((int(l_pad), int(r_pad)), (int(t_pad), int(b_pad)))

    return padding

In [111]:
def process_and_crop_image(image_path, output_folder, margin=10, pad_image_to_size=(256, 256)):
    try:
        image = io.imread(image_path, as_gray=True)

        thresh = threshold_otsu(image)
        binary = image > thresh
        cleaned = morphology.remove_small_objects(binary, min_size=150)
        filled_image = binary_fill_holes(cleaned)
        label_img = measure.label(filled_image)
        regions = measure.regionprops(label_img)

        if not regions:
            raise Exception("Could not find any regions")

        region_max = max(regions, key=lambda r: r.area)

        minr, minc, maxr, maxc = region_max.bbox
        width, height = image.shape
        minr = max(0, minr - margin)
        minc = max(0, minc - margin)
        maxr = min(width, maxr)
        maxc = min(height, maxc)

        cropped_image = image[minr:maxr, minc:maxc]
        cropped_image = np.pad(cropped_image, get_padding(cropped_image, pad_image_to_size), 'constant')

        filename = os.path.basename(image_path)
        cropped_image_path = os.path.join(output_folder, f"{filename}")
        io.imsave(cropped_image_path, cropped_image.astype(np.uint8))
    except Exception as e:
        print(f"Error processing {image_path}: {e}")

In [None]:
output_folder = '../data/cropped/'
input_folder = '../data/rsna_breast_cancer'

processed_images_paths = []
os.makedirs(output_folder, exist_ok=True)

df = pd.read_csv('../train.csv')

for idx, row in tqdm(df.iterrows()):
    image_name = str(row['patient_id']) + "_" + str(row['image_id']) + ".png"
    image_path = os.path.join(input_folder, image_name)
    process_and_crop_image(image_path, output_folder)