# RSNA: Cut Off Empty Space from Images

This notebook generates images with all empty space removed. 
This magnifies important parts of the data and helps to train better models with the same compute. 

Note that extra artifacts such as labels, noise are also removed from images!

It uses pregenerated PNGs from https://www.kaggle.com/code/theoviel/dicom-resized-png-jpg as an input.

So if you want to use this preprocessing in inference:
1. Run code from https://www.kaggle.com/code/theoviel/dicom-resized-png-jpg on test data first.
2. Run the code from this notebook on the output of (1)

See https://www.kaggle.com/vslaykovsky/infer-effnetv2-aux-targets-weighted-loss-thres for an example use in inference. 


**UPD**



**Upvote if you think this is awesome!**

In [None]:
from concurrent.futures import ProcessPoolExecutor
import cv2
import glob
import numpy as np
from tqdm import tqdm
import os
from PIL import Image
import matplotlib.pyplot as plt
import re


DEBUG = False
 

In [None]:
def fit_image(fname):
    X = cv2.imread(fname)
    
    # Some images have narrow exterior "frames" that complicate selection of the main data. Cutting off the frame
    X = X[5:-5, 5:-5]
    
    # regions of non-empty pixels
    output= cv2.connectedComponentsWithStats((X > 20).astype(np.uint8)[:, :, 0], 8, cv2.CV_32S)

    # stats.shape == (N, 5), where N is the number of regions, 5 dimensions correspond to:
    # left, top, width, height, area_size
    stats = output[2]

    # finding max area which always corresponds to the breast data. 
    idx = stats[1:, 4].argmax() + 1
    x1, y1, w, h = stats[idx][:4]
    x2 = x1 + w
    y2 = y1 + h
    
    # cutting out the breast data
    X_fit = X[y1: y2, x1: x2]
    
    patient_id, im_id = re.findall('(\d+)_(\d+).png', os.path.basename(fname))[0]
    os.makedirs(patient_id, exist_ok=True)
    cv2.imwrite(f'{patient_id}/{im_id}.png', X_fit[:, :, 0])

def fit_all_images(all_images):
    with ProcessPoolExecutor(4) as p:
        for i in tqdm(p.map(fit_image, all_images), total=len(all_images)):
            pass


fit_image('/kaggle/input/rsna-breast-cancer-512-pngs/10006_462822612.png')
np.array(Image.open('10006/462822612.png'))

In [None]:

all_images = glob.glob('/kaggle/input/rsna-breast-cancer-512-pngs/*')
if DEBUG:
    all_images = np.random.choice(all_images, size=100)
fit_all_images(all_images)

In [None]:
np.random.seed(123)

for fname in np.random.choice(glob.glob('*/*'), size=100):
    plt.figure(figsize=(20, 10))
    patient_id, im_id = re.findall('(\d+)/(\d+).png', fname)[0]
    plt.suptitle(f'[{fname}]')
    im1 = Image.open(fname).convert('F')
    plt.subplot(121).imshow(im1)
    plt.subplot(121).set_title(f'Output image {im1.size}')
    im2 = Image.open(f'/kaggle/input/rsna-breast-cancer-512-pngs/{patient_id}_{im_id}.png').convert('F')
    plt.subplot(122).imshow(
        im2
    )
    plt.subplot(122).set_title(f'Source image {im2.size}')

In [None]:
import pandas as pd
df = pd.DataFrame(dict(zip(('y', 'x', 'c'), cv2.imread(i).shape)) for i in glob.glob('*/*.png'))

In [None]:
df[['x', 'y']].plot.hist(alpha=0.7, bins=100)