In [None]:
# Install two dedicated libraries for handling jpeg2000 files, and a package for dicomsdl
!pip install --no-index --no-deps /kaggle/input/extrapackages-dicomsdl-gdcm-pylibjpeg/wheelhouse/dicomsdl-0.109.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
!pip install --no-index --no-deps /kaggle/input/extrapackages-dicomsdl-gdcm-pylibjpeg/wheelhouse/numpy-1.21.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
!pip install --no-index --no-deps /kaggle/input/extrapackages-dicomsdl-gdcm-pylibjpeg/wheelhouse/pylibjpeg-1.4.0-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/extrapackages-dicomsdl-gdcm-pylibjpeg/wheelhouse/python_gdcm-3.0.21-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for making plots
from tqdm.notebook import tqdm
import sys
from joblib import Parallel, delayed
from multiprocessing import cpu_count
import dicomsdl

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

import glob

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import csv data as a data frame.
csv_train = pd.read_csv("/kaggle/input/rsna-breast-cancer-detection/train.csv")
csv_test = pd.read_csv("/kaggle/input/rsna-breast-cancer-detection/test.csv")

train_images_folder = "/kaggle/input/rsna-breast-cancer-detection/train_images"
test_images_folder = "/kaggle/input/rsna-breast-cancer-detection/test_images"

In [None]:
# If any entries in the test set have missing entries in age, just set it to an avg.
data_test = csv_test.copy()
data_test['age'] = data_test['age'].fillna(data_test['age'].mean())

In [None]:
# Put paths to all the images in a list to iterate through later

train_im_dir = '/kaggle/input/rsna-breast-cancer-detection/train_images/'
test_im_dir = '/kaggle/input/rsna-breast-cancer-detection/test_images/'

path = [test_im_dir + str(patient) + '/' + str(image) + '.dcm'
        for patient, image in zip(data_test['patient_id'], data_test['image_id'])]

In [None]:
import cv2
import time
import ray

def process_im_ray(im, image_size = 256, show = False):
    if show:
        try:
            print('begin process')
            plt.imshow(im.pixelData(), cmap='gray')
            plt.show(); plt.close()
        except: 
            print('begin process')
            plt.imshow(im.pixel_array, cmap='gray')
            plt.show(); plt.close()
    try:
        out = im.pixelData()
        phototype = im.getPixelDataInfo()['PhotometricInterpretation']
    except:
        out = im.pixel_array
        phototype = im.PhotometricInterpretation
        
    # MONOCHROME1 pictures use a backward brightness-darkness scheme from MONOCHROME2.
    # We reverse the MONOCHROME1 pictures to make this consistent.
    if phototype == 'MONOCHROME1':
        out = out.max() - out
        
    minval = out.min()
    maxval = out.max()
    
    # Crop image to interesting part.
    Threshold = maxval/5
    out = cut_empty_space_ray(out, T = Threshold, show = show)
    
    # scale up pixel vals so that pixel values lie between 0 and 1.
    out = (out-minval) / (maxval-minval)
    
    # Resize the photo.
    out = cv2.resize(out, (image_size, image_size))
    
    if show:
        plt.imshow(out, cmap='gray')
        plt.show(); plt.close()
        
    return out


def cut_empty_space_ray(im, T = 100, cutedge = 10, show = False):
    # ignore the border of images since some stray pixels there interfere with cropping.
    impx_cv2_raw = im[cutedge:-cutedge, cutedge:-cutedge] 
    
    # Convert image foreground % background to black and white.
    _, impx_cv2 = cv2.threshold(impx_cv2_raw, T, 255, cv2.THRESH_BINARY) 
    
    # Get edge detection contours
    contours, _ = cv2.findContours(impx_cv2.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) 
    boundary = max(contours, key=cv2.contourArea) # Get the contour enclosing the most space.
    
    # Make a mask so background is set to "color=0."
    # This eliminates image defects/non tissue such as labels, other random devices, etc
    impx_mask = np.zeros(impx_cv2.shape, dtype="uint8") 
    cv2.drawContours(impx_mask, [boundary], -1, 255, cv2.FILLED) 
    # Apply the mask on top of the original image.
    impx_cv2_masked = cv2.bitwise_and(impx_cv2_raw, impx_cv2_raw, mask = impx_mask) 
    
    # Get coordinates bounding the rectangle of the interesting parts of the image.
    s = time.time()
    x, y, w, h = cv2.boundingRect(boundary)

    
    ## Testing out how well the boundingRect function selects the correct region.
    #print(x, y, w, h)
    #blank = np.zeros(impx_cv2.shape, dtype="uint8")
    #m = np.amax(impx_cv2_masked)
    #cv2.rectangle((impx_cv2_masked),(x,y),(x+w,y+h), color = (int(m), 0, 0), thickness = 100, lineType=cv2.LINE_AA)
    #plt.imshow(impx_cv2_masked)
    #plt.show(); plt.close()
    
    out = impx_cv2_masked[y:y+h, x:x+w]
    if show:        
        plt.imshow(out)
        plt.show(); plt.close()
        print('end crop')
    return out

In [None]:
# Create new ray setup for parallel processing
ray.shutdown()
ray.init(log_to_driver=False, num_gpus=1, num_cpus=cpu_count())

# Progress bar solution for ray taken from: 
# https://github.com/ray-project/ray/issues/5554#issuecomment-615477207

# Parallel processing iterator. Runs "ray.get()" on each ray future.
# Then iterating through images to save them as PNGs with a more polished version
# of the above algorithm.
# PNG saving solution taken & adapted from 
# https://www.kaggle.com/code/remekkinas/fast-dicom-processing-1-6-2x-faster

# I use the ray package to make processing these images in parallel efficient.
# Ray parallel processing solution was taken & adapted from 
# https://www.kaggle.com/code/remekkinas/ray-parallel-processing-dicom-files-and

def to_iterator(obj_ids):
    while obj_ids:
        done,  obj_ids = ray.wait(obj_ids)
        yield ray.get(done[0])

@ray.remote
def get_images_new(imagepath, image_size = 256):
    
    im = dicomsdl.open(imagepath)
    image = process_im_ray(im, image_size)
    image = np.ndarray.flatten(image)
    del im
    return image
    
@ray.remote
def save_images(imagepath, image_size = 256, savedir = None):
    
    im = dicomsdl.open(imagepath)
    image = process_im_ray(im, image_size)
    
    directory_split = imagepath.split('/')
    image_id = directory_split[-1].split('.')[0]
    patient_id = directory_split[-2]
    if savedir is not None:
        newfolder = savedir + '/' + patient_id
        fname = newfolder + '/' + image_id + '.png'
        os.makedirs(newfolder, exist_ok=True)
        image_save = (image * 255).astype(np.uint8)
        cv2.imwrite(fname, image_save)
    
start = time.time()
image_size = 512

run_dcm_to_png = True
show_prog_bar = False
endpoint = 100

if run_dcm_to_png:

    workdir = '/kaggle/working/test/'
    save_dir = workdir + 'processed_' + str(image_size)
    os.makedirs(save_dir, exist_ok=True)

    Save_Im_futures = [save_images.remote(p, image_size = image_size, savedir = save_dir) for p in path]
    if show_prog_bar:
        Save_Im = [x for x in tqdm(to_iterator(Save_Im_futures), total=len(Save_Im_futures))]
    else:
        Save_Im = [x for x in (to_iterator(Save_Im_futures))]
    
    #del Save_Im_futures
#    ML_patientID_inputs_futures = [
#        save_images_patient.remote(p, image_size = image_size, savedir = save_dir) for p in patientpathlist[:100]
#    ]
#    ML_inputs = [x for x in tqdm(to_iterator(ML_patientID_inputs_futures), total = len(ML_patientID_inputs_futures))] ## Progress bar
    #ML_inputs = [x for x in (to_iterator(ML_patientID_inputs_futures))]

ray.shutdown()
#Parallel(n_jobs = 4, require='sharedmem')(delayed(get_images_patient)(p, ML_inputs) for p in tqdm(patientpathlist[:125]))
print(time.time() - start)
#print(len(ML_inputs))
#print(len(ML_inputs[0]))

In [None]:
def onehot(df, encode):
    temp = pd.get_dummies(df[encode])
    df = df.drop([encode], axis=1)
    df = pd.concat([df, temp], axis=1)
    return df


onehotcols = ['laterality', 'view']
for col in onehotcols:
    data_test = onehot(data_test, col)
    data_train_relevant = onehot(csv_train, col)

for col in data_train_relevant.columns:
    if col not in data_test.columns:
        data_test[col] = 0
    
# Implant column is a int64 for some reason; let's convert it to uint8 so it's lighter.
data_test['implant'] = data_test['implant'].astype('uint8')
data_test.head()

In [None]:
data_test.head()

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator


pngfolder = '/kaggle/working/test/processed_512'

# We'll add a column to the dataframe with all of the filenames to use with flow_from_dataframe.
data_test['file'] = data_test.apply(
    lambda x: pngfolder + '/' + str(int(x['patient_id'])) + '/' + str(int(x['image_id'])) 
    + '.png', axis=1)

#print(data_train_relevant.head)

# To create the correct shape of inputs for the network, we need to get both the
# image data and the csv data -- have all the columns be the "target y" for flow_from_dataframe.
def test_from_dataframe(directory, generator, subset='training', batch_size = 64,
                        data = data_test, columns = ['cancer'], seed = None):
    
    gendat = generator.flow_from_dataframe(data, directory=directory, shuffle = True, 
                                           target_size = (ImageSize, ImageSize),
                                           subset = subset, batch_size = batch_size,
                                           x_col = 'file', y_col = columns, 
                                           class_mode = 'multi_output', 
                                           color_mode = 'grayscale', 
                                           validate_filenames=False)
    N = gendat.n
    i = 0
    while i < N:
        data = gendat.next()
        x_im = np.array(data[0]).astype(np.float16) # reference to image
        x_info = np.array(data[1][:]).T.astype(np.float16) # data columns
        i += batch_size
        yield [x_im, x_info]
    return
ImageSize = 512 # i don't know if we should be resizing this
val_split = 0.0 
batch_size = 64 # No idea what a good number for this is, i'm using something close to what i saw on Google

cols = ['age', 'implant', 'L', 'R', 'AT', 'CC', 'LM', 'LMO', 'ML', 'MLO']
# Make a Keras ImageDataGenerator
imagegen = ImageDataGenerator(rescale = 1./255., validation_split = 0)

test_gen = test_from_dataframe(None, imagegen, batch_size = batch_size,
                                data = data_test, columns = cols, subset='training')

In [None]:
data_test.head()

In [None]:
model_import_path = '/kaggle/input/breastcancerpredictor-v2/breastcancerpredictor_v2/'
ML = keras.models.load_model(model_import_path)

In [None]:
test_pred = ML.predict(test_gen, steps=None, batch_size = batch_size)

In [None]:
test_pred_series = pd.Series(test_pred.T[0])
predictions = pd.DataFrame()
predictions['image_id'] = pd.Series(data_test['image_id']).tolist()
predictions['cancer'] = (test_pred_series).tolist()
predict_test = csv_test.copy()
predict_test['cancer'] = (test_pred_series).tolist()
print(predict_test)

In [None]:
predict_merge = predict_test.groupby(['patient_id', 'laterality'])['cancer'].max()
print(predict_merge)
predict_merge = predict_merge
predict_merge = predict_merge.reset_index()
predict_merge['prediction_id'] = predict_merge.apply(
    lambda x: str(x['patient_id']) + '_' + x['laterality'], axis = 1
)
print(predict_merge)
output_cols = ['predict_merge', 'cancer']
predict_merge = predict_merge.reindex(columns=['prediction_id', 'cancer'])
print(predict_merge)

In [None]:
predict_merge.to_csv('submission.csv', index=False)