In [1]:
# extract rar to temp_folder
!unrar x '/content/drive/MyDrive/Colab Notebooks/assets/data.rar' '/content/temp/'


UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal


Extracting from /content/drive/MyDrive/Colab Notebooks/assets/data.rar

Creating    /content/temp                                             OK
Creating    /content/temp/skin cancer                                 OK
Creating    /content/temp/skin cancer/SET_D                           OK
Extracting  /content/temp/skin cancer/SET_D/D1.BMP                         0%  OK 
Extracting  /content/temp/skin cancer/SET_D/D10.BMP                        0%  OK 
Extracting  /content/temp/skin cancer/SET_D/D100.BMP                       0%  OK 
Extracting  /content/temp/skin cancer/SET_D/D1000.BMP                      0%  OK 
Extracting  /content/temp/skin cancer/SET_D/D101.BMP                       0%  OK 
Extracting  /content/temp/skin cancer/SET_D/D102.BMP                       0%  OK 
Extracting  /content/temp/skin cancer/SET_D/D103.BMP                       0%  

In [2]:
from pathlib import Path
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import cv2
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from math import ceil

from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Activation, Flatten, Dropout
from keras.layers import Conv2D
from keras.models import Sequential
from keras.optimizers import Adam
from keras.applications.vgg16 import VGG16

In [3]:
# cross-platform forward slash path notation
assets_folder = Path('/content/drive/MyDrive/Colab Notebooks/assets')
temp_folder = Path('/content/temp')

# list absolute paths to every extracted file
original_rar_file_list = [os.path.join(r, file) for r,d,f in os.walk(temp_folder) for file in f]

# get only the rows/colmns containing data from CLIN_DIA.xlsx
images_diag_df = pd.read_excel(assets_folder / 'CLIN_DIA.xlsx')
images_diag_df.dropna(axis=0, how='all', inplace=True)
images_diag_df.dropna(axis=1, how='all', inplace=True)

# filter out non-existent images and files where diagnose wasn't finshed yet
def construct_image_path(image_diag_row):
  """Return the path for the image in the diag df, if it exists, otherwise, return NaN"""
  try:
    return [path for path in original_rar_file_list if image_diag_row['id'].upper()+'.BMP' in path][0]
  except:
    return np.nan

images_diag_df['filepath'] = images_diag_df.apply(construct_image_path, axis = 1)
images_diag_df.dropna(axis = 0, subset=['filepath'], inplace=True)
images_diag_df = images_diag_df[images_diag_df['kat.Diagnose'].isin([1,2,3])]

# create binary label based on kat.Diagnose
# 1		==> no_doctor
# 2/3	==> doctor
images_diag_df['label'] = images_diag_df['kat.Diagnose'].apply(lambda x: 'no_doctor' if x == 1 else 'doctor')

# split the images_diag_df in train and test with each 22% cases that should see a doctor
images_diag_train, images_diag_test = train_test_split(images_diag_df, test_size=0.1, random_state=101, stratify=images_diag_df['label'])
print('% see doctor cases - train:', images_diag_train[images_diag_train['label'] == 'doctor'].shape[0]/len(images_diag_train))
print('% see doctor cases - test:', images_diag_test[images_diag_test['label'] == 'doctor'].shape[0]/len(images_diag_test))

% see doctor cases - train: 0.2168582375478927
% see doctor cases - test: 0.21724137931034482


In [4]:
def selma_secret_sauce(image, advanced = True, bigger_border = False, replicate_border = False, for_training = True, model_size = (256, 256)):
    
    def crop(image):
        '''this function cuts of the small black borders found on training images,
        so that expanding to squares with pixel replication works'''
        cut_left = 2
        cut_right = 2
        cut_top = 2
        cut_bottom = 2
        top_left = (cut_left, cut_top)
        bottom_right = (image.shape[1]-cut_right, image.shape[0]-cut_bottom)
        image = image[top_left[1]:(bottom_right[1] + 1), top_left[0]:(bottom_right[0] + 1)]
        return image
        
    
    def make_square(image, skip_cropping = False):
        '''make square with blurred border replication'''
        if skip_cropping == False:
            image = crop(image)
        if bigger_border == True:
            # enlarge border size to allow more room for augmentation rotation/shifting
            square_dimension = ceil((image.shape[1]**2 + image.shape[0]**2)**0.5*1.1)
        else:
            square_dimension = max(image.shape[1],image.shape[0])
        top_border = (square_dimension-image.shape[0])//2
        bottom_border = square_dimension-image.shape[0]-top_border
        left_border = (square_dimension-image.shape[1])//2
        right_border = square_dimension-image.shape[1]-left_border
        if replicate_border == True:
            square = cv2.copyMakeBorder(image,
                                        top_border,
                                        bottom_border,
                                        left_border,
                                        right_border,
                                        cv2.BORDER_REPLICATE)
            # blur the border (and the whole image)
            square = cv2.GaussianBlur(square,(19,19),cv2.BORDER_DEFAULT)
            # impose unblurred image on blurred square
            yoff = top_border
            xoff = left_border
            square[yoff:yoff+image.shape[0], xoff:xoff+image.shape[1]] = image
        else:
            # fill the border with grey instead
            square = cv2.copyMakeBorder(image,
                                        top_border,
                                        bottom_border,
                                        left_border,
                                        right_border,
                                        cv2.BORDER_CONSTANT,
                                        value = (125, 125, 125))
        
        return square
    
    # convert BMP to RGB format
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # make square
    square = make_square(image)
    
    # don't continue processing when advanced is False, and resize to model dimension
    # when predicting
    if advanced == False:
        if for_training == False:
            square = cv2.resize(square, model_size, interpolation = cv2.INTER_AREA)
        return square
    
    # convert image to grayScale
    grayScale = cv2.cvtColor(crop(image), cv2.COLOR_RGB2GRAY)

    # kernel for morphologyEx
    kernel = cv2.getStructuringElement(1,(17,17))
    
    # apply MORPH_BLACKHAT to grayScale image
    blackhat = cv2.morphologyEx(grayScale, cv2.MORPH_BLACKHAT, kernel)
    # apply thresholding to blackhat
    _,threshold = cv2.threshold(blackhat,10,255,cv2.THRESH_BINARY)
    
    # inpaint with original image and threshold image
    hair_removal_image = cv2.inpaint(crop(image),threshold,1,cv2.INPAINT_TELEA)
    hair_removal_image = cv2.medianBlur(hair_removal_image,5)
    
    #-----Converting image to LAB Color model-----------------------------------
    lab = cv2.cvtColor(hair_removal_image, cv2.COLOR_RGB2LAB)
    
    #-----Splitting the LAB image to different channels-------------------------
    l, a, b = cv2.split(lab)
    
    #-----Applying CLAHE to L-channel-------------------------------------------
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
    cl = clahe.apply(l)
    
    #-----Merge the CLAHE enhanced L-channel with the a and b channel-----------
    limg = cv2.merge((cl,a,b))
    
    #-----Converting image from LAB Color model to RGB model--------------------
    final = cv2.cvtColor(limg, cv2.COLOR_LAB2RGB)
    
    # make square and resize in case of prediction
    final = make_square(final, skip_cropping = True)
    if for_training == False:
        final = cv2.resize(final, model_size, interpolation = cv2.INTER_AREA)
    return final

    #_____END_____#

In [5]:
# create directory structure
structure = [ 'train', 'test', 'train/a_no_doctor', 'train/b_doctor', 'test/a_no_doctor', 'test/b_doctor']
for dir in structure:
  dir = temp_folder / dir
  if not dir.exists():
    os.mkdir(dir)

current_rows = 'train'
progress_counter = 0

for rows in (images_diag_train.iterrows(), images_diag_test.iterrows()):
  for index, row in rows:
    # pre-processing part
    image = cv2.imread(f"{row['filepath']}")
    image = selma_secret_sauce(image, advanced = True)
    # save to image tree
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    if current_rows == 'train':
      if row['label'] == 'no_doctor':
        cv2.imwrite(str(temp_folder / 'train' / 'a_no_doctor' / f"{row['id']}.png"), image)
      else:
        cv2.imwrite(str(temp_folder / 'train' / 'b_doctor' / f"{row['id']}.png"), image)
    else:
      if row['label'] == 'no_doctor':
        cv2.imwrite(str(temp_folder / 'test' / 'a_no_doctor' / f"{row['id']}.png"), image)
      else:
        cv2.imwrite(str(temp_folder / 'test' / 'b_doctor' / f"{row['id']}.png"), image)
    progress_counter += 1
    if progress_counter % 200 == 0:
      print(f'{progress_counter/images_diag_df.shape[0]:.1%}')
  current_rows = 'test'

6.9%
13.8%
20.7%
27.6%
34.5%
41.4%
48.3%
55.2%
62.1%
69.0%
75.9%
82.8%
89.7%
96.6%


In [6]:
# All images will be rescaled by 1./255
train_datagen = ImageDataGenerator(rotation_range=90,
                                   rescale=1/255,
                                   zoom_range=0.1,
                                   horizontal_flip=True,
                                   vertical_flip=True,
                                   fill_mode='nearest',
                                   cval=125)
                                   
val_datagen = ImageDataGenerator(rescale=1./255)

# Flow training images in batches of 20 using train_datagen generator
train_generator = train_datagen.flow_from_directory(
        temp_folder / 'train',  # This is the source directory for training images
        target_size=(64, 64),  # All images will be resized to 256 x 256
        batch_size=20,
        # Since we use binary_crossentropy loss, we need binary labels
        class_mode='binary')

# Flow validation images in batches of 20 using val_datagen generator
validation_generator = val_datagen.flow_from_directory(
        temp_folder / 'test',
        target_size=(64, 64),
        batch_size=20,
        class_mode='binary')

Found 2610 images belonging to 2 classes.
Found 290 images belonging to 2 classes.
