## Feature creation/engineering

### Setup environment

In [1]:
from skimage import io
import os

# local root dataset folder
dataset_root = 'datasets/'
# local image data process
process_root = 'data_scaled_wip/'

### Helper functions

#### ISIC images dataset

In [None]:
# recursive *.jpg image generator
def get_images(root_dir):
    for entry in os.scandir(root_dir):
        if entry.name.endswith(".jpg") and entry.is_file():
            print(entry.path)
        elif entry.is_dir():
            yield from get_images(entry.path)
# retrive images path
for image in get_images(dataset_root):
    print(image)

#### ISIC metadata dataset

In [3]:
# recursive directory names generator
def get_dataset_path(root_dir):
    for entry in os.scandir(root_dir):
        if entry.is_dir():
            yield entry.path
# retrieve dataset paths
for path in get_dataset_path(dataset_root):
    for entry in os.scandir(path):
        if entry.name.endswith(".csv") and entry.is_file():
            print(path + os.sep + entry.name)

datasets/MSK-4/metadata.csv
datasets/UDA-2/metadata.csv
datasets/MSK-2/metadata.csv
datasets/UDA-1/metadata.csv
datasets/MSK-1/metadata.csv
datasets/MSK-3/metadata.csv
datasets/MSK-5/metadata.csv


### Data transformation

In [2]:
import numpy as np
from skimage.util import img_as_ubyte
from skimage.transform import resize

# function for data augmenatation and processing
def image_process(image_name, output_folder, output_size=(128,128)):
    # crop images to max square area available
    image = io.imread(image_name)
    h,w=round(image.shape[0]/2),round(image.shape[1]/2)
    n=np.minimum(w,h)
    l,t,r,b = w-n, h-n, w+n, h+n
    # resize image w/ anti_alias filter
    image = resize(image[t:b,l:r],output_size, anti_aliasing=True)
    # convert to ubyte scale
    image = img_as_ubyte(image)
    # store scaled_image
    io.imsave(output_folder + os.sep + image_name.split('/')[-1], image)

In [6]:
import os
import pandas as pd
from tqdm import tqdm

# specify image dataset folder to process
dataset_folder = 'datasets/UDA-2/'

# field to classify images
dataset_split = 'clinical.benign_malignant'
split_positive = 'benign'
split_negative = 'malignant'

# create output folders for classifier
if(not os.path.exists(process_root)):
    os.mkdir(process_root)
    os.mkdir(process_root + split_positive)
    os.mkdir(process_root + split_negative)

# get metadata.csv for splitting bening and/or malignant
image = pd.read_csv(dataset_folder + 'metadata.csv', usecols=['image_name',dataset_split])
# image output stats
n_benign = n_malignant = n_unknown = n_nonexist = n_total = 0
# progress bar
pbar = tqdm(total=len(image))
# read field to classify and process images
for _,row in image.iterrows():
    # update progress bar
    pbar.update(1)
    # file_name check
    file_name = dataset_folder + row['image_name'] + '.jpg'
    if(os.path.exists(file_name)):
        # image processing & store
        if(row[dataset_split] == split_positive):
            image_process(file_name, process_root + split_positive)
            n_benign += 1
        elif(row[dataset_split] == split_negative):
            image_process(file_name, process_root + split_negative)
            n_malignant += 1
        else:
            n_unknown += 1
    else:
        n_nonexist += 1
# progress bar
pbar.close()
# stats
n_total = n_benign+n_malignant+n_unknown+n_nonexist
# Images processed according to metadata dataset stats
print("Total {} images: Benign = {}({:2.2%}), Malignant = {}({:2.2%}), Unknown = {}({:2.2%}, Non-Exist = {}({:2.2%}))"\
      .format(n_total,n_benign,n_benign/n_total,n_malignant,n_malignant/n_total,n_unknown,n_unknown/n_total,n_nonexist,n_nonexist/n_total))

100%|██████████| 60/60 [01:36<00:00,  1.61s/it]

Total 60 images: Benign = 23(38.33%), Malignant = 37(61.67%), Unknown = 0(0.00%, Non-Exist = 0(0.00%))



