In [1]:
from prototypes.deeplearning.dataloader.IsicDataLoader import IsicDataLoaderFolders, over_under_sample, load_val_images, create_folds, AugmentationWrapper
from prototypes.utility.data import ProjectConfiguration
import pandas as pd

In [2]:
config = ProjectConfiguration("../config.json")

In [3]:
config.get_keys()

dict_keys(['DATASET_PATH', 'TRAIN_IMAGES_PATH', 'TRAIN_METADATA', 'TEST_METADATA', 'SAMPLE_SUBMISSION', 'IMAGE_WIDTH', 'IMAGE_HEIGHT', 'TARGET_COLUMNS', 'VECTORS_PATH', 'BATCH_SIZE', 'K_FOLDS', 'NUM_CLASSES', 'ALPHA', 'TRAIN_DEVICE', 'USE_SAMPLER', 'TRAIN_SAMPLE_SIZE', 'VAL_SAMPLE_SIZE', 'TRAIN_SPLIT', 'NUM_EPOCHS', 'NUM_WORKERS', 'PREFETCH_FACTOR', 'MODEL', 'DROPOUT_LAST_LAYER', 'CLASS_BALANCE_PERCENTAGE', 'ENABLED_EXPONENTIAL_LR', 'USING_MIXUP', 'TOTAL_TRAIN_SAMPLES', 'PER_SAMPLE_AUGMENTATION', 'METADATA_COLUMNS', 'LEARNING_RATE', 'TOLERANCE_EARLY_STOPPING', 'SAMPLE_PERCENTAGE', 'HYPER_PARAMETERS_PATH', 'VERSION'])

In [4]:
metadata_df = pd.read_csv(config.get_value("TRAIN_METADATA"), engine="python")
columns = config.get_value("METADATA_COLUMNS").split("\t")

isic_id, metadata_array, labels = metadata_df["isic_id"].values, metadata_df[columns].values, metadata_df[
    "target"].values

In [5]:
folds_config_dict = create_folds(isic_id=isic_id, metadata=metadata_array, labels=labels, config=config)

In [6]:
folds_config_dict

{'1': {'train': {'isic_id': array(['ISIC_0015670', 'ISIC_0015845', 'ISIC_0015864', ...,
          'ISIC_9999960', 'ISIC_9999964', 'ISIC_9999967'], dtype=object),
   'metadata': array([[ 2.02444223e+01,  1.62619752e+01,  2.69224474e+01, ...,
           -1.82703552e+02,  6.13493652e+02, -4.24279480e+01],
          [ 3.17125700e+01,  2.53647400e+01,  2.63310000e+01, ...,
           -7.83081100e-02,  1.57568700e+03,  5.71745000e+01],
          [ 2.25758300e+01,  1.71281700e+01,  3.79704600e+01, ...,
            1.23649700e+02,  1.47201000e+03,  2.32908900e+02],
          ...,
          [ 1.73325667e+01,  1.23643974e+01,  2.98453261e+01, ...,
            5.46222458e+01,  1.06526367e+03, -1.06833740e+02],
          [ 2.22885700e+01,  9.56472100e+00,  2.84312000e+01, ...,
           -9.86155700e+00,  8.77527000e+02, -7.69821200e+01],
          [ 1.67929000e+01,  1.14347300e+01,  2.14572200e+01, ...,
           -2.58872300e+02,  1.48606200e+02,  1.07710600e+02]]),
   'target': array([0, 0, 0, 

# K - Folds 

In [7]:
from PIL import Image
import os
import copy
import shutil
from tqdm.auto import tqdm
import albumentations as A
import numpy as np


augmentation_oversampling = A.Compose([
    # crop border avois multiple instances
    A.RandomCrop(width=config.get_value("IMAGE_WIDTH"), height=config.get_value("IMAGE_HEIGHT"), p=1),
    A.Rotate(limit=(-360, 360), p=0.50, crop_border=False),
    A.VerticalFlip(p=0.25),
    A.HorizontalFlip(p=0.25),
    A.RandomBrightnessContrast(p=0.5),
    # water reflexion effect magnification and min
    A.GridDistortion(p=0.25),
    # A.Resize(height=config.get_value("IMAGE_HEIGHT"), width=config.get_value("IMAGE_WIDTH"))
], p = 1)

dataset_path = config.get_value("DATASET_PATH")
shutil.rmtree(os.path.join(dataset_path, "splits"), ignore_errors=True)

train_metadata = []
val_metadata = []

image_source_path = config.get_value("TRAIN_IMAGES_PATH")
for fold_index in tqdm(folds_config_dict.keys()):
    print(f"Fold: {fold_index}")
    
    os.makedirs(os.path.join(dataset_path, "splits", f"fold_{fold_index}", "train", "0"), exist_ok=True)
    os.makedirs(os.path.join(dataset_path, "splits", f"fold_{fold_index}", "train", "1"), exist_ok=True)
    os.makedirs(os.path.join(dataset_path, "splits", f"fold_{fold_index}", "val", "0"), exist_ok=True)
    os.makedirs(os.path.join(dataset_path, "splits", f"fold_{fold_index}", "val", "1"), exist_ok=True)
    
    normal_images_ids = folds_config_dict[fold_index]["train"]["isic_id"][np.where(folds_config_dict[fold_index]["train"]["target"]==0)]
    anomaly_images_ids = folds_config_dict[fold_index]["train"]["isic_id"][np.where(folds_config_dict[fold_index]["train"]["target"]==1)]
    
    print(f"total images: {len(normal_images_ids) + len(anomaly_images_ids)}")
    print("Over and under sampling...")
    over_under_sample(normal_images_ids=normal_images_ids,
                      anomaly_images_ids=anomaly_images_ids,
                      augmentation_transform=augmentation_oversampling,
                      root_path=os.path.join(dataset_path, "splits", f"fold_{fold_index}"),
                      config=config)
    
    print("Saving validation images to disk...")
    for isic_id, target in tqdm(zip(folds_config_dict[fold_index]["val"]["isic_id"], folds_config_dict[fold_index]["val"]["target"]), total=len(folds_config_dict[fold_index]["val"]["isic_id"])):
        img = copy.deepcopy(Image.open(os.path.join(image_source_path, f"{isic_id}.jpg")))
        img.save(os.path.join(dataset_path, "splits", f"fold_{fold_index}", "val", f"{int(target)}", f"{isic_id}.jpg"))

  0%|          | 0/10 [00:00<?, ?it/s]

Fold: 1
total images: 360953
Over and under sampling...
Target: Normal samples count: 216572 | Target: Anomally samples count: 144382 
Iterations needed to reach target count for anomaly samples: 408 | Anomaly original count: 354


  0%|          | 0/408 [00:00<?, ?it/s]

ValueError: could not broadcast input array from shape (128,128,3) into shape (354,128,3)

In [None]:
import glob
import os

In [None]:
fold_index = 1

dataset_path = config.get_value("DATASET_PATH")

root = os.path.join(dataset_path, "splits", f"fold_{fold_index}", "train")

file_list = glob.glob(os.path.join(root, "*.jpg"))

In [None]:
os.listdir(root)

In [None]:
file_list[:5]

In [None]:
dataloader = IsicDataLoaderFolders(root=root)

In [None]:
next(iter(dataloader))