In [2]:
from prototypes.deeplearning.dataloader.IsicDataLoader import IsicDataLoaderFolders, over_under_sample, load_val_images, create_folds, AugmentationWrapper
from prototypes.utility.data import ProjectConfiguration
import pandas as pd

In [3]:
config = ProjectConfiguration("../config.json")

In [4]:
config.get_keys()

dict_keys(['DATASET_PATH', 'TRAIN_IMAGES_PATH', 'TRAIN_METADATA', 'TEST_METADATA', 'SAMPLE_SUBMISSION', 'IMAGE_WIDTH', 'IMAGE_HEIGHT', 'TARGET_COLUMNS', 'VECTORS_PATH', 'BATCH_SIZE', 'K_FOLDS', 'NUM_CLASSES', 'ALPHA', 'TRAIN_DEVICE', 'USE_SAMPLER', 'TRAIN_SAMPLE_SIZE', 'VAL_SAMPLE_SIZE', 'TRAIN_SPLIT', 'NUM_EPOCHS', 'NUM_WORKERS', 'PREFETCH_FACTOR', 'MODEL', 'DROPOUT_LAST_LAYER', 'CLASS_BALANCE_PERCENTAGE', 'ENABLED_EXPONENTIAL_LR', 'USING_MIXUP', 'TOTAL_TRAIN_SAMPLES', 'PER_SAMPLE_AUGMENTATION', 'METADATA_COLUMNS', 'LEARNING_RATE', 'TOLERANCE_EARLY_STOPPING', 'SAMPLE_PERCENTAGE', 'HYPER_PARAMETERS_PATH', 'VERSION'])

In [5]:
metadata_df = pd.read_csv(config.get_value("TRAIN_METADATA"), engine="python")
columns = config.get_value("METADATA_COLUMNS").split("\t")

isic_id, metadata_array, labels = metadata_df["isic_id"].values, metadata_df[columns].values, metadata_df[
    "target"].values

In [6]:
folds_config_dict = create_folds(isic_id=isic_id, metadata=metadata_array, labels=labels, config=config)

In [7]:
folds_config_dict

{'1': {'train': {'isic_id': array(['ISIC_0015902', 'ISIC_0024200', 'ISIC_0035502', ...,
          'ISIC_9999960', 'ISIC_9999964', 'ISIC_9999967'], dtype=object),
   'metadata': array([[  14.24232888,   12.16475727,   21.44814427, ..., -141.02478027,
           1442.18579102,   58.35980225],
          [  24.72552   ,   20.05747   ,   26.4649    , ...,  -72.31564   ,
           1488.72      ,   21.42896   ],
          [  22.12918339,   19.91142436,   24.10661432, ...,  -11.95996094,
           1436.02746582,  101.33465576],
          ...,
          [  17.33256672,   12.36439742,   29.84532611, ...,   54.62224579,
           1065.26367188, -106.83374023],
          [  22.28857   ,    9.564721  ,   28.4312    , ...,   -9.861557  ,
            877.527     ,  -76.98212   ],
          [  16.7929    ,   11.43473   ,   21.45722   , ..., -258.8723    ,
            148.6062    ,  107.7106    ]]),
   'target': array([0, 0, 0, ..., 0, 0, 0])},
  'val': {'isic_id': array(['ISIC_0015670', 'ISIC_00158

# K - Folds 

In [8]:
from PIL import Image
import os
import copy
import shutil
from tqdm.auto import tqdm
import albumentations as A
import numpy as np


augmentation_oversampling = A.Compose([
    # crop border avois multiple instances
    A.Rotate(limit=(-360, 360), p=0.5, crop_border=True),
    A.VerticalFlip(p=0.25),
    A.HorizontalFlip(p=0.25),
    A.RandomBrightnessContrast(p=0.25),
    # water reflexion effect magnification and min
    A.GridDistortion(p=0.25),
    # Only with crop_border = True
    A.Resize(height=config.get_value("IMAGE_HEIGHT"), width=config.get_value("IMAGE_WIDTH"))
])

dataset_path = config.get_value("DATASET_PATH")
shutil.rmtree(os.path.join(dataset_path, "splits"), ignore_errors=True)

train_metadata = []
val_metadata = []

image_source_path = config.get_value("TRAIN_IMAGES_PATH")
for fold_index in tqdm(folds_config_dict.keys()):
    print(f"Fold: {fold_index}")
    
    os.makedirs(os.path.join(dataset_path, "splits", f"fold_{fold_index}", "train", "0"), exist_ok=True)
    os.makedirs(os.path.join(dataset_path, "splits", f"fold_{fold_index}", "train", "1"), exist_ok=True)
    os.makedirs(os.path.join(dataset_path, "splits", f"fold_{fold_index}", "val", "0"), exist_ok=True)
    os.makedirs(os.path.join(dataset_path, "splits", f"fold_{fold_index}", "val", "1"), exist_ok=True)
    
    normal_images_ids = folds_config_dict[fold_index]["train"]["isic_id"][np.where(folds_config_dict[fold_index]["train"]["target"]==0)]
    anomaly_images_ids = folds_config_dict[fold_index]["train"]["isic_id"][np.where(folds_config_dict[fold_index]["train"]["target"]==1)]
    
    print(f"total images: {len(normal_images_ids) + len(anomaly_images_ids)}")
    print("Over and under sampling...")
    over_under_sample(normal_images_ids=normal_images_ids,
                      anomaly_images_ids=anomaly_images_ids,
                      augmentation_transform=augmentation_oversampling,
                      root_path=os.path.join(dataset_path, "splits", f"fold_{fold_index}"),
                      config=config)
    
    print("Saving validation images to disk...")
    for isic_id, target in tqdm(zip(folds_config_dict[fold_index]["val"]["isic_id"], folds_config_dict[fold_index]["val"]["target"]), total=len(folds_config_dict[fold_index]["val"]["isic_id"])):
        img = copy.deepcopy(Image.open(os.path.join(image_source_path, f"{isic_id}.jpg")))
        img.save(os.path.join(dataset_path, "splits", f"fold_{fold_index}", "val", f"{int(target)}", f"{isic_id}.jpg"))

  0%|          | 0/5 [00:00<?, ?it/s]

Fold: 1
total images: 320847
Over and under sampling...
Target: Normal samples count: 192509 | Target: Anomally samples count: 128339 
Iterations needed to reach target count for anomaly samples: 408 | Anomaly original count: 315


  0%|          | 0/408 [00:00<?, ?it/s]

Sampled normal images: 192509 | unique numbers: 192509


  0%|          | 0/192509 [00:00<?, ?it/s]

Saving validation images to disk...


  0%|          | 0/80212 [00:00<?, ?it/s]

Fold: 2
total images: 320847
Over and under sampling...
Target: Normal samples count: 192509 | Target: Anomally samples count: 128339 
Iterations needed to reach target count for anomaly samples: 409 | Anomaly original count: 314


  0%|          | 0/409 [00:00<?, ?it/s]

Sampled normal images: 192509 | unique numbers: 192509


  0%|          | 0/192509 [00:00<?, ?it/s]

Saving validation images to disk...


  0%|          | 0/80212 [00:00<?, ?it/s]

Fold: 3
total images: 320847
Over and under sampling...
Target: Normal samples count: 192509 | Target: Anomally samples count: 128339 
Iterations needed to reach target count for anomaly samples: 409 | Anomaly original count: 314


  0%|          | 0/409 [00:00<?, ?it/s]

Sampled normal images: 192509 | unique numbers: 192509


  0%|          | 0/192509 [00:00<?, ?it/s]

Saving validation images to disk...


  0%|          | 0/80212 [00:00<?, ?it/s]

Fold: 4
total images: 320847
Over and under sampling...
Target: Normal samples count: 192509 | Target: Anomally samples count: 128339 
Iterations needed to reach target count for anomaly samples: 409 | Anomaly original count: 314


  0%|          | 0/409 [00:00<?, ?it/s]

Sampled normal images: 192509 | unique numbers: 192509


  0%|          | 0/192509 [00:00<?, ?it/s]

Saving validation images to disk...


  0%|          | 0/80212 [00:00<?, ?it/s]

Fold: 5
total images: 320848
Over and under sampling...
Target: Normal samples count: 192509 | Target: Anomally samples count: 128340 
Iterations needed to reach target count for anomaly samples: 408 | Anomaly original count: 315


  0%|          | 0/408 [00:00<?, ?it/s]

Sampled normal images: 192509 | unique numbers: 192509


  0%|          | 0/192509 [00:00<?, ?it/s]

Saving validation images to disk...


  0%|          | 0/80211 [00:00<?, ?it/s]

In [9]:
import glob
import os

In [10]:
fold_index = 1

dataset_path = config.get_value("DATASET_PATH")

root = os.path.join(dataset_path, "splits", f"fold_{fold_index}", "train")

file_list = glob.glob(os.path.join(root, "*.jpg"))

In [11]:
os.listdir(root)

['1', '0']

In [12]:
file_list[:5]

[]

In [13]:
dataloader = IsicDataLoaderFolders(root=root)

In [14]:
next(iter(dataloader))

(tensor([[[0.6706, 0.6706, 0.6667,  ..., 0.7020, 0.7216, 0.7255],
          [0.6706, 0.6745, 0.6706,  ..., 0.6824, 0.6902, 0.6941],
          [0.6706, 0.6706, 0.6667,  ..., 0.6549, 0.6510, 0.6627],
          ...,
          [0.5961, 0.5961, 0.6000,  ..., 0.6078, 0.6196, 0.6275],
          [0.6118, 0.6078, 0.6078,  ..., 0.6078, 0.6196, 0.6235],
          [0.6314, 0.6235, 0.6157,  ..., 0.6157, 0.6235, 0.6314]],
 
         [[0.4784, 0.4784, 0.4745,  ..., 0.5255, 0.5451, 0.5490],
          [0.4784, 0.4824, 0.4784,  ..., 0.5059, 0.5137, 0.5176],
          [0.4784, 0.4784, 0.4745,  ..., 0.4784, 0.4745, 0.4863],
          ...,
          [0.4078, 0.4078, 0.4118,  ..., 0.4157, 0.4275, 0.4353],
          [0.4235, 0.4196, 0.4196,  ..., 0.4157, 0.4275, 0.4314],
          [0.4431, 0.4353, 0.4275,  ..., 0.4235, 0.4314, 0.4392]],
 
         [[0.4118, 0.4118, 0.4078,  ..., 0.4431, 0.4627, 0.4667],
          [0.4118, 0.4157, 0.4118,  ..., 0.4235, 0.4314, 0.4353],
          [0.4118, 0.4118, 0.4078,  ...,