In [1]:
from prototypes.deeplearning.dataloader.IsicDataLoader import IsicDataLoaderFolders, over_under_sample, load_val_images, create_folds, AugmentationWrapper
from prototypes.utility.data import ProjectConfiguration
import pandas as pd

In [2]:
config = ProjectConfiguration("../config.json")

In [3]:
config.get_keys()

dict_keys(['DATASET_PATH', 'TRAIN_IMAGES_PATH', 'TRAIN_METADATA', 'TEST_METADATA', 'SAMPLE_SUBMISSION', 'IMAGE_WIDTH', 'IMAGE_HEIGHT', 'TARGET_COLUMNS', 'VECTORS_PATH', 'BATCH_SIZE', 'K_FOLDS', 'NUM_CLASSES', 'ALPHA', 'TRAIN_DEVICE', 'USE_SAMPLER', 'TRAIN_SAMPLE_SIZE', 'VAL_SAMPLE_SIZE', 'TRAIN_SPLIT', 'NUM_EPOCHS', 'NUM_WORKERS', 'PREFETCH_FACTOR', 'MODEL', 'DROPOUT_LAST_LAYER', 'CLASS_BALANCE_PERCENTAGE', 'ENABLED_EXPONENTIAL_LR', 'USING_MIXUP', 'TOTAL_TRAIN_SAMPLES', 'PER_SAMPLE_AUGMENTATION', 'METADATA_COLUMNS', 'LEARNING_RATE', 'TOLERANCE_EARLY_STOPPING', 'SAMPLE_PERCENTAGE', 'HYPER_PARAMETERS_PATH', 'VERSION'])

In [4]:
metadata_df = pd.read_csv(config.get_value("TRAIN_METADATA"), engine="python")
columns = config.get_value("METADATA_COLUMNS").split("\t")

isic_id, metadata_array, labels = metadata_df["isic_id"].values, metadata_df[columns].values, metadata_df[
    "target"].values

In [5]:
folds_config_dict = create_folds(isic_id=isic_id, metadata=metadata_array, labels=labels, config=config)

In [6]:
folds_config_dict

{'1': {'train': {'isic_id': array(['ISIC_0015670', 'ISIC_0015845', 'ISIC_0015902', ...,
          'ISIC_9999951', 'ISIC_9999960', 'ISIC_9999967'], dtype=object),
   'metadata': array([[ 2.02444223e+01,  1.62619752e+01,  2.69224474e+01, ...,
           -1.82703552e+02,  6.13493652e+02, -4.24279480e+01],
          [ 3.17125700e+01,  2.53647400e+01,  2.63310000e+01, ...,
           -7.83081100e-02,  1.57568700e+03,  5.71745000e+01],
          [ 1.42423289e+01,  1.21647573e+01,  2.14481443e+01, ...,
           -1.41024780e+02,  1.44218579e+03,  5.83598022e+01],
          ...,
          [ 1.99776400e+01,  1.60268700e+01,  3.41588400e+01, ...,
            5.23497400e+01,  1.39318700e+03,  1.27261700e+02],
          [ 1.73325667e+01,  1.23643974e+01,  2.98453261e+01, ...,
            5.46222458e+01,  1.06526367e+03, -1.06833740e+02],
          [ 1.67929000e+01,  1.14347300e+01,  2.14572200e+01, ...,
           -2.58872300e+02,  1.48606200e+02,  1.07710600e+02]]),
   'target': array([0, 0, 0, 

# K - Folds 

In [7]:
from PIL import Image
import os
import copy
import shutil
from tqdm.auto import tqdm
import albumentations as A
import numpy as np


augmentation_oversampling = A.Compose([
    # crop border avois multiple instances
    A.Rotate(limit=(-360, 360), p=0.9, crop_border=False),
    A.VerticalFlip(p=0.5),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=1.0),
    # water reflexion effect magnification and min
    A.GridDistortion(p=0.5),
    # Only with crop_border = True
    A.Resize(height=config.get_value("IMAGE_HEIGHT"), width=config.get_value("IMAGE_WIDTH"))
])

dataset_path = config.get_value("DATASET_PATH")
shutil.rmtree(os.path.join(dataset_path, "splits"), ignore_errors=True)

train_metadata = []
val_metadata = []

image_source_path = config.get_value("TRAIN_IMAGES_PATH")
for fold_index in tqdm(folds_config_dict.keys()):
    print(f"Fold: {fold_index}")
    
    os.makedirs(os.path.join(dataset_path, "splits", f"fold_{fold_index}", "train", "0"), exist_ok=True)
    os.makedirs(os.path.join(dataset_path, "splits", f"fold_{fold_index}", "train", "1"), exist_ok=True)
    os.makedirs(os.path.join(dataset_path, "splits", f"fold_{fold_index}", "val", "0"), exist_ok=True)
    os.makedirs(os.path.join(dataset_path, "splits", f"fold_{fold_index}", "val", "1"), exist_ok=True)
    
    normal_images_ids = folds_config_dict[fold_index]["train"]["isic_id"][np.where(folds_config_dict[fold_index]["train"]["target"]==0)]
    anomaly_images_ids = folds_config_dict[fold_index]["train"]["isic_id"][np.where(folds_config_dict[fold_index]["train"]["target"]==1)]
    
    print(f"total images: {len(normal_images_ids) + len(anomaly_images_ids)}")
    print("Over and under sampling...")
    over_under_sample(normal_images_ids=normal_images_ids,
                      anomaly_images_ids=anomaly_images_ids,
                      augmentation_transform=augmentation_oversampling,
                      root_path=os.path.join(dataset_path, "splits", f"fold_{fold_index}"),
                      config=config)
    
    print("Saving validation images to disk...")
    for isic_id, target in tqdm(zip(folds_config_dict[fold_index]["val"]["isic_id"], folds_config_dict[fold_index]["val"]["target"]), total=len(folds_config_dict[fold_index]["val"]["isic_id"])):
        img = copy.deepcopy(Image.open(os.path.join(image_source_path, f"{isic_id}.jpg")))
        img.save(os.path.join(dataset_path, "splits", f"fold_{fold_index}", "val", f"{int(target)}", f"{isic_id}.jpg"))

  0%|          | 0/5 [00:00<?, ?it/s]

Fold: 1
total images: 320847
Over and under sampling...
Target: Normal samples count: 208551 | Target: Anomally samples count: 112297 
Iterations needed to reach target count for anomaly samples: 357 | Anomaly original count: 315


  0%|          | 0/357 [00:00<?, ?it/s]

Sampled normal images: 208551 | unique numbers: 208551


  0%|          | 0/208551 [00:00<?, ?it/s]

Saving validation images to disk...


  0%|          | 0/80212 [00:00<?, ?it/s]

Fold: 2
total images: 320847
Over and under sampling...
Target: Normal samples count: 208551 | Target: Anomally samples count: 112297 
Iterations needed to reach target count for anomaly samples: 358 | Anomaly original count: 314


  0%|          | 0/358 [00:00<?, ?it/s]

Sampled normal images: 208551 | unique numbers: 208551


  0%|          | 0/208551 [00:00<?, ?it/s]

Saving validation images to disk...


  0%|          | 0/80212 [00:00<?, ?it/s]

Fold: 3
total images: 320847
Over and under sampling...
Target: Normal samples count: 208551 | Target: Anomally samples count: 112297 
Iterations needed to reach target count for anomaly samples: 358 | Anomaly original count: 314


  0%|          | 0/358 [00:00<?, ?it/s]

Sampled normal images: 208551 | unique numbers: 208551


  0%|          | 0/208551 [00:00<?, ?it/s]

Saving validation images to disk...


  0%|          | 0/80212 [00:00<?, ?it/s]

Fold: 4
total images: 320847
Over and under sampling...
Target: Normal samples count: 208551 | Target: Anomally samples count: 112297 
Iterations needed to reach target count for anomaly samples: 358 | Anomaly original count: 314


  0%|          | 0/358 [00:00<?, ?it/s]

Sampled normal images: 208551 | unique numbers: 208551


  0%|          | 0/208551 [00:00<?, ?it/s]

Saving validation images to disk...


  0%|          | 0/80212 [00:00<?, ?it/s]

Fold: 5
total images: 320848
Over and under sampling...
Target: Normal samples count: 208552 | Target: Anomally samples count: 112297 
Iterations needed to reach target count for anomaly samples: 357 | Anomaly original count: 315


  0%|          | 0/357 [00:00<?, ?it/s]

Sampled normal images: 208552 | unique numbers: 208552


  0%|          | 0/208552 [00:00<?, ?it/s]

Saving validation images to disk...


  0%|          | 0/80211 [00:00<?, ?it/s]

In [8]:
import glob
import os

In [9]:
fold_index = 1

dataset_path = config.get_value("DATASET_PATH")

root = os.path.join(dataset_path, "splits", f"fold_{fold_index}", "train")

file_list = glob.glob(os.path.join(root, "*.jpg"))

In [10]:
os.listdir(root)

['1', '0']

In [11]:
file_list[:5]

[]

In [12]:
dataloader = IsicDataLoaderFolders(root=root)

In [13]:
next(iter(dataloader))

(tensor([[[0.4745, 0.4706, 0.4667,  ..., 0.4510, 0.4392, 0.4353],
          [0.4706, 0.4706, 0.4667,  ..., 0.4588, 0.4471, 0.4431],
          [0.4706, 0.4667, 0.4627,  ..., 0.4588, 0.4471, 0.4431],
          ...,
          [0.4824, 0.4745, 0.4667,  ..., 0.4118, 0.4196, 0.4275],
          [0.4745, 0.4745, 0.4745,  ..., 0.4118, 0.4196, 0.4275],
          [0.4706, 0.4784, 0.4863,  ..., 0.4118, 0.4196, 0.4275]],
 
         [[0.3020, 0.2980, 0.2941,  ..., 0.2941, 0.2941, 0.2902],
          [0.2980, 0.2980, 0.2941,  ..., 0.3020, 0.3020, 0.2980],
          [0.2980, 0.2941, 0.2902,  ..., 0.3020, 0.3020, 0.2980],
          ...,
          [0.3176, 0.3098, 0.3020,  ..., 0.2392, 0.2471, 0.2549],
          [0.3098, 0.3098, 0.3098,  ..., 0.2392, 0.2471, 0.2549],
          [0.3059, 0.3137, 0.3216,  ..., 0.2392, 0.2471, 0.2549]],
 
         [[0.2039, 0.2000, 0.1961,  ..., 0.1961, 0.1922, 0.1882],
          [0.2000, 0.2000, 0.1961,  ..., 0.2039, 0.2000, 0.1961],
          [0.2000, 0.1961, 0.1922,  ...,