In [2]:
import getting_data

In [2]:
# This cell will take a long time. Downloads files in chunks to not kill memory.
getting_data.download_all_files()

200
200
200
200
200


In [3]:
# Unzipping two folders
getting_data.unzip_files()

In [10]:
# For training images only, move images into labeled subdirectories
getting_data.move_train_images_into_subdir()

In [None]:
# Delete large zip files once done
# getting_data.delete_zip_files()  # Lets leave it in for now

In [None]:
# Visualize how much we have of each skin class in training data
import data
train_image_count = data.y_train[[c for c in data.y_train.columns if c != "image"]].sum()
train_image_count.plot(kind="bar")

train_image_count = train_image_count.drop("UNK")  # drop empty UNK

In [None]:
train_image_count.sort_values(ascending=False)

In [None]:
print(f"{train_image_count.idxmin()} has the fewest images in class {int(train_image_count.min())}")
print(f"{train_image_count.idxmax()} has the most images in class {int(train_image_count.max())}")

In [None]:
3500 * 2 + 3323 + 2624 + 867 + 628 + 253 + 239

Mike: I think some strategies are:
- [Undersample]() NV and MEL classes to almost match BCC's counts, say 3500.
  - Undersampling has the down-side of removing some valuable data, and slightly higher risk of poor generalizability, BUT we have so many images anyways, this will help us get trainable models.
- Use [class weights](https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#train_a_model_with_class_weights), which applies only during fitting, not preprocessing.

Total images after under-sampling: 14934 -- still a lot, but slightly more manageable? We can still focus on getting a playground for setting up models.

In [None]:
# Undersampling NV and MEL
import copy
import numpy as np
np.random.seed(0)

y_train_undersampled = copy.copy(data.metadata_train)
for class_to_undersample in ["NV", "MEL"]:
    all_img_ids_in_class = y_train_undersampled[y_train_undersampled[class_to_undersample] == 1]["image"].to_list()
    sampled_img_ids_in_class = np.random.choice(all_img_ids_in_class, 3500, replace=False)
    
    img_ids_to_get_rid = set(all_img_ids_in_class) - set(sampled_img_ids_in_class)
    
    y_train_undersampled.drop(
        index=[i for i, row in y_train_undersampled.iterrows() if row["image"] in img_ids_to_get_rid],
        inplace=True,
    )

y_train_undersampled.to_csv("isic_data/ISIC_2019_Training_GroundTruth_undersampled.csv", index=False)

print("Shape of new undersampled dataframe:", y_train_undersampled.shape)
y_train_undersampled.head()

In [None]:
# Copying undersampled set into a new folder
import os
import shutil

original_dir = "isic_data/ISIC_2019_Training_Input"
undersampled_dir = "isic_data/undersampled"

dirs_to_create = [
    undersampled_dir,
    *[undersampled_dir + "/" + skin_class for skin_class in y_train_undersampled.columns if skin_class != "image"]
]

for dirpath in dirs_to_create:
    if not os.path.exists(dirpath):
        os.mkdir(dirpath)

for _, row in y_train_undersampled.iterrows():
    img_id = row.image
    label = row.index[row == 1][0]
    shutil.copyfile(
        original_dir + "/" + label + "/" + img_id + ".jpg",
        undersampled_dir + "/" + label + "/" + img_id + ".jpg",
    )