> This notebook serves to separate the images by their label into folders that can be pulled during the augmentation and model fitting stages. Please note that since I could not upload the data to GitHub, you would need to download the data from Kaggle and create folders that are titled the same as mine to use this code in this way. 

In [None]:
import pandas as pd
import PIL
from PIL import Image
from matplotlib import image

> Information data:

In [None]:
data = pd.read_csv("skin-cancer-mnist-ham10000/HAM10000_metadata.csv")

In [None]:
data.head()

> Dataframe containing labels

In [None]:
data_2 = pd.read_csv("./skin-cancer-mnist-ham10000/pre_processed_data_from_isic/hmnist_28_28_RGB.csv")

In [None]:
label = data_2["label"]

> New dataframe with both image titles and labels:

In [None]:
data = pd.merge(data, label, left_index= True, right_index= True)

In [None]:
data["dx"].value_counts()

In [None]:
data["label"].value_counts()

> Separating the pictures into train/test/validation sets. The train set has 50% of the overall data, the test set has 25% of the overall data and the validation set has 25% of the overall data.

In [None]:
#taking 50% of overall data, by category, for the training df
df4_train = data[data["label"] == 4].sample(frac = .5)
df6_train = data[data["label"] == 6].sample(frac = .5)
df2_train = data[data["label"] == 2].sample(frac = .5)
df1_train = data[data["label"] == 1].sample(frac = .5)
df0_train = data[data["label"] == 0].sample(frac = .5)
df5_train = data[data["label"] == 5].sample(frac = .5)
df3_train = data[data["label"] == 3].sample(frac = .5)

In [None]:
#val df is created from the remainder of the original data
val_test_df = data.drop(df4_train.index)
val_test_df.drop(df6_train.index, inplace = True)
val_test_df.drop(df2_train.index, inplace = True)
val_test_df.drop(df1_train.index, inplace = True)
val_test_df.drop(df0_train.index, inplace = True)
val_test_df.drop(df5_train.index, inplace = True)
val_test_df.drop(df3_train.index, inplace = True)

In [None]:
#taking 50% of the data that is left for the val df
df4_val = val_test_df[val_test_df["label"] == 4].sample(frac = .5)
df6_val = val_test_df[val_test_df["label"] == 6].sample(frac = .5)
df2_val = val_test_df[val_test_df["label"] == 2].sample(frac = .5)
df1_val = val_test_df[val_test_df["label"] == 1].sample(frac = .5)
df0_val = val_test_df[val_test_df["label"] == 0].sample(frac = .5)
df5_val = val_test_df[val_test_df["label"] == 5].sample(frac = .5)
df3_val = val_test_df[val_test_df["label"] == 3].sample(frac = .5)

In [None]:
#droping the val indices from the original data
#left over is the test df
test_df = val_test_df.drop(df4_val.index)
test_df.drop(df6_val.index, inplace = True)
test_df.drop(df2_val.index, inplace = True)
test_df.drop(df1_val.index, inplace = True)
test_df.drop(df0_val.index, inplace = True)
test_df.drop(df5_val.index, inplace = True)
test_df.drop(df3_val.index, inplace = True)

In [None]:
test_df.reset_index(inplace = True)

In [None]:
#combine all val classes
val_df = df4_val.append(df6_val).append(df2_val).append(df1_val).append(df0_val).append(df5_val).append(df3_val)
val_df.reset_index(inplace = True)

In [None]:
#combine all train classes
train_df = df4_train.append(df6_train).append(df2_train).append(df1_train).append(df0_train).append(df5_train).append(df3_train)
train_df.reset_index(inplace = True)

In [None]:
test_df["label"].value_counts()

In [None]:
val_df["label"].value_counts()

In [None]:
train_df["label"].value_counts()

In [None]:
from keras.preprocessing.image import ImageDataGenerator

im_gen = ImageDataGenerator(
        rotation_range = 360,
        width_shift_range = 0.2,
        height_shift_range = 0.2,
        shear_range = 0.2,
        zoom_range = 0.2,
        horizontal_flip = True,
        vertical_flip = True
)

> Code to separate into labels and then save to folders:

In [None]:
#this iterates through each image in the training dataset created above by label and adds the image to the directory
for i in range(7):
    df = train_df[train_df["label"] == i]
    for j in df["image_id"].values:
        if int(j.replace("ISIC_00","")) < 29_306:
            image1 = Image.open(f"./skin-cancer-mnist-ham10000/HAM10000_images_part_1/{j}.jpg")
            image1.save(f"./skin-cancer-mnist-ham10000/image_data_train/type_{i}/{j}.jpg")

        else:
            image1 = Image.open(f"./skin-cancer-mnist-ham10000/HAM10000_images_part_2/{j}.jpg")
            image1.save(f"./skin-cancer-mnist-ham10000/image_data_train/type_{i}/{j}.jpg")

            
#below code was added to create an augmented training data folder
    count = 0
    while count < (train_df["label"].shape[0]/df.shape[0]): #balancing the classes
        for s in df["image_id"].values:
        
            str_count = str(count)
            if int(s.replace("ISIC_00","")) < 29_306:
                #find image in one folder
                image2 = image.imread(f"./skin-cancer-mnist-ham10000/HAM10000_images_part_1/{s}.jpg")
                #create a random transformation
                im_dict = im_gen.get_random_transform((450,600,3))
                #apply to image
                new_im = im_gen.apply_transform(image2, im_dict)
                image2 = Image.fromarray(new_im)
                #save to augmented folder
                image2.save(f"./skin-cancer-mnist-ham10000/image_data_train_augmented/type_{i}/{s}{str_count}.jpg")
            else:
                #same thing as above but pulling from the second image folder
                image2 = image.imread(f"./skin-cancer-mnist-ham10000/HAM10000_images_part_2/{s}.jpg")
                im_dict = im_gen.get_random_transform((450,600,3))
                new_im = im_gen.apply_transform(image2, im_dict)
                image2 = Image.fromarray(new_im)
                image2.save(f"./skin-cancer-mnist-ham10000/image_data_train_augmented/type_{i}/{s}{str_count}.jpg")
        count += 1

In [None]:
#creating the test directory
for i in range(7):
    df = test_df[test_df["label"] == i]
    for j in df["image_id"].values:
        if int(j.replace("ISIC_00","")) < 29_306:
            image = Image.open(f"./skin-cancer-mnist-ham10000/HAM10000_images_part_1/{j}.jpg")
            image.save(f"./skin-cancer-mnist-ham10000/image_data_test//type_{i}/{j}.jpg")
        else:
            image = Image.open(f"./skin-cancer-mnist-ham10000/HAM10000_images_part_2/{j}.jpg")
            image.save(f"./skin-cancer-mnist-ham10000/image_data_test//type_{i}/{j}.jpg")

In [None]:
#creating the validation directory
for i in range(7):
    df = val_df[val_df["label"] == i]
    for j in df["image_id"].values:
        if int(j.replace("ISIC_00","")) < 29_306:
            image = Image.open(f"./skin-cancer-mnist-ham10000/HAM10000_images_part_1/{j}.jpg")
            image.save(f"./skin-cancer-mnist-ham10000/image_data_validation//type_{i}/{j}.jpg")
        else:
            image = Image.open(f"./skin-cancer-mnist-ham10000/HAM10000_images_part_2/{j}.jpg")
            image.save(f"./skin-cancer-mnist-ham10000/image_data_validation//type_{i}/{j}.jpg")

> It took about three minutes for the above cells to run.