In [26]:
import os
import shutil

import pandas as pd

from sklearn.model_selection import train_test_split

In [33]:
base_dir = os.path.join('..', '..', 'data2')
csv_path = os.path.join(base_dir, 'archive', 'ISIC_2019_Training_GroundTruth.csv')
metadata = pd.read_csv(csv_path)
category_counts = {category: 0 for category in metadata.columns if category != 'image'}

for i, row in metadata.iterrows():
    for category in category_counts.keys():
        if row[category] == 1.0:
            category_counts[category] += 1

print(category_counts)

{'MEL': 4522, 'NV': 12875, 'BCC': 3323, 'AK': 867, 'BKL': 2624, 'DF': 239, 'VASC': 253, 'SCC': 628, 'UNK': 0}


The UNK category is empty, so let's drop the column altogether

In [34]:
metadata = metadata[['image', 'MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC']]
metadata = metadata.rename(columns={
    'MEL': 'mel',
    'NV': 'nv',
    'BCC': 'bcc',
    'AK': 'akiec',
    'BKL': 'bkl',
    'DF': 'df',
    'VASC': 'vasc',
    'SCC': 'scc'
})

metadata.to_csv(
    os.path.join(
        base_dir,
        'archive',
        'ISIC_2019_Training_GroundTruth_corrected.csv'),
    index=False)

In [35]:
images_dir = os.path.join(base_dir, 'extended_and_resized')
training_dir = os.path.join(
    base_dir,
    'images_original_inception_resnet_v2_200x150_categorized',
    'training')
test_dir = os.path.join(
    base_dir,
    'images_original_inception_resnet_v2_200x150_categorized',
    'test')
validation_dir = os.path.join(
    base_dir,
    'images_original_inception_resnet_v2_200x150_categorized',
    'validation')
get_paths = lambda path: [
    f'{os.path.join(root, file)}'
    for root, dirs, files in os.walk(path)
    for file in files]
get_name = lambda path: path.split(os.sep)[-1]
original_paths = get_paths(images_dir)
train_files, test_files = train_test_split(original_paths, test_size=0.15, random_state=57)
train_files, valid_files = train_test_split(train_files, test_size=.2, random_state=123)

In [36]:
metadata.set_index('image', inplace=True)


def categorize(target_dir: str, files: list[str]) -> None:
    for path in files:
        full_name = get_name(path)
        name = full_name.split('.')[-2]
        categories = metadata.loc[name]
        category = categories.idxmax()
        directory = os.path.join(target_dir, category)

        if not os.path.exists(directory):
            os.makedirs(directory)

        im_path = os.path.join(directory, full_name)

        shutil.copy(path, im_path)

In [37]:
categorize(training_dir, train_files)
categorize(validation_dir, valid_files)
categorize(test_dir, test_files)