In [3]:
import os
import shutil
import pandas as pd

The IMAGES_FOLDER constant should be changed to load different set of images, as well as the TARGET_DATA_FOLDER const.

In [7]:
IMAGES_FOLDER = 'images_original_inception_resnet_v2_200x150' #'images_original'
TARGET_DATA_FOLDER = 'images_original_inception_resnet_v2_200x150_categorized'
metadata_path = os.path.join('..', '..', 'data', 'HAM10000_metadata.csv')
images_path = os.path.join('..', '..', 'data', IMAGES_FOLDER)
baseline_data_path = os.path.join('..', '..', 'data', TARGET_DATA_FOLDER)
metadata_frame = pd.read_csv(metadata_path)[['image_id', 'dx']]

In [8]:
def stratified_split(df: pd.DataFrame, test_size=0.2):
    groups = df.groupby('dx')
    train_dfs = []
    valid_dfs = []

    for _, group in groups:
        valid_size = int(len(group) * test_size)
        valid_size = max(valid_size, 1)
        valid_df = group.sample(n=valid_size)
        train_df = group.drop(valid_df.index)

        train_dfs.append(train_df)
        valid_dfs.append(valid_df)

    train_df = pd.concat(train_dfs)
    valid_df = pd.concat(valid_dfs)

    return train_df, valid_df

In [9]:
train_frame, valid_frame = stratified_split(metadata_frame)

In [10]:
get_paths = lambda path: [f'{root}/{file}' for root, dirs, files in os.walk(path) for file in files]
image_paths = get_paths(images_path)

In [13]:
def categorize_files(_type: str, frame: pd.DataFrame) -> None:
    def create_dir_if_not_exists(check_path: str):
        if not os.path.exists(check_path):
            try:
                os.makedirs(check_path)
            except OSError as error:
                print(f'Creation of directory {check_path} failed. Error: {error}')


    type_path = os.path.join('..', '..', 'data', TARGET_DATA_FOLDER, _type)

    for _, row in frame.iterrows():
        category = row['dx']
        image_name = row['image_id']
        category_path = os.path.join(type_path, category)
        orig_image_path = os.path.join(images_path, f'{image_name}.jpg')
        new_image_path = os.path.join(category_path, f'{image_name}.jpg')

        create_dir_if_not_exists(category_path)
        shutil.copy(orig_image_path, new_image_path)

In [14]:
categorize_files('training', train_frame)
categorize_files('validation', valid_frame)