In [1]:
# the code block below is meant for obtaining 300 samples

import kagglehub

# download the dataset from kaggle, located in path
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/skin-cancer-mnist-ham10000


In [2]:
import pandas as pd
import os

# look at first few lines of metadata to analyze the data
# classes include akiec, bcc, bkl, df, mel, nv
df = pd.read_csv(os.path.join(path, "HAM10000_metadata.csv"))
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


This Dataset Iteration contains 3 classes of 300 total images (100 images per class, train/test distribution described below)

In [3]:
# start with 3 classes, we will look at bkl, bcc, and nv classes based on visual differences
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# load the meta data again
metadata = pd.read_csv(os.path.join(path, "HAM10000_metadata.csv"))

# filter target classes, which in this case are bkl, bcc, and nv
target_classes = ['bkl', 'bcc', 'nv']
filtered_metadata = metadata[metadata['dx'].isin(target_classes)]

print("Filtered dataset size:", len(filtered_metadata))
print("Class distribution:\n", filtered_metadata['dx'].value_counts())

Filtered dataset size: 8318
Class distribution:
 dx
nv     6705
bkl    1099
bcc     514
Name: count, dtype: int64


In [4]:
# this box just gets the metadata of the data we are downloading, 100 each for an initial test
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import shutil
from tqdm import tqdm

# 1. Load metadata
metadata_path = os.path.join(path, "HAM10000_metadata.csv")
metadata = pd.read_csv(metadata_path)

# 2. Filter to the 3 target classes
target_classes = ['bkl', 'bcc', 'nv']
filtered_metadata = metadata[metadata['dx'].isin(target_classes)]

# 3. Limit each class to 100 images by sampling lesions
limits = {'nv': 100, 'bkl': 100, 'bcc': 100}
selected_metadata = []

for dx in limits:
    class_df = filtered_metadata[filtered_metadata['dx'] == dx]
    lesion_ids = class_df['lesion_id'].drop_duplicates().sample(frac=1, random_state=42)

    selected_lesions = []
    selected_count = 0

    for lesion in lesion_ids:
        lesion_imgs = class_df[class_df['lesion_id'] == lesion]
        if selected_count + len(lesion_imgs) > limits[dx]:
            continue
        selected_lesions.append(lesion)
        selected_count += len(lesion_imgs)
        if selected_count >= limits[dx]:
            break

    subset = class_df[class_df['lesion_id'].isin(selected_lesions)]
    selected_metadata.append(subset)

limited_metadata = pd.concat(selected_metadata).reset_index(drop=True)

print("Limited class distribution:\n", limited_metadata['dx'].value_counts())


Limited class distribution:
 dx
nv     100
bkl    100
bcc    100
Name: count, dtype: int64


In [5]:
# split into train and testing class
lesions = limited_metadata['lesion_id'].unique()
train_lesions, test_lesions = train_test_split(lesions, test_size=0.2, random_state=42)

train_metadata = limited_metadata[limited_metadata['lesion_id'].isin(train_lesions)]
test_metadata = limited_metadata[limited_metadata['lesion_id'].isin(test_lesions)]

print("Train images:", len(train_metadata))
print("Test images:", len(test_metadata))

print("Train class distribution:\n", train_metadata['dx'].value_counts())
print("\nTest class distribution:\n", test_metadata['dx'].value_counts())

Train images: 228
Test images: 72
Train class distribution:
 dx
nv     78
bkl    77
bcc    73
Name: count, dtype: int64

Test class distribution:
 dx
bcc    27
bkl    23
nv     22
Name: count, dtype: int64


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Image folders from kagglehub
image_dir1 = os.path.join("/kaggle/input/skin-cancer-mnist-ham10000", "HAM10000_images_part_1")
image_dir2 = os.path.join("/kaggle/input/skin-cancer-mnist-ham10000", "HAM10000_images_part_2")

# Map image IDs to full paths
image_paths = {img[:-4]: os.path.join(image_dir1, img) for img in os.listdir(image_dir1) if img.endswith('.jpg')}
image_paths.update({img[:-4]: os.path.join(image_dir2, img) for img in os.listdir(image_dir2) if img.endswith('.jpg')})

# Output folder
output_root = os.path.join("/content/drive/MyDrive/BME450/Group Project/", "tiny_dataset")

os.makedirs(output_root, exist_ok=True)

def copy_images(df, split):
    for _, row in tqdm(df.iterrows(), total=len(df)):
        label = row['dx']
        img_id = row['image_id']
        src = image_paths.get(img_id)
        if not src:
            print(f"Missing image: {img_id}")
            continue
        dst_dir = os.path.join(output_root, split, label)
        os.makedirs(dst_dir, exist_ok=True)
        shutil.copy(src, os.path.join(dst_dir, f"{img_id}.jpg"))

# Copy files
copy_images(train_metadata, "train")
copy_images(test_metadata, "test")

100%|██████████| 228/228 [00:03<00:00, 74.98it/s]
100%|██████████| 72/72 [00:00<00:00, 75.98it/s]
