In [10]:
import os
import json
import pandas as pd
from pathlib import Path

folder_path = "data/ham10000/"

# --- Step 0: Define paths ---
ann_dir = Path(folder_path, "ds/ann")  # folder with annotation JSONs
img_dir = Path(folder_path, "ds/img")  # folder with images

output_csv = "ham10000_dataset.csv"

# --- Step 1: Define 3-class mapping ---
group_map = {
    "melanoma": "melanoma",
    "basal cell carcinoma": "suspicious",
    "actinic keratoses": "suspicious",
    "melanocytic nevi": "benign",
    "benign keratosis-like lesions": "benign",
    "dermatofibroma": "benign",
    "vascular lesions": "benign",
}

# --- Step 2: Iterate over annotation files and extract info ---
rows = []

for ann_file in os.listdir(ann_dir):
    if not ann_file.endswith(".json"):
        continue

    with open(os.path.join(ann_dir, ann_file), "r") as f:
        ann = json.load(f)

    # Get the diagnosis label from classTitle
    if "objects" not in ann or len(ann["objects"]) == 0:
        continue  # skip if no objects

    label = ann["objects"][0]["classTitle"]

    if label not in group_map:
        continue  # skip unknown labels

    group = group_map[label]

    # Match annotation to image file (assume same base name + .jpg)
    base_name = os.path.splitext(ann_file)[0]
    img_file = base_name
    img_path = os.path.join(img_dir, img_file)

    # Skip if image file does not exist
    if not os.path.exists(img_path):
        continue

    # Save row
    rows.append({"image_path": img_path, "label": label, "group": group})

# --- Step 3: Create DataFrame ---
df = pd.DataFrame(rows)

# --- Step 4: Quick inspection ---
print("Number of images:", len(df))
print("Counts per raw label:\n", df["label"].value_counts())
print("\nCounts per 3-class group:\n", df["group"].value_counts())

# --- Step 5: Optional: save CSV ---
df.to_csv(output_csv, index=False)
print(f"\nDataset CSV saved to {output_csv}")

Number of images: 10013
Counts per raw label:
 melanocytic nevi                 6705
melanoma                         1112
benign keratosis-like lesions    1098
basal cell carcinoma              514
actinic keratoses                 327
vascular lesions                  142
dermatofibroma                    115
Name: label, dtype: int64

Counts per 3-class group:
 benign        8060
melanoma      1112
suspicious     841
Name: group, dtype: int64

Dataset CSV saved to ham10000_dataset.csv


In [None]:
from sklearn.model_selection import train_test_split

# Assume df is your dataset DataFrame with columns: image_path, label, group

# Step 1: Split train + temp (val+test)
train_df, temp_df = train_test_split(
    df, test_size=0.3, stratify=df["group"], random_state=42  # 30% goes to val+test  # keep class balance
)

# Step 2: Split temp into validation and test
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,  # 50% of temp -> test, 50% -> val
    stratify=temp_df["group"],  # keep class balance
    random_state=42,
)

# Step 3: Check the distribution
print("Train distribution:\n", train_df["group"].value_counts())
print("\nValidation distribution:\n", val_df["group"].value_counts())
print("\nTest distribution:\n", test_df["group"].value_counts())

# Step 4: Optionally save CSVs
train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)
test_df.to_csv("test.csv", index=False)

Train distribution:
 benign        5642
melanoma       778
suspicious     589
Name: group, dtype: int64

Validation distribution:
 benign        1209
melanoma       167
suspicious     126
Name: group, dtype: int64

Test distribution:
 benign        1209
melanoma       167
suspicious     126
Name: group, dtype: int64


: 