# Let's understand the HAM10000 dataset!

In [5]:
import pandas as pd
import json
import os
from pathlib import Path

folder_path = "data/ham10000/"
print(Path(folder_path, "meta.json"))
# Read HAM10000 metadata
with open(Path(folder_path, "meta.json")) as f:
    ham_metadata = json.load(f)

# check unique classes in metadata
unique_titles = []
for i in ham_metadata["classes"]:
    unique_titles.append(i["title"])

unique_titles

data\ham10000\meta.json


['actinic keratoses',
 'basal cell carcinoma',
 'benign keratosis-like lesions',
 'dermatofibroma',
 'melanocytic nevi',
 'melanoma',
 'vascular lesions']

In [6]:
len(ham_metadata["classes"])

7

# Let's create the mapping dict

In [7]:
group_map = {
    "melanoma": "melanoma",
    "basal cell carcinoma": "suspicious",
    "actinic keratoses": "suspicious",
    "melanocytic nevi": "benign",
    "benign keratosis-like lesions": "benign",
    "dermatofibroma": "benign",
    "vascular lesions": "benign",
}

In [8]:
ann_dir = Path(folder_path, "ds/ann")
img_dir = Path(folder_path, "ds/img")

rows = []
error_cnt = 0

for fname in os.listdir(ann_dir):
    if not fname.endswith(".json"):
        continue

    with open(os.path.join(ann_dir, fname), "r") as f:
        ann = json.load(f)

    # Extract label
    try:

        label = ann["objects"][0]["classTitle"]
    except IndexError:
        error_cnt += 1
        print(f"Skipping {fname} due to no annotations.")
        continue
    group = group_map[label]

    # Match to image (same name but .jpg or .png)
    img_name = fname.replace(".json", ".jpg")  # or ".png" if that’s your case

    rows.append({"image": img_name, "label": label, "group": group})

df = pd.DataFrame(rows)
print(df["group"].value_counts())
print(f"Skipped {error_cnt} files due to no annotations.")

Skipping ISIC_0026042.jpg.json due to no annotations.
Skipping ISIC_0029819.jpg.json due to no annotations.
Skipping ISIC_0029819.jpg.json due to no annotations.
benign        8060
melanoma      1112
suspicious     841
Name: group, dtype: int64
Skipped 2 files due to no annotations.
benign        8060
melanoma      1112
suspicious     841
Name: group, dtype: int64
Skipped 2 files due to no annotations.


In [32]:
df

Unnamed: 0,image,label,group
0,ISIC_0024306.jpg.jpg,melanocytic nevi,benign
1,ISIC_0024307.jpg.jpg,melanocytic nevi,benign
2,ISIC_0024308.jpg.jpg,melanocytic nevi,benign
3,ISIC_0024309.jpg.jpg,melanocytic nevi,benign
4,ISIC_0024310.jpg.jpg,melanoma,melanoma
...,...,...,...
10008,ISIC_0034316.jpg.jpg,melanoma,melanoma
10009,ISIC_0034317.jpg.jpg,melanoma,melanoma
10010,ISIC_0034318.jpg.jpg,benign keratosis-like lesions,benign
10011,ISIC_0034319.jpg.jpg,melanocytic nevi,benign


In [9]:
from sklearn.model_selection import train_test_split

# Get all image files
all_files = df["image"].tolist()
print(f"Total images: {len(all_files)}")

# Split into train/val/test (same as in 6_model_improvement.ipynb)
# 80% train+val, 20% test
train_files, test_files = train_test_split(all_files, test_size=0.2, random_state=42)
# Split train into 80% train, 20% val (resulting in ~64% train, 16% val, 20% test)
train_files, val_files = train_test_split(train_files, test_size=0.2, random_state=42)

print(f"\nDataset splits:")
print(f"  Training:   {len(train_files):>5} images ({len(train_files)/len(all_files)*100:.1f}%)")
print(f"  Validation: {len(val_files):>5} images ({len(val_files)/len(all_files)*100:.1f}%)")
print(f"  Test:       {len(test_files):>5} images ({len(test_files)/len(all_files)*100:.1f}%)")


# Count class distribution in each split
def get_split_distribution(file_list, dataframe):
    """Get class distribution for a specific split."""
    split_df = dataframe[dataframe["image"].isin(file_list)]
    return split_df["group"].value_counts()


print("\n" + "=" * 60)
print("TRAINING SET CLASS DISTRIBUTION:")
print("=" * 60)
train_dist = get_split_distribution(train_files, df)
for label, count in train_dist.items():
    print(f"  {label:12s}: {count:>5} ({count/len(train_files)*100:.1f}%)")

print("\n" + "=" * 60)
print("VALIDATION SET CLASS DISTRIBUTION:")
print("=" * 60)
val_dist = get_split_distribution(val_files, df)
for label, count in val_dist.items():
    print(f"  {label:12s}: {count:>5} ({count/len(val_files)*100:.1f}%)")

print("\n" + "=" * 60)
print("TEST SET CLASS DISTRIBUTION:")
print("=" * 60)
test_dist = get_split_distribution(test_files, df)
for label, count in test_dist.items():
    print(f"  {label:12s}: {count:>5} ({count/len(test_files)*100:.1f}%)")

Total images: 10013

Dataset splits:
  Training:    6408 images (64.0%)
  Validation:  1602 images (16.0%)
  Test:        2003 images (20.0%)

TRAINING SET CLASS DISTRIBUTION:
  benign      :  5154 (80.4%)
  melanoma    :   716 (11.2%)
  suspicious  :   538 (8.4%)

VALIDATION SET CLASS DISTRIBUTION:
  benign      :  1299 (81.1%)
  melanoma    :   159 (9.9%)
  suspicious  :   144 (9.0%)

TEST SET CLASS DISTRIBUTION:
  benign      :  1607 (80.2%)
  melanoma    :   237 (11.8%)
  suspicious  :   159 (7.9%)


In [None]:
ann["objects"][0]

[{'id': 13034747,
  'classId': 20169,
  'description': '',
  'geometryType': 'bitmap',
  'labelerLogin': 'iw@datasetninja.com',
  'createdAt': '2023-07-05T09:03:26.371Z',
  'updatedAt': '2023-07-05T09:03:26.371Z',
  'tags': [],
  'classTitle': 'melanocytic nevi',
  'bitmap': {'data': 'eJwBDQXy+olQTkcNChoKAAAADUlIRFIAAAF2AAABwAEDAAAAjAPRTgAAAAZQTFRFAAAA////pdmf3QAAAAF0Uk5TAEDm2GYAAAS1SURBVHic7dxNcusqEAVguzzIkCWwFJYGs7ctlsISGHqgsu6L/yRQ090HG5dc9zJMvopN9+nYsYIOh8ay8zxfWt/cruN8XUHrzc3PWu/uPiv9nWu3cHr4Oar8z9OfQa/b8coH0CeNN4ufNN4uXlVRt3jVBlZcs4Hj2is6cFp7xYbN2is2bNde0eGCywU6lT5CT19R0PLpyzPmSy8V9FhysaDVdsWC/oDe1D7w3tY+8t7VPoE+897XXpiAmsOeb3DdXqnBm/YKftNeocHmdR84b7c+ct6/7hPnt5wN0LZdfCDe4bkAUZ4LxDY+M9tg0gfQR9CnL/e57YlxbPnTf4kcr0aAzPUbVu39/JssR3kqcNeNXqhxoT35g++LCCiVY86TdWS84TwxABb0DvSe9WHjWb71ZCwZb3gfsadTeyvwesAc6CVeDZhQnI1ns3Zd55e8EX05YBb0TvTlwHjQi7wcAHbUO32A2lt6uV1loM0An6D24j5DccD9OtBe4acXvILD/tLvNfFcB1rnw9Nr4oz7JaCa+K8Dh3qj8nmwXwJqVX7q9k7llwD5wV7FYf8MtC7OS0B18VwCinpdPJeAav0jcAb0FvRO6R+B9kr/CK

In [None]:
ann

{'description': '',
 'tags': [],
 'size': {'height': 450, 'width': 600},
 'objects': [{'id': 13034747,
   'classId': 20169,
   'description': '',
   'geometryType': 'bitmap',
   'labelerLogin': 'iw@datasetninja.com',
   'createdAt': '2023-07-05T09:03:26.371Z',
   'updatedAt': '2023-07-05T09:03:26.371Z',
   'tags': [],
   'classTitle': 'melanocytic nevi',
   'bitmap': {'data': 'eJwBDQXy+olQTkcNChoKAAAADUlIRFIAAAF2AAABwAEDAAAAjAPRTgAAAAZQTFRFAAAA////pdmf3QAAAAF0Uk5TAEDm2GYAAAS1SURBVHic7dxNcusqEAVguzzIkCWwFJYGs7ctlsISGHqgsu6L/yRQ090HG5dc9zJMvopN9+nYsYIOh8ay8zxfWt/cruN8XUHrzc3PWu/uPiv9nWu3cHr4Oar8z9OfQa/b8coH0CeNN4ufNN4uXlVRt3jVBlZcs4Hj2is6cFp7xYbN2is2bNde0eGCywU6lT5CT19R0PLpyzPmSy8V9FhysaDVdsWC/oDe1D7w3tY+8t7VPoE+897XXpiAmsOeb3DdXqnBm/YKftNeocHmdR84b7c+ct6/7hPnt5wN0LZdfCDe4bkAUZ4LxDY+M9tg0gfQR9CnL/e57YlxbPnTf4kcr0aAzPUbVu39/JssR3kqcNeNXqhxoT35g++LCCiVY86TdWS84TwxABb0DvSe9WHjWb71ZCwZb3gfsadTeyvwesAc6CVeDZhQnI1ns3Zd55e8EX05YBb0TvTlwHjQi7wcAHbUO32A2lt6uV1loM0An6D24j5DccD9OtBe4acXvILD/tLvNfFc

: 