# HAM10000 preparation

```bash
pip install -U isic-cli
DEST="/workspace/clip_xai/dermaCAP/HAM10000"
mkdir -p "$DEST"
isic image download --collections 212 --limit 0 "$DEST"
```

In [1]:
from pathlib import Path
import pandas as pd

In [2]:
BASE_DIR   = Path("/workspace/clip_xai/dermaCAP")
HAM_DIR    = BASE_DIR / "HAM10000"
META_CSV   = HAM_DIR / "metadata.csv"          
OUT_CSV    = BASE_DIR / "ham10000_prepared.csv"
IMG_EXT    = ".jpg"   

In [7]:
EXCLUDE = {"isic_id", "lesion_id", "copyright_license", "attribution"}

LABELS = {
    "age_approx": "Age (approx)",
    "anatom_site_general": "Anatomic site (general)",
    "anatom_site_special": "Anatomic site (special)",
    "concomitant_biopsy": "Concomitant biopsy",
    "diagnosis_1": "Diagnosis 1",
    "diagnosis_2": "Diagnosis 2",
    "diagnosis_3": "Diagnosis 3",
    "diagnosis_confirm_type": "Diagnosis confirm type",
    "image_manipulation": "Image manipulation",
    "image_type": "Image type",
    "melanocytic": "Melanocytic",
    "pixels_x": "Pixels X",
    "pixels_y": "Pixels Y",
    "sex": "Sex",
}

YESNO = {"true": "Yes", "false": "No", "yes": "Yes", "no": "No", "1": "Yes", "0": "No"}

In [8]:
df = pd.read_csv(META_CSV)

df

Unnamed: 0,isic_id,attribution,copyright_license,age_approx,anatom_site_general,anatom_site_special,concomitant_biopsy,diagnosis_1,diagnosis_2,diagnosis_3,diagnosis_confirm_type,image_manipulation,image_type,lesion_id,melanocytic,pixels_x,pixels_y,sex
0,ISIC_0024306,MILK study team,CC-BY-NC,45.0,,,False,Benign,Benign melanocytic proliferations,Nevus,serial imaging showing no change,,dermoscopic,IL_7252831,True,600,450,male
1,ISIC_0024307,MILK study team,CC-BY-NC,50.0,lower extremity,,False,Benign,Benign melanocytic proliferations,Nevus,serial imaging showing no change,,dermoscopic,IL_6125741,True,600,450,male
2,ISIC_0024308,MILK study team,CC-BY-NC,55.0,,,False,Benign,Benign melanocytic proliferations,Nevus,serial imaging showing no change,,dermoscopic,IL_3692653,True,600,450,female
3,ISIC_0024309,MILK study team,CC-BY-NC,40.0,,,False,Benign,Benign melanocytic proliferations,Nevus,serial imaging showing no change,,dermoscopic,IL_0959663,True,600,450,male
4,ISIC_0024310,MILK study team,CC-BY-NC,60.0,anterior torso,,True,Malignant,Malignant melanocytic proliferations (Melanoma),"Melanoma, NOS",histopathology,,dermoscopic,IL_8194852,True,600,450,male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11715,ISIC_0036060,MILK study team,CC-BY-NC,,,,False,Benign,Benign epidermal proliferations,Pigmented benign keratosis,single image expert consensus,,dermoscopic,IL_5218936,False,600,450,
11716,ISIC_0036061,MILK study team,CC-BY-NC,,,,False,Benign,Benign melanocytic proliferations,Nevus,single image expert consensus,,dermoscopic,IL_5893216,True,600,450,
11717,ISIC_0036062,MILK study team,CC-BY-NC,70.0,head/neck,,True,Indeterminate,Indeterminate epidermal proliferations,Solar or actinic keratosis,histopathology,,dermoscopic,IL_9294619,False,600,450,female
11718,ISIC_0036063,MILK study team,CC-BY-NC,,,,False,Benign,Benign epidermal proliferations,Pigmented benign keratosis,single image expert consensus,,dermoscopic,IL_6088695,False,600,450,


In [9]:
def lbl(col: str) -> str:
    return LABELS.get(col, col.replace("_", " ").title())

def norm_bool(x):
    s = str(x).strip().lower()
    return YESNO.get(s, None)

def fmt(col, v):
    if pd.isna(v):
        return "Unknown"
    as_bool = norm_bool(v)
    if as_bool is not None:
        return as_bool
    try:
        f = float(v)
        return str(int(f)) if f.is_integer() else str(f)
    except:
        pass
    if col == "sex":
        s = str(v).strip().lower()
        if s in {"male","m"}: return "Male"
        if s in {"female","f"}: return "Female"
    return str(v)

df["img_path"] = HAM_DIR.as_posix() + "/" + df["isic_id"].astype(str) + IMG_EXT

cols_for_caption = [c for c in df.columns if c not in EXCLUDE and c != "img_path"]

df["caption"] = df.apply(
    lambda r: ", ".join(f"{lbl(c)}: {fmt(c, r[c])}" for c in cols_for_caption),
    axis=1
)

final = df[["img_path", "caption"]].copy()

final

Unnamed: 0,img_path,caption
0,/workspace/clip_xai/dermaCAP/HAM10000/ISIC_002...,"Age (approx): 45, Anatomic site (general): Unk..."
1,/workspace/clip_xai/dermaCAP/HAM10000/ISIC_002...,"Age (approx): 50, Anatomic site (general): low..."
2,/workspace/clip_xai/dermaCAP/HAM10000/ISIC_002...,"Age (approx): 55, Anatomic site (general): Unk..."
3,/workspace/clip_xai/dermaCAP/HAM10000/ISIC_002...,"Age (approx): 40, Anatomic site (general): Unk..."
4,/workspace/clip_xai/dermaCAP/HAM10000/ISIC_002...,"Age (approx): 60, Anatomic site (general): ant..."
...,...,...
11715,/workspace/clip_xai/dermaCAP/HAM10000/ISIC_003...,"Age (approx): Unknown, Anatomic site (general)..."
11716,/workspace/clip_xai/dermaCAP/HAM10000/ISIC_003...,"Age (approx): Unknown, Anatomic site (general)..."
11717,/workspace/clip_xai/dermaCAP/HAM10000/ISIC_003...,"Age (approx): 70, Anatomic site (general): hea..."
11718,/workspace/clip_xai/dermaCAP/HAM10000/ISIC_003...,"Age (approx): Unknown, Anatomic site (general)..."


In [10]:
final.to_csv(OUT_CSV, index=False, encoding="utf-8")
print(f"OK → {OUT_CSV} | rows: {len(final)}")

OK → /workspace/clip_xai/dermaCAP/ham10000_prepared.csv | rows: 11720
