# PAD-UFES-20 preparation

In [9]:
from pathlib import Path
import requests, zipfile, pandas as pd, io
import os
import shutil

In [2]:
PAD_ZIP_URL   = "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/zr7vgbcyr2-1.zip"

In [20]:
BASE_DIR      = Path("/workspace/clip_xai/dermaCAP/pad-ufes-20")
OUT_CSV       = Path("/workspace/clip_xai/dermaCAP") / "pad_ufes_prepared.csv"

META_NAME     = "metadata.csv"
IMAGES_DIRNAME= "images"
KEY_IMAGE_COL = "img_id"

In [None]:
BASE_DIR.mkdir(parents=True, exist_ok=True)

zip_bytes = requests.get(PAD_ZIP_URL, stream=True).content
with zipfile.ZipFile(io.BytesIO(zip_bytes), "r") as zf:
    zf.extractall(BASE_DIR)

images_dir = BASE_DIR / IMAGES_DIRNAME
for z in images_dir.glob("imgs_part_*.zip"):
    with zipfile.ZipFile(z, "r") as part:
        part.extractall(images_dir)
    z.unlink()

In [None]:
for folder in images_dir.glob("imgs_part_*"):
    if folder.is_dir():
        for file_path in folder.glob("*"):
            if file_path.is_file():
                file_name = file_path.name
                destination = images_dir / file_name
                shutil.move(str(file_path), str(destination))
        
        folder.rmdir()

In [None]:
meta_path = BASE_DIR / META_NAME
df = pd.read_csv(meta_path)

rel = (
    df[KEY_IMAGE_COL]
    .astype(str)
    .str.strip()
    .str.lstrip("./")
    .str.replace(r"^(images?/)?", "", regex=True)
)
df["img_path"] = images_dir.as_posix() + "/" + rel
df = df.drop(columns=["patient_id", "lesion_id", "img_id"])
df

Unnamed: 0,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,cancer_history,has_piped_water,...,diameter_2,diagnostic,itch,grew,hurt,changed,bleed,elevation,biopsed,img_path
0,,,,,8,,,,,,...,,NEV,False,False,False,False,False,False,False,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...
1,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,True,True,...,5.0,BCC,True,True,False,True,True,True,True,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...
2,,,,,77,,,,,,...,,ACK,True,False,False,False,False,False,False,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...
3,,,,,75,,,,,,...,,ACK,True,False,False,False,False,False,False,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...
4,False,True,POMERANIA,POMERANIA,79,False,MALE,True,False,False,...,5.0,BCC,True,True,False,False,True,True,True,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2293,,,,,73,,,,,,...,,ACK,True,False,False,False,False,False,False,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...
2294,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,True,True,...,12.0,BCC,True,True,False,True,False,False,True,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...
2295,,,,,74,,,,,,...,,SEK,False,False,False,False,False,False,False,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...
2296,False,False,POMERANIA,POMERANIA,58,True,FEMALE,True,True,False,...,4.0,BCC,True,False,False,False,False,True,True,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...


In [15]:
for col in df.columns:
    non_null_value = next((val for val in df[col] if pd.notna(val)), "Все значения пустые")
    print(f"Столбец: {col}, Пример: {non_null_value}")

Столбец: smoke, Пример: False
Столбец: drink, Пример: False
Столбец: background_father, Пример: POMERANIA
Столбец: background_mother, Пример: POMERANIA
Столбец: age, Пример: 8
Столбец: pesticide, Пример: False
Столбец: gender, Пример: FEMALE
Столбец: skin_cancer_history, Пример: True
Столбец: cancer_history, Пример: True
Столбец: has_piped_water, Пример: True
Столбец: has_sewage_system, Пример: True
Столбец: fitspatrick, Пример: 3.0
Столбец: region, Пример: ARM
Столбец: diameter_1, Пример: 6.0
Столбец: diameter_2, Пример: 5.0
Столбец: diagnostic, Пример: NEV
Столбец: itch, Пример: False
Столбец: grew, Пример: False
Столбец: hurt, Пример: False
Столбец: changed, Пример: False
Столбец: bleed, Пример: False
Столбец: elevation, Пример: False
Столбец: biopsed, Пример: False
Столбец: img_path, Пример: /workspace/clip_xai/dermaCAP/pad-ufes-20/images/PAT_1516_1765_530.png


In [16]:
LABELS = {
    "smoke": "Smokes",
    "drink": "Alcohol use",
    "background_father": "Father background",
    "background_mother": "Mother background",
    "age": "Age",
    "pesticide": "Pesticide exposure",
    "gender": "Gender",
    "skin_cancer_history": "Skin cancer history",
    "cancer_history": "Cancer history (any)",
    "has_piped_water": "Piped water",
    "has_sewage_system": "Sewage system",
    "fitspatrick": "Fitzpatrick",
    "region": "Region",
    "diameter_1": "Diameter 1",
    "diameter_2": "Diameter 2",
    "diagnostic": "Diagnosis",
    "itch": "Itch",
    "grew": "Growth",
    "hurt": "Pain",
    "changed": "Change",
    "bleed": "Bleeding",
    "elevation": "Elevation",
    "biopsed": "Biopsied",
}

BOOL_COLS = {
    "smoke","drink","pesticide","skin_cancer_history","cancer_history",
    "has_piped_water","has_sewage_system","itch","grew","hurt",
    "changed","bleed","elevation","biopsed"
}

ROMAN = {1:"I", 2:"II", 3:"III", 4:"IV", 5:"V", 6:"VI"}

def yesno(x):
    s = str(x).strip().lower()
    if s in {"1","true","yes","y"}:  return "Yes"
    if s in {"0","false","no","n"}:  return "No"
    return "Unknown"

def fmt(col, v):
    if pd.isna(v): return "Unknown"
    if col in BOOL_COLS: return yesno(v)
    if col == "gender":
        m = {"FEMALE":"Female","MALE":"Male"}
        return m.get(str(v).upper(), str(v))
    if col == "fitspatrick":
        try:
            n = int(float(v))
            return ROMAN.get(n, str(n))
        except:
            return str(v)
    if col in {"age"}:
        try:
            f = float(v)
            v = int(f) if f.is_integer() else f
            return str(v)
        except:
            return str(v)
    if col in {"diameter_1","diameter_2"}:
        try:
            f = float(v)
            v = int(f) if f.is_integer() else f
            return f"{v} mm"
        except:
            return str(v)
    if isinstance(v, float) and v.is_integer():
        return str(int(v))
    return str(v)

cols = [c for c in df.columns if c != "img_path"]

df["caption"] = df.apply(
    lambda r: ", ".join(f"{LABELS.get(c, c)}: {fmt(c, r[c])}" for c in cols),
    axis=1
)

In [17]:
df

Unnamed: 0,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,cancer_history,has_piped_water,...,diagnostic,itch,grew,hurt,changed,bleed,elevation,biopsed,img_path,caption
0,,,,,8,,,,,,...,NEV,False,False,False,False,False,False,False,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...,"Smokes: Unknown, Alcohol use: Unknown, Father ..."
1,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,True,True,...,BCC,True,True,False,True,True,True,True,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...,"Smokes: No, Alcohol use: No, Father background..."
2,,,,,77,,,,,,...,ACK,True,False,False,False,False,False,False,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...,"Smokes: Unknown, Alcohol use: Unknown, Father ..."
3,,,,,75,,,,,,...,ACK,True,False,False,False,False,False,False,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...,"Smokes: Unknown, Alcohol use: Unknown, Father ..."
4,False,True,POMERANIA,POMERANIA,79,False,MALE,True,False,False,...,BCC,True,True,False,False,True,True,True,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...,"Smokes: No, Alcohol use: Yes, Father backgroun..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2293,,,,,73,,,,,,...,ACK,True,False,False,False,False,False,False,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...,"Smokes: Unknown, Alcohol use: Unknown, Father ..."
2294,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,True,True,...,BCC,True,True,False,True,False,False,True,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...,"Smokes: No, Alcohol use: No, Father background..."
2295,,,,,74,,,,,,...,SEK,False,False,False,False,False,False,False,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...,"Smokes: Unknown, Alcohol use: Unknown, Father ..."
2296,False,False,POMERANIA,POMERANIA,58,True,FEMALE,True,True,False,...,BCC,True,False,False,False,False,True,True,/workspace/clip_xai/dermaCAP/pad-ufes-20/image...,"Smokes: No, Alcohol use: No, Father background..."


In [21]:
final = df[["img_path", "caption"]].copy()
final.to_csv(OUT_CSV, index=False, encoding="utf-8")
print(f"OK → {OUT_CSV} | rows: {len(final)}")

OK → /workspace/clip_xai/dermaCAP/pad_ufes_prepared.csv | rows: 2298
