# Stanford AIMI Datasets preparation

Получите права на датасеты MRA-MIDAS, DDI, DDI2 - [тут](https://stanfordaimi.azurewebsites.net/)

Для этого потребуются ваши персональные данные

```bash
curl -sL https://aka.ms/downloadazcopy-v10-linux -o azcopy.tar.gz \
&& tar -xf azcopy.tar.gz \
&& mkdir -p /usr/local/bin \
&& install -m 0755 azcopy_linux_amd64_*/azcopy /usr/local/bin/azcopy \
&& rm -rf azcopy.tar.gz azcopy_linux_amd64_* \
&& azcopy --version
```

Примеры команд для загрузки датасета (URL получите на сайте AIMI):

**MRA-MIDAS**

```bash
SAS_URL='https://aimistanforddatasets01.blob.core.windows.net/midasmultimodalimagedatasetforaibasedskincancer?sv=2019-02-02&sr=c&sig=PiKRqIEdqoyntWTpP%2BChFQOY9Hbu%2BM07QdsaT8OPe2E%3D&st=2025-08-14T00%3A27%3A00Z&se=2025-09-13T00%3A32%3A00Z&sp=rl'
DEST='/workspace/clip_xai/dermaCAP'

mkdir -p "$DEST"
azcopy copy "$SAS_URL" "$DEST" --recursive=true
```

**DDI**

```bash
SAS_URL='https://aimistanforddatasets01.blob.core.windows.net/ddidiversedermatologyimages?sv=2019-02-02&sr=c&sig=QXBsamdp3E%2BGYeWCU2PqMMzs4swinLrQWHDTJXodYGw%3D&st=2025-08-14T00%3A42%3A19Z&se=2025-09-13T00%3A47%3A19Z&sp=rl'
DEST='/workspace/clip_xai/dermaCAP'

mkdir -p "$DEST"
azcopy copy "$SAS_URL" "$DEST" --recursive=true
```

**DDI2**

```bash
SAS_URL='https://aimistanforddatasets01.blob.core.windows.net/ddi2diversedermatologyimages2?sv=2019-02-02&sr=c&sig=0Y1DAWaPZom80qBWfHGRgcEh9LG10TC7tiEB60%2Bkdkk%3D&st=2025-08-14T00%3A42%3A54Z&se=2025-09-13T00%3A47%3A54Z&sp=rl'
DEST='/workspace/clip_xai/dermaCAP'

mkdir -p "$DEST"
azcopy copy "$SAS_URL" "$DEST" --recursive=true
```

## Preparation

In [2]:
from pathlib import Path
import pandas as pd

In [13]:
BASE_DIR        = Path("/workspace/clip_xai/dermaCAP")
OUT_CSV         = BASE_DIR / "aimi_prepared.csv"

# --- DDI2 ---
DDI2_DIR        = BASE_DIR / "ddi2diversedermatologyimages2"
DDI2_XLSX       = DDI2_DIR / "final_DDI2_Asian_spreadsheet.xlsx"
DDI2_IMG_DIR    = DDI2_DIR / "Final DDI2 Asian Photos (no metadata)"
DDI2_TECH_COLS  = {"deidentified_patient_id", "photo_id"}  # exclude
DDI2_SOURCE     = "ddi2"

# --- DDI ---
DDI_DIR         = BASE_DIR / "ddidiversedermatologyimages"
DDI_META_CSV    = DDI_DIR / "ddi_metadata.csv"
DDI_TECH_COLS   = {"DDI_ID", "DDI_file", "Unnamed: 0"}  # exclude
DDI_SOURCE      = "ddi"

# --- MRA-MIDAS ---
MIDAS_DIR       = BASE_DIR / "midasmultimodalimagedatasetforaibasedskincancer"
MIDAS_XLSX      = MIDAS_DIR / "release_midas.xlsx"
MIDAS_TECH_COLS = {"midas_record_id", "midas_file_name", "Unnamed: 0"}  # exclude
MIDAS_SOURCE    = "mramidas"

In [14]:
LABELS = {
    # DDI2
    "fitzpatrick_skin_type": "Fitzpatrick skin type",
    "anatomical_site_detailed": "Anatomical site (detailed)",
    "anatomical_site_general": "Anatomical site (general)",
    "diagnosis_detailed": "Diagnosis (detailed)",
    "diagnosis_general": "Diagnosis (general)",
    "dermatological_presentation": "Dermatological presentation",
    "benign/malignant": "Benign/Malignant",
    "common/uncommon": "Common/Uncommon",
    "cropped?": "Cropped",
    "extra_markings": "Extra markings",
    "sex": "Sex",
    "race": "Race",
    "ethnicity": "Ethnicity",
    "self-described_ethnic_background": "Self-described ethnic background",
    "language": "Language",
    "interpreter_needed": "Interpreter needed",
    "more_than_1_photo_for_this_patient": "More than 1 photo for this patient",
    # DDI
    "skin_tone": "Skin tone",
    "malignant": "Malignant",
    "disease": "Disease",
    # MIDAS
    "midas_iscontrol": "Is control",
    "midas_distance": "Distance",
    "midas_location": "Location",
    "midas_path": "Path (coarse)",
    "midas_pathreport": "Pathology report",
    "midas_gender": "Gender",
    "midas_age": "Age",
    "midas_fitzpatrick": "Fitzpatrick",
    "midas_melanoma": "Melanoma",
    "midas_ethnicity": "Ethnicity",
    "midas_race": "Race",
    "clinical_impression_1": "Clinical impression 1",
    "clinical_impression_2": "Clinical impression 2",
    "clinical_impression_3": "Clinical impression 3",
    "length_(mm)": "Length (mm)",
    "width_(mm)": "Width (mm)",
}

YESNO_VALUES = {"y": "Yes", "n": "No", "yes": "Yes", "no": "No", "true": "Yes", "false": "No", "0": "No", "1": "Yes"}
ROMAN_FTZ    = {1: "I", 2: "II", 3: "III", 4: "IV", 5: "V", 6: "VI"}

In [15]:
def _label(col: str) -> str:
    return LABELS.get(col, col.replace("_", " ").title())

def _fmt_value(col: str, v):
    if pd.isna(v):
        return "Unknown"
    s = str(v).strip()
    ls = s.lower()

    if ls in YESNO_VALUES:
        return YESNO_VALUES[ls]

    if col in {"fitzpatrick_skin_type"}:
        try:
            n = int(float(s))
            return ROMAN_FTZ.get(n, s)
        except:
            return s

    try:
        f = float(s)
        if f.is_integer():
            return str(int(f))
        return str(f)
    except:
        return s

def build_caption(df: pd.DataFrame, exclude: set) -> pd.Series:
    cols = [c for c in df.columns if c not in exclude]
    return df.apply(lambda r: ", ".join(f"{_label(c)}: {_fmt_value(c, r[c])}" for c in cols), axis=1)

In [16]:
ddi2 = pd.read_excel(DDI2_XLSX)
ddi2["img_path"] = DDI2_IMG_DIR.as_posix() + "/" + ddi2["photo_id"].astype(str) + ".jpg"
ddi2["caption"]  = build_caption(ddi2, exclude=DDI2_TECH_COLS | {"img_path"})
ddi2["source"]   = DDI2_SOURCE
ddi2_final = ddi2[["img_path", "caption", "source"]]

ddi2_final

Unnamed: 0,img_path,caption,source
0,/workspace/clip_xai/dermaCAP/ddi2diversedermat...,"More than 1 photo for this patient: No, Fitzpa...",ddi2
1,/workspace/clip_xai/dermaCAP/ddi2diversedermat...,"More than 1 photo for this patient: No, Fitzpa...",ddi2
2,/workspace/clip_xai/dermaCAP/ddi2diversedermat...,"More than 1 photo for this patient: No, Fitzpa...",ddi2
3,/workspace/clip_xai/dermaCAP/ddi2diversedermat...,"More than 1 photo for this patient: No, Fitzpa...",ddi2
4,/workspace/clip_xai/dermaCAP/ddi2diversedermat...,"More than 1 photo for this patient: No, Fitzpa...",ddi2
...,...,...,...
660,/workspace/clip_xai/dermaCAP/ddi2diversedermat...,"More than 1 photo for this patient: No, Fitzpa...",ddi2
661,/workspace/clip_xai/dermaCAP/ddi2diversedermat...,"More than 1 photo for this patient: Yes, Fitzp...",ddi2
662,/workspace/clip_xai/dermaCAP/ddi2diversedermat...,"More than 1 photo for this patient: Yes, Fitzp...",ddi2
663,/workspace/clip_xai/dermaCAP/ddi2diversedermat...,"More than 1 photo for this patient: No, Fitzpa...",ddi2


In [17]:
ddi = pd.read_csv(DDI_META_CSV)
ddi["img_path"] = DDI_DIR.as_posix() + "/" + ddi["DDI_file"].astype(str)
ddi["caption"]  = build_caption(ddi, exclude=DDI_TECH_COLS | {"img_path"})
ddi["source"]   = DDI_SOURCE
ddi_final = ddi[["img_path", "caption", "source"]]

ddi_final

Unnamed: 0,img_path,caption,source
0,/workspace/clip_xai/dermaCAP/ddidiversedermato...,"Skin tone: 56, Malignant: Yes, Disease: melano...",ddi
1,/workspace/clip_xai/dermaCAP/ddidiversedermato...,"Skin tone: 56, Malignant: Yes, Disease: melano...",ddi
2,/workspace/clip_xai/dermaCAP/ddidiversedermato...,"Skin tone: 56, Malignant: Yes, Disease: mycosi...",ddi
3,/workspace/clip_xai/dermaCAP/ddidiversedermato...,"Skin tone: 56, Malignant: Yes, Disease: squamo...",ddi
4,/workspace/clip_xai/dermaCAP/ddidiversedermato...,"Skin tone: 12, Malignant: Yes, Disease: basal-...",ddi
...,...,...,...
651,/workspace/clip_xai/dermaCAP/ddidiversedermato...,"Skin tone: 34, Malignant: No, Disease: pyogeni...",ddi
652,/workspace/clip_xai/dermaCAP/ddidiversedermato...,"Skin tone: 34, Malignant: No, Disease: melanoc...",ddi
653,/workspace/clip_xai/dermaCAP/ddidiversedermato...,"Skin tone: 34, Malignant: No, Disease: acral-m...",ddi
654,/workspace/clip_xai/dermaCAP/ddidiversedermato...,"Skin tone: 34, Malignant: Yes, Disease: squamo...",ddi


In [18]:
midas = pd.read_excel(MIDAS_XLSX)
midas["img_path"] = MIDAS_DIR.as_posix() + "/" + midas["midas_file_name"].astype(str)
midas["caption"]  = build_caption(midas, exclude=MIDAS_TECH_COLS | {"img_path"})
midas["source"]   = MIDAS_SOURCE
midas_final = midas[["img_path", "caption", "source"]]

midas_final

Unnamed: 0,img_path,caption,source
0,/workspace/clip_xai/dermaCAP/midasmultimodalim...,"Is control: No, Distance: 1ft, Location: chest...",mramidas
1,/workspace/clip_xai/dermaCAP/midasmultimodalim...,"Is control: No, Distance: 6in, Location: chest...",mramidas
2,/workspace/clip_xai/dermaCAP/midasmultimodalim...,"Is control: No, Distance: dscope, Location: ch...",mramidas
3,/workspace/clip_xai/dermaCAP/midasmultimodalim...,"Is control: No, Distance: 1ft, Location: l low...",mramidas
4,/workspace/clip_xai/dermaCAP/midasmultimodalim...,"Is control: No, Distance: dscope, Location: l ...",mramidas
...,...,...,...
3411,/workspace/clip_xai/dermaCAP/midasmultimodalim...,"Is control: Yes, Distance: n/a - virtual, Loca...",mramidas
3412,/workspace/clip_xai/dermaCAP/midasmultimodalim...,"Is control: Yes, Distance: n/a - virtual, Loca...",mramidas
3413,/workspace/clip_xai/dermaCAP/midasmultimodalim...,"Is control: No, Distance: dscope, Location: lo...",mramidas
3414,/workspace/clip_xai/dermaCAP/midasmultimodalim...,"Is control: No, Distance: 6in, Location: lower...",mramidas


In [19]:
aimi = pd.concat([ddi2_final, ddi_final, midas_final], ignore_index=True)
OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
aimi.to_csv(OUT_CSV, index=False, encoding="utf-8")
print(f"OK → {OUT_CSV} | rows: {len(aimi)}")

OK → /workspace/clip_xai/dermaCAP/aimi_prepared.csv | rows: 4737
