In [1]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset

In [None]:
def link_all_mimic_data(report_folder, chexpert_csv, metadata_csv, views=["PA"], load_text=True):
    # 1. Define primary observation labels
    pathologies = [
        "Enlarged Cardiomediastinum", "Cardiomegaly", "Lung Lesion",
        "Lung Opacity", "Edema", "Consolidation", "Pneumonia", "Atelectasis",
        "Pneumothorax", "Pleural Effusion", "Pleural Other", "Fracture", "Support Devices"
    ]

    # 2. Load input CSV files
    df_chexpert = pd.read_csv(chexpert_csv)
    df_meta = pd.read_csv(metadata_csv)

    # 3. Merge metadata and chexpert labels
    if "subject_id" in df_chexpert.columns:
        df = pd.merge(df_meta, df_chexpert, on=["subject_id", "study_id"])
    else:
        df = pd.merge(df_meta, df_chexpert, on="study_id")
        df = df.rename(columns={"subject_id_x": "subject_id"})
        if "subject_id_y" in df.columns:
            df = df.drop(columns=["subject_id_y"])

    # 4. Filter by view position (e.g., PA only)
    df["view"] = df["ViewPosition"].fillna("UNKNOWN")
    if "*" not in views:
        df = df[df["view"].isin(views)]

    # 5. Construct report path for each study
    def build_report_path(subject_id, study_id):
        sid = str(subject_id)
        stid = str(study_id)
        folder1 = "p" + sid[:2]
        folder2 = "p" + sid
        filename = "s" + stid + ".txt"
        full_path = os.path.join(report_folder, folder1, folder2, filename)
        return full_path if os.path.exists(full_path) else None

    df["report_path"] = df.apply(
        lambda row: build_report_path(row["subject_id"], row["study_id"]), axis=1
    )
    df = df[df["report_path"].notna()].reset_index(drop=True)

    # 6. (Optional) Load report text
    if load_text:
        def read_text(path):
            try:
                with open(path, "r", encoding="utf-8") as f:
                    return f.read()
            except:
                return ""
        df["report_text"] = df["report_path"].apply(read_text)

    # 7. Keep core columns
    base_cols = ["subject_id", "study_id", "dicom_id", "view", "StudyDate", "report_path"]
    if load_text:
        base_cols.append("report_text")
    label_cols = [col for col in df.columns if col in pathologies]

    df_final = df[base_cols + label_cols].copy()
    return df_final

In [None]:
df_linked = link_all_mimic_data(
    report_folder="mimic-cxr-reports/files",
    chexpert_csv="mimic-cxr-2.0.0-chexpert.csv",
    metadata_csv="mimic-cxr-2.0.0-metadata.csv",
    views=["PA"], 
    load_text=True
)

df_linked.to_csv("mimic_task2_linked_data_Full.csv", index=False)