In [None]:
import os
import random
from typing import Dict, List

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
import tensorflow as tf
from tqdm import tqdm

In [None]:
def load_embedding(embedding_path):
    raw_dataset = tf.data.TFRecordDataset([embedding_path])
    for raw_record in raw_dataset.take(1):
      example = tf.train.Example()
      example.ParseFromString(raw_record.numpy())
      embedding_feature = example.features.feature['embedding']
      embedding_values = embedding_feature.float_list.value
    return torch.tensor(embedding_values)

In [None]:
class MIMIC_Embed_Dataset(Dataset):

    pathologies = [
        "Enlarged Cardiomediastinum",
        "Cardiomegaly",
        "Lung Opacity",
        "Lung Lesion",
        "Edema",
        "Consolidation",
        "Pneumonia",
        "Atelectasis",
        "Pneumothorax",
        "Pleural Effusion",
        "Pleural Other",
        "Fracture",
        "Support Devices",
    ]

    split_ratio = [0.8, 0.1, 0.1]

    def __init__(
        self,
        embedpath,
        csvpath,
        metacsvpath,
        report_folder=None,
        views=["PA"],
        data_aug=None,
        seed=0,
        unique_patients=True,
        mode="train",
    ):
        super().__init__()
        np.random.seed(seed)

        self.pathologies = sorted(self.pathologies)
        self.mode = mode
        self.embedpath = embedpath
        self.report_folder = report_folder
        self.data_aug = data_aug
        self.csv = pd.read_csv(csvpath)
        self.metacsv = pd.read_csv(metacsvpath)

        self.csv = self.csv.set_index(["subject_id", "study_id"])
        self.metacsv = self.metacsv.set_index(["subject_id", "study_id"])
        self.csv = self.csv.join(self.metacsv).reset_index()

        self.csv["view"] = self.csv["ViewPosition"]
        self.limit_to_selected_views(views)

        if unique_patients:
            self.csv = self.csv.groupby("subject_id").first().reset_index()

        n_row = self.csv.shape[0]
        if self.mode == "train":
            self.csv = self.csv[: int(n_row * self.split_ratio[0])]
        elif self.mode == "valid":
            self.csv = self.csv[
                int(n_row * self.split_ratio[0]) : int(n_row * (self.split_ratio[0] + self.split_ratio[1]))
            ]
        elif self.mode == "test":
            self.csv = self.csv[-int(n_row * self.split_ratio[-1]) :]
        elif self.mode == "all" or self.mode is None:
            pass
        else:
            raise ValueError(f"attr:mode must be one of ['train', 'valid', 'test', 'all', None], but got {self.mode}")

        healthy = self.csv["No Finding"] == 1
        labels = []
        for pathology in self.pathologies:
            if pathology in self.csv.columns:
                self.csv.loc[healthy, pathology] = 0
                mask = self.csv[pathology]
                labels.append(mask.values)
        self.labels = np.asarray(labels).T.astype(np.float32)
        self.labels[self.labels == -1] = np.nan

        self.pathologies = list(np.char.replace(self.pathologies, "Pleural Effusion", "Effusion"))
        self.csv["offset_day_int"] = self.csv["StudyDate"]
        self.csv["patientid"] = self.csv["subject_id"].astype(str)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sample = {}
        sample["idx"] = idx
        sample["lab"] = self.labels[idx]

        subjectid = str(self.csv.iloc[idx]["subject_id"])
        studyid = str(self.csv.iloc[idx]["study_id"])
        dicom_id = str(self.csv.iloc[idx]["dicom_id"])

        # Load image embedding
        embed_file = os.path.join(
            self.embedpath,
            "p" + subjectid[:2],
            "p" + subjectid,
            "s" + studyid,
            dicom_id + ".tfrecord",
        )
        sample["embedding"] = load_embedding(embed_file)

        # Load report text
        if self.report_folder:
            report_file = os.path.join(
                self.report_folder,
                "p" + subjectid[:2],
                "p" + subjectid,
                "s" + studyid + ".txt"
            )
            try:
                with open(report_file, "r") as f:
                    sample["report_text"] = f.read()
            except FileNotFoundError:
                sample["report_text"] = ""
        else:
            sample["report_text"] = ""

        return sample

    def limit_to_selected_views(self, views):
        if type(views) is not list:
            views = [views]
        if "*" in views:
            views = ["*"]
        self.views = views
        self.csv["view"] = self.csv["view"].fillna("UNKNOWN")
        if "*" not in views:
            self.csv = self.csv[self.csv["view"].isin(self.views)]

    def string(self):
        return f"{self.__class__.__name__} mode={self.mode} num_samples={len(self)} views={self.views}"

In [None]:
embedpath = "generalized-image-embeddings-for-the-mimic-chest-x-ray-dataset-1.0/files"
csvpath = "mimic-cxr-2.0.0-chexpert.csv"
metacsvpath = "mimic-cxr-2.0.0-metadata.csv"
report_folder = "mimic-cxr-reports/files"

dataset = MIMIC_Embed_Dataset(
    embedpath=embedpath,
    csvpath=csvpath,
    metacsvpath=metacsvpath,
    report_folder=report_folder,
    mode="all"
)

In [None]:
print("Number of samples:", len(dataset))              # Number of samples
print("Label shape:", dataset.labels.shape)            # Label matrix shape (num_samples, num_labels)

Number of samples: 45628
Label shape: (45628, 13)


In [None]:
sample = dataset[0]
print("Keys in sample:", sample.keys())
print("Embedding type:", type(sample["embedding"]))
print("Report text length:", len(sample["report_text"]))

Keys in sample: dict_keys(['idx', 'lab', 'embedding', 'report_text'])
Embedding type: <class 'torch.Tensor'>
Report text length: 644


In [None]:
for i in range(3):
    sample = dataset[i]
    print(f"Sample {i}: embedding shape = {sample['embedding'].shape}, report length = {len(sample['report_text'])}")

Sample 0: embedding shape = torch.Size([1376]), report length = 644
Sample 1: embedding shape = torch.Size([1376]), report length = 489
Sample 2: embedding shape = torch.Size([1376]), report length = 675


In [None]:
# Initialize lists
embeddings, labels, texts, subject_ids, study_ids = [], [], [], [], []

for i in tqdm(range(len(dataset)), desc="Processing dataset for saving"):
    row = dataset.csv.iloc[i]
    sample = dataset[i]
    
    embeddings.append(sample["embedding"].numpy())
    labels.append(sample["lab"])
    texts.append(sample["report_text"])
    subject_ids.append(row["subject_id"])
    study_ids.append(row["study_id"])

# Convert to arrays
embeddings = np.array(embeddings)
labels = np.array(labels)
texts = np.array(texts, dtype=object)  # 文字长度不一致时需 object
subject_ids = np.array(subject_ids)
study_ids = np.array(study_ids)

# Save as a compressed .npz file
np.savez_compressed(
    "mimic_multimodal_dataset.npz",
    embeddings=embeddings,
    labels=labels,
    texts=texts,
    subject_ids=subject_ids,
    study_ids=study_ids
)
print("Multimodal data saved to mimic_multimodal_dataset.npz")

Processing dataset for saving: 100%|██████████| 45628/45628 [20:30<00:00, 37.07it/s]


✅ 多模态数据已保存为 mimic_multimodal_dataset.npz


In [None]:
data = np.load("mimic_multimodal_dataset.npz", allow_pickle=True)

embeddings = data["embeddings"]
labels = data["labels"]
texts = data["texts"]
subject_ids = data["subject_ids"]
study_ids = data["study_ids"]

In [None]:
# Collect required information
records = []
for i in tqdm(range(len(dataset)), desc="Saving report text"):
    row = dataset.csv.iloc[i]
    sample = dataset[i]
    records.append({
        "subject_id": row["subject_id"],
        "study_id": row["study_id"],
        "report_text": sample["report_text"]
    })

# Convert to DataFrame
df_text = pd.DataFrame(records)

# Save as CSV file
df_text.to_csv("mimic_report_text_only.csv", index=False)

Saving report text: 100%|██████████| 45628/45628 [25:38<00:00, 29.65it/s]  


In [None]:
# 1. Load the two datasets
df_text = pd.read_csv("mimic_report_text_only.csv")
df_chexpert = pd.read_csv("mimic-cxr-2.0.0-chexpert.csv")

# 2. Keep necessary columns from chexpert (primary keys + labels)
label_columns = [
    "Enlarged Cardiomediastinum", "Cardiomegaly", "Lung Opacity",
    "Lung Lesion", "Edema", "Consolidation", "Pneumonia",
    "Atelectasis", "Pneumothorax", "Pleural Effusion",
    "Pleural Other", "Fracture", "Support Devices"
]
df_labels = df_chexpert[["subject_id", "study_id"] + label_columns]

# 3.Merge the two tables based on subject_id + study_id
df_merged = pd.merge(df_text, df_labels, on=["subject_id", "study_id"], how="inner")

# 4. Save to a new CSV file
df_merged.to_csv("mimic_task2_linked_data_half.csv", index=False)
print(" mimic_task2_linked_data_half.csv generated with {} records".format(len(df_merged)))

✅ 已生成 mimic_report_with_labels.csv，共 45628 条记录
