In [1]:
import os
import os.path as osp
from glob import glob
import re
from pprint import pprint
from tqdm import tqdm
import pydicom as dicom
import random

PATHS = [f'/kuacc/users/oince22/hpc_run/physionet.org/files/mimic-cxr-jpg/2.0.0/files/p{s}' for s in range(10, 20)]
TEXT_PATH_ROOT = '/datasets/mimic/cxr/physionet.org/files/mimic-cxr/2.0.0/files/'

get_report_path = lambda img_path: Path.joinpath(TEXT_ROOT, img_path.parents[0]).with_suffix(".txt")

def check_jpg_extension(filename):
    pattern = r"\.jpg$"
    if re.search(pattern, filename):
        return True
    else:
        return False

In [2]:
import csv

splits = {
    "train": set(),
    "validate": set(),
    "test": set(),
}
with open("/kuacc/users/oince22/hpc_run/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-split.csv", "r") as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        if i == 0:
            continue
        splits[row[-1]].add(f"p{row[-2]}")

In [3]:
[len(splits[x]) for x in list(splits.keys())]

[64586, 500, 293]

In [9]:
ALL_IMG_PATHS = {
    "train": list(),
    "validate": list(),
    "test": list(),
}
ALL_TXT_PATHS = {
    "train": list(),
    "validate": list(),
    "test": list(),
}

In [10]:
get_txt_path = lambda s: TEXT_PATH_ROOT + "/".join(s.split("/")[10:-1]).split(".")[0] + ".txt"
get_new_img_path = lambda s: "/".join(s.split("/")[10:])

In [11]:
for PATH in PATHS:
    top_dirs = [dir for dir in os.listdir(PATH) if dir.find(".") == -1]
    for dir_name in tqdm(top_dirs):
        paths = os.listdir(osp.join(PATH, dir_name))
        img_dirs = [osp.join(PATH, dir_name, dir) for dir in paths if dir.find(".") == -1]
        for img_dir in img_dirs:
            img_paths = [osp.join(img_dir, img_name) for img_name in os.listdir(img_dir)]
            img_paths = [path for path in img_paths if check_jpg_extension(path)]
            for sname, split_set in splits.items():
                if dir_name in split_set:
                    ALL_IMG_PATHS[sname].extend([get_new_img_path(s) for s in img_paths])
                    ALL_TXT_PATHS[sname].extend([get_txt_path(s) for s in img_paths])
                    break
            else:
                assert False, "No split is defined"
                

100%|██████████| 6396/6396 [00:21<00:00, 296.95it/s]
100%|██████████| 6571/6571 [00:23<00:00, 274.80it/s]
100%|██████████| 6526/6526 [00:24<00:00, 270.57it/s]
100%|██████████| 6548/6548 [00:24<00:00, 262.61it/s]
100%|██████████| 6506/6506 [00:24<00:00, 262.16it/s]
100%|██████████| 6592/6592 [00:25<00:00, 260.06it/s]
100%|██████████| 6476/6476 [00:23<00:00, 279.22it/s]
100%|██████████| 6642/6642 [00:24<00:00, 275.70it/s]
100%|██████████| 6543/6543 [00:23<00:00, 273.60it/s]
100%|██████████| 6579/6579 [00:23<00:00, 280.71it/s]


In [12]:
def clean_section(text):
    text = re.sub(r'[\S]+:', '', text)
    text = re.sub(r"_+", "_", text)  # Remove multiple underscores
    text = re.sub(r"\s\s+", " ", text)
    text = re.sub("[^a-zA-Z0-9 :.,-]", "", text)
    text = re.sub(r" +", " ", text)
    text = text.strip()
    return text

In [13]:
import string
def preprocess_report(text):
    # Remove unnecessary and insensible parts
    text = re.sub(r"EXAMINATION:.*", "", text)  # Remove EXAMINATION line
    text = re.sub(r"WET READ:.*", "", text)  # Remove WET READ line
    text = re.sub(r"FINAL REPORT", "", text)  # Remove FINAL REPORT line
    text = re.sub(r"STUDY:.*", "", text)  # Remove STUDY line
    text = re.sub(r"COMPARISON:.*", "", text)  # Remove COMPARISON section
    text = re.sub(r"TECHNIQUE:.*", "", text)  # Remove TECHNIQUE section
    text = re.sub(r"_+", "_", text)  # Remove multiple underscores

    # Clean up excessive newlines and spaces
    text = re.sub(r"\s\s+", " ", text)
    text = re.sub("[^a-zA-Z0-9 :.,-]", "", text)
    text = re.sub(r" +", " ", text)
    text = text.strip()
    return text

In [14]:
def extract_sections(report_text):
    findings_pattern = r'FINDINGS:[\s\S]*:'
    impression_pattern = r'IMPRESSION:[\s\S]*'

    findings_match = re.search(findings_pattern, report_text, re.IGNORECASE)
    impression_match = re.search(impression_pattern, report_text, re.IGNORECASE)

    findings = findings_match.group().strip() if findings_match else None
    impression = impression_match.group().strip() if impression_match else None

    if findings is None and impression is None:
        return preprocess_report(report_text)
    elif impression is None:
        return clean_section(findings)
    elif findings is None:
        return clean_section(impression)
    else:
        return clean_section(findings) + " " + clean_section(impression)

    return findings, impression

In [16]:
idx = random.randint(0, len(ALL_TXT_PATHS)-1)
f = open(ALL_TXT_PATHS["train"][idx], "r")
text = f.read()
f.close()
print(text)

                                 FINAL REPORT
 INDICATION:   ___F with abdominal pain, ttp RUQ, below nipple  // rib fx?
 
 TECHNIQUE:  AP and lateral views of the chest.
 
 COMPARISON:  ___.
 
 FINDINGS: 
 
 The lungs are clear.  The cardiomediastinal silhouette is within normal
 limits.  No acute osseous abnormalities.  No displaced fractures on this
 nondedicated exam.
 
 IMPRESSION: 
 
 No acute cardiopulmonary process.  No visualized fracture.  If high clinical
 concern, consider dedicated rib series.



In [17]:
extract_sections(text)

'The lungs are clear. The cardiomediastinal silhouette is within normal limits. No acute osseous abnormalities. No displaced fractures on this nondedicated exam. No acute cardiopulmonary process. No visualized fracture. If high clinical concern, consider dedicated rib series.'

In [18]:
import csv

def create_tsv():
    for sname in splits.keys():
        with open(f"MIMIC_JPG_{sname[:5]}.tsv", 'w') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerow(["caption", "image"])
            for txt_path, img_path in tqdm(zip(ALL_TXT_PATHS[sname], ALL_IMG_PATHS[sname]), total=len(ALL_TXT_PATHS[sname])):
                try:
                    f = open(txt_path, "r")
                except FileNotFoundError as e:
                    print(str(e))
                    continue
                txt = extract_sections(f.read().strip())
                f.close()
                writer.writerow([txt, img_path])

In [19]:
create_tsv()

100%|██████████| 365096/365096 [40:29<00:00, 150.25it/s]
100%|██████████| 2971/2971 [00:20<00:00, 144.45it/s]
100%|██████████| 5086/5086 [00:24<00:00, 204.39it/s]


In [None]:
create_tsv(val_img_paths, val_train_txts, split="val")

In [None]:
len(train_img_paths)

In [None]:
from scipy import ndimage
import numpy as np
import matplotlib.pyplot as plt

def show_image(xray_image):
    plt.imshow(xray_image, cmap="gray")

In [None]:
ds = dicom.dcmread("/datasets/mimic/physionet.org/files/mimic-cxr/2.0.0/files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.dcm")

In [None]:
show_image(ds.pixel_array)

In [71]:
ds.ViewPosition

'LATERAL'

In [58]:
determine_view_position("/datasets/mimic/physionet.org/files/mimic-cxr/2.0.0/files/p10/p10000032/s50414267/174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.dcm")

'Lateral'

In [59]:
determine_view_position("/datasets/mimic/physionet.org/files/mimic-cxr/2.0.0/files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.dcm")

'Frontal'