In [1]:
import os
import os.path as osp
from glob import glob
import re
from pprint import pprint
from tqdm import tqdm
import pydicom as dicom
import random

PATHS = [f'/kuacc/users/oince22/hpc_run/physionet.org/files/mimic-cxr-jpg/2.0.0/files/p{s}' for s in range(10, 17)]
TEXT_PATH_ROOT = '/datasets/mimic/cxr/physionet.org/files/mimic-cxr/2.0.0/files/'

get_report_path = lambda img_path: Path.joinpath(TEXT_ROOT, img_path.parents[0]).with_suffix(".txt")

def check_jpg_extension(filename):
    pattern = r"\.jpg$"
    if re.search(pattern, filename):
        return True
    else:
        return False

In [2]:
ALL_IMG_PATHS = []
ALL_TXT_PATHS = []

In [3]:
get_txt_path = lambda s: TEXT_PATH_ROOT + "/".join(s.split("/")[10:-1]).split(".")[0] + ".txt"
get_new_img_path = lambda s: "/".join(s.split("/")[10:])

In [4]:
for PATH in PATHS:
    top_dirs = [dir for dir in os.listdir(PATH) if dir.find(".") == -1]
    for dir_name in tqdm(top_dirs):
        paths = os.listdir(osp.join(PATH, dir_name))
        img_dirs = [osp.join(PATH, dir_name, dir) for dir in paths if dir.find(".") == -1]
        for img_dir in img_dirs:
            img_paths = [osp.join(img_dir, img_name) for img_name in os.listdir(img_dir)]
            img_paths = [path for path in img_paths if check_jpg_extension(path)]
            ALL_IMG_PATHS.extend([get_new_img_path(s) for s in img_paths])
            ALL_TXT_PATHS.extend([get_txt_path(s) for s in img_paths])

100%|██████████| 6396/6396 [00:15<00:00, 408.75it/s]
100%|██████████| 6571/6571 [00:23<00:00, 276.84it/s]
100%|██████████| 6526/6526 [00:23<00:00, 275.33it/s]
100%|██████████| 6548/6548 [00:24<00:00, 266.33it/s]
100%|██████████| 6506/6506 [00:24<00:00, 263.94it/s]
100%|██████████| 6592/6592 [00:24<00:00, 265.08it/s]
100%|██████████| 6476/6476 [00:20<00:00, 314.60it/s]


In [5]:
def clean_section(text):
    text = re.sub(r'[\S]+:', '', text)
    text = re.sub(r"_+", "_", text)  # Remove multiple underscores
    text = re.sub(r"\s\s+", " ", text)
    text = re.sub("[^a-zA-Z0-9 :.,-]", "", text)
    text = re.sub(r" +", " ", text)
    text = text.strip()
    return text

In [6]:
import string
def preprocess_report(text):
    # Remove unnecessary and insensible parts
    text = re.sub(r"EXAMINATION:.*", "", text)  # Remove EXAMINATION line
    text = re.sub(r"WET READ:.*", "", text)  # Remove WET READ line
    text = re.sub(r"FINAL REPORT", "", text)  # Remove FINAL REPORT line
    text = re.sub(r"STUDY:.*", "", text)  # Remove STUDY line
    text = re.sub(r"COMPARISON:.*", "", text)  # Remove COMPARISON section
    text = re.sub(r"TECHNIQUE:.*", "", text)  # Remove TECHNIQUE section
    text = re.sub(r"_+", "_", text)  # Remove multiple underscores

    # Clean up excessive newlines and spaces
    text = re.sub(r"\s\s+", " ", text)
    text = re.sub("[^a-zA-Z0-9 :.,]", "", text)
    text = re.sub(r" +", " ", text)
    text = text.strip()
    return text

In [7]:
def extract_sections(report_text):
    findings_pattern = r'FINDINGS:[\s\S]*:'
    impression_pattern = r'IMPRESSION:[\s\S]*'

    findings_match = re.search(findings_pattern, report_text, re.IGNORECASE)
    impression_match = re.search(impression_pattern, report_text, re.IGNORECASE)

    findings = findings_match.group().strip() if findings_match else None
    impression = impression_match.group().strip() if impression_match else None

    if findings is None and impression is None:
        return preprocess_report(report_text)
    elif impression is None:
        return clean_section(findings)
    elif findings is None:
        return clean_section(impression)
    else:
        return clean_section(findings) + " " + clean_section(impression)

    return findings, impression

In [8]:
idx = random.randint(0, len(ALL_TXT_PATHS)-1)
f = open(ALL_TXT_PATHS[idx], "r")
text = f.read()
f.close()
print(text)

                                 FINAL REPORT
 EXAMINATION:  CHEST (PORTABLE AP)
 
 INDICATION:  ___ year old man with chf, hiv possible viremia with dyspnia and
 increased resp rate  // Pneumonia? Edema?     Pneumonia? Edema?
 
 COMPARISON:  Chest radiographs since ___ most recently ___.
 
 IMPRESSION: 
 
 Lungs are mildly hyperinflated, but clear.  Heart is normal size. Pulmonary
 vasculature is more distended today than on ___ probably in indication
 of borderline left ventricular left heart dysfunction, but there is no
 pulmonary edema, consolidation, or pleural effusion.  No pneumothorax.



In [9]:
extract_sections(text)

'Lungs are mildly hyperinflated, but clear. Heart is normal size. Pulmonary vasculature is more distended today than on probably in indication of borderline left ventricular left heart dysfunction, but there is no pulmonary edema, consolidation, or pleural effusion. No pneumothorax.'

In [10]:
import csv

def create_tsv():
    with open(f"MIMIC_JPG.tsv", 'w') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerow(["caption", "image"])
        for txt_path, img_path in tqdm(zip(ALL_TXT_PATHS, ALL_IMG_PATHS), total=len(ALL_TXT_PATHS)):
            try:
                f = open(txt_path, "r")
            except FileNotFoundError as e:
                print(str(e))
                continue
            txt = extract_sections(f.read().strip())
            f.close()
            writer.writerow([txt, img_path])

In [11]:
create_tsv()

100%|██████████| 261774/261774 [41:11<00:00, 105.90it/s]


In [None]:
create_tsv(val_img_paths, val_train_txts, split="val")

In [None]:
len(train_img_paths)

In [None]:
!pip install matplotlib

In [None]:
from scipy import ndimage
import numpy as np
import matplotlib.pyplot as plt

def show_image(xray_image):
    plt.imshow(xray_image, cmap="gray")

In [None]:
ds = dicom.dcmread("/datasets/mimic/physionet.org/files/mimic-cxr/2.0.0/files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.dcm")

In [None]:
show_image(ds.pixel_array)

In [71]:
ds.ViewPosition

'LATERAL'

In [58]:
determine_view_position("/datasets/mimic/physionet.org/files/mimic-cxr/2.0.0/files/p10/p10000032/s50414267/174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.dcm")

'Lateral'

In [59]:
determine_view_position("/datasets/mimic/physionet.org/files/mimic-cxr/2.0.0/files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.dcm")

'Frontal'