In [1]:
updated_path = '/Mounts/rbg-storage1/datasets/PLCO_XRAY/allxrays/'

In [2]:
old_path = "/Mounts/rbg-storage1/datasets/PLCO_XRAY/tifs/"
old_json = "/Mounts/rbg-storage1/datasets/PLCO_XRAY/metadata_2022_04_05_rotated.json"

In [3]:
from glob import glob
import json
import pandas as pd
import skimage
import numpy as np

In [4]:
im_names = [pth.replace('/Mounts/rbg-storage1/datasets/PLCO_XRAY/allxrays/', "") for pth in glob(updated_path + "*.tif")]

In [5]:
im_names[0]

'AH04082092107142739_v2.tif'

In [6]:
old_json = json.load(open(old_json, 'rb'))

In [7]:
old_im_names = []
for patient in old_json:
    for accession in patient['accessions']:
        for img_series in accession['image_series']:
            old_im_names.append(img_series['filename'])

In [8]:
old_json[0]

{'accessions': [{'exam': '1_T0',
   'visit_num': 1,
   'study_yr': 0,
   'image_series': [{'path': '/Mounts/rbg-storage1/datasets/PLCO_XRAY/rotated_better_fix/batch_2e/AD02485080607140559_v2.tif',
     'filename': 'AD02485080607140559_v2.tif',
     'Image ImageWidth': 2540,
     'Image ImageLength': 2093,
     'Image BitsPerSample': 8,
     'Image Compression': 5,
     'Image PhotometricInterpretation': 1,
     'Image StripOffsets': [8,
      170,
      355,
      622,
      1003,
      1598,
      2302,
      3007,
      3739,
      4453,
      5055,
      5632,
      6209,
      6796,
      7336,
      7866,
      8395,
      8959,
      9488,
      10048,
      10587,
      11124,
      11638,
      12135,
      12651,
      13157,
      13697,
      14229,
      14745,
      15276,
      15781,
      16281,
      16766,
      17273,
      17747,
      18202,
      18649,
      19130,
      19596,
      20058,
      20524,
      20965,
      21396,
      21828,
      22253,
      22

In [9]:
set_old_im_names = set(old_im_names)

In [10]:
len(set_old_im_names)

89716

In [11]:
trues = 0
falses = 0
for im_name in im_names:
    if im_name in set_old_im_names:
        trues += 1
    else:
        falses += 1

In [12]:
trues, falses

(89716, 108824)

In [13]:
patient2image = '/Mounts/rbg-storage1/datasets/PLCO_XRAY/package-plcoi-919/Lung/Standard 25K Linkage (2021)/link_2021_25k_selection.csv'

In [14]:
df = pd.read_csv(patient2image)[['image_file_name', 'plco_id']]

In [15]:
len(set(df['plco_id'].values))

25000

In [16]:
im_paths = glob(updated_path + "*.tif")


In [17]:
i = np.random.randint(0,198540)
path = im_paths[i]

In [18]:
path

'/Mounts/rbg-storage1/datasets/PLCO_XRAY/allxrays/AE04042120506081110_v2.tif'

In [21]:
from tqdm import tqdm
import albumentations as A

In [24]:
height, width = (512, 512)
resize = A.Resize(height, width)

In [25]:
means = []
stds = []
for p in tqdm(im_paths[:len(im_paths)//100]):
    im = skimage.io.imread(p, plugin='tifffile').astype(np.float64)
    out = resize(image=im)['image']
    means.append(np.mean(out))
    stds.append(np.std(out))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1985/1985 [11:21<00:00,  2.91it/s]


In [26]:
np.mean(means), np.mean(stds)

(2863.0726471573075, 887.999514817787)

In [None]:
(2863.070033296751, 887.966694489157)

In [None]:
class Args:
    def __init__(self):
        self.tif_dirs = updated_path
        self.link_csv = ''
        self.person_csv = ''
        self.output_json_path = '' 
        self.error_json_path = None
        self.split_probs = [0.7, 0.15, 0.15]

args = Args()

In [None]:
tifs = list(path for tif_dir in args.tif_dirs for path in tif_dir.glob('**/*.tif'))

print("Loading", args.link_csv)
filename_to_id_map_df = pd.read_csv(args.link_csv)
filename_to_id_map = filename_to_id_map_df.set_index('image_file_name').to_dict('index')

print("Loading", args.person_csv)
lung_persons = pd.read_csv(args.person_csv)
lung_persons.fillna(-1, inplace=True)
id_to_metadata_map = lung_persons.set_index('plco_id').to_dict('index')

json_dataset = []
pid2idx = {}
peid2idx = {}
loadingbar = tqdm(tifs)
for path in loadingbar:
    loadingbar.set_description(f"Processing {path}")
    filename = path.name

    pid = filename_to_id_map[filename]['plco_id']
    years_from_baseline = filename_to_id_map[filename]['assoc_visit_syr']
    visit_num = filename_to_id_map[filename]['assoc_visit_visnum']
    # couldn't find: date, series_id, sop_id

    exam = '{}_T{}'.format(visit_num, years_from_baseline)
    peid = "{}{}".format(pid, exam)

    exam_dict = {
        "exam": exam,
        "visit_num": visit_num,
        "study_yr": years_from_baseline,
        #"accession_number": accession_number
        #"date": date
    }

    img_dict = {
        "path": str(path.absolute()),
        "filename": filename,
    }

    # tif image metadata
    with open(str(path), 'rb') as f:
        tags = exifread.process_file(f)
        for tag in tags.keys():
            if tag not in ('JPEGThumbnail', 'TIFFThumbnail', 'Filename', 
                           'EXIF MakerNote', 'Image DateTime'): # DateTime is date of image-digitization, not of scan
                value = tags[tag].values
                if type(value[0]) not in (str, int, float, bool):
                    continue
                if len(value) == 1:
                    value = value[0]

                img_dict[tag] = value


    if pid in pid2idx:
        pt_idx = pid2idx[pid]

        if peid in peid2idx:
            exam_idx = peid2idx[peid]
            json_dataset[pt_idx]["accessions"][exam_idx]["image_series"].append(
                img_dict
            )

        else:
            peid2idx[peid] = len(json_dataset[pt_idx]["accessions"])
            exam_dict["image_series"] = [img_dict]
            json_dataset[pt_idx]["accessions"].append(exam_dict)

    else:
        pid2idx[pid] = len(json_dataset)
        peid2idx[peid] = 0

        patient_metadata = id_to_metadata_map[pid]

        pt_dict = {
            "accessions": [exam_dict],
            "pid": pid,
            "split": np.random.choice(["train", "dev", "test"], p=args.split_probs),
            "pt_metadata": patient_metadata
            }
        pt_dict["accessions"][0]["image_series"] = [img_dict]

        json_dataset.append(pt_dict)

json.dump(list(json_dataset), open(args.output_json_path, "w"))
