In [99]:
import pickle as pkl
import pandas as pd
import pydicom
import os
import xml.etree.ElementTree as ET
import numpy as np

In [29]:
def find_ct_path(raw_path, patient_id):
    """
    Given a patient i.d., returns the path to the directory with their
    CT images

    :param raw_path: path to raw data
    :param patient_id: string of patient ID of form LIDC-IDRI-XXXX
    return: path to the folder containing CT images for a given patient
    """
    patient_dir = raw_path + patient_id
    dirs = [
        os.path.join(patient_dir, d) for d in os.listdir(patient_dir)
        if os.path.isdir(os.path.join(patient_dir, d))
    ]
    dir1 = dirs[0]
    imdir1 = [
        os.path.join(dir1, d) for d in os.listdir(dir1)
        if os.path.isdir(os.path.join(dir1, d))
        and len(os.listdir(os.path.join(dir1, d))) > 1
    ][0]
    if len(dirs) == 1:
        return imdir1

    dir2 = dirs[1]
    imdir2 = [
        os.path.join(dir2, d) for d in os.listdir(dir2)
        if os.path.isdir(os.path.join(dir2, d))
    ][0]
    im1 = os.path.join(
        imdir1,
        [im for im in os.listdir(imdir1) if im.endswith('dcm')][0]
    )
    type1 = pydicom.dcmread(im1)[('0008', '0060')].value
    if type1 == 'CT':
        return imdir1
    else:
        return imdir2


In [102]:
def get_malignancy(dirname):
    """
    Given the path to the directory with the CT files for a patient,
    returns a dataframe containing all ROIs for all contoured images
    where index=UID

    :param dirname: absolute path to the folder containing CT images
    for a given patient
    return: list of ROI boundaries defined in the CT image XML file
    """
    rootfile = [f for f in os.listdir(dirname) if f.endswith(".xml")][0]
    root = ET.parse(os.path.join(dirname, rootfile))
    malignancies = []
    for session in root.findall('{http://www.nih.gov}readingSession'):
        for readNodule in session.findall('{http://www.nih.gov}unblindedReadNodule'):
            chars = readNodule.find('{http://www.nih.gov}characteristics')
            if chars:
                malignancies.append(chars.find('{http://www.nih.gov}malignancy').text)

    return np.array(malignancies).astype('int')

In [126]:
path = find_ct_path('../raw_data/LIDC-IDRI/', 'LIDC-IDRI-0061')
malignancies = get_malignancy(path)
print(malignancies)
print(malignancies.mean())

[4 3 2 5 5 3 3 3 3 3 3 5 5 2 2 2 2 2 2]
3.1052631578947367


In [39]:
pid_df

Unnamed: 0_level_0,ROIs,position,path
UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.3.6.1.4.1.14519.5.2.1.6279.6001.110383487652933113465768208719,"[[(312, 355), (311, 356), (310, 357), (309, 35...","[-166.000000, -171.699997, -125.000000]",../raw_data/LIDC-IDRI/LIDC-IDRI-0001/01-01-200...
1.3.6.1.4.1.14519.5.2.1.6279.6001.499837844441581448374672853475,"[[(314, 346), (313, 347), (313, 348), (313, 34...","[-166.000000, -171.699997, -122.500000]",../raw_data/LIDC-IDRI/LIDC-IDRI-0001/01-01-200...
1.3.6.1.4.1.14519.5.2.1.6279.6001.299410838455281419536742634793,"[[(312, 346), (311, 347), (311, 348), (311, 34...","[-166.000000, -171.699997, -120.000000]",../raw_data/LIDC-IDRI/LIDC-IDRI-0001/01-01-200...
1.3.6.1.4.1.14519.5.2.1.6279.6001.824843590991776411530080688091,"[[(308, 340), (307, 341), (308, 342), (308, 34...","[-166.000000, -171.699997, -117.500000]",../raw_data/LIDC-IDRI/LIDC-IDRI-0001/01-01-200...
1.3.6.1.4.1.14519.5.2.1.6279.6001.297813206491522913194774892711,"[[(314, 347), (313, 346), (312, 347), (311, 34...","[-166.000000, -171.699997, -115.000000]",../raw_data/LIDC-IDRI/LIDC-IDRI-0001/01-01-200...
1.3.6.1.4.1.14519.5.2.1.6279.6001.261151233960269013402330853013,"[[(315, 349), (314, 350), (313, 351), (312, 35...","[-166.000000, -171.699997, -112.500000]",../raw_data/LIDC-IDRI/LIDC-IDRI-0001/01-01-200...
1.3.6.1.4.1.14519.5.2.1.6279.6001.202709423777326615340853838834,"[[(310, 351), (309, 352), (309, 353), (309, 35...","[-166.000000, -171.699997, -110.000000]",../raw_data/LIDC-IDRI/LIDC-IDRI-0001/01-01-200...
1.3.6.1.4.1.14519.5.2.1.6279.6001.281416679065036634264586513142,"[[(306, 351), (305, 352), (305, 353), (304, 35...","[-166.000000, -171.699997, -107.500000]",../raw_data/LIDC-IDRI/LIDC-IDRI-0001/01-01-200...
