In [None]:
import glob
import numpy
import pandas
import SimpleITK
from bs4 import BeautifulSoup #conda install beautifulsoup4、conda install lxml

In [None]:
extract_xml_files()

In [None]:
def extract_xml_files():
    
    for sub_path in glob.glob("G:/LungCancerPredict/resources/lidc_annotations/*"):

        for xml_path in glob.glob(sub_path + "/*.xml"):
            
            print(xml_path)
            
            extract_xml_file(xml_path=xml_path)
            
def extract_xml_file(xml_path, agreement_threshold = 1):
    
    res = []
    
    xml = BeautifulSoup(open(xml_path, 'r').read(), features="xml")
    if xml.LidcReadMessage is None: return

    patient_id = xml.LidcReadMessage.ResponseHeader.SeriesInstanceUid.text

    src_file = find_mhd_file(patient_id)
    if src_file is None: return 
    
    itk_img = SimpleITK.ReadImage(src_file)
    img_array = SimpleITK.GetArrayFromImage(itk_img)
    origin = numpy.array(itk_img.GetOrigin())      
    spacing = numpy.array(itk_img.GetSpacing())    
   
    reading_sessions = xml.LidcReadMessage.find_all("readingSession")

    for reading_session in reading_sessions:

        nodules = reading_session.find_all("unblindedReadNodule")
        
        for nodule in nodules:

            rois = nodule.find_all("roi")
            if len(rois) < 2: continue
            
            nodule_id = nodule.noduleID.text
            
            x_min = y_min = z_min = 999999
            x_max = y_max = z_max = -999999
        
            for roi in rois:
                
                z_min, z_max = min(z_min, float(roi.imageZposition.text)), float(z_max, int(roi.imageZposition.text))
   
                edge_maps = roi.find_all("edgeMap")
   
                for edge_map in edge_maps:
        
                    x_min, y_min = min(x_min, int(edge_map.xCoord.text)), min(y_min, int(edge_map.yCoord.text))
                    x_max, y_max = max(x_max, int(edge_map.xCoord.text)), max(y_max, int(edge_map.yCoord.text))
                    
                if x_max == x_min: continue
                if y_max == y_min: continue

            x_diameter = x_max - x_min;        y_diameter = y_max - y_min;        z_diameter = z_max - z_min
            x_center = x_min + x_diameter / 2; y_center = y_min + y_diameter / 2; z_center = z_min + z_diameter / 2
            
            z_center -= origin[2]; z_center /= spacing[2]

            x_center_perc = round(x_center / img_array.shape[2], 4)
            y_center_perc = round(y_center / img_array.shape[1], 4)
            z_center_perc = round(z_center / img_array.shape[0], 4)
            
            diameter = max(x_diameter , y_diameter)
            diameter_perc = round(max(x_diameter / img_array.shape[2], y_diameter / img_array.shape[1]), 4)

            if nodule.characteristics is None: continue
            if nodule.characteristics.malignancy is None: continue
         
            malignacy = nodule.characteristics.malignancy.text
            sphericiy = nodule.characteristics.sphericity.text
            margin = nodule.characteristics.margin.text
            spiculation = nodule.characteristics.spiculation.text
            texture = nodule.characteristics.texture.text
            calcification = nodule.characteristics.calcification.text
            internal_structure = nodule.characteristics.internalStructure.text
            lobulation = nodule.characteristics.lobulation.text
            subtlety = nodule.characteristics.subtlety.text

            line = [nodule_id, x_center_perc, y_center_perc, z_center_perc, diameter_perc, malignacy, sphericiy, margin, spiculation, texture, calcification, internal_structure, lobulation, subtlety]  
                                                                                                   
            res.append(line)
    
    if agreement_threshold > 1:
        
        filtered_res = []
        
        for line1 in res:
            
            id1, x1, y1, z1, d1 = line1[0], line1[1], line1[2], line1[3], line1[4]

            overlaps = 0
            
            for line2 in res:
                
                id2 = line2[0]
                if id1 == id2: continue
                    
                x2, y2, z2, d2 = line2[1], line2[2], line2[3], line2[4]

                dist = math.sqrt(math.pow(x1 - x2, 2) + math.pow(y1 - y2, 2) + math.pow(z1 - z2, 2))
                
                if dist < d1 or dist < d2: overlaps += 1
                    
            if overlaps >= agreement_threshold: filtered_res.append(line1)

        res = filtered_res
        
    df = pandas.DataFrame(res, columns=["anno_index", "coord_x", "coord_y", "coord_z", "diameter", "malscore" ,  "sphericiy", "margin", "spiculation", "texture", "calcification", "internal_structure", "lobulation", "subtlety"])
    df.to_csv("G:/LungCancerPredict/extracted/lidc_extracted_label/" + patient_id + "_nodule_attribute.csv", index=False)

def find_mhd_file(patient_id):
    
    for subset_no in range(0, 10):
        
        src_path = "G:/LungCancerPredict/original/luna_raw/"  + "subset" + str(subset_no) + "/"
        
        for src_file in glob.glob(src_path + "*.mhd"):
            
            if patient_id in src_file: return src_file
                
    return None