In [1]:
import numpy as np
import pandas as pd
import os
import pydicom as dicom
from collections import Counter
import time

In [2]:
start=time.time()

In [3]:
path='H:/My Desktop/sr_files' #Path with SR DICOM files with measurements

In [4]:
save_path="H:/My Desktop/CACS_manual_new" #path to save txt files to be used to create df

if not os.path.exists(save_path): #Create folder to save images
    os.mkdir(save_path)

In [5]:
# Extract file names from the list of files in the path and count their occurrence
files=[file.split('_')[1].split('.')[0] for file in os.listdir(path)]
Counter(files).most_common() #Check if same participant exists more than once - should not be, otherwise error

[('135984', 1),
 ('136449', 1),
 ('136567', 1),
 ('141168', 1),
 ('145677', 1),
 ('158883', 1),
 ('166198', 1),
 ('171539', 1),
 ('177488', 1),
 ('182367', 1),
 ('183377', 1),
 ('188150', 1),
 ('195806', 1),
 ('198107', 1),
 ('203629', 1),
 ('211569', 1),
 ('212333', 1),
 ('220228', 1),
 ('221349', 1),
 ('224428', 1),
 ('228777', 1),
 ('229541', 1),
 ('230268', 1),
 ('233118', 1),
 ('240819', 1),
 ('247747', 1),
 ('248597', 1),
 ('251710', 1),
 ('255514', 1),
 ('255649', 1),
 ('262895', 1),
 ('266477', 1),
 ('275872', 1),
 ('282702', 1),
 ('285527', 1),
 ('293158', 1),
 ('293967', 1),
 ('294224', 1),
 ('295772', 1),
 ('296768', 1),
 ('297674', 1),
 ('301013', 1),
 ('306167', 1),
 ('316249', 1),
 ('316867', 1),
 ('327664', 1),
 ('330189', 1),
 ('332758', 1),
 ('338683', 1),
 ('341948', 1),
 ('346061', 1),
 ('346603', 1),
 ('347137', 1),
 ('349299', 1),
 ('368158', 1),
 ('369762', 1),
 ('377008', 1),
 ('378327', 1),
 ('379271', 1),
 ('384563', 1),
 ('394676', 1),
 ('396349', 1),
 ('39758

In [6]:
# Define function to recursively loop over attributes in DICOM file to get numbers for each CAC score
def recurse(ds,open_path): 

    for index,elem in enumerate(ds): #Loop over elements in the DICOM file
            
        if elem.tag==(0x0040a730): #If the attribute is 'Content Sequence' keep looping inside it - call function again
            [recurse(item,open_path) for item in elem.value] 
            
        else:
            if isinstance(elem.value,str)!=1: #If the attribute is not string - will be numeric value or CAC attribute
               
                try: #May not be present or may have other attributes - will give errors - Ignore these cases
                    for elem2 in elem.value: #Double loop in cases that there is such attribute to get those of interest
                        for elem3 in elem2:
                            if elem3.tag==(0x00080104): #Attribute that conveys meaning of text
                                if (elem3.value=='Calcium Volume' or elem3.value=='Calcium Mass' or 
                                    elem3.value=='Calcium Score' or elem3.value=='Number of Lesions' or 
                                    elem3.value=='Agatston Score Threshold'): #These are what we want to extract
                
                                    with open(open_path, 'a') as file: #Append information to txt file
                                        file.write(str(elem3.value))
                                        file.close()

                            if elem3.tag==(0x0040a30a): #Attribute that has numeric values for the above
                                
                                with open(open_path, 'a') as file:  #Append information to txt file
                                    file.write(str(elem3.value))
                                    file.write('\n')
                                    file.close()

                except: #Ignore when errors appear
                    pass

            if (elem.value=='LM' or elem.value=='LAD' or elem.value=='CX' or elem.value=='RCA' or elem.value=='Ca'
                or elem.value=='U1' or elem.value=='U2'): #Attributes for which we need measurements

                with open(open_path, 'a') as file: #Append attribute to txt file to be used to save to df below
                    file.write(str(elem.value))
                    file.write('\n')
                    file.close()

In [7]:
for file in os.listdir(path): #For each SR file in path - Assume only SR files with relevant information from Syngo.via
    SR=dicom.dcmread(path+'/'+file) #Load DICOM file
    
    open_path=save_path+'/'+str(SR.PatientID)+'.txt' #Path to save txt file
    
    with open(open_path, 'w') as f: #Create that file to write information to it
        f.write('Information below about participant ' +str(SR.PatientID))
        f.write('\n')
        f.close()

### We need to repeat the above since we get permission errors with txt files otherwise
### We may even have to manually run the code below more than once

In [9]:
for file in os.listdir(path): #Similar as above to extract information from the above
    SR=dicom.dcmread(path+'/'+file)
    open_path=save_path+'/'+str(SR.PatientID)+'.txt'
    recurse(SR,open_path)

### Add from txt files to Dataframe

In [10]:
#Attributes to be added in REDCap
column_names=['participant_id','total_lesions','total_artery_volume','total_artery_mass','total_artery_agatston', 
             'lm_artery_lesions','lm_artery_volume','lm_artery_mass','lm_artery_agatston',
             'lad_artery_lesions','lad_artery_volume','lad_artery_mass','lad_artery_agatston',
             'cx_artery_lesions','cx_artery_volume','cx_artery_mass','cx_artery_agatston',
             'rca_artery_lesions','rca_artery_volume','rca_artery_mass','rca_artery_agatston']

In [25]:
df=pd.DataFrame(columns=column_names) #Create df with above column names to be filled with extracted information
df

Unnamed: 0,participant_id,total_lesions,total_artery_volume,total_artery_mass,total_artery_agatston,lm_artery_lesions,lm_artery_volume,lm_artery_mass,lm_artery_agatston,lad_artery_lesions,...,lad_artery_mass,lad_artery_agatston,cx_artery_lesions,cx_artery_volume,cx_artery_mass,cx_artery_agatston,rca_artery_lesions,rca_artery_volume,rca_artery_mass,rca_artery_agatston


#### It's assumed that we always have total Agatston, LM, LAD, CX, RCA, followed by the num of lesions, vol, mass and score. We also stored information for volume, score etc. followed exactly after attribute name without space (eg. 'Calcium Volume12.9')

In [26]:
for file in os.listdir(save_path): #Loop over all txt files saved
    
    #Initialize empty series to add info for one participant below
    empty=pd.DataFrame(index=np.arange(1),columns=column_names) 
    
    with open(save_path+'/'+file,'r') as f: #Open each txt file as 'read-only' to save information to df
        
        #Initialize values to 0 as indicators that this attribute was found in txt and save to df
        tot=0
        lm=0
        lad=0
        cx=0
        rca=0
        
        for line in f: #loop over each line of txt file
            
            if 'participant' in line: #If we have participant id in that line
                empty['participant_id']=str(line.split('participant')[1][:-1]) #Save it to df, ignoring the '\n' char
                
            #If each of the above attributes set their corresponding indicator to 1 to get in each of the loops below    
            if 'Agatston' in line:
                tot=1
            if 'LM' in line:
                lm=1
            if 'LAD' in line:
                lad=1
            if 'CX' in line:
                cx=1
            if 'RCA' in line:
                rca=1                
    
#For each attribute for which we got an indicator of 1, get into the corresponding loop, save num of lesion, volume,
#mass and score to series and set the indicator to 0 to avoid get into it again when moving to next line 
#of the txt file.
    
            if tot==1:
                if 'Number of Lesions' in line:
                    empty['total_lesions']=line.split('Lesions')[-1][:-1] #last element the value, without newline
                if 'Calcium Volume' in line:
                    empty['total_artery_volume']=line.split('Volume')[-1][:-1]
                if 'Calcium Mass' in line:
                    empty['total_artery_mass']=line.split('Mass')[-1][:-1]
                if 'Calcium Score' in line:
                    empty['total_artery_agatston']=line.split('Score')[-1][:-1]
                    tot=0
    
            if lm==1:
                if 'Number of Lesions' in line:
                    empty['lm_artery_lesions']=line.split('Lesions')[-1][:-1]
                if 'Calcium Volume' in line:
                    empty['lm_artery_volume']=line.split('Volume')[-1][:-1]
                if 'Calcium Mass' in line:
                    empty['lm_artery_mass']=line.split('Mass')[-1][:-1]
                if 'Calcium Score' in line:
                    empty['lm_artery_agatston']=line.split('Score')[-1][:-1]
                    lm=0
                    
            if lad==1:
                if 'Number of Lesions' in line:
                    empty['lad_artery_lesions']=line.split('Lesions')[-1][:-1]
                if 'Calcium Volume' in line:
                    empty['lad_artery_volume']=line.split('Volume')[-1][:-1]
                if 'Calcium Mass' in line:
                    empty['lad_artery_mass']=line.split('Mass')[-1][:-1]
                if 'Calcium Score' in line:
                    empty['lad_artery_agatston']=line.split('Score')[-1][:-1]
                    lad=0
                    
            if cx==1:
                if 'Number of Lesions' in line:
                    empty['cx_artery_lesions']=line.split('Lesions')[-1][:-1]
                if 'Calcium Volume' in line:
                    empty['cx_artery_volume']=line.split('Volume')[-1][:-1]
                if 'Calcium Mass' in line:
                    empty['cx_artery_mass']=line.split('Mass')[-1][:-1]
                if 'Calcium Score' in line:
                    empty['cx_artery_agatston']=line.split('Score')[-1][:-1]
                    cx=0    
                    
            if rca==1:
                if 'Number of Lesions' in line:
                    empty['rca_artery_lesions']=line.split('Lesions')[-1][:-1]
                if 'Calcium Volume' in line:
                    empty['rca_artery_volume']=line.split('Volume')[-1][:-1]
                if 'Calcium Mass' in line:
                    empty['rca_artery_mass']=line.split('Mass')[-1][:-1]
                if 'Calcium Score' in line:
                    empty['rca_artery_agatston']=line.split('Score')[-1][:-1]
                    rca=0             
        
        f.close() #Close the txt file of that specific participant once we loop over all its lines

    df=df.append(empty) #Append information from series to df

In [None]:
df=df.reset_index(drop=True) #Reset indices of df and drop 'index' column
df #Should be in increasing participant id order

In [None]:
df[df.isnull().any(axis=1)] #Cases with null values - To be reviewed and extracted again

In [29]:
df_save=df[~df.isnull().any(axis=1)] #df to save non-nan cases
# df_save.to_excel('CACS_REDCap.xlsx',index=False) #save to xlsx
df_save.to_csv('CACS_REDCap.csv',index=False) #save to csv

In [16]:
end=time.time()
print('Total time to run was',end-start, 'secs')

Total time to run was 574.5030493736267 secs
