In [1]:
#Import dependencies
import os
import numpy as np
import traceback
import pandas as pd
import time
import pickle

import warnings
warnings.simplefilter(action='ignore',category=FutureWarning) #Avoid printing warning messages

In [2]:
start=time.time()

In [3]:
AI_path=os.getcwd()+"\\ai_logs" #Path of folder with AI txt files

In [4]:
path=os.getcwd()+'\AI_timestamp' #Path of all AI txt files with processing time and tasks

#Store this variable - Will be accessed inside the other notebook 
%store path

Stored 'path' (str)


In [5]:
#Run notebook 'AI_timestamp.ipynb' to find which txt filename corresponds to which participant.
#Continue execution on next cell if it gives error

try: #To ignore error and continue in next cell we need try-except and 'no raise error' flag
    %run ./AI_timestamp.ipynb --no-raise-error
except:
    pass

100%|██████████| 10029/10029 [00:02<00:00, 4408.89it/s]


Total number of files without any of the above measurements (nodules, aorta, CACS, vertebra, cardiac fat) is: 0
Total number of empty files is: 492
Total number of files with measurements is: 9528
Total number of files with empty patient_names: 9
Unique participants sent for nodules:  375
All participants sent for nodules:  547


Unique participants sent for aorta:  1580
All participants sent for aorta:  1685


Unique participants sent for CACS:  4965
All participants sent for CACS:  6951


Unique participants sent for vertebra:  1880
All participants sent for vertebra:  2081


Unique participants sent for cardiac fat:  1647
All participants sent for cardiac fat:  1648


Unique participants sent for emphysema:  1883
All participants sent for emphysema:  2116
Total number of files with aorta measurements 1580
Total number of files with CACS measurements 4965
Total number of files with nodule measurements 375
Total number of files with vertebra measurements 1880
Total number of files wit

In [6]:
#Load paths of pickle files containing dictionaries with participant and its corresponding txt file to keep - Created from 'AI_timestamp.ipynb'
nodule_path='patient_and_files_nodules.pkl'
aorta_path='patient_and_files_aorta.pkl'
CACS_path='patient_and_files_CACS.pkl'
vertebra_path='patient_and_files_vertebra.pkl'
fat_path='patient_and_files_fat.pkl'
emph_path='patient_and_files_emph.pkl'

### REDCap attributes to be extracted

In [7]:
#Participant ID
participant_id=['participant_id']

#Aorta measurements
diam_names=['sin_vals_diam_ai','sino_junc_diam_ai','mid_asc_aorta_diam_ai','prox_arch_diam_ai','mid_arch_diam_ai',
            'prox_desc_diam_ai','mid_desc_diam_ai','diaphragm_diam_ai','abd_aorta_diam_ai']
orth_names=['sin_vals_ortho_diam_ai','sino_junc_ortho_diam_ai','mid_asc_aorta_ortho_diam_ai','prox_arch_ortho_diam_ai',
            'mid_arch_ortho_diam_ai','prox_desc_ortho_diam_ai','mid_desc_ortho_diam_ai','diaphragm_ortho_diam_ai',
            'abd_aorta_ortho_diam_ai']

#Emphysema
emphysema=['lung_vol_ai','lung_dens_ai_wl','lung_perc15_ai_wl',
           'lung_dens_ai_lul','lung_perc15_ai_lul',
           'lung_dens_ai_lll','lung_perc15_ai_lll',
           'lung_dens_ai_rul','lung_perc15_ai_rul',
           'lung_dens_ai_ml','lung_perc15_ai_ml', 
           'lung_dens_ai_rll','lung_perc15_ai_rll',
           'ai_emphysema_quantification_complete' #set to 1=> unverified (to check automated imports manually)
          ]

#Cardiac Fat
heart_fat=['heart_vol_ai','cardiac_fat_vol_ai','cardiac_fat_hu','cardiac_fat_hu_sd']

#Vertebra measurements
vert_t1=['vert_volume_t1','vert_post_height_t1','vert_ant_height_t1','vert_mid_height_t1','vert_mean_hu_t1']
vert_t2=['vert_volume_t2','vert_post_height_t2','vert_ant_height_t2','vert_mid_height_t2','vert_mean_hu_t2']
vert_t3=['vert_volume_t3','vert_post_height_t3','vert_ant_height_t3','vert_mid_height_t3','vert_mean_hu_t3']
vert_t4=['vert_volume_t4','vert_post_height_t4','vert_ant_height_t4','vert_mid_height_t4','vert_mean_hu_t4']
vert_t5=['vert_volume_t5','vert_post_height_t5','vert_ant_height_t5','vert_mid_height_t5','vert_mean_hu_t5']
vert_t6=['vert_volume_t6','vert_post_height_t6','vert_ant_height_t6','vert_mid_height_t6','vert_mean_hu_t6']
vert_t7=['vert_volume_t7','vert_post_height_t7','vert_ant_height_t7','vert_mid_height_t7','vert_mean_hu_t7']
vert_t8=['vert_volume_t8','vert_post_height_t8','vert_ant_height_t8','vert_mid_height_t8','vert_mean_hu_t8']
vert_t9=['vert_volume_t9','vert_post_height_t9','vert_ant_height_t9','vert_mid_height_t9','vert_mean_hu_t9']
vert_t10=['vert_volume_t10','vert_post_height_t10','vert_ant_height_t10','vert_mid_height_t10','vert_mean_hu_t10']
vert_t11=['vert_volume_t11','vert_post_height_t11','vert_ant_height_t11','vert_mid_height_t11','vert_mean_hu_t11']
vert_t12=['vert_volume_t12','vert_post_height_t12','vert_ant_height_t12','vert_mid_height_t12','vert_mean_hu_t12']
vert_l1=['vert_volume_l1','vert_post_height_l1','vert_ant_height_l1','vert_mid_height_l1','vert_mean_hu_l1']
vert_c7=['vert_volume_c7','vert_post_height_c7','vert_ant_height_c7','vert_mid_height_c7','vert_mean_hu_c7']

#Calcium score measurements
CACS_perf=['ai_cacs_perf']
total_ag=['ai_total_number_lesions','ai_total_artery_agatston','ai_total_artery_volume','ai_total_artery_mass']
total_lm=['ai_lm_number_lesions','ai_lm_artery_agatston','ai_lm_artery_volume','ai_lm_artery_mass']
total_lad=['ai_lad_number_lesions','ai_lad_artery_agatston','ai_lad_artery_volume','ai_lad_artery_mass']
total_cx=['ai_cx_number_lesions','ai_cx_artery_agatston','ai_cx_artery_volume','ai_cx_artery_mass']
total_rca=['ai_rca_number_lesions','ai_rca_artery_agatston','ai_rca_artery_volume','ai_rca_artery_mass']

#Lung Nodules
nod_ids=['ai_nod_id1','ai_nod_id2','ai_nod_id3','ai_nod_id4','ai_nod_id5','ai_nod_id6','ai_nod_id7','ai_nod_id8',
        'ai_nod_id9','ai_nod_id10']
nod_volumes=['ai_nod_vol1','ai_nod_vol2','ai_nod_vol3','ai_nod_vol4','ai_nod_vol5','ai_nod_vol6','ai_nod_vol7',
            'ai_nod_vol8','ai_nod_vol9','ai_nod_vol10']
nod_diam_2d=['ai_nod_dia2d_n1','ai_nod_dia2d_n2','ai_nod_dia2d_n3','ai_nod_dia2d_n4','ai_nod_dia2d_n5',
            'ai_nod_dia2d_n6','ai_nod_dia2d_n7','ai_nod_dia2d_n8','ai_nod_dia2d_n9','ai_nod_dia2d_n10']
nod_diam_3d=['ai_nod_dia3d_n1','ai_nod_dia3d_n2','ai_nod_dia3d_n3','ai_nod_dia3d_n4','ai_nod_dia3d_n5',
            'ai_nod_dia3d_n6','ai_nod_dia3d_n7','ai_nod_dia3d_n8','ai_nod_dia3d_n9','ai_nod_dia3d_n10']

nod_pos=['pos1','pos2','pos3','pos4','pos5','pos6','pos7','pos8','pos9','pos10'] #Used to export file for automation

#Can be used in combination with automation algorithm - Defines if a finding was TP
# nod_true_or_not=['ai_det_true_nod1','ai_det_true_nod2','ai_det_true_nod3','ai_det_true_nod4','ai_det_true_nod5',
#                 'ai_det_true_nod6','ai_det_true_nod7','ai_det_true_nod8','ai_det_true_nod9','ai_det_true_nod10']

### Empty dataframes creation - They have one row with NaNs and the above columns

In [8]:
df_vertebra=pd.DataFrame(index=np.arange(1),columns=participant_id+vert_t1+vert_t2+vert_t3+vert_t4+vert_t5+vert_t6+
                         vert_t7+vert_t8+vert_t9+vert_t10+vert_t11+vert_t12+vert_l1+vert_c7)
df_vertebra

Unnamed: 0,participant_id,vert_volume_t1,vert_post_height_t1,vert_ant_height_t1,vert_mid_height_t1,vert_mean_hu_t1,vert_volume_t2,vert_post_height_t2,vert_ant_height_t2,vert_mid_height_t2,...,vert_volume_l1,vert_post_height_l1,vert_ant_height_l1,vert_mid_height_l1,vert_mean_hu_l1,vert_volume_c7,vert_post_height_c7,vert_ant_height_c7,vert_mid_height_c7,vert_mean_hu_c7
0,,,,,,,,,,,...,,,,,,,,,,


In [9]:
df_aorta=pd.DataFrame(index=np.arange(1),columns=participant_id+diam_names+orth_names) 
df_aorta

Unnamed: 0,participant_id,sin_vals_diam_ai,sino_junc_diam_ai,mid_asc_aorta_diam_ai,prox_arch_diam_ai,mid_arch_diam_ai,prox_desc_diam_ai,mid_desc_diam_ai,diaphragm_diam_ai,abd_aorta_diam_ai,sin_vals_ortho_diam_ai,sino_junc_ortho_diam_ai,mid_asc_aorta_ortho_diam_ai,prox_arch_ortho_diam_ai,mid_arch_ortho_diam_ai,prox_desc_ortho_diam_ai,mid_desc_ortho_diam_ai,diaphragm_ortho_diam_ai,abd_aorta_ortho_diam_ai
0,,,,,,,,,,,,,,,,,,,


In [10]:
df_calcium=pd.DataFrame(index=np.arange(1),columns=participant_id+CACS_perf+total_ag+total_lm+total_lad+total_cx+total_rca)
df_calcium

Unnamed: 0,participant_id,ai_cacs_perf,ai_total_number_lesions,ai_total_artery_agatston,ai_total_artery_volume,ai_total_artery_mass,ai_lm_number_lesions,ai_lm_artery_agatston,ai_lm_artery_volume,ai_lm_artery_mass,...,ai_lad_artery_volume,ai_lad_artery_mass,ai_cx_number_lesions,ai_cx_artery_agatston,ai_cx_artery_volume,ai_cx_artery_mass,ai_rca_number_lesions,ai_rca_artery_agatston,ai_rca_artery_volume,ai_rca_artery_mass
0,,,,,,,,,,,...,,,,,,,,,,


In [11]:
df_cardiac_fat=pd.DataFrame(index=np.arange(1),columns=participant_id+heart_fat)
df_cardiac_fat

Unnamed: 0,participant_id,heart_vol_ai,cardiac_fat_vol_ai
0,,,


In [12]:
df_emph=pd.DataFrame(index=np.arange(1),columns=participant_id+emphysema)
df_emph

Unnamed: 0,participant_id,lung_vol_ai,lung_dens_ai_wl,lung_perc15_ai_wl,lung_dens_ai_lul,lung_perc15_ai_lul,lung_dens_ai_lll,lung_perc15_ai_lll,lung_dens_ai_rul,lung_perc15_ai_rul,lung_dens_ai_ml,lung_perc15_ai_ml,lung_dens_ai_rll,lung_perc15_ai_rll,ai_emphysema_quantification_complete
0,,,,,,,,,,,,,,,


In [13]:
df_nodules=pd.DataFrame(index=np.arange(1),columns=participant_id+nod_ids+nod_volumes+nod_diam_2d+
                        nod_diam_3d+nod_pos) 
# Also add 'nod_true_or_not' in the future, after running automation code to extract TP ids
df_nodules

Unnamed: 0,participant_id,ai_nod_id1,ai_nod_id2,ai_nod_id3,ai_nod_id4,ai_nod_id5,ai_nod_id6,ai_nod_id7,ai_nod_id8,ai_nod_id9,...,pos1,pos2,pos3,pos4,pos5,pos6,pos7,pos8,pos9,pos10
0,,,,,,,,,,,...,,,,,,,,,,


### Extract all information for CACS, aorta, cardiac fat, vertebra, emphysema and nodules

In [14]:
#Load dictionaries with participants and their AI files based on processing time AI outputs - Created with 'AI_timestamp.ipynb'

with open(nodule_path,'rb') as f:
    patient_and_nodules=pickle.load(f)
with open(aorta_path,'rb') as f1:
    patient_and_aorta=pickle.load(f1)
with open(CACS_path,'rb') as f2:
    patient_and_CACS=pickle.load(f2)
with open(vertebra_path,'rb') as f3:
    patient_and_vertebra=pickle.load(f3)
with open(fat_path,'rb') as f4:
    patient_and_fat=pickle.load(f4)
with open(emph_path,'rb') as f5:
    patient_and_emph=pickle.load(f5)
    
#Get lists of files for all participants in each category
nodule_files_list=[item for item in list(patient_and_nodules.values())]  
aorta_files_list=[item for item in list(patient_and_aorta.values())]    
CACS_files_list=[item for item in list(patient_and_CACS.values())]
vertebra_files_list=[item for item in list(patient_and_vertebra.values())]
fat_files_list=[item for item in list(patient_and_fat.values())]
emph_files_list=[item for item in list(patient_and_emph.values())]

#In case that we don't want to extract some of the above comment the above line and activate the corresponding one below
# nodule_files_list=[]
# aorta_files_list=[]
# CACS_files_list=[]
# vertebra_files_list=[]
# fat_files_list=[]
# emph_files_list=[]

In [15]:
print('Total number of aorta files is',len(aorta_files_list))
print('Total number of CACS files is',len(CACS_files_list))
print('Total number of vertebra files is',len(vertebra_files_list))
print('Total number of nodule files is',len(nodule_files_list))
print('Total number of cardiac fat files is',len(fat_files_list))
print('Total number of emphysema files is',len(emph_files_list))

Total number of aorta files is 1368
Total number of CACS files is 3651
Total number of vertebra files is 1654
Total number of nodule files is 334
Total number of cardiac fat files is 1647
Total number of emphysema files is 1654


In [16]:
def get_measurements(AI_path=AI_path, #Path with AI txt files with information to extract
                    
                    #Paths of files having information for each of the nodules, aorta etc. 
                    nodule_files_list=nodule_files_list,
                    aorta_files_list=aorta_files_list,
                    CACS_files_list=CACS_files_list,
                    vertebra_files_list=vertebra_files_list,
                    fat_files_list=fat_files_list,
                    emph_files_list=emph_files_list,

                    #Empty dictionaries having as column names the attributes to be extracted for each of the above files
                    df_nodules=df_nodules,
                    df_aorta=df_aorta,
                    df_calcium=df_calcium,
                    df_vertebra=df_vertebra,
                    df_cardiac_fat=df_cardiac_fat,
                    df_emph=df_emph
                    ):
    
    'Gets the path of the AI txt files, along with the paths of measurements that we want to extract (nodules, aorta,'
    'CACS,vertebra, and cardiac_fat) along with the empty dataframes with the columns that we want to fill and import in REDCap.'
    'Returns these dataframes filled with the information of the AI txt files'
    'Dataframes are returned in the following order: df_nodules, df_aorta, df_calcium, df_vertebra, df_cardiac_fat, df_emph'

    # #High Emphysema - Just used to get a list of participants with high emphysema, will not be uploaded
    # high_emphysema_pats=[] #Keep track of participants with high emphysema (>20 specified below)
   

    for textfile in os.listdir(AI_path): #Loop over folder with AI txt files with information
        
        if (textfile in nodule_files_list or textfile in aorta_files_list or textfile in CACS_files_list
        or textfile in vertebra_files_list or textfile in fat_files_list or textfile in emph_files_list): 
            #If the txt file exists in any of the files for which we want to extract information


            with open (AI_path+'/'+textfile) as f:
                lines=f.readlines() #Read all lines in txt file

                if len(lines[2])==17 or 'imalife_' in lines[2].lower(): #10 for 'PatientID:', 6 for the actual numbers, and 1 extra character for newline

# #                     Just for debugging
#                     if "PatientID:435703" in lines[2]:
#                         print("File for participant 435703 is",textfile)
    
    
##########For vertebra measurements below

                    if textfile in vertebra_files_list: #If txt files in list of files with vertebra measurements
        
                        try: #Since file might not be processed correctly by AI
                            #List of columns of the dataframe we want to import in REDCap
                            columns_vertebra=[participant_id+vert_t1+vert_t2+vert_t3+vert_t4+vert_t5+vert_t6+vert_t7+vert_t8+vert_t9+vert_t10+vert_t11+vert_t12+vert_l1+vert_c7]

                            #Initialize empty dataframe - similar to series, just for one participant to be appended below
                            series_vertebra=pd.DataFrame(index=np.arange(1),columns=participant_id+vert_t1+vert_t2+
                                                         vert_t3+vert_t4+vert_t5+vert_t6+vert_t7+vert_t8+vert_t9+
                                                         vert_t10+vert_t11+vert_t12+vert_l1+vert_c7)

                            series_vertebra['participant_id']=int(lines[2][-7:]) #Add participant id

                            vert_measurements=[s for s in lines if '{"label":' in s] #Vertebra measurements

                            vert_stats=[s for s in lines if 'HU:' in s] #HU value measurements
                            
                            vert_errors=eval([s for s in lines if '"errorInfo":' in s][0]) #Errors by AI

                            #Check error outputs by AI in vertebra measurement and print file and participant for those cases
                            for error in vert_errors:
                                if error['errorInfo']!='': #When there is an error
                                    print('There are errors for patient {} in file {}'.format(int(lines[2][-7:]),textfile))

                            if len(vert_measurements)!=[]: #When there are vertebra measurements
                                
                                vert_inside_measurements=eval([s for s in lines if '{"label":' in s][0]) #Get dictionary of them

                                for vert_measurement in vert_inside_measurements: #Loop over keys of dictionary with measurements

                                    label=vert_measurement['label']
                                    label_lower=label.lower() #Get name of vertebrae and set it to lowercase to be used in attributes below
                                    
                                    #Extract values from the txt file for each vertebra
                                    volume=vert_measurement['volumeInCm3']
                                    posterior_height=vert_measurement['posteriorHeightDiameter']['lengthInMm']
                                    anterior_height=vert_measurement['anteriorHeightDiameter']['lengthInMm']
                                    mid_height=vert_measurement['midHeightDiameter']['lengthInMm']

                                    #Round the above values to two digits and add them to a dataframe to be appended to the full one below                       
                                    series_vertebra['vert_volume_'+label_lower]=round(volume,2)
                                    series_vertebra['vert_post_height_'+label_lower]=round(posterior_height,2)
                                    series_vertebra['vert_ant_height_'+label_lower]=round(anterior_height,2)
                                    series_vertebra['vert_mid_height_'+label_lower]=round(mid_height,2)


                            if vert_stats!=[]: #When there are HU value measurements
                                
                                for vertebrae in vert_stats: #For each of those HU value measurements in the dictionary
                                    
                                    label_and_hu=vertebrae.split('Label: ')[1].split(',HU: ') #Get label and HU value
                                    label=label_and_hu[0] #Label is the first element
                                    label_lower=label.lower() #Lowercase to be used in attribute name below
                                    hu=label_and_hu[1][:-1] #Get HU value for the above label

                                    series_vertebra['vert_mean_hu_'+label_lower]=round(float(hu),2) #Round it and add it to dataframe

                                    
                            #Append above participant to the final dataframe        
                            df_vertebra=df_vertebra.append(series_vertebra,ignore_index=True)
                            
                            #If more attributes are added (eg. l2 etc.) remove those at the end since not needed
                            df_vertebra=df_vertebra.iloc[:,:len(columns_vertebra[0])] 
                            
                            
                        except: #If there are errors print the participant that wasn't processed correctly and its AI file
                            try:
                                print('Vertebra measurements not processed for participant {} in file {}'.format(int(lines[2][-7:]),textfile))
                                # print(traceback.format_exc())
                            except: #This means problem with participant id - Probably not a 6 digit number
                                print("Problem with participant_id in vertebra measurements of file {}".format(textfile))    
                            # print(traceback.format_exc())
                            print('\n')               


##########For cardiac fat and heart volume measurements below

                    if textfile in fat_files_list: #If txt files in list of files with cardiac fat measurements
        
                        try: #In case there are errors
                            
                            #Initialize dataframe similar as above to be filled with just one row and appended to the final one
                            series_fat=pd.DataFrame(index=np.arange(1),columns=participant_id+heart_fat)

                            series_fat['participant_id']=int(lines[2][-7:]) #Add participant id

                            #Extract heart volume, round it to 2 digits, and append it to the above dataframe
                            heart_measurements=[s for s in lines if 'Heart Volume (mm3):' in s]
                            series_fat['heart_vol_ai']=round(float(heart_measurements[0].split('Heart Volume (mm3):')[-1][1:-1]),2)
                        
                            #Extract cardiac fat measurement, round it to 2 digits, and append it to the above dataframe
                            fat_measurements=[s for s in lines if 'Absolute Volume (ml):' in s]
                            series_fat['cardiac_fat_vol_ai']=round(float(fat_measurements[0].split('Absolute Volume (ml):')[-1][1:-1]),2)

                            #Extract mean for cardiac fat HU
                            mean_HU=[s for s in lines if 'Mean :' in s]
                            series_fat['cardiac_fat_hu']=round(float(mean_HU[0].split('Mean :')[-1][1:-1]),2)

                            #Extract std for cardiac fat HU
                            std_HU=[s for s in lines if 'Standard Deviation :' in s]
                            series_fat['cardiac_fat_hu_sd']=round(float(std_HU[0].split('Standard Deviation :')[-1][1:-1]),2)

                            #At the end, append the above participant and its measurements to the final dataframe    
                            df_cardiac_fat=df_cardiac_fat.append(series_fat,ignore_index=True)
        
                        except: #If errors, print participant for whom they occurred and its AI file
                            try:
                                print('Cardiac fat and/or heart measurements not processed for participant {} in file {}'.format(int(lines[2][-7:]),textfile))
                                # print(traceback.format_exc())
                            except:
                                print("Problem with participant_id in cardiac_fat measurements of file {}".format(textfile))
                            # print(traceback.format_exc())
                            print('\n')
        

##########For emphysema measurements below

                    if textfile in emph_files_list: #If txt files in list of files with emphysema measurements
        
                        try: #In case there are errors
                            
                            #Initialize dataframe similar as above to be filled with just one row and appended to the final one
                            series_emph=pd.DataFrame(index=np.arange(1),columns=participant_id+emphysema)

                            series_emph['participant_id']=int(lines[2][-7:]) #Add participant id

                            #Extract emphysema and append it to the above dataframe
                            emph_measurements=[s for s in lines if ':950 - Lobe:' in s]                          
                            
                            for lobe in emph_measurements: #Loop over all measurements at 950 HU threshold
                                #Depending on the lobe add measurements to the corresponding column of dataframe
                                
                                if 'Lobe:WHOLE' in lobe: #Whole lung
                                    if float(lobe.split('Perc15:')[-1][:-1])!=0:
                                        series_emph['lung_vol_ai']=float(lobe.split('outVolume:')[1].split(' - ')[0])
                                        series_emph['lung_dens_ai_wl']=float(lobe.split('outLAAPercent:')[1].split(' - ')[0])
                                        series_emph['lung_perc15_ai_wl']=float(lobe.split('Perc15:')[-1][:-1])

                                if 'Lobe:LU' in lobe: #Left Upper lobe
                                    if float(lobe.split('Perc15:')[-1][:-1])!=0:
                                        series_emph['lung_dens_ai_lul']=float(lobe.split('outLAAPercent:')[1].split(' - ')[0])
                                        series_emph['lung_perc15_ai_lul']=float(lobe.split('Perc15:')[-1][:-1])

                                if 'Lobe:LL' in lobe: #Left Lower lobe
                                    if float(lobe.split('Perc15:')[-1][:-1])!=0:
                                        series_emph['lung_dens_ai_lll']=float(lobe.split('outLAAPercent:')[1].split(' - ')[0])
                                        series_emph['lung_perc15_ai_lll']=float(lobe.split('Perc15:')[-1][:-1])

                                if 'Lobe:RU' in lobe: #Right Upper lobe
                                    if float(lobe.split('Perc15:')[-1][:-1])!=0:
                                        series_emph['lung_dens_ai_rul']=float(lobe.split('outLAAPercent:')[1].split(' - ')[0])
                                        series_emph['lung_perc15_ai_rul']=float(lobe.split('Perc15:')[-1][:-1])

                                if 'Lobe:RM' in lobe: #Right Middle lobe
                                    if float(lobe.split('Perc15:')[-1][:-1])!=0:
                                        series_emph['lung_dens_ai_ml']=float(lobe.split('outLAAPercent:')[1].split(' - ')[0])
                                        series_emph['lung_perc15_ai_ml']=float(lobe.split('Perc15:')[-1][:-1])

                                if 'Lobe:RL' in lobe: #Right Lower lobe
                                    if float(lobe.split('Perc15:')[-1][:-1])!=0:
                                        series_emph['lung_dens_ai_rll']=float(lobe.split('outLAAPercent:')[1].split(' - ')[0])
                                        series_emph['lung_perc15_ai_rll']=float(lobe.split('Perc15:')[-1][:-1])
                            
                            
                            series_emph['ai_emphysema_quantification_complete']=1 #Set this to 'unverified' to check manually after uploading

                            #At the end, append the above participant and its measurements to the final dataframe    
                            df_emph=df_emph.append(series_emph,ignore_index=True)
                            
                            # #Check for participants with high percentage of emphysema (>20% in any lobe) - Used for checks, will not be uploaded
                            # if (series_emph['lung_dens_ai_wl'].values[0]>=20 or 
                            #     series_emph['lung_dens_ai_lul'].values[0]>=20 or 
                            #     series_emph['lung_dens_ai_lll'].values[0]>=20 or 
                            #     series_emph['lung_dens_ai_rul'].values[0]>=20 or 
                            #     series_emph['lung_dens_ai_ml'].values[0]>=20 or 
                            #     series_emph['lung_dens_ai_rll'].values[0]>=20):
                                
                            #     high_emphysema_pats.append(int(lines[2][-7:])) #Add them to list
                                
       
                        except: #If errors here print participant for whom they occurred and its AI file
                            try:
                                print('Emphysema measurements not procesed for participant {} in file {}'.format(int(lines[2][-7:]),textfile))
                                # print(traceback.format_exc())
                            except:
                                print("Problem with participant_id in emphysema measurements of file {}".format(textfile))
                            # print(traceback.format_exc())
                            print('\n')


##########For aortic measurements below

                    if textfile in aorta_files_list: #If txt files in list of files with aorta measurements

                        pat=[] #To temporarily save a patient ID and add it to dataframe
                        diam=[] #To store all 9 diameter values
                        orth_diam=[] #To store all 9 orthogonal diameter values

                        try: #In case of errors
            
                            measurements=[s for s in lines if '{"id":' in s] #All aortic diameter measurements

                            if len(measurements)!=0: #If we have measurements

                                #Split to 9 elements of list - one for each measurement
                                inside_measurements=eval([s for s in lines if '{"id":' in s][0]) 

                                if len(inside_measurements)!=9: #We should not get in here
                                    print("We don't have 9 aorta measurements for file {}. We only have {}".format(textfile,len(inside_measurements)))

                                else: #If we have 9 aortic measurements
                                    pat.append(int(lines[2][-7:])) #Add that patient to a temporary list

                                    for i in range(9): #Loop over these measurements to extract information
                                        further_inside_measur=inside_measurements[i] #Get line with specific measurement
                                        diam.append(round(further_inside_measur['diameter']['lengthInMm'],2)) #Add diameter to list
                                        orth_diam.append(round(further_inside_measur['orthogonal_diameter']['lengthInMm'],2)) #Add orthogonal diameter to list

                                    series=[pat,diam,orth_diam] #Create list of lists with attributes to be filled in dataframe
                                    series_flat=[x_in for x in series for x_in in x] #Convert list of lists to list
                                    series_flat[0]=int(series_flat[0]) #Convert patient id to integer           

                                    series_df=pd.Series(series_flat,index=df_aorta.columns) #Create a series object from the above

                                    #At the end, append the above participant and its measurements to the final dataframe    
                                    df_aorta=df_aorta.append(series_df,ignore_index=True)

                            #Not needed to have an else statement here since it will also give all files not sent for aorta (eg. sent only for nodules)                                        

                        except: 
                            #Empty lists again since some of them may have been filled before error occured
                            pat=[]
                            diam=[]
                            orth_diam=[]
                            try:
                                print('Error in aorta measurements for participant {} in file {}'.format(int(lines[2][-7:]),textfile))
                            except:
                                print("Problem with participant_id in aorta measurements of file {}".format(textfile))
                            # print(traceback.format_exc())
                            print('\n')


##########For calcium score measurements below    

                    if textfile in CACS_files_list: #If txt files in list of files with CACS measurements

                        try: #Since we may have errors in some files
                            all_measures=[s for s in lines if 'Coronaary branch measurements:' in s] 

                            if len(all_measures)!=0: #If we have branch measurements

                                #create empty series with required column names to be added to REDCap
                                series_calcium=pd.DataFrame(index=np.arange(1),columns=participant_id+CACS_perf+total_ag+total_lm+total_lad+total_cx+total_rca) 
                                series_calcium['participant_id']=int(lines[2][-7:]) #Add participant id

                                #Extract the total agatston score, num of lesions, volume and mass for each branch
                                tot_aga=[s.split('\t') for s in lines if 'TOT' in s]
                                tot_lm=[s.split('\t') for s in lines if 'LM' in s]
                                tot_lad=[s.split('\t') for s in lines if 'LAD' in s]
                                tot_cx=[s.split('\t') for s in lines if 'CX' in s]
                                tot_rca=[s.split('\t') for s in lines if 'RCA' in s]

                                series_calcium['ai_cacs_perf']='1' #Set this variable to 1 to denote that we will add measurements - REDCap needs that

                                series_calcium['ai_total_artery_mass']=tot_aga[0][0].split(' ')[0]
                                series_calcium['ai_total_artery_agatston']=tot_aga[0][3].split(' ')[0]
                                series_calcium['ai_total_number_lesions']=tot_aga[0][4].split(' ')[0]
                                series_calcium['ai_total_artery_volume']=tot_aga[0][6].split(' ')[0]

                                series_calcium['ai_lm_artery_mass']=tot_lm[0][0].split(' ')[0]
                                series_calcium['ai_lm_artery_agatston']=tot_lm[0][3].split(' ')[0]
                                series_calcium['ai_lm_number_lesions']=tot_lm[0][4].split(' ')[0]
                                series_calcium['ai_lm_artery_volume']=tot_lm[0][6].split(' ')[0]

                                series_calcium['ai_lad_artery_mass']=tot_lad[0][0].split(' ')[0]
                                series_calcium['ai_lad_artery_agatston']=tot_lad[0][3].split(' ')[0]
                                series_calcium['ai_lad_number_lesions']=tot_lad[0][4].split(' ')[0]
                                series_calcium['ai_lad_artery_volume']=tot_lad[0][6].split(' ')[0]

                                series_calcium['ai_cx_artery_mass']=tot_cx[0][0].split(' ')[0]                            
                                series_calcium['ai_cx_artery_agatston']=tot_cx[0][3].split(' ')[0]
                                series_calcium['ai_cx_number_lesions']=tot_cx[0][4].split(' ')[0]
                                series_calcium['ai_cx_artery_volume']=tot_cx[0][6].split(' ')[0]

                                series_calcium['ai_rca_artery_mass']=tot_rca[0][0].split(' ')[0]
                                series_calcium['ai_rca_artery_agatston']=tot_rca[0][3].split(' ')[0]
                                series_calcium['ai_rca_number_lesions']=tot_rca[0][4].split(' ')[0]
                                series_calcium['ai_rca_artery_volume']=tot_rca[0][6].split(' ')[0]                            

                                #Append that information to the final df with CACS measurements
                                df_calcium=df_calcium.append(series_calcium,ignore_index=True)

                        except:
                            try:
                                print('Error in CACS measurements for participant {} in file {}'.format(int(lines[2][-7:]),textfile))
                            except:
                                print("Problem with participant_id in CACS measurements of file {}".format(textfile))
                            # print(traceback.format_exc())
                            print('\n')


##########For lung nodules measurements below

                    if textfile in nodule_files_list: #If txt files in list of files with nodule measurements

                        pat_nod=[] #To temporarily save a patient ID and add it to dataframe
                        nod_ids_new=[] #To be filled with nodule ids
                        vols=[] #To be filled with volume of nodules
                        diam_2d=[] #To be filled with the 2D diameter of nodules
                        diam_3d=[] #To be filled with the 3D diameter of nodules

                        #The following needed only for automation algorithm - Cannot be imported directly to RedCap
                        slice_pos=[] #To be filled with the slice positions in which the nodules could be found
                        # nod_type=[] #To be filled with the nodule type (solid, subsolid etc.)

                        try:
                            lesions_lung=[s for s in lines if 'Lesion :' in s] #Get information about nodules

                            if len(lesions_lung)!=0: #If we have lung nodule measurements

                                pat_nod.append(int(lines[2][-7:])) #Add participant_id to list

                                #Loop over each line and add information for nodule_id, volume, slice, diameters (and nodule type) to lists
                                for ind in range(len(lesions_lung)): 

                                    nod_ids_new.append(ind+1)
                                    vols.append(float(lesions_lung[ind].split(':')[3].split(',')[0])) 
                                    slice_pos.append(float(lesions_lung[ind].split(':')[2].split(',')[2][:-1])) 
                                    diam_2d.append(float(lesions_lung[ind].split(':')[4].split(',')[0]))
                                    diam_3d.append(float(lesions_lung[ind].split(':')[5].split(',')[0]))
                                    # nod_type.append(lesions_lung[ind].split(':')[6].strip())

                                #For the remaining nodules (until 10) fill empty strings to list instead of 'np.nan'
                                for i in range(10-len(nod_ids_new)): 
                                    nod_ids_new.append('')
                                    vols.append('')
                                    slice_pos.append('')
                                    diam_2d.append('')
                                    diam_3d.append('')
                                    # nod_type.append('')

                                series_nod=[pat_nod,nod_ids_new,vols,diam_2d,diam_3d,slice_pos] #Create list of lists with attributes to be filled in dataframe
                                series_flat_nod=[x_in for x in series_nod for x_in in x] #Convert list of lists to list                                
                                series_flat_nod[0]=int(series_flat_nod[0]) #Convert patient id to integer
                                
                                series_df_nod=pd.Series(series_flat_nod,index=df_nodules.columns) #Create a series object from the above

                                df_nodules=df_nodules.append(series_df_nod,ignore_index=True) #Add it to df

                            #Ensure that all patients (send or not for nodules) will exist in df, even with nan values
                            elif len(lesions_lung)==0: #If empty fill empty values for that patient            
                            #This happens when no nodules detected or when scan sent for aorta or other measurements and not for nodules

                                    if lines[2][-7:] not in (np.unique(df_nodules['participant_id'])): #If patient not in df

                                        series_df_fill=pd.Series(index=df_nodules.columns,dtype='object') #Create a series object from the above

                                        series_df_fill['participant_id']=int(lines[2][-7:]) #Add patient to above series

                                        #Add patient with software version to df
                                        df_nodules=df_nodules.append(series_df_fill,ignore_index=True) 


                        except:
                            try:
                                print('Error in nodule measurements for participant {} in file {}'.format(int(lines[2][-7:]),textfile))
                            except:
                                print("Problem with participant_id in nodule measurements of file {}".format(textfile))
                            # print(traceback.format_exc())
                            print('\n')

                            #Empty lists since there may not be emptied if error occurs
                            pat_nod=[]
                            temp_soft_vers=[]
                            nod_ids_new=[]
                            vols=[]
                            diam_2d=[]
                            diam_3d=[]
                            slice_pos=[]
                            # nod_type=[]

                            

                #If we have an invalid participant id - Might only get in here if participant was sent in a very early version of Siemens AI Rad Companion                          
                else:
                    print("Invalid patient {} found in file {}".format(lines[2],textfile))
                    print('\n')
      
    #Save list of participants with high emphysema (>20% in any lobe) in excel
#     high_emphysema_df=pd.DataFrame(high_emphysema_pats,columns=['participant_id'])
#     high_emphysema_df.to_excel('high_emphysema_participants.xlsx',index=False)

    return df_nodules, df_aorta, df_calcium, df_vertebra, df_cardiac_fat, df_emph

In [17]:
# %%capture cap --no-stderr
# #The above command to save output of cell in a txt file - here the errors 
#If used, should be accompanied by the command in the next cell (activate it)

df_nodules,df_aorta,df_calcium,df_vertebra,df_cardiac_fat,df_emph=get_measurements()

There are errors for patient 325373 in file 1.2.276.0.28.3.345049594267.42.10032.20220925082922036.txt
Vertebra measurements not processed for participant 565033 in file 1.2.276.0.28.3.345049594267.42.10156.20220930193802000.txt


There are errors for patient 198041 in file 1.2.276.0.28.3.345049594267.42.1668.20220923112410000.txt
There are errors for patient 126676 in file 1.2.276.0.28.3.345049594267.42.1988.20220914133717000.txt
There are errors for patient 339537 in file 1.2.276.0.28.3.345049594267.42.2204.20220925125353036.txt
There are errors for patient 339537 in file 1.2.276.0.28.3.345049594267.42.2204.20220925125353036.txt
There are errors for patient 339537 in file 1.2.276.0.28.3.345049594267.42.2204.20220925125353036.txt
Vertebra measurements not processed for participant 126961 in file 1.2.276.0.28.3.345049594267.42.2804.20220914144515036.txt


There are errors for patient 134516 in file 1.2.276.0.28.3.345049594267.42.3052.20220921133240036.txt
Vertebra measurements not proc

In [18]:
# with open('errors_all.txt','w') as f: #Save output of above cell to txt file
#     f.write(cap.stdout)

### CACS information

In [None]:
df_calcium

In [None]:
df_calcium=df_calcium.dropna() #To delete all rows with nans - First row created when adding empty series above
df_calcium.reset_index(drop=True,inplace=True) #reset index and drop index column
df_calcium.participant_id=df_calcium.participant_id.astype(int) #Convert first column with participant IDs to integers
df_calcium

In [21]:
assert len(np.unique(df_calcium.participant_id))==len(df_calcium.participant_id) #Confirm that we only have unique patients in df_calcium

In [22]:
df_calcium.to_csv('CACS_AI_21-4.csv',index=False) #Save file to csv
df_calcium.to_excel('CACS_AI_21-4.xlsx',index=False) #Save file to excel

### Aorta information

In [None]:
df_aorta

In [None]:
df_aorta=df_aorta.dropna() #To delete all rows with nans - First row created when adding empty series above
df_aorta.reset_index(drop=True,inplace=True) #reset index and drop index column
df_aorta.participant_id=df_aorta.participant_id.astype(int) #Convert first column with participant IDs to integers
df_aorta

In [25]:
assert len(np.unique(df_aorta.participant_id))==len(df_aorta.participant_id) #Confirm that we only have unique patients in df

In [26]:
df_aorta.to_csv('aorta_RedCap_21-4.csv',index=False) #Save file to csv
df_aorta.to_excel('aorta_RedCap_21-4.xlsx',index=False) #Save file to excel

### Vertebra information

In [None]:
df_vertebra

In [None]:
df_vertebra=df_vertebra.iloc[1:] #Remove first row with nan created above
#There are many more rows with nans here since many vertebrae not found by AI
#Here we don't use 'dropna()' since some attributes/columns might not be present in most measurements but we want to keep the rest
df_vertebra.reset_index(drop=True,inplace=True) #reset index

df_vertebra.participant_id=df_vertebra.participant_id.astype(int) #Convert first column with participant IDs to integers

assert len(np.unique(df_vertebra.participant_id))==len(df_vertebra.participant_id) #Confirm that we only have unique patients in df

df_vertebra

In [29]:
df_vertebra.to_csv('vertebra_RedCap_21-4.csv',index=False) #Save file to csv
df_vertebra.to_excel('vertebra_RedCap_21-4.xlsx',index=False) #Save file to excel

### Cardiac Fat information

In [None]:
df_cardiac_fat

In [None]:
df_cardiac_fat=df_cardiac_fat.dropna() #To delete all rows with nans - First row created when adding empty series above
df_cardiac_fat.reset_index(drop=True,inplace=True) #reset index

df_cardiac_fat.participant_id=df_cardiac_fat.participant_id.astype(int) #Convert first column with participant IDs to integers

assert len(np.unique(df_cardiac_fat.participant_id))==len(df_cardiac_fat.participant_id) #Confirm that we only have unique patients in df

df_cardiac_fat

In [32]:
df_cardiac_fat.to_csv('cardiac_fat_RedCap_21-4.csv',index=False) #Save file to csv
df_cardiac_fat.to_excel('cardiac_fat_RedCap_21-4.xlsx',index=False) #Save file to excel

### Emphysema information

In [None]:
df_emph

In [None]:
df_emph=df_emph.dropna() #To delete all rows with nans - First row created when adding empty series above
df_emph.reset_index(drop=True,inplace=True) #reset index and drop index column
df_emph.participant_id=df_emph.participant_id.astype(int) #Convert first column with participant IDs to integers
df_emph

In [35]:
assert len(np.unique(df_emph.participant_id))==len(df_emph.participant_id) #Confirm that we only have unique patients in df

In [36]:
df_emph.to_csv('emph_RedCap_21-4.csv',index=False) #Save file to csv
df_emph.to_excel('emph_RedCap_21-4.xlsx',index=False) #Save file to excel

### Nodules information

In [37]:
#just for debugging
# df_nodules[df_nodules.participant_id==101191] 

In [None]:
df_nodules

In [None]:
df_nodules=df_nodules.iloc[1:] #Remove first row with nan created above
#There are many more rows with nans here since we may not have any nodules detected by AI
#Here we don't use 'dropna()' since some attributes/columns might not be present (eg. when no nodules detected by AI) but we want to keep that information
df_nodules.reset_index(drop=True,inplace=True) #reset index and drop index column
df_nodules.participant_id=df_nodules.participant_id.astype(int) #Convert first column with participant IDs to integers
df_nodules

In [40]:
assert len(np.unique(df_nodules.participant_id))==len(df_nodules.participant_id) #Confirm that we only have unique patients in df

In [41]:
#Export to REDCap - not the last 10 columns with slice information

df_nodules.iloc[:,:-10].to_csv('nodules_to_REDCap_21-4.csv',index=False) #Save file to csv
df_nodules.iloc[:,:-10].to_excel('nodules_to_REDCap_21-4.xlsx',index=False) #Save file to xlsx

### Nodules for BMI experiment

In [42]:
# BMI_pats=[] #Initialize empty list to be filled with BMI participant_ids

# #Loop over possible directories with BMI participants and add them to the above list
# for BMI_pat in os.listdir(os.getcwd()+"\BMI_exp\BMI_high_scans_new"):
#     BMI_pats.append(int(BMI_pat))
# for BMI_pat in os.listdir(os.getcwd()+"\BMI_exp\BMI_low_scans_new"):
#     BMI_pats.append(int(BMI_pat))
# for BMI_pat in os.listdir(os.getcwd()+"\BMI_exp\BMI_high_scans"):
#     BMI_pats.append(int(BMI_pat))
# for BMI_pat in os.listdir(os.getcwd()+"\BMI_exp\BMI_low_scans"):
#     BMI_pats.append(int(BMI_pat))

In [43]:
# df_BMI=df_nodules.loc[df_nodules['participant_id'].isin(BMI_pats)] #Select from all participants with nodules only those of BMI experiment
# df_BMI

In [44]:
# #Check if there is any participant in our experiment that was not selected - If so, might need to resend to AI
# for participant_BMI in BMI_pats:
#     if participant_BMI not in df_BMI['participant_id'].values:
#         print("Participant",participant_BMI,"not found")

In [45]:
# df_BMI.iloc[:,:-10].to_excel('BMI_exp_AI_13-1.xlsx',index=False) #Save file to xlsx - Ignore last 10 columns with slice information
#!!WHY IGNORE THE LAST 10? POSITION NEEDED TO CONVERT TO SLICE NUMBER!!! 
#The actual dataframe needed by the automated algorithm to work is extracted below

### Nodules for Emphysema Experiment

In [46]:
#Patient IDs of individuals with advanced, moderate, and noemphysema
adv=[163557, 197239, 512145, 670208, 998310] 

mod=[136550, 136581, 200637, 215387, 240819, 255903, 283229, 294019, 331182, 332758, 438820, 503788, 507704, 609065,
     633549, 640431, 660928, 757591, 810826, 811041, 860079,  873698, 971099, 985215, 991277, 101191, 944714]

conf=[866164, 282528, 370941, 617769, 754238, 845594,552612] #592863 - has only findings <30 and >300 mm3

noemph=[136154, 184429, 295789, 335382, 341417, 353491, 369762, 370347, 382098, 383275, 384136, 395464, 406668, 410655, 
        427498, 429789, 435703, 440453, 451989, 452500, 493907, 537519, 570103, 591162, 789586, 808262, 146007, 248597, 
        388787, 428859, 449790, 475503, 485925, 585377, 632817, 673634, 817358, 135915, 136470,  225858, 225969,
        278319, 320656, 425409, 490144, 499832, 518709, 582854, 663854, 706029, 870199, 910698, 986374, 988394,
        662368, 199391, 427158, 429703, 458362, 545508, 720754, 845334, 891238, 951248, 100785, 113137, 135984, 136012, 
        136109, 136116, 136185, 136307, 136321,162158, 136418,136432,136456,136487,136494,136425,138310 ,144629]

print("Number of individuals with advanced emphysema is {}".format(len(adv))) #5 in total
print("Number of individuals with confluent emphysema is {}".format(len(conf))) #7 in total
print("Number of individuals with moderate emphysema is {}".format(len(mod))) #27 in total
print("Number of individuals with no-emphysema is {}".format(len(noemph))) #82 in total

Number of individuals with advanced emphysema is 5
Number of individuals with confluent emphysema is 7
Number of individuals with moderate emphysema is 27
Number of individuals with no-emphysema is 82


In [None]:
df_only_nodules=df_nodules.loc[df_nodules['participant_id'].isin(adv+mod+conf+noemph)] #Select only participants of emphysema experiment
df_only_nodules

In [48]:
#The following are some of the participants in which AI didn't detect any nodules (they may have nodules in REDCap)
df_nonods=pd.DataFrame(columns=participant_id+nod_ids+nod_volumes+nod_diam_2d+nod_diam_3d)

#Check for which participant there is no AI file - Should be empty since these included above, otherwise error
nonods_individuals=[]
for participant in adv+mod+conf+noemph:
    if len(df_nodules[df_nodules.participant_id==participant])==0:
        nonods_individuals.append(participant)

df_nonods.participant_id=nonods_individuals
df_nonods #Only first column filled - list of participants

Unnamed: 0,participant_id,ai_nod_id1,ai_nod_id2,ai_nod_id3,ai_nod_id4,ai_nod_id5,ai_nod_id6,ai_nod_id7,ai_nod_id8,ai_nod_id9,...,ai_nod_dia3d_n1,ai_nod_dia3d_n2,ai_nod_dia3d_n3,ai_nod_dia3d_n4,ai_nod_dia3d_n5,ai_nod_dia3d_n6,ai_nod_dia3d_n7,ai_nod_dia3d_n8,ai_nod_dia3d_n9,ai_nod_dia3d_n10
0,609065,,,,,,,,,,...,,,,,,,,,,


In [None]:
#In df_only_nodules we may also have patients with no nodules - Those that were sent to AI only for aorta and not nodules 
combined=[df_only_nodules,df_nonods]
df_nodules_final=pd.concat(combined)
df_nodules_final #All participants with nodules and without
#Define below if we want to create the file for automated algorithm for those participants

#### Below are the actual dataframe needed from AI exports for the automated algorithm to work

In [None]:
#Specify below which dataframe to export, 'df_BMI' (BMI experiment) or 'df_nodules_final' (emphysema experiment)

#Columns needed are those with the volume of nodules and their location
df_want=df_nodules_final[['participant_id','ai_nod_vol1','ai_nod_vol2','ai_nod_vol3','ai_nod_vol4',
                  'ai_nod_vol5','ai_nod_vol6','ai_nod_vol7','ai_nod_vol8','ai_nod_vol9','ai_nod_vol10',
                 'pos1','pos2','pos3','pos4','pos5','pos6','pos7','pos8','pos9','pos10']]

df_want=df_want.sort_values(by='participant_id') #Sort by participant_id
df_want

#Rename column names
df_want.set_axis(['participant_id','V 01','V 02','V 03','V 04','V 05','V 06','V 07','V 08','V 09','V 10',
                 'L 01','L 02','L 03','L 04','L 05','L 06','L 07','L 08','L 09','L 10'],axis=1,inplace=True)

df_want.set_index('participant_id') #Replace index column
df_want['num_nodules']=0 #Extra column that will be filled with num on nods AI detects
df_want.fillna('-',inplace=True) #replace nan with '-'
df_want.replace('','-',inplace=True) #replace empty cells with '-'

for i in range(len(df_want)): #Create new column with number of nodules detected
    df_want['num_nodules'].iloc[i]=int(10-np.sum(df_want.iloc[i]=='-')/2)
    
df_want

In [51]:
df_want.to_excel('emph_exp_AI_21-4.xlsx',index=False) #Final file used by automation algorithm

In [52]:
end=time.time()
print("It took {} secs to run".format(end-start)) #~3min 

It took 171.33628106117249 secs to run
