In [169]:
import os
import pandas as pd
import numpy as np
import traceback
import copy
import pickle

#Statistics
from scipy.special import ndtri
from statsmodels.stats.contingency_tables import mcnemar
# from sklearn.metrics import cohen_kappa_score
import scipy.stats as stats #For Mann-Whitney U test

import warnings #To stop pandas warnings 
warnings.simplefilter(action='ignore', category=Warning)

In [170]:
#Paths with folders containing subfolders named by each participant id. 
#These subfolders contain txt files and images of discrepancies reviewed by radiologists (information provided in txt files)
path_high=os.getcwd()+'/details_final/high'
path_low=os.getcwd()+'/details_final/low'

In [171]:
#Read excel files with data
high=pd.read_excel(os.getcwd()+'/BMI_exp_files'+"\\high_scans.xlsx") 
low=pd.read_excel(os.getcwd()+'/BMI_exp_files'+"\\low_scans.xlsx") 

Function to check results of radiologists' review

Main differences compared to emphysema experiment: Added '[:-1]' to some parameters, 'file[:3]' to some others, 
'litis' and 'ectasis' added, "'ai' in file.lower() and 'fp' not in file.lower()" deleted since not exist in Excel of review. Also some changes in 'other' prints for better viewing.

In 'Convert slices..' section: Volumes >=30mm3 kept, replaced '!!!' in manual annotations, slices_range replaced with +-10 to get the full range, "astype(str).str.contains('L')"

In excel files above, volumes changed along with the number of FPs. We also excluded subsolid nodules in those files too.

Peribronchial tissue is excluded now. We also added a section below to check for peribronchial tissue and to add the rest to peribronchial lymph nodules. 

Also atypical PFNs now in nodule category. In these changed, instead of file.lower().split('fp')[0][:-1] we removed the '[:-1]'.

In [172]:
def show_information_of_review(path):
    
    'Gets the path of a folder with subfolders containing images and txt files of results of nodule review.'
    'These results should be of the following format: "nodule"/"no nodule" and then description and a confidence score'
    'It prints the participant_id, the txt file (with the slice number and if it is a FP or FN), the confidence score,'
    'and a description of the finding given by the radiologists.'
    'Returns dictionaries with participant id and nodule ids of findings belonging to each of the nodule/non-nodule'
    'categories (two dictionaries for each category, one with FPs and one with FNs). Moreover, it returns 4 dictionaries,'
    '2 containing the participant with the correct nodule ids for each of the FPs and FNs and 2 with the wrong ones.'
    'Moreover, we get 4 more dictionaries, 2 containing only lymph nodes and 2 containing only nodule ids (FP and FN again).'
    'At last, we get 4 dictionaries with non-nodule categories, 2 with FPs and 2 with FNs. Each of them has lung and non-lung findings.'


    uncertain=0 #Unsure of what the finding is
    nodule_all=0 #Count all nodules, FPs, and FNs
    total_files=0 #Total files
    excluded=[] #Files not taken into account
    tp_mistakes=0 #For TP accidentaly considered as discrepancies during review - happened probably only once
    
    
    #All possible non-nodule categories - based on new definition   
    fibr_scar_pleural=0 
    other=0
    
    #FPs and FNs for non-nodule categories
    fibr_FP=0
    fibr_FN=0
    other_FP=0
    other_FN=0
    
    #Possible TPs (errors) for non-nodule categories
    fibr_TP=0
    other_TP=0
    
    
    #Nodule categories
    cal_nod=0
    pleu_nod=0
    other_nod=0
    subgrou_nod=0
    canc_nod=0
    
    atypical_triang=0 #This and the next are typically benign so less important if AI would miss them
    peri_fissur=0
    bronchperi=0 
    
    #TP (errors) for nodules
    other_nod_TP=0
    cal_TP=0
    pleu_TP=0
    subgrou_TP=0
    canc_TP=0
    
    atypical_TP=0
    peri_TP=0
    bronchperi_TP=0
    
    #FPs and FNs for nodule categories
    other_nod_FP=0
    cal_FP=0
    pleu_FP=0
    subgrou_FP=0
    canc_FP=0
    other_nod_FN=0
    cal_FN=0
    pleu_FN=0
    subgrou_FN=0
    canc_FN=0
    
    atypical_FP=0
    atypical_FN=0
    peri_FP=0
    peri_FN=0
    bronchperi_FP=0
    bronchperi_FN=0
    
    #Dictionaries to be filled participant_ids and nodule_ids that belong to a given category
    atyp_FN={}
    per_FN={}
    bronchioperi_FN={}
    pleural_FN={}
    calcif_FN={}
    sub_ground_FN={}
    cancer_FN={}
    other_nodules_FN={}
    
    other_nonodules_FN={}
    fibrosis_FN={}
    other_nonodules_FN_lung={}
    other_nonodules_FN_nolung={}
    
    atyp_FP={}
    per_FP={}
    bronchioperi_FP={}
    pleural_FP={}
    calcif_FP={}
    sub_ground_FP={}
    cancer_FP={}
    other_nodules_FP={}
    
    #Non-nodule categories
    other_nonodules_FP={}
    fibrosis_FP={}
    other_nonodules_FP_lung={}
    other_nonodules_FP_nolung={}

    peri=0
    
    #Initialize empty dictionaries to keep track FP and FN slices
    
    #These are for both nodules (+lymph nodes) and non-nodules
    dict_FP_correct={}
    dict_FN_correct={}
    dict_FP_wrong={}
    dict_FN_wrong={}

    #Only for lymph nodes
    lymph_FN_correct={}
    lymph_FP_wrong={}

    #Only for nodules
    nod_FN_correct={}
    nod_FP_wrong={}

    #List with confidence scores
    conf_scores=[]
    
    
    for dirpath, dirnames, filenames in os.walk(path): #Loop over folders and subfolders
        for folder in dirnames: #For each folder (has participant name) in the above directory
            for file in os.listdir(dirpath+'/'+folder): #For each file in the above folder

                if file.endswith('.txt'): #If it's a txt print it (contains the review) along with the folder name (ID)
                    print(dirpath,':',folder,':',file)

                    with open(dirpath+'/'+folder+'/'+file) as f: #Read txt file
                        lines = f.readlines()

                    folder_pat=folder[:6] #keep only first 6 letters that correspond to participant id
                        
                    #Get confidence score - the only number in the text
                    confidence=[num for line in lines for num in line if num.isdigit()] 
            
                    if len(confidence)==1: #If there are more numbers it should be checked for errors
                        print('Confidence is',int(confidence[0]))
                        conf_scores.append(int(confidence[0]))
                    else:
                        print("ERROR in confidence level of file",file)

                        
                    no_nodules=[line for line in lines if 'no ' in line.lower()] #if this string in txt then no nodule

                    if len(no_nodules)!=0: #Confirm that above non-empty list
                        
                        total_files=total_files+1 #Increase total number of files taken into account
                        print('Finding is NOT a nodule (or it is a lymph node)')
                        
                        information=[info.split('nodule',1) for info in no_nodules][0] #split only on first occurence 
                        details=[elem for elem in information if len(elem)>5] #Since we may also have an element with 'no'
                        
                        if len(details)>0: #If we have a description of finding

                            #Perform some replacements to delete '\n','-', empty spaces and confidence score
                            detailed_info=details[0].replace('-', '').replace('\n','').replace(confidence[0],'').strip()

                            print(detailed_info.replace(':',''), 'was written in the txt file')
                            
#                             if int(confidence[0])>=4: #Only take into account confident predictions
                                
                            #Below categories for non-nodules

                            #For atypical and perifissural we noted them as non-nodules while actually want to be detected
                            #We will consider them as nodules - that's why we changed to 'fn_correct', 'fp_wrong' for them
                            if ('atypical' in detailed_info.lower() or 'triangular' in detailed_info.lower() and 'fissural' not in detailed_info.lower() \
                                and 'amidst' not in detailed_info.lower()): 

                                print('atypical/triangular lymph node')
                                atypical_triang=atypical_triang+1 #Count them

                                if 'tp' in file.lower():
                                    print('This will not be considered')
                                    atypical_TP=atypical_TP+1
                                    tp_mistakes=tp_mistakes+1
                                    nodule_all=nodule_all+1

                                elif 'fp' in file.lower() or 'ai' in file.lower() and 'fn' not in file.lower():
                                    atypical_FP=atypical_FP+1 #Count them
                                    nodule_all=nodule_all+1
                                                                        
                                    if folder_pat not in dict_FP_wrong: #Add participant id to dictionary - list only with slice numbers
                                        dict_FP_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))] 
                                    else:
                                        dict_FP_wrong[folder_pat]=dict_FP_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    
                                    # if folder_pat not in lymph_FP_wrong: #Add it to dictionary with lymph nodes
                                    #     lymph_FP_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    # else:
                                    #     lymph_FP_wrong[folder_pat]=lymph_FP_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    if folder_pat not in nod_FP_wrong:
                                        nod_FP_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        nod_FP_wrong[folder_pat]=nod_FP_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]

                                    if int(folder_pat) in atyp_FP: #Add it to corresponding category dictionary
                                        atyp_FP[int(folder_pat)].append(file.lower().split('fp')[0])
                                        #All these were file.lower().split('fp')[0][:-1] but [:-1] removed! same for fn

                                    else:
                                        atyp_FP[int(folder_pat)]=[file.lower().split('fp')[0]]

                                elif 'fn' in file.lower(): #Similarly as above for FNs
                                    atypical_FN=atypical_FN+1
                                    nodule_all=nodule_all+1
                                    
                                    if folder_pat not in dict_FN_correct:
                                        dict_FN_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        dict_FN_correct[folder_pat]=dict_FN_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    
                                    # if folder_pat not in lymph_FN_correct:
                                    #     lymph_FN_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    # else:
                                    #     lymph_FN_correct[folder_pat]=lymph_FN_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]    
                                    if folder_pat not in nod_FN_correct:
                                        nod_FN_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        nod_FN_correct[folder_pat]=nod_FN_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]  
                                    
                                    if int(folder_pat) in atyp_FN:
                                        atyp_FN[int(folder_pat)].append(file.lower().split('fn')[0])
                                    else:
                                        atyp_FN[int(folder_pat)]=[file.lower().split('fn')[0]]

                                else:
                                    print('ERROR in atypical')


                            elif ('periphysural' in detailed_info.lower() or 'perifissural' in detailed_info.lower() # or 'fissural' in detailed_info.lower() 
                                  or 'fiscu' in detailed_info.lower() or 'pfn' in detailed_info.lower() or 'fissural' in detailed_info.lower() \
                                    and 'thickening' not in detailed_info.lower()): 

                                print('perifissural/fissural/PFN')
                                peri_fissur=peri_fissur+1  

                                if 'tp' in file.lower():
                                    print('This will not be considered')
                                    peri_TP=peri_TP+1
                                    tp_mistakes=tp_mistakes+1
                                    nodule_all=nodule_all+1

                                elif 'fp' in file.lower() or 'ai' in file.lower() and 'fn' not in file.lower():
                                    peri_FP=peri_FP+1
                                    nodule_all=nodule_all+1
                                    
                                    if folder_pat not in dict_FP_wrong:
                                        dict_FP_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        dict_FP_wrong[folder_pat]=dict_FP_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                                                                                 
                                    if folder_pat not in lymph_FP_wrong:
                                        lymph_FP_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        lymph_FP_wrong[folder_pat]=lymph_FP_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]

                                    if int(folder_pat) in per_FP:
                                        per_FP[int(folder_pat)].append(file.lower().split('fp')[0])
                                    else:
                                        per_FP[int(folder_pat)]=[file.lower().split('fp')[0]]

                                elif 'fn' in file.lower():
                                    peri_FN=peri_FN+1
                                    nodule_all=nodule_all+1
                                    
                                    if folder_pat not in dict_FN_correct:
                                        dict_FN_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        dict_FN_correct[folder_pat]=dict_FN_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    
                                    if folder_pat not in lymph_FN_correct:
                                        lymph_FN_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        lymph_FN_correct[folder_pat]=lymph_FN_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]


                                    if int(folder_pat) in per_FN:
                                        per_FN[int(folder_pat)].append(file.lower().split('fn')[0])
                                    else:
                                        per_FN[int(folder_pat)]=[file.lower().split('fn')[0]]

                                else:
                                    print('ERROR in periphysural')


                            elif ('fibrosis' in detailed_info.lower() or 'scar' in detailed_info.lower() 
                                  or 'thick' in detailed_info.lower() or 'strand' in detailed_info.lower()):

                                print('fibrosis/scar/pleural thickening')
                                fibr_scar_pleural=fibr_scar_pleural+1 

                                if 'tp' in file.lower():
                                    print('This will not be considered')
                                    fibr_TP=fibr_TP+1
                                    tp_mistakes=tp_mistakes+1
                                    nodule_all=nodule_all+1

                                elif 'fp' in file.lower() or 'ai' in file.lower() and 'fn' not in file.lower():
                                    fibr_FP=fibr_FP+1
                                    
                                    if folder_pat not in dict_FP_correct:
                                        dict_FP_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        dict_FP_correct[folder_pat]=dict_FP_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        
                                    if 'ai' in file.lower() and 'fp' not in file.lower():
                                        dict_FP_correct[folder_pat]=dict_FP_correct[folder_pat][:-1]+[int(''.join([x for x in file[-7:-4] if x.isdigit()]))]
                                    
                                    if int(folder_pat) in fibrosis_FP:
                                        fibrosis_FP[int(folder_pat)].append(file.lower().split('fp')[0])
                                    else:
                                        fibrosis_FP[int(folder_pat)]=[file.lower().split('fp')[0]]

                                elif 'fn' in file.lower():
                                    fibr_FN=fibr_FN+1
                                    
                                    if folder_pat not in dict_FN_wrong:
                                        dict_FN_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        dict_FN_wrong[folder_pat]=dict_FN_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    
                                    if int(folder_pat) in fibrosis_FN:
                                        fibrosis_FN[int(folder_pat)].append(file.lower().split('fn')[0])
                                    else:
                                        fibrosis_FN[int(folder_pat)]=[file.lower().split('fn')[0]]

                                else:
                                    print('ERROR IN fibrosis')


                            elif ('bronch' in detailed_info.lower() or 'peribronchial' in detailed_info.lower() or 'pbv' in detailed_info.lower() \
                                or 'hilar' in detailed_info.lower()) \
                                and ('litis' not in detailed_info.lower() and 'ectasis' not in detailed_info.lower() and 'bundle' not in detailed_info.lower() \
                                and 'bronchovascular tissue' not in detailed_info.lower() and 'supportive tissue' not in detailed_info.lower() \
                                and 'some tissue' not in detailed_info.lower()):

                                print('peribronchial/bronchiovascular lymph node 1')
                                bronchperi=bronchperi+1

                                if 'tp' in file.lower():
                                    print('This will not be considered')
                                    bronchperi_TP=bronchperi_TP+1
                                    tp_mistakes=tp_mistakes+1
                                    nodule_all=nodule_all+1

                                elif 'peribronchial' in detailed_info.lower() or 'pbv' in detailed_info.lower():
                                    print('peribronchial that will not be considered - Need to confirm manually that vol<100mm3.')
                                    nodule_all=nodule_all+1
                                    peri=peri+1

                                elif 'fp' in file.lower() or 'ai' in file.lower() and 'fn' not in file.lower():
                                    bronchperi_FP=bronchperi_FP+1
                                    nodule_all=nodule_all+1
                                    
                                    if folder_pat not in dict_FP_wrong:
                                        dict_FP_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        dict_FP_wrong[folder_pat]=dict_FP_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]

                                    if folder_pat not in lymph_FP_wrong:
                                        lymph_FP_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        lymph_FP_wrong[folder_pat]=lymph_FP_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    
                                    if int(folder_pat) in bronchioperi_FP:
                                        bronchioperi_FP[int(folder_pat)].append(file.lower().split('fp')[0])
                                    else:
                                        bronchioperi_FP[int(folder_pat)]=[file.lower().split('fp')[0]]

                                elif 'fn' in file.lower():
                                    bronchperi_FN=bronchperi_FN+1
                                    nodule_all=nodule_all+1
                                    
                                    if folder_pat not in dict_FN_correct:
                                        dict_FN_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        dict_FN_correct[folder_pat]=dict_FN_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        
                                    if folder_pat not in lymph_FN_correct:
                                        lymph_FN_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        lymph_FN_correct[folder_pat]=lymph_FN_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        
                                    if int(folder_pat) in bronchioperi_FN:
                                        bronchioperi_FN[int(folder_pat)].append(file.lower().split('fn')[0])
                                    else:
                                        bronchioperi_FN[int(folder_pat)]=[file.lower().split('fn')[0]]
                                else:
                                    print('ERROR IN peribronchial')


                            elif 'lymph' in detailed_info.lower():  
                                #Seperate from above since sometimes it may start with 'fissural lymph node' - 'intrapulmonary lymph node'
                                #and therefore being a different category - this checked first in the 'if' above

                                if 'bronch' in detailed_info.lower():

                                    print('peribronchial/bronchiovascular lymph node')
                                    bronchperi=bronchperi+1

                                    if 'tp' in file.lower():
                                        print('This will not be considered')
                                        bronchperi_TP=bronchperi_TP+1
                                        tp_mistakes=tp_mistakes+1
                                        nodule_all=nodule_all+1

                                    elif 'peribronchial' in detailed_info.lower() or 'pbv' in detailed_info.lower():
                                        print('peribronchial that will not be considered - Need to confirm manually that vol<100mm3.')
                                        nodule_all=nodule_all+1
                                        peri=peri+1

                                    elif 'fp' in file.lower() or 'ai' in file.lower() and 'fn' not in file.lower():
                                        bronchperi_FP=bronchperi_FP+1
                                        nodule_all=nodule_all+1
                                        
                                        if folder_pat not in dict_FP_wrong:
                                            dict_FP_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        else:
                                            dict_FP_wrong[folder_pat]=dict_FP_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]

                                        if folder_pat not in lymph_FP_wrong:
                                            lymph_FP_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        else:
                                            lymph_FP_wrong[folder_pat]=lymph_FP_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        
                                        if int(folder_pat) in bronchioperi_FP:
                                            bronchioperi_FP[int(folder_pat)].append(file.lower().split('fp')[0])
                                        else:
                                            bronchioperi_FP[int(folder_pat)]=[file.lower().split('fp')[0]]

                                    elif 'fn' in file.lower():
                                        bronchperi_FN=bronchperi_FN+1
                                        nodule_all=nodule_all+1
                                        
                                        if folder_pat not in dict_FN_correct:
                                            dict_FN_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        else:
                                            dict_FN_correct[folder_pat]=dict_FN_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                            
                                        if folder_pat not in lymph_FN_correct:
                                            lymph_FN_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        else:
                                            lymph_FN_correct[folder_pat]=lymph_FN_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                            
                                        if int(folder_pat) in bronchioperi_FN:
                                            bronchioperi_FN[int(folder_pat)].append(file.lower().split('fn')[0])
                                        else:
                                            bronchioperi_FN[int(folder_pat)]=[file.lower().split('fn')[0]]
                                    else:
                                        print('ERROR IN peribronchial')


                                else:   
                                    #We added the above section to check for peribronchial tissue and to add the rest to peribronchial lymph nodules
                                    #The ones below same as in emphysema
                                    print('atypical/triangular lymph node')
                                    atypical_triang=atypical_triang+1

                                    if 'tp' in file.lower():
                                        print('This will not be considered')
                                        atypical_TP=atypical_TP+1
                                        tp_mistakes=tp_mistakes+1
                                        nodule_all=nodule_all+1

                                    elif 'fp' in file.lower() or 'ai' in file.lower() and 'fn' not in file.lower():
                                        atypical_FP=atypical_FP+1
                                        nodule_all=nodule_all+1
                                        
                                        if folder_pat not in dict_FP_wrong:
                                            dict_FP_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        else:
                                            dict_FP_wrong[folder_pat]=dict_FP_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]

                                        # if folder_pat not in lymph_FP_wrong:
                                        #     lymph_FP_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        # else:
                                        #     lymph_FP_wrong[folder_pat]=lymph_FP_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        if folder_pat not in nod_FP_wrong:
                                            nod_FP_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        else:
                                            nod_FP_wrong[folder_pat]=nod_FP_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        
                                        if int(folder_pat) in atyp_FP:
                                            atyp_FP[int(folder_pat)].append(file.lower().split('fp')[0])
                                        else:
                                            atyp_FP[int(folder_pat)]=[file.lower().split('fp')[0]]

                                    elif 'fn' in file.lower():
                                        atypical_FN=atypical_FN+1
                                        nodule_all=nodule_all+1
                                        
                                        if folder_pat not in dict_FN_correct:
                                            dict_FN_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        else:
                                            dict_FN_correct[folder_pat]=dict_FN_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    
                                        # if folder_pat not in lymph_FN_correct:
                                        #     lymph_FN_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        # else:
                                        #     lymph_FN_correct[folder_pat]=lymph_FN_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        if folder_pat not in nod_FN_correct:
                                            nod_FN_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                        else:
                                            nod_FN_correct[folder_pat]=nod_FN_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]    
                                            
                                        if int(folder_pat) in atyp_FN:
                                            atyp_FN[int(folder_pat)].append(file.lower().split('fn')[0])
                                        else:
                                            atyp_FN[int(folder_pat)]=[file.lower().split('fn')[0]]

                                    else:
                                        print('ERROR IN atypical')


                            else: #Here when we have description but it's other non-nods (eg. atelectasis)
                                other=other+1

                                if 'tp' in file.lower():
                                    print('This will not be considered')
                                    other_TP=other_TP+1
                                    tp_mistakes=tp_mistakes+1
                                    nodule_all=nodule_all+1

                                elif 'fp' in file.lower() or 'ai' in file.lower() and 'fn' not in file.lower():
                                    other_FP=other_FP+1
                                    
                                    if folder_pat not in dict_FP_correct:
                                        dict_FP_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        dict_FP_correct[folder_pat]=dict_FP_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    
                                    if int(folder_pat) in other_nonodules_FP:
                                        other_nonodules_FP[int(folder_pat)].append(file.lower().split('fp')[0])

                                        res=detailed_info.lower() #Get information about type of non-nodule

                                        if 'atele' in res or 'infe' in res or 'conso' in res or 'mucu' in res or 'vess' in res \
                                            or 'vascular' in res or 'pleural' in res or 'adhesion' in res or 'bronchi' in res \
                                            or 'abnormality' in res or 'infiltrate' in res: #Without parenthesis always get in!
                                            print("Classified as lung finding")
                                            try: #Lung findings
                                                other_nonodules_FP_lung[int(folder_pat)].append(file.lower().split('fp')[0])
                                            except:
                                                other_nonodules_FP_lung[int(folder_pat)]=[file.lower().split('fp')[0]]

                                        elif 'bone' in res or 'osis' in res or 'fat' in res or 'tiss' in res or 'colon' in res \
                                            or 'cartilage' in res or 'diaphragm' in res or 'hernia' in res: #Non-lung
                                            print("Classified as non-lung finding")
                                            try:
                                                other_nonodules_FP_nolung[int(folder_pat)].append(file.lower().split('fp')[0])
                                            except:
                                                other_nonodules_FP_nolung[int(folder_pat)]=[file.lower().split('fp')[0]]
                                        else:
                                            print("Cannot classify it as lung/non-lung based on description")


                                    else:
                                        other_nonodules_FP[int(folder_pat)]=[file.lower().split('fp')[0]]

                                        res=detailed_info.lower()

                                        if 'atele' in res or 'infe' in res or 'conso' in res or 'mucu' in res or 'vess' in res \
                                            or 'vascular' in res or 'pleural' in res or 'adhesion' in res or 'bronchi' in res \
                                            or 'abnormality' in res or 'infiltrate' in res: #Without parenthesis always get in!
                                            print("Classified as lung finding")

                                            try:
                                                other_nonodules_FP_lung[int(folder_pat)].append(file.lower().split('fp')[0])
                                            except:
                                                other_nonodules_FP_lung[int(folder_pat)]=[file.lower().split('fp')[0]]

                                        elif 'bone' in res or 'osis' in res or 'fat' in res or 'tiss' in res or 'colon' in res \
                                            or 'cartilage' in res or 'diaphragm' in res or 'hernia' in res: #Non-lung
                                            print("Classified as non-lung finding")
                                            try:
                                                other_nonodules_FP_nolung[int(folder_pat)].append(file.lower().split('fp')[0])
                                            except:
                                                other_nonodules_FP_nolung[int(folder_pat)]=[file.lower().split('fp')[0]]
                                        else:
                                            print("Cannot classify it as lung/non-lung based on description")
                                        
                                elif 'fn' in file.lower():
                                    other_FN=other_FN+1
                                    
                                    if folder_pat not in dict_FN_wrong:
                                        dict_FN_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:  
                                        dict_FN_wrong[folder_pat]=dict_FN_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    
                                    if int(folder_pat) in other_nonodules_FN:
                                        other_nonodules_FN[int(folder_pat)].append(file.lower().split('fn')[0][:-1])

                                        res=detailed_info.lower()

                                        if 'atele' in res or 'infe' in res or 'conso' in res or 'mucu' in res or 'vess' in res \
                                            or 'vascular' in res or 'pleural' in res or 'adhesion' in res or 'bronchi' in res \
                                            or 'abnormality' in res or 'infiltrate' in res: #Without parenthesis always get in!
                                            print("Classified as lung finding")
                                            try:
                                                other_nonodules_FN_lung[int(folder_pat)].append(file.lower().split('fn')[0])
                                            except:
                                                other_nonodules_FN_lung[int(folder_pat)]=[file.lower().split('fn')[0]]

                                        elif 'bone' in res or 'osis' in res or 'fat' in res or 'tiss' in res or 'colon' in res \
                                            or 'cartilage' in res or 'diaphragm' in res or 'hernia' in res: #Non-lung
                                            print("Classified as non-lung finding")
                                            try:
                                                other_nonodules_FN_nolung[int(folder_pat)].append(file.lower().split('fn')[0])
                                            except:
                                                other_nonodules_FN_nolung[int(folder_pat)]=[file.lower().split('fn')[0]]
                                        else:
                                            print("Cannot classify it as lung/non-lung based on description")

                                    else:
                                        other_nonodules_FN[int(folder_pat)]=[file.lower().split('fn')[0][:-1]]

                                        res=detailed_info.lower()

                                        if 'atele' in res or 'infe' in res or 'conso' in res or 'mucu' in res or 'vess' in res \
                                            or 'vascular' in res or 'pleural' in res or 'adhesion' in res or 'bronchi' in res \
                                            or 'abnormality' in res or 'infiltrate' in res: #Without parenthesis always get in!
                                            print("Classified as lung finding")
                                            try:
                                                other_nonodules_FN_lung[int(folder_pat)].append(file.lower().split('fn')[0])
                                            except:
                                                other_nonodules_FN_lung[int(folder_pat)]=[file.lower().split('fn')[0]]

                                        elif 'bone' in res or 'osis' in res or 'fat' in res or 'tiss' in res or 'colon' in res \
                                            or 'cartilage' in res or 'diaphragm' in res or 'hernia' in res: #Non-lung
                                            print("Classified as non-lung finding")
                                            try:
                                                other_nonodules_FN_nolung[int(folder_pat)].append(file.lower().split('fn')[0])
                                            except:
                                                other_nonodules_FN_nolung[int(folder_pat)]=[file.lower().split('fn')[0]]
                                        else:
                                            print("Cannot classify it as lung/non-lung based on description")

                                else:
                                    print('ERROR IN other')                                    
                                        
#                             else:
#                                 print('Low confidence <=3 - excluded from analysis')
#                                 excluded.append(folder+':'+file)
                                
                                
                        else: #If we don't have a description of the finding - we add those 'non-nodule' in 'other'
                            
#                             if int(confidence[0])>=4:

                                print('No information for non-nodule file',dirpath,':',folder,':',file)
                                other=other+1

                                if 'tp' in file.lower():
                                    print('This will not be considered')                    
                                    other_TP=other_TP+1
                                    tp_mistakes=tp_mistakes+1

                                elif 'fp' in file.lower() or 'ai' in file.lower():
                                    other_FP=other_FP+1
                                    
                                    if folder_pat not in dict_FP_correct:
                                        dict_FP_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        dict_FP_correct[folder_pat]=dict_FP_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    
                                    if int(folder_pat) in other_nonodules_FP:
                                        other_nonodules_FP[int(folder_pat)].append(file.lower().split('fp')[0])
                                    else:
                                        other_nonodules_FP[int(folder_pat)]=[file.lower().split('fp')[0]]
                                        
                                elif 'fn' in file.lower():
                                    other_FN=other_FN+1
                                    
                                    if folder_pat not in dict_FN_wrong:
                                        dict_FN_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    else:
                                        dict_FN_wrong[folder_pat]=dict_FN_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    
                                    if int(folder_pat) in other_nonodules_FN:
                                        other_nonodules_FN[int(folder_pat)].append(file.lower().split('fn')[0])
                                    else:
                                        other_nonodules_FN[int(folder_pat)]=[file.lower().split('fn')[0]]
                                        
                                else:
                                    print('ERROR IN other')

#                             else:
#                                 print('Low confidence <=3 - excluded from analysis')
#                                 excluded.append(folder+':'+file)
                                    
                        print('\n')


                    else: #If it's not a non-nodule, it will be either 'unsure' or 'nodule'
                        
                        total_files=total_files+1 #Increase total number of files taken into account
                        unsure=[line for line in lines if 'unsure' in line.lower()] #If line contains 'unsure'
                        
                        if len(unsure)!=0: #If it's not empty, then unsure about finding
                            print('Unsure about what this finding is')
                            uncertain=uncertain+1
#                             print('Low confidence <=3 - excluded from analysis')
#                             excluded.append(folder+':'+file)
                            
                        else: #Otherwise it's a nodule
                            print('Finding is a nodule')
                            
                            nodules=[line for line in lines if 'nodule' in line.lower()] #Confirm 'nodule' in line
                            information=[info.split('nodule',1) for info in nodules][0] #split only on first occurence
                            details=[elem for elem in information if len(elem)>5] #similar as above
                            
                            if len(details)>0: #If we have a description of finding 
                                    
                                    #Clean as above plus ':'
                                    nod_desc=details[0].lower().replace('nodule','').replace('-', '').replace('\n','').replace(confidence[0],'').replace(':','').strip()
                                    print(nod_desc)
            
#                                     if int(confidence[0])>=4:
                    
                                    #Below categories for nodules

                                    if 'calc' in nod_desc:
                                        cal_nod=cal_nod+1
                                        print('Calcified nodule added')

                                        if 'tp' in file.lower():
                                            print('This will not be considered')
                                            cal_TP=cal_TP+1
                                            tp_mistakes=tp_mistakes+1

                                        elif 'fp' in file.lower() or 'ai' in file.lower():
                                            cal_FP=cal_FP+1
                                            
                                            if int(folder_pat) in calcif_FP:
                                                calcif_FP[int(folder_pat)].append(file.lower().split('fp')[0])
                                            else:
                                                calcif_FP[int(folder_pat)]=[file.lower().split('fp')[0]]

                                        elif 'fn' in file.lower():
                                            cal_FN=cal_FN+1
                                            
                                            if int(folder_pat) in calcif_FN:
                                                calcif_FN[int(folder_pat)].append(file.lower().split('fn')[0])
                                            else:
                                                calcif_FN[int(folder_pat)]=[file.lower().split('fn')[0]]

                                        else:
                                            print('ERROR IN calcified')


                                    elif 'pleu' in nod_desc:
                                        pleu_nod=pleu_nod+1
                                        print('pleural nodule added')

                                        if 'tp' in file.lower():
                                            print('This will not be considered')
                                            pleu_TP=pleu_TP+1
                                            tp_mistakes=tp_mistakes+1

                                        elif 'fp' in file.lower() or 'ai' in file.lower():
                                            pleu_FP=pleu_FP+1
                                            
                                            if int(folder_pat) in pleural_FP:
                                                pleural_FP[int(folder_pat)].append(file.lower().split('fp')[0])
                                            else:
                                                pleural_FP[int(folder_pat)]=[file.lower().split('fp')[0]]

                                        elif 'fn' in file.lower():
                                            pleu_FN=pleu_FN+1
                                            
                                            if int(folder_pat) in pleural_FN: 
                                                pleural_FN[int(folder_pat)].append(file.lower().split('fn')[0])
                                            else:
                                                pleural_FN[int(folder_pat)]=[file.lower().split('fn')[0]]

                                        else:
                                            print('ERROR IN pleural nodules')


                                    elif 'sub' in nod_desc or 'grou' in nod_desc:
                                        subgrou_nod=subgrou_nod+1
                                        print('subsolid/ground glass nodule added')

                                        if 'tp' in file.lower():
                                            print('This will not be considered')
                                            subgrou_TP=subgrou_TP+1
                                            tp_mistakes=tp_mistakes+1

                                        elif 'fp' in file.lower() or 'ai' in file.lower():
                                            subgrou_FP=subgrou_FP+1
                                            
                                            if int(folder_pat) in sub_ground_FP:
                                                sub_ground_FP[int(folder_pat)].append(file.lower().split('fp')[0])
                                            else:
                                                sub_ground_FP[int(folder_pat)]=[file.lower().split('fp')[0]]

                                        elif 'fn' in file.lower():
                                            subgrou_FN=subgrou_FN+1
                                            
                                            if int(folder_pat) in sub_ground_FN:
                                                sub_ground_FN[int(folder_pat)].append(file.lower().split('fn')[0])
                                            else:
                                                sub_ground_FN[int(folder_pat)]=[file.lower().split('fn')[0]]

                                        else:
                                            print('ERROR IN subsolid/ground class nodules')


                                    elif 'canc' in nod_desc:
                                        canc_nod=canc_nod+1
                                        print('cancer added')

                                        if 'tp' in file.lower():
                                            print('This will not be considered')
                                            canc_TP=canc_TP+1
                                            tp_mistakes=tp_mistakes+1

                                        elif 'fp' in file.lower() or 'ai' in file.lower():
                                            canc_FP=canc_FP+1
                                            
                                            if int(folder_pat) in cancer_FP:
                                                cancer_FP[int(folder_pat)].append(file.lower().split('fp')[0])
                                            else:
                                                cancer_FP[int(folder_pat)]=[file.lower().split('fp')[0]]                                           

                                        elif 'fn' in file.lower():
                                            canc_FN=canc_FN+1
                                            
                                            if int(folder_pat) in cancer_FN:
                                                cancer_FN[int(folder_pat)].append(file.lower().split('fn')[0])
                                            else:
                                                cancer_FN[int(folder_pat)]=[file.lower().split('fn')[0]]

                                        else:
                                            print('ERROR IN cancer')
    
#                                     else:
#                                         print('Low confidence <=3 - excluded from analysis')
#                                         excluded.append(folder+':'+file)
                        
                                    else:
                                        
        #                                 if int(confidence[0])>=4:
                                            
                                            other_nod=other_nod+1
                                            print('No information for file with nodule:',dirpath,':',folder,':',file)


                                            if 'tp' in file.lower():
                                                print('This will not be considered')
                                                other_nod_TP=other_nod_TP+1
                                                tp_mistakes=tp_mistakes+1

                                            elif 'fp' in file.lower() or 'ai' in file.lower():
                                                other_nod_FP=other_nod_FP+1
                                                
                                                if int(folder_pat) in other_nodules_FP:
                                                    other_nodules_FP[int(folder_pat)].append(file.lower().split('fp')[0])
                                                else:     
                                                    other_nodules_FP[int(folder_pat)]=[file.lower().split('fp')[0]]

                                            elif 'fn' in file.lower():
                                                other_nod_FN=other_nod_FN+1
                                                
                                                if int(folder_pat) in other_nodules_FN:
                                                    other_nodules_FN[int(folder_pat)].append(file.lower().split('fn')[0])
                                                else:
                                                    other_nodules_FN[int(folder_pat)]=[file.lower().split('fn')[0]]

                                            else:
                                                print('ERROR IN other')               
                                                
        #                                 else:
        #                                     print('Low confidence <=3 - excluded from analysis')
        #                                     excluded.append(folder+':'+file)
                                      
                            
#                             if int(confidence[0])>=4:
                            
                            nodule_all=nodule_all+1                                      

                            if 'fn' in file.lower() and 'fp' not in file.lower(): 
                            #Ensure that it was FN - second condition to confirm it
                        
                                if folder_pat not in dict_FN_correct:
                                    dict_FN_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                else:
                                    #First letters pick up the slice number
                                    dict_FN_correct[folder_pat]=dict_FN_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                    
                                    
                                if folder_pat not in nod_FN_correct:
                                    nod_FN_correct[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                else:
                                    nod_FN_correct[folder_pat]=nod_FN_correct[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]    
                                    

                            elif ('fp' in file.lower() or 'ai' in file.lower()) and 'fn' not in file.lower():
                               
                                if folder_pat not in dict_FP_wrong:
                                    dict_FP_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                else:
                                    dict_FP_wrong[folder_pat]=dict_FP_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                    
                                if folder_pat not in nod_FP_wrong:
                                    nod_FP_wrong[folder_pat]=[int(''.join([x for x in file[:3] if x.isdigit()]))]
                                else:
                                    nod_FP_wrong[folder_pat]=nod_FP_wrong[folder_pat]+[int(''.join([x for x in file[:3] if x.isdigit()]))]
                        

                                if 'ai' in file.lower() and 'fp' not in file.lower():
                                    dict_FP_wrong[folder_pat]=dict_FP_wrong[folder_pat][:-1]+[int(''.join([x for x in file[-7:-4] if x.isdigit()]))]
                                    nod_FP_wrong[folder_pat]=nod_FP_wrong[folder_pat][:-1]+[int(''.join([x for x in file[-7:-4] if x.isdigit()]))]

                        print('\n')

                                                                                                 
    print('Num of uncertainties',uncertain)
    print('\n')
        
    print("From nodules, there were {} FP, {} FN, and {} TP calcified nodules".format(cal_FP,cal_FN,cal_TP))    
    print("From nodules, there were {} FP, {} FN, and {} TP pleural nodules".format(pleu_FP,pleu_FN,pleu_TP))
    print("From nodules, there were {} FP, {} FN, and {} TP 'other' nodules".format(other_nod_FP,other_nod_FN,other_nod_TP))
    print("From nodules, there were {} FP, {} FN, and {} TP subsolid/ground class nodules".format(subgrou_FP,subgrou_FN,subgrou_TP))
    print("From nodules, there were {} FP, {} FN, and {} TP cancer cases".format(canc_FP,canc_FN,canc_TP))
    print("There are {} FP, {} FN, and {} TP atypical PFN and/or triangular lymph nodes".format(atypical_FP,atypical_FN,atypical_TP))
    print("There are {} FP, {} FN, and {} TP perifissural/fissural/PFN".format(peri_FP,peri_FN,peri_TP))
    print("There are {} FP, {} FN, and {} TP bronchiovascular lymph nodes".format(bronchperi_FP,bronchperi_FN,bronchperi_TP))
    print("There are {} peribronchial (excluded) lymph nodes - both FP and FN".format(peri))
    print('\n')

    print("There are {} FP, {} FN, and {} TP fibrosis/scar/pleural thickening".format(fibr_FP,fibr_FN,fibr_TP))
    print("There are {} FP, {} FN, and {} TP other findings (bone, tissue, mucus, arthosis, vessel, consolidation, infection, fat, atelectasis, etc. )".format(other_FP,other_FN,other_TP))
    print('\n')

    print("Total number of files is ",total_files)
    print("From those, there were {} files excluded due to low confidence <=3 and their names are: {}".format(len(excluded),excluded))

    #Confirm that non-nodules found properly
    assert fibr_scar_pleural==fibr_FP+fibr_FN+fibr_TP

    #Confirm that nodules found properly
    assert cal_nod==cal_FP+cal_FN+cal_TP
    assert pleu_nod==pleu_FP+pleu_FN+pleu_TP
    assert subgrou_nod==subgrou_FP+subgrou_FN+subgrou_TP
    assert canc_nod==canc_FP+canc_FN+canc_TP
    
    assert atypical_triang==atypical_FP+atypical_FN+atypical_TP
    assert peri_fissur==peri_FP+peri_FN+peri_TP
    assert bronchperi==bronchperi_TP+bronchperi_FP+bronchperi_FN+peri

    assert nodule_all==cal_nod+pleu_nod+other_nod+subgrou_nod+canc_nod+atypical_triang+peri_fissur+bronchperi

    print("Mean confidence score is ",np.mean(conf_scores),'and median is', np.median(conf_scores), \
          'for a total of ',len(conf_scores),'files')
    
    return (atyp_FN,per_FN,pleural_FN,calcif_FN,sub_ground_FN,cancer_FN,other_nodules_FN,other_nonodules_FN,fibrosis_FN,
            bronchioperi_FN,atyp_FP,per_FP,pleural_FP,calcif_FP,sub_ground_FP,cancer_FP,other_nodules_FP,
            other_nonodules_FP,fibrosis_FP,bronchioperi_FP,dict_FP_correct,dict_FP_wrong,dict_FN_correct,dict_FN_wrong,
            lymph_FP_wrong,lymph_FN_correct, nod_FP_wrong, nod_FN_correct,
            other_nonodules_FN_lung,other_nonodules_FN_nolung, other_nonodules_FP_lung,other_nonodules_FP_nolung)

In [1]:
# %%capture cap --no-stderr

(atyp_FN_high,per_FN_high,pleural_FN_high,calcif_FN_high,sub_ground_FN_high,cancer_FN_high,
 other_nodules_FN_high,other_nonodules_FN_high,fibrosis_FN_high,bronchioperi_FN_high,atyp_FP_high,per_FP_high,
 pleural_FP_high,calcif_FP_high,sub_ground_FP_high,cancer_FP_high,other_nodules_FP_high,other_nonodules_FP_high,
 fibrosis_FP_high,bronchioperi_FP_high,dict_FP_correct_high,dict_FP_wrong_high,dict_FN_correct_high,
 dict_FN_wrong_high,lymph_FP_wrong_high,lymph_FN_correct_high, nod_FP_wrong_high, nod_FN_correct_high,
 other_nonodules_FN_lung_high,other_nonodules_FN_nolung_high,
 other_nonodules_FP_lung_high,other_nonodules_FP_nolung_high)=show_information_of_review(path_high)

#If we want to save the cell output to txt use below and activate above
# with open('no_emph_review.txt', 'w') as f:
#     f.write(cap.stdout)

In [2]:
# %%capture cap --no-stderr

(atyp_FN_low,per_FN_low,pleural_FN_low,calcif_FN_low,sub_ground_FN_low,cancer_FN_low,other_nodules_FN_low,
 other_nonodules_FN_low,fibrosis_FN_low,bronchioperi_FN_low,atyp_FP_low,per_FP_low,pleural_FP_low,calcif_FP_low,
 sub_ground_FP_low,cancer_FP_low,other_nodules_FP_low,other_nonodules_FP_low,fibrosis_FP_low,bronchioperi_FP_low,
 dict_FP_correct_low,dict_FP_wrong_low,dict_FN_correct_low,dict_FN_wrong_low,
 lymph_FP_wrong_low,lymph_FN_correct_low, nod_FP_wrong_low, nod_FN_correct_low,
 other_nonodules_FN_lung_low,other_nonodules_FN_nolung_low,
 other_nonodules_FP_lung_low,other_nonodules_FP_nolung_low)=show_information_of_review(path_low)

# with open('emph_review.txt', 'w') as f:
#     f.write(cap.stdout)

### Convert slices to IDs - Use manually checked annotations

In [3]:
# high

In [176]:
#Added to replace issues with automation algorithm output - Cases with errors
high['AI_nod1']=high['AI_nod1'].replace("!!!",'',regex=True) #Replace double exclamation marks 
low['AI_nod1']=low['AI_nod1'].replace("!!!",'',regex=True) #Replace double exclamation marks 

high['AI_nod1']=high['AI_nod1'].replace("xxx",'',regex=True) #Replace double exclamation marks 
low['AI_nod1']=low['AI_nod1'].replace("xxx",'',regex=True) #Replace double exclamation marks 

In [4]:
# high

In [178]:
vol_cols=[col for col in high.columns if 'V' in col] #Get name of columns containing volumes of AI nodules

BMI_deg=['high_fp','low_fp']

for deg in BMI_deg: #Loop over BMI degrees
    for col in vol_cols: #Loop over columns with volumes
        #If the volume is less than 30mm3 we should ignore them - set it along with the corresponding AI nod to '-' 
        #This can be done since we get TP and FN from other file - This only considers FPs
        for ind,val in eval(deg[:-3]+"[("+deg[:-3]+"['"+col+"']<30)]['"+col+"'].items()"): #Changed to keep all volumes >=30mm3
            exec(deg[:-3]+"['"+col+"'].iloc[ind]=np.nan") #was '-' instead of nan
            exec(deg[:-3]+"['AI_nod"+str(col[1:])+"'].iloc[ind]=np.nan")

In [179]:
#Added since we will keep all volumes, replace 0 with nans to be used below
high['0-100fp'].replace(0, np.nan, inplace=True)
high['100-300fp'].replace(0, np.nan, inplace=True)
high['300+ fp'].replace(0, np.nan, inplace=True)

low['0-100fp'].replace(0, np.nan, inplace=True)
low['100-300fp'].replace(0, np.nan, inplace=True)
low['300+ fp'].replace(0, np.nan, inplace=True)

In [180]:
#Select rows where we have at least one FP in any of the 0-100 or 100-300 or 300+ volume subgroup
high_fp=high[(high['100-300fp'].notnull() | high['0-100fp'].notnull() | high['300+ fp'].notnull()) & high['participant_id'].notnull()]
low_fp=low[(low['100-300fp'].notnull() | low['0-100fp'].notnull() | low['300+ fp'].notnull()) & low['participant_id'].notnull()]

In [181]:
#Initialize empty dicts in the form {'pat_id1':[],'pat_id2':[],...}
high_dict=dict.fromkeys([str(numeric_string) for numeric_string in high_fp['participant_id'].values], [])
high_dict=[[key[:6],[]] for (key, value) in high_dict.items()]
high_dict = {item[0]: item[1] for item in high_dict}
high_fp['participant_id']=list(high_dict.keys())

low_dict=dict.fromkeys([str(numeric_string) for numeric_string in low_fp['participant_id'].values], [])
low_dict=[[key[:6],[]] for (key, value) in low_dict.items()]
low_dict = {item[0]: item[1] for item in low_dict}
low_fp['participant_id']=list(low_dict.keys())

In [182]:
#Volume dictionaries - Normal copies don't work properly. This is why deepcopy is used
high_dict_vol=copy.deepcopy(high_dict)
low_dict_vol=copy.deepcopy(low_dict)

In [183]:
AI_cols=[col for col in high_fp.columns if 'AI_nod' in col] #Get name of columns containing AI nodules

In [5]:
# high_fp

In [185]:
BMI_deg=['high_fp','low_fp'] #list with strings of dfs to loop

for deg in BMI_deg: #Loop over emphysema degrees
    print(deg)
    
    for ind_col,col in enumerate(AI_cols): #Loop over AI nodule columns
        
        #Following line to change nan with '-' since otherwise cannot check for string with 'L' below
        exec(deg[:-3]+"_fp['"+col+"']="+deg[:-3]+"_fp['"+col+"'].fillna('-')")
        exec(deg[:-3]+"_fp['"+str(col)+"'] = "+deg[:-3]+"_fp['"+str(col)+"'].astype(str)") #Convert to string type to use below

        #Create variables storing only those rows of df that a specific AI_nod col contains 'L' (denotes a TP)-or not those
        exec('temp='+deg[:-3]+'_fp[~'+deg[:-3]+"_fp['"+str(col)+"'].astype(str).str.contains('L')]") #FPs
        exec('temp_tp='+deg[:-3]+'_fp['+deg[:-3]+"_fp['"+str(col)+"'].astype(str).str.contains('L')]") #TPs

        if not temp.empty: #If we have FP for that participant

            for ind,pat in enumerate(temp['participant_id']): #Loop over all participants with FP in a specific AI col

                try: #To ensure that there are no errors
                    nod_id=temp.iloc[ind,ind_col+1][temp.iloc[ind,ind_col+1].find('L')+1:] #Get id
                    nod_id=nod_id.split(' ')[0] #To get actual id
                    vol=temp.iloc[ind,ind_col+11] #To get the value of the volume
                    
                    exec(deg[:-3]+'_dict'+"['"+str(pat)+"'].append('"+nod_id+"')") #Add that to the dictionary
                    
                    if pd.isnull(vol): #When there is no volume - is nan
                        exec(deg[:-3]+'_dict_vol'+"['"+str(pat)+"'].append('-')") #Same for volume dictionary
                    else:
                        exec(deg[:-3]+'_dict_vol'+"['"+str(pat)+"'].append('"+str(vol)+"')") #Same for volume dictionary
                except:
                    print(traceback.print_exc()) #print error
                    
                    
        if not temp_tp.empty: #If we have TP for that participant

            for ind,pat in enumerate(temp_tp['participant_id']): #Loop over all participants with TP in a specific AI col

                try: #To ensure that there are no errors
                    exec(deg[:-3]+'_dict'+"['"+str(pat)+"'].append('"+"-"+"')") #Add that to the dictionary
                    exec(deg[:-3]+'_dict_vol'+"['"+str(pat)+"'].append('"+"-"+"')") #Same for volume dictionary
                except:
                    print(traceback.print_exc())

high_fp


low_fp


#### Check below IDs again - They have nodules not reviewed by radiologists

In [186]:
#Confirm all participants' nodules counted - These are nodules not reviewed - Should be checked again

print("All participants below have nodules not reviewed yet. We should not have any participants printed below at the end. \n\
For now we do, since we manually deleted those in which AI detected a finding >30mm3 but manually measured <30mm3.")
print("\n")

for deg in BMI_deg: #Loop over emphysema degrees
    
    for pat in eval(deg[:-3]+"_dict"): #For each participant in a given degree
        
        temp=[x for x in eval(deg[:-3]+"_dict['"+pat+"']") if x!='-'] #Get how many FP we have - ignore '-' values
        counted=0 #Initialize an index to 0

        temp_deg=deg
            
        #If the participant is in any of the FP dictionaries increase the count by the values of that dictionary list
        if pat in eval("dict_FP_wrong_"+temp_deg[:-3]): 
            counted=counted+len(eval("dict_FP_wrong_"+temp_deg[:-3]+"['"+pat+"']"))

        if pat in eval("dict_FP_correct_"+temp_deg[:-3]):
            counted=counted+len(eval("dict_FP_correct_"+temp_deg[:-3]+"['"+pat+"']"))
            
        #Check if the counts match - If not then we missed some slices
        try:
            assert len(temp)==counted
        except:
            print('Missing FP slice(s) for participant',pat,'with',temp_deg[:-3]+' BMI')  

All participants below have nodules not reviewed yet. We should not have any participants printed below at the end. 
For now we do, since we manually deleted those in which AI detected a finding >30mm3 but manually measured <30mm3.




If we ignore peribronchial lymph nodes (for now that's not the case), we would expect errors here like in 103302 slice 100.

In [187]:
#Create copies of dictionaries with FP to be filled with the corresponding ids
dict_FP_wrong_high_ids=copy.deepcopy(dict_FP_wrong_high)
dict_FP_correct_high_ids=copy.deepcopy(dict_FP_correct_high)
dict_FP_wrong_low_ids=copy.deepcopy(dict_FP_wrong_low)
dict_FP_correct_low_ids=copy.deepcopy(dict_FP_correct_low)

#Same for dictionaries with lymph nodes only and nodule only
lymph_FP_wrong_high_ids=copy.deepcopy(lymph_FP_wrong_high)
lymph_FP_wrong_low_ids=copy.deepcopy(lymph_FP_wrong_low)
nod_FP_wrong_high_ids=copy.deepcopy(nod_FP_wrong_high)
nod_FP_wrong_low_ids=copy.deepcopy(nod_FP_wrong_low)

In [6]:
#Create new dictionaries 'correct' and 'wrong' with FP indices

pat_manual_check=[] #Initialize a list to be filled with participants in whom nodules should be filled in manually

for deg in BMI_deg: #Loop over BMI degrees
    
    deg=deg[:-3] #Keep name of BMI degree only, without '_fp'
    
    for pat in eval(deg+"['participant_id']"): #Loop over participants in each BMI degree
        
        if isinstance(pat,str): #required conversions to only keep first 6 digits of participant_id
            try:
                pat=int(pat[:6])
            except:
                pass
        else: #If participant_id consists only of numbers, then we assume that this is the 6 digit participant_id
            try:
                pat=int(pat)
            except:
                pass
        
        
    #Initialize a list to keep track of slices and ensure that there is no overlap between them - Unique mapping to ids
        all_slices=[] 

        #There are a lot of key errors since a participant might not exist in the FP dictionaries - only TP and/or FNs
        try: 
            for elem in eval(deg+"_dict['"+str(pat)+"']"): #Loop over all participants findings
                if elem!='-': #If there is a value in that finding

                    #We replaced with +-10 compared to emphysema experiment since before we had the full range of values
                    slices_range=[elemnew for elemnew in eval(deg+"_dict['"+str(pat)+"']") if elemnew!=elem]
                    limit_range=[y for x in slices_range if x!='-' for y in range(int(x)-10,int(x)+10)]

                    if int(elem) in limit_range: #If the slice is in the range of +-10 of any other slice

                        if pat in pat_manual_check: #If that participant has already been added in list to check 
                            pass
                        else: #If not in the list to check, add it
                            pat_manual_check.append(pat)
        except:
            pass

        temp_deg=deg
        
        #For those that there is a unique mapping make the conversion:
        try:
            for AI_ind,elem in enumerate(eval(deg+"_dict['"+str(pat)+"']")): #Loop over participants of given degree
                if elem!='-' and (pat not in pat_manual_check): #For participants with a 1-to-1 mapping

                    for i in range(int(elem)-10,int(elem)+10): #Loop over the range of slices
                        
                        #Then loop over the unique slices in the FP dictionaries
                        try: #Again avoid key errors
                            for ind,slice_FP_wrong in enumerate(eval("dict_FP_wrong_"+str(temp_deg)+"_ids['"+str(pat)+"']")): 
                                    if slice_FP_wrong==i: #If we found that slice replace it with id
                                        exec("dict_FP_wrong_"+str(temp_deg)+"_ids['"+str(pat)+"']"+"["+str(ind)+"]="+str(AI_ind+1))
                        except:
                            pass
                        
                        try:
                            for ind,slice_FP_wrong in enumerate(eval("dict_FP_correct_"+str(temp_deg)+"_ids['"+str(pat)+"']")): 
                                    if slice_FP_wrong==i: #If we found that slice replace it with id
                                        exec("dict_FP_correct_"+str(temp_deg)+"_ids['"+str(pat)+"']"+"["+str(ind)+"]="+str(AI_ind+1))   
                        except:
                            pass
                        
                                            
                        #Again avoid key errors for lymph nodes and nodules
                        try: 
                            for ind,slice_FP_wrong in enumerate(eval("lymph_FP_wrong_"+str(temp_deg)+"_ids['"+str(pat)+"']")): 
                                    if slice_FP_wrong==i: #If we found that slice replace it with id
                                        exec("lymph_FP_wrong_"+str(temp_deg)+"_ids['"+str(pat)+"']"+"["+str(ind)+"]="+str(AI_ind+1))
                        except:
                            pass
                        
                        try:
                            for ind,slice_FP_wrong in enumerate(eval("nod_FP_wrong_"+str(temp_deg)+"_ids['"+str(pat)+"']")): 
                                    if slice_FP_wrong==i: #If we found that slice replace it with id
                                        exec("nod_FP_wrong_"+str(temp_deg)+"_ids['"+str(pat)+"']"+"["+str(ind)+"]="+str(AI_ind+1))   
                        except:
                            pass
                        
        except:
            pass

print("The following participants have to be checked manually to map slices to ids:",pat_manual_check)

Correct the above errors - The rest are ok

High BMI

In [189]:
# dict_FP_wrong_high

In [190]:
# dict_FP_wrong_high_ids

In [191]:
# dict_FP_correct_high

In [192]:
# dict_FP_correct_high_ids

In [193]:
# nod_FP_wrong_high

In [194]:
# nod_FP_wrong_high_ids

In [195]:
# lymph_FP_wrong_high

In [196]:
# lymph_FP_wrong_high_ids

Low BMI

In [197]:
# dict_FP_wrong_low

In [198]:
# dict_FP_wrong_low_ids

In [199]:
# dict_FP_correct_low

In [200]:
# dict_FP_correct_low_ids

In [201]:
# nod_FP_wrong_low

In [202]:
# nod_FP_wrong_low_ids

In [203]:
# lymph_FP_wrong_low

In [204]:
# lymph_FP_wrong_low_ids

Manually add participants for whom a unique mapping wasn't possible - Atypical lymph nodes considered as nodules below

We consider atypical PFNs as lymph nodes here! - In the main analysis below (tables) there is a version in which we exclude them.

In [205]:
#Low
# dict_FP_wrong_low_ids['....']=[4]
# dict_FP_correct_low_ids['....']=[5,1,3,2]
# nod_FP_wrong_low_ids['...']=[4]

# ...

Check if the FP in the dictionaries are as expected until now

In [206]:
for pat in dict_FP_correct_low_ids: #Loop over participants in low BMI FP_correct
    for id in dict_FP_correct_low_ids[pat]:
        try:
            if id in dict_FP_wrong_low_ids[pat]: #If this ID is also in FP_wrong
                print("Low BMI and FP_correct",pat,id," should be checked manually") #Should fix that manually
        except: #might not exist in above dict
            pass

for pat in dict_FP_wrong_low_ids: #Loop over participants in low BMI FP_wrong
    for id in dict_FP_wrong_low_ids[pat]:
        try:
            if id in dict_FP_correct_low_ids[pat]: #If this ID is also in FP_correct
                print("Low BMI and FP_wrong",pat,id,"should be checked manually") #Should fix that manually
        except: #might not exist in above dict
            pass

print("\n")

for pat in dict_FP_correct_high_ids: #Loop over participants in high BMI FP_correct
    for id in dict_FP_correct_high_ids[pat]:
        try:
            if id in dict_FP_wrong_high_ids[pat]: #If this ID is also in FP_wrong
                print("High BMI FP_correct",pat,id,"should be checked manually") #Should fix that manually
        except: #might not exist in above dict
            pass

for pat in dict_FP_wrong_high_ids: #Loop over participants in high BMI FP_wrong
    for id in dict_FP_wrong_high_ids[pat]: 
        try:
            if id in dict_FP_correct_high_ids[pat]: #If this ID is also in FP_correct
                print("High BMI FP_wrong",pat,id,"should be checked manually") #Should fix that manually
        except: #might not exist in above dict
            pass





Manually correct above errors

In [207]:
# #Correct above error
# dict_FP_wrong_low_ids['137966']=[1]
# lymph_FP_wrong_low_ids['137966']=[1]

In [208]:
#Delete participants with on cases that mapping is not possible
for deg in BMI_deg: #Loop over BMI degree
    print("Below are for ",deg[:-3],"BMI")
    
    deg=deg[:-3] #Keep name of BMI degree only, without '_fp'

    temp_deg=deg
    
    for key,values in eval("dict_FP_wrong_"+str(temp_deg)+"_ids.items()"): #Loop over participants and their id
        for ind,val in enumerate(values):

            try: #Since we will loop many times in the 'emph' and so, we won't be able to evaluate '-' value as integer
                if int(val)>10: #For cases with errors
                    exec("dict_FP_wrong_"+str(temp_deg)+"_ids['"+key+"']["+str(ind)+"]="+"'-'")
            except:
                print('dict_FP_wrong_'+str(temp_deg)+"_ids['"+str(key)+"']"+' not deleted')
                pass
                
    for key,values in eval("dict_FP_correct_"+str(temp_deg)+"_ids.items()"): 
        for ind,val in enumerate(values):
            try:
                if int(val)>10:
                    exec("dict_FP_correct_"+str(temp_deg)+"_ids['"+key+"']["+str(ind)+"]="+"'-'")
            except:
                print('dict_FP_correct_'+str(temp_deg)+"_ids['"+str(key)+"']"+' not deleted')
                pass       


    #Delete participants with only '-' values
    del_keys=[]
    for key,values in eval("dict_FP_wrong_"+str(temp_deg)+"_ids.items()"): 
        if np.unique(values)[0]=='-': #If a participant has '-' which denotes an error
            del_keys.append(key)
            print('For dict_FP_wrong we have',len(values),'"-" values for',key)
            
    for key in del_keys:
        try:
            exec("del dict_FP_wrong_"+str(temp_deg)+"_ids['"+key+"']") #Delete those participants and its values from dict
            print('dict_FP_wrong_'+str(temp_deg)+"_ids['"+str(key)+"']"+'deleted')
        except:
            print('dict_FP_wrong_'+str(temp_deg)+"_ids['"+str(key)+"']"+' not deleted')
            pass
        
    #Similar for dict_FP_correct    
    del_keys=[]
    for key,values in eval("dict_FP_correct_"+str(temp_deg)+"_ids.items()"): 
        if np.unique(values)[0]=='-':
            del_keys.append(key)
            print('For dict_FP_correct we have',len(values),'"-" values for',key)
       
    for key in del_keys:
        try:
            exec("del dict_FP_correct_"+str(temp_deg)+"_ids['"+key+"']") 
            print('dict_FP_correct_'+str(temp_deg)+"_ids['"+str(key)+"']"+'deleted')
        except:
            print('dict_FP_correct_'+str(temp_deg)+"_ids['"+str(key)+"']"+' not deleted')
            pass
        
        
        
    #Similar for lymph nodes only and nod only dictionaries
    for key,values in eval("lymph_FP_wrong_"+str(temp_deg)+"_ids.items()"): 
        for ind,val in enumerate(values):

            try: #Since we will loop many times in the 'emph' and so, we won't be able to evaluate '-' value as integer
                if int(val)>10:
                    exec("lymph_FP_wrong_"+str(temp_deg)+"_ids['"+key+"']["+str(ind)+"]="+"'-'")
            except:
                print("lymph_FP_wrong_"+str(temp_deg)+"_ids["+key+"] not deleted")
                pass
                
    for key,values in eval("nod_FP_wrong_"+str(temp_deg)+"_ids.items()"): 
        for ind,val in enumerate(values):
            try:
                if int(val)>10:
                    exec("nod_FP_wrong_"+str(temp_deg)+"_ids['"+key+"']["+str(ind)+"]="+"'-'")
            except:
                print("nod_FP_wrong_"+str(temp_deg)+"_ids["+key+"] not deleted")
                pass
            
            
    #Delete participants with only '-' values
    del_keys=[]
    for key,values in eval("lymph_FP_wrong_"+str(temp_deg)+"_ids.items()"): 
        if np.unique(values)[0]=='-':
            del_keys.append(key)            
       
    for key in del_keys:
        try:
            exec("del lymph_FP_wrong_"+str(temp_deg)+"_ids['"+key+"']") 
            print('lymph_FP_wrong_'+str(temp_deg)+"_ids['"+str(key)+"']"+'deleted' )
        except:
            print('lymph_FP_wrong_'+str(temp_deg)+"_ids['"+str(key)+"']"+'not deleted')
            pass
        
    del_keys=[]
    for key,values in eval("nod_FP_wrong_"+str(temp_deg)+"_ids.items()"): 
        if np.unique(values)[0]=='-':
            del_keys.append(key)
       
    for key in del_keys:
        try:
            exec("del nod_FP_wrong_"+str(temp_deg)+"_ids['"+key+"']") 
            print('nod_FP_wrong_'+str(temp_deg)+"_ids['"+str(key)+"']"+'deleted')
        except:
            print('nod_FP_wrong_'+str(temp_deg)+"_ids['"+str(key)+"']"+' not deleted')
            pass

    print("\n")

Below are for  high BMI
For dict_FP_correct we have 1 "-" values for 686142
dict_FP_correct_high_ids['686142']deleted


Below are for  low BMI




There might be errors above that should be checked and deleted from nodule list (initially there was 119215)

In [209]:
dict_FP_correct_high['686142']=[290]
dict_FP_correct_high_ids['686142']=[6]

In [1]:
# pat_manual_check #These are the participants that should be checked manually

In [211]:
#Checks again - Copy pasted from above
for pat in dict_FP_correct_low_ids: #Loop over participants in low BMI FP_correct
    for id in dict_FP_correct_low_ids[pat]:
        try:
            if id in dict_FP_wrong_low_ids[pat]: #If this ID is also in FP_wrong
                print("Low BMI and FP_correct",pat,'with ID',id,"should be checked manually") #Should fix that manually
        except: #might not exist in above dict
            pass

for pat in dict_FP_wrong_low_ids: #Loop over participants in low BMI FP_wrong
    for id in dict_FP_wrong_low_ids[pat]:
        try:
            if id in dict_FP_correct_low_ids[pat]: #If this ID is also in FP_correct
                print("Low BMI and FP_wrong",pat,'with ID',id,"should be checked manually") #Should fix that manually
        except: #might not exist in above dict
            pass

print("\n")

for pat in dict_FP_correct_high_ids: #Loop over participants in high BMI FP_correct
    for id in dict_FP_correct_high_ids[pat]:
        try:
            if id in dict_FP_wrong_high_ids[pat]: #If this ID is also in FP_wrong
                print("High BMI FP_correct",pat,'with ID',id,"should be checked manually") #Should fix that manually
        except: #might not exist in above dict
            pass

for pat in dict_FP_wrong_high_ids: #Loop over participants in high BMI FP_wrong
    for id in dict_FP_wrong_high_ids[pat]: 
        try:
            if id in dict_FP_correct_high_ids[pat]: #If this ID is also in FP_correct
                print("High BMI FP_wrong",pat,'with ID',id,"should be checked manually") #Should fix that manually
        except: #might not exist in above dict
            pass





Further checks based on number of nodules, lymph nodes, and non-nodules (originally FPs based on AI initial reading)

In [212]:
print("Low BMI")
all=0
for pat in dict_FP_correct_low:
    all=all+len(dict_FP_correct_low[pat])
print('Non-nodules:',all)

FP_lymph_wrong=0
for pat in lymph_FP_wrong_low:
    FP_lymph_wrong=FP_lymph_wrong+len(lymph_FP_wrong_low[pat])
print("Lymph nodes:",FP_lymph_wrong) 

FP_nod_wrong=0
for pat in nod_FP_wrong_low:
    FP_nod_wrong=FP_nod_wrong+len(nod_FP_wrong_low[pat])
print("Nodules",FP_nod_wrong)

for pat in dict_FP_wrong_low:
    all=all+len(dict_FP_wrong_low[pat])
print("All findings:",all) 

Low BMI
Non-nodules: 96
Lymph nodes: 5
Nodules 17
All findings: 118


In [213]:
print("High BMI")
all=0
for pat in dict_FP_correct_high:
    all=all+len(dict_FP_correct_high[pat])
print('Non-nodules:',all)

FP_lymph_wrong=0
for pat in lymph_FP_wrong_high:
    FP_lymph_wrong=FP_lymph_wrong+len(lymph_FP_wrong_high[pat])
print("Lymph nodes:",FP_lymph_wrong) 

FP_nod_wrong=0
for pat in nod_FP_wrong_high:
    FP_nod_wrong=FP_nod_wrong+len(nod_FP_wrong_high[pat])
print("Nodules",FP_nod_wrong)

for pat in dict_FP_wrong_high:
    all=all+len(dict_FP_wrong_high[pat])
print("All findings:",all) 

High BMI
Non-nodules: 53
Lymph nodes: 9
Nodules 22
All findings: 84


In [214]:
FP_wrong_low=np.sum([len(x) for x in dict_FP_wrong_low.values()])
FP_wrong_high=np.sum([len(x) for x in dict_FP_wrong_high.values()])
print("Total FP wrong for low BMI are:",FP_wrong_low)
print("Total FP wrong for high BMI are:",FP_wrong_high)

FP_correct_low=np.sum([len(x) for x in dict_FP_correct_low.values()])
FP_correct_high=np.sum([len(x) for x in dict_FP_correct_high.values()])
print("Total FP correct for low BMI are:",FP_correct_low)
print("Total FP correct for high BMI are:",FP_correct_high)

Total FP wrong for low BMI are: 22
Total FP wrong for high BMI are: 31
Total FP correct for low BMI are: 96
Total FP correct for high BMI are: 53


Manually add participants for whom a unique mapping wasn't possible

In [215]:
# dict_FP_correct_high

In [216]:
# dict_FP_wrong_high

In [217]:
# dict_FP_correct_low

In [218]:
# dict_FP_wrong_low

In [220]:
#Checks again - Copy pasted from above and added lymph nodes and nodules - Should not print anything now
for pat in dict_FP_correct_low_ids: #Loop over participants in low BMI FP_correct
    for id in dict_FP_correct_low_ids[pat]:
        try:
            if id in dict_FP_wrong_low_ids[pat]: #If this ID is also in FP_wrong
                print("Low BMI and FP_correct",pat,'with ID',id,"should be checked manually") #Should fix that manually
        except: #might not exist in above dict
            pass

for pat in dict_FP_wrong_low_ids: #Loop over participants in low BMI FP_wrong
    for id in dict_FP_wrong_low_ids[pat]:
        try:
            if id in dict_FP_correct_low_ids[pat]: #If this ID is also in FP_correct
                print("Low BMI and FP_wrong",pat,'with ID',id,"should be checked manually") #Should fix that manually
        except: #might not exist in above dict
            pass

print("\n")

for pat in dict_FP_correct_high_ids: #Loop over participants in high BMI FP_correct
    for id in dict_FP_correct_high_ids[pat]:
        try:
            if id in dict_FP_wrong_high_ids[pat]: #If this ID is also in FP_wrong
                print("High BMI FP_correct",pat,'with ID',id,"should be checked manually") #Should fix that manually
        except: #might not exist in above dict
            pass

for pat in dict_FP_wrong_high_ids: #Loop over participants in high BMI FP_wrong
    for id in dict_FP_wrong_high_ids[pat]: 
        try:
            if id in dict_FP_correct_high_ids[pat]: #If this ID is also in FP_correct
                print("High BMI FP_wrong",pat,'with ID',id,"should be checked manually") #Should fix that manually
        except: #might not exist in above dict
            pass


#For lymph nodes and nodules
for pat in lymph_FP_wrong_low_ids: #Loop over participants in low BMI FP_wrong
    for id in lymph_FP_wrong_low_ids[pat]:
        try:
            if id in nod_FP_wrong_low_ids[pat]: #If this ID is also in nod_FP_wrong
                print("Low BMI lymph_FP_wrong",pat,'with ID',id,"should be checked manually")
        except: #might not exist in above dict
            pass

for pat in nod_FP_wrong_low_ids: #Loop over participants in low BMI nod_FP_wrong
    for id in nod_FP_wrong_low_ids[pat]:
        try:
            if id in lymph_FP_wrong_low_ids[pat]: #If this ID is also in lymph_FP_wrong
                print("Low BMI nod_FP_wrong",pat,'with ID',id,"should be checked manually")
        except: #might not exist in above dict
            pass





In [221]:
all=0
for pat in dict_FP_correct_low:
    all=all+len(dict_FP_correct_low[pat])
print('Non-nodules:',all)

FP_lymph_wrong=0
for pat in lymph_FP_wrong_low:
    FP_lymph_wrong=FP_lymph_wrong+len(lymph_FP_wrong_low[pat])
print("Lymph nodes:",FP_lymph_wrong) 

FP_nod_wrong=0
for pat in nod_FP_wrong_low:
    FP_nod_wrong=FP_nod_wrong+len(nod_FP_wrong_low[pat])
print("Nodules",FP_nod_wrong)

for pat in dict_FP_wrong_low:
    all=all+len(dict_FP_wrong_low[pat])
print("All findings:",all) 

Non-nodules: 96
Lymph nodes: 5
Nodules 17
All findings: 118


In [222]:
FP_wrong_low=np.sum([len(x) for x in dict_FP_wrong_low.values()])
FP_wrong_high=np.sum([len(x) for x in dict_FP_wrong_high.values()])
print("Total FP wrong for low BMI are:",FP_wrong_low)
print("Total FP wrong for high BMI are:",FP_wrong_high)

FP_correct_low=np.sum([len(x) for x in dict_FP_correct_low.values()])
FP_correct_high=np.sum([len(x) for x in dict_FP_correct_high.values()])
print("Total FP correct for low BMI are:",FP_correct_low)
print("Total FP correct for high BMI are:",FP_correct_high)

Total FP wrong for low BMI are: 22
Total FP wrong for high BMI are: 31
Total FP correct for low BMI are: 96
Total FP correct for high BMI are: 53


In [2]:
#Low BMI - FPs based on initial AI reading
print("Atypical PFNs from AI",np.sum([len(atyp_FP_low[x]) for x in atyp_FP_low]),atyp_FP_low) #9
print("PFNs from AI", np.sum([len(per_FP_low[x]) for x in per_FP_low]), per_FP_low) #2
print("Calcified nodules from AI",np.sum([len(calcif_FP_low[x]) for x in calcif_FP_low]),calcif_FP_low) #1
print("Subsolid and ground-glass nodules from AI",np.sum([len(sub_ground_FP_low[x]) for x in sub_ground_FP_low]),sub_ground_FP_low) #4
print("Cancer from AI",np.sum([len(cancer_FP_low[x]) for x in cancer_FP_low]),cancer_FP_low) #1
print("Pleural nodules from AI",np.sum([len(pleural_FP_low[x]) for x in pleural_FP_low]),pleural_FP_low) #0
print("Other nodules from AI",np.sum([len(other_nodules_FP_low[x]) for x in other_nodules_FP_low]),other_nodules_FP_low) #2
print("Other non-nodules (FPs) from AI",np.sum([len(other_nonodules_FP_low[x]) for x in other_nonodules_FP_low]),other_nonodules_FP_low) #56
print("Fibrosis/scars (FPs) from AI",np.sum([len(fibrosis_FP_low[x]) for x in fibrosis_FP_low]),fibrosis_FP_low) #40
print("Bronchovascular and Peribronchial non-nodules (FPs) from AI",
      np.sum([len(bronchioperi_FP_low[x]) for x in bronchioperi_FP_low]),bronchioperi_FP_low) #3


In [3]:
#High BMI - FPs based on initial AI reading
print("Atypical PFNs from AI",np.sum([len(atyp_FP_high[x]) for x in atyp_FP_high]),atyp_FP_high) #17
print("PFNs from AI", np.sum([len(per_FP_high[x]) for x in per_FP_high]), per_FP_high) #8
print("Calcified nodules from AI",np.sum([len(calcif_FP_high[x]) for x in calcif_FP_high]),calcif_FP_high) #0
print("Subsolid and ground-glass nodules from AI",np.sum([len(sub_ground_FP_high[x]) for x in sub_ground_FP_high]),sub_ground_FP_high) #2
print("Cancer from AI",np.sum([len(cancer_FP_high[x]) for x in cancer_FP_high]),cancer_FP_high) #0
print("Pleural nodules from AI",np.sum([len(pleural_FP_high[x]) for x in pleural_FP_high]),pleural_FP_high) #0
print("Other nodules from AI",np.sum([len(other_nodules_FP_high[x]) for x in other_nodules_FP_high]),other_nodules_FP_high) #3
print("Other non-nodules (FPs) from AI",np.sum([len(other_nonodules_FP_high[x]) for x in other_nonodules_FP_high]),other_nonodules_FP_high) #41
print("Fibrosis/scars (FPs) from AI",np.sum([len(fibrosis_FP_high[x]) for x in fibrosis_FP_high]),fibrosis_FP_high) #12
print("Bronchovascular and Peribronchial non-nodules (FPs) from AI", 
        np.sum([len(bronchioperi_FP_high[x]) for x in bronchioperi_FP_high]),bronchioperi_FP_high) #1


In [None]:
#Lists of dictionaries with participant and nodule ids that belong to a given category
all_categories=[atyp_FN_high,per_FN_high,pleural_FN_high,calcif_FN_high,sub_ground_FN_high,
                cancer_FN_high,other_nodules_FN_high,other_nonodules_FN_high,fibrosis_FN_high,
                bronchioperi_FN_high,atyp_FP_high,per_FP_high,pleural_FP_high,calcif_FP_high,
                sub_ground_FP_high,cancer_FP_high,other_nodules_FP_high,other_nonodules_FP_high,
                fibrosis_FP_high,bronchioperi_FP_high,  #Until here high BMI
                atyp_FN_low,per_FN_low,pleural_FN_low,calcif_FN_low,sub_ground_FN_low,cancer_FN_low,
                other_nodules_FN_low,other_nonodules_FN_low,fibrosis_FN_low,bronchioperi_FN_low,atyp_FP_low,
                per_FP_low,pleural_FP_low,calcif_FP_low,sub_ground_FP_low,cancer_FP_low,
                other_nodules_FP_low,other_nonodules_FP_low,fibrosis_FP_low,bronchioperi_FP_low] 

In [None]:
#Same as above with the names as strings
name_cats=['atyp_FN_high','per_FN_high','pleural_FN_high','calcif_FN_high','sub_ground_FN_high',
            'cancer_FN_high','other_nodules_FN_high','other_nonodules_FN_high','fibrosis_FN_high',
            'bronchioperi_FN_high','atyp_FP_high','per_FP_high','pleural_FP_high','calcif_FP_high',
            'sub_ground_FP_high','cancer_FP_high','other_nodules_FP_high','other_nonodules_FP_high',
            'fibrosis_FP_high','bronchioperi_FP_high',  #Until here high BMI
            'atyp_FN_low','per_FN_low','pleural_FN_low','calcif_FN_low','sub_ground_FN_low','cancer_FN_low',
            'other_nodules_FN_low','other_nonodules_FN_low','fibrosis_FN_low','bronchioperi_FN_low','atyp_FP_low',
            'per_FP_low','pleural_FP_low','calcif_FP_low','sub_ground_FP_low','cancer_FP_low',
            'other_nodules_FP_low','other_nonodules_FP_low','fibrosis_FP_low','bronchioperi_FP_low']

In [None]:
#Save dictionaries to be used from 'patient_selection_emphysema_experiment.ipynb' file to match slices with ids

with open('dict_FN_wrong_low.pickle','wb') as f:
    pickle.dump(dict_FN_wrong_low,f)

with open('dict_FN_correct_low.pickle','wb') as f:
    pickle.dump(dict_FN_correct_low,f)    

with open('dict_FN_wrong_high.pickle','wb') as f:
    pickle.dump(dict_FN_wrong_high,f)

with open('dict_FN_correct_high.pickle','wb') as f:
    pickle.dump(dict_FN_correct_high,f) 
    
#Same for lymph nodes only and nodules only dictionaries
with open('lymph_FN_correct_low.pickle','wb') as f:
    pickle.dump(lymph_FN_correct_low,f)    

with open('lymph_FN_correct_high.pickle','wb') as f:
    pickle.dump(lymph_FN_correct_high,f) 
    
with open('nod_FN_correct_low.pickle','wb') as f:
    pickle.dump(nod_FN_correct_low,f)    

with open('nod_FN_correct_high.pickle','wb') as f:
    pickle.dump(nod_FN_correct_high,f) 

### Get volume subgroups for nodules/non-nodules for each of high/low BMI

##### AI found, reader missed

In [None]:
#Get numbers for nodules (+lymph nodes) vs no nodules
#Nodules can also be found be just adding nodules only + lymph nodes only from below

ai_nonods_high_30_100=0
ai_nonods_high_100_300=0
ai_nonods_high_300=0

ai_nonods_low_30_100=0
ai_nonods_low_100_300=0
ai_nonods_low_300=0

#Similarly get numbers for nodules only and for lymph nodes only
ai_only_nods_high_30_100=0
ai_only_nods_high_100_300=0
ai_only_nods_high_300=0

ai_lymph_high_30_100=0
ai_lymph_high_100_300=0
ai_lymph_high_300=0

ai_only_nods_low_30_100=0
ai_only_nods_low_100_300=0
ai_only_nods_low_300=0

ai_lymph_low_30_100=0
ai_lymph_low_100_300=0
ai_lymph_low_300=0

In [None]:
#Similarly for volume of nodules (if comparison between groups with Mann-Whitney U test is used below)

#Get detailed list of volume of nodules (+lymph nodes) vs no nodules
#Nodules can also be found be just adding nodules only + lymph nodes only from below

ai_nonods_high_30_100_vols=[]
ai_nonods_high_100_300_vols=[]
ai_nonods_high_300_vols=[]

ai_nonods_low_30_100_vols=[]
ai_nonods_low_100_300_vols=[]
ai_nonods_low_300_vols=[]

#Similarly get numbers for nodules only and for lymph nodes only
ai_only_nods_high_30_100_vols=[]
ai_only_nods_high_100_300_vols=[]
ai_only_nods_high_300_vols=[]

ai_lymph_high_30_100_vols=[]
ai_lymph_high_100_300_vols=[]
ai_lymph_high_300_vols=[]

ai_only_nods_low_30_100_vols=[]
ai_only_nods_low_100_300_vols=[]
ai_only_nods_low_300_vols=[]

ai_lymph_low_30_100_vols=[]
ai_lymph_low_100_300_vols=[]
ai_lymph_low_300_vols=[]

Get numbers of AI detected nodules (FPs based on initial reading) for lymph nodes only, nodules only, and non-nodule categories in low/high BMI

In [None]:
#For lymph node subgroup in high/low BMI group from discrepancies

for deg in ['high','low']:
    total=0 #count total number

    for pat in eval('lymph_FP_wrong_'+deg+'_ids'): #loop over participants

        for nod_id in eval('lymph_FP_wrong_'+deg+'_ids[pat]'): #Loop over nodule ids

            if deg=='high':
                vol=float(eval(high_dict_vol[pat][nod_id-1])) #Get volume of that nodule id
            else: #For BMI groups volume will be taken from the corresponding degree of that participant
                vol=float(eval(low_dict_vol[pat][nod_id-1])) #Get volume of that nodule id

            #Increase the number of findings of a specific volume subgroup depending on volume of finding - Add volume to the corresponding variable
            if vol>=30 and vol<=100:
                exec('ai_lymph_'+deg+'_30_100=ai_lymph_'+deg+'_30_100+1')
                exec('ai_lymph_'+deg+'_30_100_vols.append(vol)')
                total=total+1
            elif vol>100 and vol<=300:
                exec('ai_lymph_'+deg+'_100_300=ai_lymph_'+deg+'_100_300+1')
                exec('ai_lymph_'+deg+'_100_300_vols.append(vol)')
                total=total+1
            elif vol>300:
                exec('ai_lymph_'+deg+'_300=ai_lymph_'+deg+'_300+1') 
                exec('ai_lymph_'+deg+'_300_vols.append(vol)')
                total=total+1
            else:
                print('For participant {} volume is smaller than 30mm3',pat)

    print('Total lymph nodes in {} BMI group is {}'.format(deg,total))

Total lymph nodes in high BMI group is 9
Total lymph nodes in low BMI group is 5


In [None]:
#Similarly for nodule only group high/low BMI from discrepancies

for deg in ['high','low']:

    total=0

    for pat in eval('nod_FP_wrong_'+deg+'_ids'):

        for nod_id in eval('nod_FP_wrong_'+deg+'_ids[pat]'):

            if deg=='high':
                vol=float(high_dict_vol[pat][nod_id-1])
            else:
                vol=float(low_dict_vol[pat][nod_id-1])

            if vol>=30 and vol<=100:
                exec('ai_only_nods_'+deg+'_30_100=ai_only_nods_'+deg+'_30_100+1')
                exec('ai_only_nods_'+deg+'_30_100_vols.append(vol)')
                total=total+1
            elif vol>100 and vol<=300:
                exec('ai_only_nods_'+deg+'_100_300=ai_only_nods_'+deg+'_100_300+1')
                exec('ai_only_nods_'+deg+'_100_300_vols.append(vol)')
                total=total+1
            elif vol>300:
                exec('ai_only_nods_'+deg+'_300=ai_only_nods_'+deg+'_300+1') 
                exec('ai_only_nods_'+deg+'_300_vols.append(vol)')
                total=total+1
            else:
                print('For participant {} volume is smaller than 30mm3',pat)
                
    print('Total nodules in {} BMI group is {}'.format(deg,total))

Total nodules in high BMI group is 22
Total nodules in low BMI group is 17


In [None]:
#Similarly for non-nodule low/high BMI groups from discrepancies

for deg in ['low','high']:
    total=0

    for pat in eval('dict_FP_correct_'+deg+'_ids'):

        for nod_id in eval('dict_FP_correct_'+deg+'_ids[pat]'):

            if deg=='high':
                vol=float(high_dict_vol[pat][nod_id-1])
            else:
                vol=float(low_dict_vol[pat][nod_id-1])

            if vol>=30 and vol<=100:
                exec('ai_nonods_'+deg+'_30_100=ai_nonods_'+deg+'_30_100+1')
                exec('ai_nonods_'+deg+'_30_100_vols.append(vol)')
                total=total+1
            elif vol>100 and vol<=300:
                exec('ai_nonods_'+deg+'_100_300=ai_nonods_'+deg+'_100_300+1')
                exec('ai_nonods_'+deg+'_100_300_vols.append(vol)')
                total=total+1
            elif vol>300:
                exec('ai_nonods_'+deg+'_300=ai_nonods_'+deg+'_300+1') 
                exec('ai_nonods_'+deg+'_300_vols.append(vol)')
                total=total+1   
            else:
                print('For participant {} volume is smaller than 30mm3',pat)

    print('Total non-nodules in {} BMI group is {}'.format(deg,total))

Total non-nodules in low BMI group is 96
Total non-nodules in high BMI group is 53


In [None]:
FP_wrong_low=np.sum([len(x) for x in dict_FP_wrong_low.values()])
FP_wrong_high=np.sum([len(x) for x in dict_FP_wrong_high.values()])
print("Total FP wrong for low BMI are:",FP_wrong_low)
print("Total FP wrong for high BMI are:",FP_wrong_high)

FP_correct_low=np.sum([len(x) for x in dict_FP_correct_low.values()])
FP_correct_high=np.sum([len(x) for x in dict_FP_correct_high.values()])
print("Total FP correct for low BMI are:",FP_correct_low)
print("Total FP correct for high BMI are:",FP_correct_high)

Total FP wrong for low BMI are: 22
Total FP wrong for high BMI are: 31
Total FP correct for low BMI are: 96
Total FP correct for high BMI are: 53


Statistics for FPs

In [None]:
low=low.dropna(axis=1, how='all') 
high=high.dropna(axis=1, how='all')

In [None]:
#Select rows with participant ids and create new cols with the total number of FPs and FNs for each participant
low_all=low[~low['participant_id'].isnull()]
low_all['fp_all']=0
low_all['fn_all']=0
low_all['fp_30_100']=0
low_all['fp_100_300']=0
low_all['fp_300']=0
low_all['fn_30_100']=0
low_all['fn_100_300']=0
low_all['fn_300']=0

high_all=high[~high['participant_id'].isnull()]
high_all['fp_all']=0
high_all['fn_all']=0
high_all['fp_30_100']=0
high_all['fp_100_300']=0
high_all['fp_300']=0
high_all['fn_30_100']=0
high_all['fn_100_300']=0
high_all['fn_300']=0

In [None]:
#Keep only valid IDs
low_all['participant_id']=[int(str(pat)[:6]) for pat in low_all['participant_id']]
high_all['participant_id']=[int(str(pat)[:6]) for pat in high_all['participant_id']]

Reader's FP (based on consensus)

In [None]:
#Loop through all participants and add the number of FPs for AI for each participant (based on consensus review)
for ind,pat in enumerate(high['participant_id']):
    try: #avoid nan
        if int(str(pat)[:6]) in (list(other_nonodules_FP_high.keys())):            
            high_all.loc[ind,'fp_all']=high_all.loc[ind,'fp_all']+len(other_nonodules_FP_high[int(str(pat)[:6])])
        if int(str(pat)[:6]) in list(fibrosis_FP_high.keys()):
            high_all.loc[ind,'fp_all']=high_all.loc[ind,'fp_all']+len(fibrosis_FP_high[int(str(pat)[:6])])
    except:
        try:
            if int(str(pat)[:6]) in list(fibrosis_FP_high.keys()):
                high_all.loc[ind,'fp_all']=high_all.loc[ind,'fp_all']+len(fibrosis_FP_high[int(str(pat)[:6])])
        except:
            pass

for ind,pat in enumerate(low['participant_id']):
    try: #avoid nan
        if int(str(pat)[:6]) in (list(other_nonodules_FP_low.keys())):
            low_all.loc[ind,'fp_all']=low_all.loc[ind,'fp_all']+len(other_nonodules_FP_low[int(str(pat)[:6])])
        if int(str(pat)[:6]) in list(fibrosis_FP_low.keys()):
            low_all.loc[ind,'fp_all']=low_all.loc[ind,'fp_all']+len(fibrosis_FP_low[int(str(pat)[:6])])
    except:
        try:
            if int(str(pat)[:6]) in list(fibrosis_FP_low.keys()):
                low_all.loc[ind,'fp_all']=low_all.loc[ind,'fp_all']+len(fibrosis_FP_low[int(str(pat)[:6])])
        except:
            pass

In [None]:
#Loop through all participants and add the number of FPs for the reader for each participant (based on consensus review)
for ind,pat in enumerate(high['participant_id']):
    try: #avoid nan
        if int(str(pat)[:6]) in (list(other_nonodules_FN_high.keys())):            
            high_all.loc[ind,'fn_all']=high_all.loc[ind,'fn_all']+len(other_nonodules_FN_high[int(str(pat)[:6])])
        if int(str(pat)[:6]) in list(fibrosis_FN_high.keys()):
            high_all.loc[ind,'fn_all']=high_all.loc[ind,'fn_all']+len(fibrosis_FN_high[int(str(pat)[:6])])
    except:
        pass

for ind,pat in enumerate(low['participant_id']):
    try: #avoid nan
        if int(str(pat)[:6]) in (list(other_nonodules_FN_low.keys())):            
            low_all.loc[ind,'fn_all']=low_all.loc[ind,'fn_all']+len(other_nonodules_FN_low[int(str(pat)[:6])])
        if int(str(pat)[:6]) in list(fibrosis_FN_low.keys()):
            low_all.loc[ind,'fn_all']=low_all.loc[ind,'fn_all']+len(fibrosis_FN_low[int(str(pat)[:6])])
    except:
        pass

In [None]:
print("Num of FPs for AI in high BMI cases:",np.sum(high_all['fp_all']))
print("Num of FPs for AI in low BMI cases:",np.sum(low_all['fp_all']))

print("Num of FPs for reader in high BMI cases:",np.sum(high_all['fn_all']))
print("Num of FPs for reader in low BMI cases:",np.sum(low_all['fn_all']))

Num of FPs for AI in high BMI cases: 53
Num of FPs for AI in low BMI cases: 96
Num of FPs for reader in high BMI cases: 9
Num of FPs for reader in low BMI cases: 28


In [None]:
# #Not used for now
# print('Paired T-test')
# print("High BMI Reader vs AI:",stats.ttest_rel(high_all['fn_all'], high_all['fp_all']).pvalue)
# print("Low BMI Reader vs AI:",stats.ttest_rel(low_all['fn_all'], low_all['fp_all']).pvalue) 
# print('\n')

# print('T-test of independent samples')
# print("High BMI Reader vs AI:",stats.ttest_ind(high_all['fn_all'], high_all['fp_all']).pvalue)
# print("Low BMI Reader vs AI:",stats.ttest_ind(low_all['fn_all'], low_all['fp_all']).pvalue) 
# print('\n')

# print("Below only possible is independent samples t-test. Paired t-test does not make sense here.")
# print("Low BMI vs High BMI for reader",stats.ttest_ind(high_all['fn_all'], low_all['fn_all']).pvalue)
# print("Low BMI vs High BMI for AI",stats.ttest_ind(high_all['fp_all'], low_all['fp_all']).pvalue)

In [None]:
# # conduct the Wilcoxon-Signed Rank Test
# print("High BMI Reader vs AI:",stats.wilcoxon(high_all['fn_all'], high_all['fp_all']).pvalue)
# print("Low BMI Reader vs AI:",stats.wilcoxon(low_all['fn_all'], low_all['fp_all']).pvalue) 
# print('\n')

#For unequal sample size Mann-Whitney U test is used
print("High BMI vs Low BMI for reader",stats.mannwhitneyu(high_all['fn_all'], low_all['fn_all']).pvalue)
print("High BMI vs Low BMI for AI",stats.mannwhitneyu(high_all['fp_all'], low_all['fp_all']).pvalue)

print("Reader vs AI for low BMI",stats.mannwhitneyu(low_all['fn_all'], low_all['fp_all']).pvalue)
print("Reader vs AI for high BMI",stats.mannwhitneyu(high_all['fn_all'], high_all['fp_all']).pvalue)

High BMI vs Low BMI for reader 0.08938425827413506
High BMI vs Low BMI for AI 0.004844752004185089
Reader vs AI for low BMI 1.2148044683373157e-09
Reader vs AI for high BMI 1.4409608525112892e-07


Statistics for FP for AI volume subgroups

In [None]:
for bmi in ['low','high']:
    print(bmi)

    for pat in eval('dict_FP_correct_'+bmi+'_ids'):

            for nod_id in eval('dict_FP_correct_'+bmi+'_ids[pat]'):

                if bmi=='low':
                    vol=float(low_dict_vol[pat][nod_id-1])
                else:
                    vol=float(high_dict_vol[pat][nod_id-1])

                if vol>=30 and vol<=100:
                    exec("index="+bmi+"_all["+bmi+"_all['participant_id']==int(pat)].index[0]")
                    exec(bmi+"_all.iloc[index,"+bmi+"_all.columns.get_loc('fp_30_100')]="+bmi+"_all.iloc[index,"+bmi+"_all.columns.get_loc('fp_30_100')]+1")

                elif vol>100 and vol<=300:
                    exec("index="+bmi+"_all["+bmi+"_all['participant_id']==int(pat)].index[0]")
                    exec(bmi+"_all.iloc[index,"+bmi+"_all.columns.get_loc('fp_100_300')]="+bmi+"_all.iloc[index,"+bmi+"_all.columns.get_loc('fp_100_300')]+1")
                elif vol>300:
                    exec("index="+bmi+"_all["+bmi+"_all['participant_id']==int(pat)].index[0]")
                    exec(bmi+"_all.iloc[index,"+bmi+"_all.columns.get_loc('fp_300')]="+bmi+"_all.iloc[index,"+bmi+"_all.columns.get_loc('fp_300')]+1")

low
high


##### AI missed, reader found

Before running part below we should execute the other file ('patient_selection_emphysema_experiment.ipynb') to get dictionaries containing information about the ids of FNs. We need REDCap information to extract those 

Up until here there are 8 files generated that will be used by the other notebook

In [4]:
#Run other notebook and continue execution on next cell if it gives error

try: #To ignore error and continue in next cell we need try-except and 'no raise error' flag
    %run ./patient_selection_BMI_experiment.ipynb --no-raise-error
except:
    pass

FP for reader's volume subgroups

In [5]:
# dict_FN_wrong_low

In [6]:
# dict_FN_wrong_low_vols

In [None]:
for bmi in ['low','high']: 
    print(bmi)

    for pat in eval('dict_FN_wrong_'+bmi+'_ids'):

            for nod_id,_ in enumerate(eval('dict_FN_wrong_'+bmi+'_ids[pat]')):

                vol=float(eval('dict_FN_wrong_'+bmi+'_vols[pat][nod_id]'))

                if vol>=30 and vol<=100:

                    exec("index="+bmi+"_all["+bmi+"_all['participant_id']==int(pat)].index[0]")
                    exec(bmi+"_all.iloc[index,"+bmi+"_all.columns.get_loc('fn_30_100')]="+bmi+"_all.iloc[index,"+bmi+"_all.columns.get_loc('fn_30_100')]+1")

                elif vol>100 and vol<=300:

                    exec("index="+bmi+"_all["+bmi+"_all['participant_id']==int(pat)].index[0]")
                    exec(bmi+"_all.iloc[index,"+bmi+"_all.columns.get_loc('fn_100_300')]="+bmi+"_all.iloc[index,"+bmi+"_all.columns.get_loc('fn_100_300')]+1")

                elif vol>300:

                    exec("index="+bmi+"_all["+bmi+"_all['participant_id']==int(pat)].index[0]")
                    exec(bmi+"_all.iloc[index,"+bmi+"_all.columns.get_loc('fn_300')]="+bmi+"_all.iloc[index,"+bmi+"_all.columns.get_loc('fn_300')]+1")

low
high


In [None]:
assert list(low_all['fp_all'])==list(low_all['fp_30_100']+low_all['fp_100_300']+low_all['fp_300'])
assert list(low_all['fn_all'])==list(low_all['fn_30_100']+low_all['fn_100_300']+low_all['fn_300'])
assert list(high_all['fp_all'])==list(high_all['fp_30_100']+high_all['fp_100_300']+high_all['fp_300'])
assert list(high_all['fn_all'])==list(high_all['fn_30_100']+high_all['fn_100_300']+high_all['fn_300'])

assert np.sum(low_all['fp_all'])==np.sum(low_all['fp_30_100'])+np.sum(low_all['fp_100_300'])+np.sum(low_all['fp_300'])
assert np.sum(low_all['fn_all'])==np.sum(low_all['fn_30_100'])+np.sum(low_all['fn_100_300'])+np.sum(low_all['fn_300'])
assert np.sum(high_all['fp_all'])==np.sum(high_all['fp_30_100'])+np.sum(high_all['fp_100_300'])+np.sum(high_all['fp_300'])
assert np.sum(high_all['fn_all'])==np.sum(high_all['fn_30_100'])+np.sum(high_all['fn_100_300'])+np.sum(high_all['fn_300'])

print("FP AI low",np.sum(low_all['fp_all']))
print("FP AI high",np.sum(high_all['fp_all']))
print("FN read low",np.sum(low_all['fn_all']))
print("FN read high",np.sum(high_all['fn_all']))

FP AI low 96
FP AI high 53
FN read low 28
FN read high 9


In [None]:
print("Volume subgroup 30-100mm3")
# print('Paired T-test')
# print("High BMI Reader vs AI:",stats.ttest_rel(high_all['fn_30_100'], high_all['fp_30_100']).pvalue)
# print("Low BMI Reader vs AI:",stats.ttest_rel(low_all['fn_30_100'], low_all['fp_30_100']).pvalue) 
# print('\n')

# print('T-test of independent samples')
# print("High BMI Reader vs AI:",stats.ttest_ind(high_all['fn_30_100'], high_all['fp_30_100']).pvalue)
# print("Low BMI Reader vs AI:",stats.ttest_ind(low_all['fn_30_100'], low_all['fp_30_100']).pvalue) 
# print('\n')

# print("Below only possible is independent samples t-test. Paired t-test does not make sense here.")
# print("Low vs High BMI for reader",stats.ttest_ind(high_all['fn_30_100'], low_all['fn_30_100']).pvalue)
# print("Low vs High BMI for AI",stats.ttest_ind(high_all['fp_30_100'], low_all['fp_30_100']).pvalue)
# print('\n')

# print("Below Wilcoxon-Signed Rank Test is used")
# print("High BMI Reader vs AI:",stats.wilcoxon(high_all['fn_30_100'], high_all['fp_30_100']).pvalue)
# print("Low BMI Reader vs AI:",stats.wilcoxon(low_all['fn_30_100'], low_all['fp_30_100']).pvalue)
# print('\n')

print("For unequal sample size Mann-Whitney U test is used")
print("Low vs High BMI for reader",stats.mannwhitneyu(high_all['fn_30_100'], low_all['fn_30_100']).pvalue)
print("Low vs High BMI for AI",stats.mannwhitneyu(high_all['fp_30_100'], low_all['fp_30_100']).pvalue)

print("Low BMI Reader vs AI:",stats.mannwhitneyu(low_all['fn_30_100'], low_all['fp_30_100']).pvalue)
print("High BMI Reader vs AI:",stats.mannwhitneyu(high_all['fn_30_100'], high_all['fp_30_100']).pvalue)

Volume subgroup 30-100mm3
For unequal sample size Mann-Whitney U test is used
Low vs High BMI for reader 0.23676620801434844
Low vs High BMI for AI 0.017800661925651263
Low BMI Reader vs AI: 0.10057216771517977
High BMI Reader vs AI: 0.6221348606202346


In [None]:
print("Volume subgroup 100-300mm3")
# print('Paired T-test')
# print("High BMI Reader vs AI:",stats.ttest_rel(high_all['fn_100_300'], high_all['fp_100_300']).pvalue)
# print("Low BMI Reader vs AI:",stats.ttest_rel(low_all['fn_100_300'], low_all['fp_100_300']).pvalue) 
# print('\n')

# print('T-test of independent samples')
# print("High BMI Reader vs AI:",stats.ttest_ind(high_all['fn_100_300'], high_all['fp_100_300']).pvalue)
# print("Low BMI Reader vs AI:",stats.ttest_ind(low_all['fn_100_300'], low_all['fp_100_300']).pvalue) 
# print('\n')

# print("Below only possible is independent samples t-test. Paired t-test does not make sense here.")
# print("Low BMI vs High BMI for reader",stats.ttest_ind(high_all['fn_100_300'], low_all['fn_100_300']).pvalue)
# print("Low BMI vs High BMI for AI",stats.ttest_ind(high_all['fp_100_300'], low_all['fp_100_300']).pvalue)
# print('\n')

# print("Below Wilcoxon-Signed Rank Test is used")
# print("High BMI Reader vs AI:",stats.wilcoxon(high_all['fn_100_300'], high_all['fp_100_300']).pvalue)
# print("Low BMI Reader vs AI:",stats.wilcoxon(low_all['fn_100_300'], low_all['fp_100_300']).pvalue)
# print('\n')

print("For unequal sample size Mann-Whitney U test is used")
print("Low vs High BMI for reader",stats.mannwhitneyu(high_all['fn_100_300'], low_all['fn_100_300']).pvalue)
print("Low vs High BMI for AI",stats.mannwhitneyu(high_all['fp_100_300'], low_all['fp_100_300']).pvalue)

print("Low BMI Reader vs AI:",stats.mannwhitneyu(low_all['fn_100_300'], low_all['fp_100_300']).pvalue)
print("High BMI Reader vs AI:",stats.mannwhitneyu(high_all['fn_100_300'], high_all['fp_100_300']).pvalue)

Volume subgroup 100-300mm3
For unequal sample size Mann-Whitney U test is used
Low vs High BMI for reader 0.1772312071394676
Low vs High BMI for AI 0.06440207310333836
Low BMI Reader vs AI: 4.8693462975873924e-08
High BMI Reader vs AI: 1.8716855511876229e-06


In [None]:
print("Volume subgroup 300+mm3")
# print('Paired T-test')
# print("High BMI Reader vs AI:",stats.ttest_rel(high_all['fn_300'], high_all['fp_300']).pvalue)
# print("Low BMI Reader vs AI:",stats.ttest_rel(low_all['fn_300'], low_all['fp_300']).pvalue) 
# print('\n')

# print('T-test of independent samples')
# print("High BMI Reader vs AI:",stats.ttest_ind(high_all['fn_300'], high_all['fp_300']).pvalue)
# print("Low BMI Reader vs AI:",stats.ttest_ind(low_all['fn_300'], low_all['fp_300']).pvalue) 
# print('\n')

# print("Below only possible is independent samples t-test. Paired t-test does not make sense here.")
# print("Low BMI vs High BMI for reader",stats.ttest_ind(high_all['fn_300'], low_all['fn_300']).pvalue)
# print("Low BMI vs High BMI for AI",stats.ttest_ind(high_all['fp_300'], low_all['fp_300']).pvalue)
# print('\n')

# print("Below Wilcoxon-Signed Rank Test is used")
# print("High BMI Reader vs AI:",stats.wilcoxon(high_all['fn_300'], high_all['fp_300']).pvalue)
# print("Low BMI Reader vs AI:",stats.wilcoxon(low_all['fn_300'], low_all['fp_300']).pvalue)
# print('\n')

print("For unequal sample size Mann-Whitney U test is used")
print("Low vs High BMI for reader",stats.mannwhitneyu(high_all['fn_300'], low_all['fn_300']).pvalue)
print("Low vs High BMI for AI",stats.mannwhitneyu(high_all['fp_300'], low_all['fp_300']).pvalue)

print("Low BMI Reader vs AI:",stats.mannwhitneyu(low_all['fn_300'], low_all['fp_300']).pvalue)
print("High BMI Reader vs AI:",stats.mannwhitneyu(high_all['fn_300'], high_all['fp_300']).pvalue)

Volume subgroup 300+mm3
For unequal sample size Mann-Whitney U test is used
Low vs High BMI for reader 0.3200679866998154
Low vs High BMI for AI 0.08575824039743388
Low BMI Reader vs AI: 1.8788003562302944e-06
High BMI Reader vs AI: 0.00013830624677904377


Load dictionaries

In [None]:
#Load ids of FNs

with open('dict_FN_wrong_low_ids.pickle', 'rb') as f:
    dict_FN_wrong_low_ids = pickle.load(f)

with open('dict_FN_correct_low_ids.pickle', 'rb') as f:
    dict_FN_correct_low_ids = pickle.load(f)

with open('dict_FN_wrong_high_ids.pickle', 'rb') as f:
    dict_FN_wrong_high_ids = pickle.load(f)
    
with open('dict_FN_correct_high_ids.pickle', 'rb') as f:
    dict_FN_correct_high_ids = pickle.load(f)
    
    
#Same for their vols

with open('dict_FN_wrong_low_vols.pickle', 'rb') as f:
    dict_FN_wrong_low_vols = pickle.load(f)

with open('dict_FN_correct_low_vols.pickle', 'rb') as f:
    dict_FN_correct_low_vols = pickle.load(f)

with open('dict_FN_wrong_high_vols.pickle', 'rb') as f:
    dict_FN_wrong_high_vols = pickle.load(f)
    
with open('dict_FN_correct_high_vols.pickle', 'rb') as f:
    dict_FN_correct_high_vols = pickle.load(f)

In [None]:
#Similarly for lymph nodes and nodules only and of their ids and volumes

with open('lymph_FN_correct_low.pickle','rb') as f:
    lymph_FN_correct_low=pickle.load(f)    

with open('lymph_FN_correct_high.pickle','rb') as f:
    lymph_FN_correct_high=pickle.load(f) 
    
with open('nod_FN_correct_low.pickle','rb') as f:
    nod_FN_correct_low=pickle.load(f)    

with open('nod_FN_correct_high.pickle','rb') as f:
    nod_FN_correct_high=pickle.load(f) 
    
    
with open('lymph_FN_correct_low_ids.pickle','rb') as f:
    lymph_FN_correct_low_ids=pickle.load(f)    

with open('lymph_FN_correct_high_ids.pickle','rb') as f:
    lymph_FN_correct_high_ids=pickle.load(f) 
    
with open('nod_FN_correct_low_ids.pickle','rb') as f:
    nod_FN_correct_low_ids=pickle.load(f)    

with open('nod_FN_correct_high_ids.pickle','rb') as f:
    nod_FN_correct_high_ids=pickle.load(f) 
    
    
with open('lymph_FN_correct_low_vols.pickle','rb') as f:
    lymph_FN_correct_low_vols=pickle.load(f)    

with open('lymph_FN_correct_high_vols.pickle','rb') as f:
    lymph_FN_correct_high_vols=pickle.load(f) 
    
with open('nod_FN_correct_low_vols.pickle','rb') as f:
    nod_FN_correct_low_vols=pickle.load(f)    

with open('nod_FN_correct_high_vols.pickle','rb') as f:
    nod_FN_correct_high_vols=pickle.load(f)     

In [None]:
#Initialize zero values for non-nodules, nodules only, and lymph nodes for each volume subgroup and for each low/high BMI groups

reader_nonods_high_30_100=0
reader_nonods_high_100_300=0
reader_nonods_high_300=0

reader_nonods_low_30_100=0
reader_nonods_low_100_300=0
reader_nonods_low_300=0


reader_only_nods_high_30_100=0
reader_only_nods_high_100_300=0
reader_only_nods_high_300=0

reader_only_nods_low_30_100=0
reader_only_nods_low_100_300=0
reader_only_nods_low_300=0


reader_lymph_high_30_100=0
reader_lymph_high_100_300=0
reader_lymph_high_300=0

reader_lymph_low_30_100=0
reader_lymph_low_100_300=0
reader_lymph_low_300=0

In [None]:
#Similarly keep track of volumes for each of those groups (if Mann-Whitney U test is used below)
reader_nonods_high_30_100_vols=[]
reader_nonods_high_100_300_vols=[]
reader_nonods_high_300_vols=[]
reader_nonods_low_30_100_vols=[]
reader_nonods_low_100_300_vols=[]
reader_nonods_low_300_vols=[]

reader_only_nods_high_30_100_vols=[]
reader_only_nods_high_100_300_vols=[]
reader_only_nods_high_300_vols=[]
reader_only_nods_low_30_100_vols=[]
reader_only_nods_low_100_300_vols=[]
reader_only_nods_low_300_vols=[]

reader_lymph_high_30_100_vols=[]
reader_lymph_high_100_300_vols=[]
reader_lymph_high_300_vols=[]
reader_lymph_low_30_100_vols=[]
reader_lymph_low_100_300_vols=[]
reader_lymph_low_300_vols=[]

Get numbers of reader nodules for lymph nodes only, nodules only, and non-nodule categories in low/high BMI

In [None]:
#Similarly for non-nodule low/high BMI groups for FNs

for deg in ['high','low']:
    total=0
    for pat in eval('dict_FN_wrong_'+deg+'_ids'):
        for ind,nod_id in enumerate(eval('dict_FN_wrong_'+deg+'_ids[pat]')):

            if deg=='high':
                vol=float(dict_FN_wrong_high_vols[pat][ind])
            else:
                vol=float(dict_FN_wrong_low_vols[pat][ind])


            if vol>=30 and vol<=100:
                exec('reader_nonods_'+deg+'_30_100=reader_nonods_'+deg+'_30_100+1')
                exec('reader_nonods_'+deg+'_30_100_vols.append(vol)')
                total+=1

            elif vol>100 and vol<=300:
                exec('reader_nonods_'+deg+'_100_300=reader_nonods_'+deg+'_100_300+1')
                exec('reader_nonods_'+deg+'_100_300_vols.append(vol)')
                total+=1
            elif vol>300:
                exec('reader_nonods_'+deg+'_300=reader_nonods_'+deg+'_300+1') 
                exec('reader_nonods_'+deg+'_300_vols.append(vol)') 
                total+=1
            else:
                print('For participant {} volume is smaller than 30mm3',pat)

    print('Total non-nodules in {} BMI group is {}'.format(deg,total))

Total non-nodules in high BMI group is 9
Total non-nodules in low BMI group is 28


In [None]:
#Similarly for lymph nodes low/high BMI groups for FNs

for deg in ['high','low']:
    total=0
    for pat in eval('lymph_FN_correct_'+deg+'_ids'):
        for ind,nod_id in enumerate(eval('lymph_FN_correct_'+deg+'_ids[pat]')):
            
            if deg=='high':
                vol=float(lymph_FN_correct_high_vols[pat][ind])
            else:
                vol=float(lymph_FN_correct_low_vols[pat][ind])

            if vol>=30 and vol<=100:
                exec('reader_lymph_'+deg+'_30_100=reader_lymph_'+deg+'_30_100+1')
                exec('reader_lymph_'+deg+'_30_100_vols.append(vol)')
                total+=1
            elif vol>100 and vol<=300:
                exec('reader_lymph_'+deg+'_100_300=reader_lymph_'+deg+'_100_300+1')
                exec('reader_lymph_'+deg+'_100_300_vols.append(vol)')
                total+=1
            elif vol>300:
                exec('reader_lymph_'+deg+'_300=reader_lymph_'+deg+'_300+1') 
                exec('reader_lymph_'+deg+'_300_vols.append(vol)')
                total+=1
            else:
                print('For participant {} volume is smaller than 30mm3',pat)

    print('Total lymph nodes in {} BMI group is {}'.format(deg,total))

Total lymph nodes in high BMI group is 20
Total lymph nodes in low BMI group is 23


In [None]:
#Similarly for nodules only low/high BMI groups for FNs
for deg in ['high','low']:
    total=0
    for pat in eval('nod_FN_correct_'+deg+'_ids'):
        for ind,nod_id in enumerate(eval('nod_FN_correct_'+deg+'_ids[pat]')):

            if deg=='high':
                vol=float(nod_FN_correct_high_vols[pat][ind])
            else:
                vol=float(nod_FN_correct_low_vols[pat][ind])

            if vol>=30 and vol<=100:
                exec('reader_only_nods_'+deg+'_30_100=reader_only_nods_'+deg+'_30_100+1')
                exec('reader_only_nods_'+deg+'_30_100_vols.append(vol)')
                total+=1
            elif vol>100 and vol<=300:
                exec('reader_only_nods_'+deg+'_100_300=reader_only_nods_'+deg+'_100_300+1')
                exec('reader_only_nods_'+deg+'_100_300_vols.append(vol)')
                total+=1
            elif vol>300:
                exec('reader_only_nods_'+deg+'_300=reader_only_nods_'+deg+'_300+1') 
                exec('reader_only_nods_'+deg+'_300_vols.append(vol)')
                total+=1
            else:
                print('For participant {} volume is smaller than 30mm3',pat)

    print('Total nodules in {} BMI group is {}'.format(deg,total))

Total nodules in high BMI group is 13
Total nodules in low BMI group is 4


Caution here! 'reader_nods' volume subgroups have both 'only_nods' (nodules+atypical lymph nodes) and 'lymph nodes' (PFNs and bronchovascular)

In [None]:
#Total number of nodules in each of the high/low groups is the sum of the nodules and lymph nodes in those

reader_nods_high_30_100=reader_lymph_high_30_100+reader_only_nods_high_30_100
reader_nods_high_100_300=reader_lymph_high_100_300+reader_only_nods_high_100_300
reader_nods_high_300=reader_lymph_high_300+reader_only_nods_high_300

reader_nods_low_30_100=reader_lymph_low_30_100+reader_only_nods_low_30_100
reader_nods_low_100_300=reader_lymph_low_100_300+reader_only_nods_low_100_300
reader_nods_low_300=reader_lymph_low_300+reader_only_nods_low_300

In [None]:
reader_only_nods_high_30_100_vols

[85.0, 54.0, 38.0, 49.0, 49.0, 46.0, 91.0, 30.0, 37.0, 42.0, 35.0]

In [None]:
reader_only_nods_high_30_100

11

## Create Tables & Statistics

##### Based on the current definition the following equations hold true:
1. AI found nodules, reader missed = FN reader
2. AI found non-nodules, reader missed = FP AI
3. AI missed nodules, reader found = FN AI
4. AI missed non-nodules, reader found = FP reader

#### Low BMI non-nodules

In [None]:
#Below are the non-nodule categories. With FP is denoted a finding that was missed by AI, whereas with FN a finding missed by the reader
#Transform above dictionaries to numbers to be used below
fibrosis_FP_low=sum([len(x) for x in fibrosis_FP_low.values()])
other_nonodules_FP_low=sum([len(x) for x in other_nonodules_FP_low.values()])
fibrosis_FN_low=sum([len(x) for x in fibrosis_FN_low.values()])
other_nonodules_FN_low=sum([len(x) for x in other_nonodules_FN_low.values()])
other_nonodules_FN_lung_low=sum([len(x) for x in other_nonodules_FN_lung_low.values()])
other_nonodules_FN_nolung_low=sum([len(x) for x in other_nonodules_FN_nolung_low.values()])
other_nonodules_FP_lung_low=sum([len(x) for x in other_nonodules_FP_lung_low.values()])
other_nonodules_FP_nolung_low=sum([len(x) for x in other_nonodules_FP_nolung_low.values()])

#Print the above
print('Fibrosis/scar low FP: '+str(fibrosis_FP_low))
print('Other non-nodules low FP: '+str(other_nonodules_FP_low))
print('Other non-nodules low FP (lung): '+str(other_nonodules_FP_lung_low))
print('Other non-nodules low FP (non-lung): '+str(other_nonodules_FP_nolung_low))
print('Fibrosis/scar low FN: '+str(fibrosis_FN_low))
print('Other non-nodules low FN: '+str(other_nonodules_FN_low))
print('Other non-nodules low FN (lung): '+str(other_nonodules_FN_lung_low))
print('Other non-nodules low FN (non-lung): '+str(other_nonodules_FN_nolung_low))

Fibrosis/scar low FP: 40
Other non-nodules low FP: 56
Other non-nodules low FP (lung): 44
Other non-nodules low FP (non-lung): 12
Fibrosis/scar low FN: 18
Other non-nodules low FN: 10
Other non-nodules low FN (lung): 10
Other non-nodules low FN (non-lung): 0


Some of the above findings cannot be classified as lung/non-lung findings and might be better to manually checked or add extra conditions above.

In [None]:
#Detailed comparison of FP categories for low and high BMI groups (no volume subgroups)

df_categories=pd.DataFrame(columns=['Incorrectly detected by AI','Incorrectly detected by reader'], #below index with the correct order as above
                          index=['fibrosis/scar low','other non-nodules low'])

df_categories.index.name = 'GT by radiologists for discrepancies'

df_categories['Incorrectly detected by AI']=[fibrosis_FP_low,other_nonodules_FP_low]

df_categories['Incorrectly detected by reader']=[fibrosis_FN_low,other_nonodules_FN_low]

df_categories['All findings']=df_categories['Incorrectly detected by AI']+df_categories['Incorrectly detected by reader'] #Sum of findings for each of emph/non-emph categories

df_categories.loc['Total']= df_categories.sum() #Total FP findings for AI/reader

all_findings=df_categories.iloc[:-1,:-1].sum().sum() #All findings

#Add percentages next to the number of each category
percentage_fp=np.round((df_categories['Incorrectly detected by AI']/all_findings)*100,1)  
df_categories['Incorrectly detected by AI']=[str(value[1])+' ('+str(percentage_fp[index])+'%)' 
                                             for index,value in enumerate(df_categories['Incorrectly detected by AI'].items())]

percentage_fn=np.round((df_categories['Incorrectly detected by reader']/all_findings)*100,1) 
df_categories['Incorrectly detected by reader']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' 
                                                 for index,value in enumerate(df_categories['Incorrectly detected by reader'].items())]

df_categories['All findings']=[str(val)+' ('+str(np.round(100*val/all_findings,1))+'%)' for val in df_categories['All findings'].values]

# #Rename columns
# df_categories.rename(columns={'FP': 'Incorrectly detected by AI', 'FN': 'Incorrectly detected by reader'}, inplace=True)

df_categories

Unnamed: 0_level_0,Incorrectly detected by AI,Incorrectly detected by reader,All findings
GT by radiologists for discrepancies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fibrosis/scar low,40 (32.3%),18 (14.5%),58 (46.8%)
other non-nodules low,56 (45.2%),10 (8.1%),66 (53.2%)
Total,96 (77.4%),28 (22.6%),124 (100.0%)


In [None]:
# df_categories.style.to_latex() #Just as a starting point - Need to be modified manually

In [None]:
df_categories.to_excel('non_nodules_low.xlsx')

In [None]:
# Detailed comparison of FP categories for low and high BMI groups (no volume subgroups)

df_categories=pd.DataFrame(columns=['Incorrectly detected by AI','Incorrectly detected by reader'], #below index with the correct order as above
                          index=['fibrosis/scar low','other non-nodules lung low','other non-nodules nolung low','other non-nodules (no description)'])

df_categories.index.name = 'GT by radiologists for discrepancies'

rest_no_desc_fp=other_nonodules_FP_low-other_nonodules_FP_lung_low-other_nonodules_FP_nolung_low
df_categories['Incorrectly detected by AI']=[fibrosis_FP_low,other_nonodules_FP_lung_low, other_nonodules_FP_nolung_low, rest_no_desc_fp]

rest_no_desc_fn=other_nonodules_FN_low-other_nonodules_FN_lung_low-other_nonodules_FN_nolung_low
df_categories['Incorrectly detected by reader']=[fibrosis_FN_low,other_nonodules_FN_lung_low, other_nonodules_FN_nolung_low, rest_no_desc_fn]

df_categories['All findings']=df_categories['Incorrectly detected by AI']+df_categories['Incorrectly detected by reader'] #Sum of findings for each of emph/non-emph categories

df_categories.loc['Total']= df_categories.sum() #Total FP findings for AI/reader

all_findings=df_categories.iloc[:-1,:-1].sum().sum() #All findings

#Add percentages next to the number of each category
percentage_fp=np.round((df_categories['Incorrectly detected by AI']/all_findings)*100,1)  
df_categories['Incorrectly detected by AI']=[str(value[1])+' ('+str(percentage_fp[index])+'%)' 
                                             for index,value in enumerate(df_categories['Incorrectly detected by AI'].items())]

percentage_fn=np.round((df_categories['Incorrectly detected by reader']/all_findings)*100,1) 
df_categories['Incorrectly detected by reader']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' 
                                                 for index,value in enumerate(df_categories['Incorrectly detected by reader'].items())]

df_categories['All findings']=[str(val)+' ('+str(np.round(100*val/all_findings,1))+'%)' for val in df_categories['All findings'].values]

# #Rename columns
# df_categories.rename(columns={'FP': 'Incorrectly detected by AI', 'FN': 'Incorrectly detected by reader'}, inplace=True)

df_categories

Unnamed: 0_level_0,Incorrectly detected by AI,Incorrectly detected by reader,All findings
GT by radiologists for discrepancies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fibrosis/scar low,40 (32.3%),18 (14.5%),58 (46.8%)
other non-nodules lung low,44 (35.5%),10 (8.1%),54 (43.5%)
other non-nodules nolung low,12 (9.7%),0 (0.0%),12 (9.7%)
other non-nodules (no description),0 (0.0%),0 (0.0%),0 (0.0%)
Total,96 (77.4%),28 (22.6%),124 (100.0%)


In [None]:
# df_categories.to_excel('non_nodules_types_low.xlsx')

#### High BMI non-nodules

In [None]:
#Same as above for high BMI group
fibrosis_FP_high=sum([len(x) for x in fibrosis_FP_high.values()])
other_nonodules_FP_high=sum([len(x) for x in other_nonodules_FP_high.values()])
other_nonodules_FP_lung_high=sum([len(x) for x in other_nonodules_FP_lung_high.values()])
other_nonodules_FP_nolung_high=sum([len(x) for x in other_nonodules_FP_nolung_high.values()])
fibrosis_FN_high=sum([len(x) for x in fibrosis_FN_high.values()])
other_nonodules_FN_high=sum([len(x) for x in other_nonodules_FN_high.values()])
other_nonodules_FN_lung_high=sum([len(x) for x in other_nonodules_FN_lung_high.values()])
other_nonodules_FN_nolung_high=sum([len(x) for x in other_nonodules_FN_nolung_high.values()])

#Print the above
print('Fibrosis/scar high FP: '+str(fibrosis_FP_high))
print('Other non-nodules high FP: '+str(other_nonodules_FP_high))
print('Other non-nodules high FP (lung): '+str(other_nonodules_FP_lung_high))
print('Other non-nodules high FP (non-lung): '+str(other_nonodules_FP_nolung_high))
print('Fibrosis/scar high FN: '+str(fibrosis_FN_high))
print('Other non-nodules high FN: '+str(other_nonodules_FN_high))
print('Other non-nodules high FN (lung): '+str(other_nonodules_FN_lung_high))
print('Other non-nodules high FN (non-lung): '+str(other_nonodules_FN_nolung_high))

Fibrosis/scar high FP: 12
Other non-nodules high FP: 41
Other non-nodules high FP (lung): 15
Other non-nodules high FP (non-lung): 26
Fibrosis/scar high FN: 1
Other non-nodules high FN: 8
Other non-nodules high FN (lung): 8
Other non-nodules high FN (non-lung): 0


In [None]:
#Same as above for high

#Detailed comparison of FP categories for low and high BMI groups (no volume subgroups)

df_categories=pd.DataFrame(columns=['Incorrectly detected by AI','Incorrectly detected by reader'], #below index with the correct order as above
                          index=['fibrosis/scar high',
                                 'other non-nodules high'
                                ])

df_categories.index.name = 'GT by radiologists for discrepancies'

df_categories['Incorrectly detected by AI']=[fibrosis_FP_high,other_nonodules_FP_high]

df_categories['Incorrectly detected by reader']=[fibrosis_FN_high,other_nonodules_FN_high]

df_categories['All findings']=df_categories['Incorrectly detected by AI']+df_categories['Incorrectly detected by reader']

df_categories.loc['Total']= df_categories.sum()

all_findings=df_categories.iloc[:-1,:-1].sum().sum()

percentage_fp=np.round((df_categories['Incorrectly detected by AI']/all_findings)*100,1)  
df_categories['Incorrectly detected by AI']=[str(value[1])+' ('+str(percentage_fp[index])+'%)' 
                                             for index,value in enumerate(df_categories['Incorrectly detected by AI'].items())]

percentage_fn=np.round((df_categories['Incorrectly detected by reader']/all_findings)*100,1) 
df_categories['Incorrectly detected by reader']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' 
                                                 for index,value in enumerate(df_categories['Incorrectly detected by reader'].items())]

df_categories['All findings']=[str(val)+' ('+str(np.round(100*val/all_findings,1))+'%)' for val in df_categories['All findings'].values]

df_categories

Unnamed: 0_level_0,Incorrectly detected by AI,Incorrectly detected by reader,All findings
GT by radiologists for discrepancies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fibrosis/scar high,12 (19.4%),1 (1.6%),13 (21.0%)
other non-nodules high,41 (66.1%),8 (12.9%),49 (79.0%)
Total,53 (85.5%),9 (14.5%),62 (100.0%)


In [None]:
df_categories.to_excel('non_nodules_high.xlsx')

In [None]:
#Same as above for high

#Detailed comparison of FP categories for low and high BMI groups (no volume subgroups)

df_categories=pd.DataFrame(columns=['Incorrectly detected by AI','Incorrectly detected by reader'], #below index with the correct order as above
                          index=['fibrosis/scar high',
                                 'other non-nodules lung high','other non-nodules nolung high','other non-nodules (no description)'
                                ])

df_categories.index.name = 'GT by radiologists for discrepancies'

rest_no_desc_fp=other_nonodules_FP_high-other_nonodules_FP_lung_high-other_nonodules_FP_nolung_high
df_categories['Incorrectly detected by AI']=[fibrosis_FP_high,other_nonodules_FP_lung_high,other_nonodules_FP_nolung_high,rest_no_desc_fp]

rest_no_desc_fn=other_nonodules_FN_high-other_nonodules_FN_lung_high-other_nonodules_FN_nolung_high
df_categories['Incorrectly detected by reader']=[fibrosis_FN_high,other_nonodules_FN_lung_high,other_nonodules_FN_nolung_high,rest_no_desc_fn]

df_categories['All findings']=df_categories['Incorrectly detected by AI']+df_categories['Incorrectly detected by reader']

df_categories.loc['Total']= df_categories.sum()

all_findings=df_categories.iloc[:-1,:-1].sum().sum()

percentage_fp=np.round((df_categories['Incorrectly detected by AI']/all_findings)*100,1)  
df_categories['Incorrectly detected by AI']=[str(value[1])+' ('+str(percentage_fp[index])+'%)' 
                                             for index,value in enumerate(df_categories['Incorrectly detected by AI'].items())]

percentage_fn=np.round((df_categories['Incorrectly detected by reader']/all_findings)*100,1) 
df_categories['Incorrectly detected by reader']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' 
                                                 for index,value in enumerate(df_categories['Incorrectly detected by reader'].items())]

df_categories['All findings']=[str(val)+' ('+str(np.round(100*val/all_findings,1))+'%)' for val in df_categories['All findings'].values]

df_categories

Unnamed: 0_level_0,Incorrectly detected by AI,Incorrectly detected by reader,All findings
GT by radiologists for discrepancies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fibrosis/scar high,12 (19.4%),1 (1.6%),13 (21.0%)
other non-nodules lung high,15 (24.2%),8 (12.9%),23 (37.1%)
other non-nodules nolung high,26 (41.9%),0 (0.0%),26 (41.9%)
other non-nodules (no description),0 (0.0%),0 (0.0%),0 (0.0%)
Total,53 (85.5%),9 (14.5%),62 (100.0%)


In [None]:
# df_categories.to_excel('non_nodules_types_high.xlsx')

### Metrics Calculation

In [None]:
#Load number of nodules and lymph nodes for each of the high/low BMI groups - These are the TPs

#Define nodule group names
nod_groups_only=['sub_ground','pleural', 'calcified','other_all','atypical_triangular'] 
lymph_groups=['per_fisu','peri_bronch'] 

#Initialize number of TP for each of the reader and AI to 0

#These can also be the sum of the volume subgroups below - Kept as is for now
TP_nod_low=0
TP_nod_high=0
TP_lymph_low=0
TP_lymph_high=0

TP_nod_low_30_100=0
TP_nod_high_30_100=0
TP_nod_low_100_300=0
TP_nod_high_100_300=0
TP_nod_low_300=0
TP_nod_high_300=0

TP_lymph_low_30_100=0
TP_lymph_high_30_100=0
TP_lymph_low_100_300=0
TP_lymph_high_100_300=0
TP_lymph_low_300=0
TP_lymph_high_300=0


for deg in ['_high','_low']: #Loop over high/low BMI groups
    
    for nod_group in nod_groups_only: #Loop over nodule groups
        
        #Load variables with TP created in 'patient_selection_BMI_experiment.ipynb' notebook
        with open(nod_group+deg+'_nod_only'+'.pickle','rb') as f:
            exec(nod_group+deg+'_nod_only= pickle.load(f)')
            
        #Same for each volume subgroup    
        with open(nod_group+deg+'_nod_only_30_100'+'.pickle','rb') as f:
            exec(nod_group+deg+'_nod_only_30_100= pickle.load(f)')   
        with open(nod_group+deg+'_nod_only_100_300'+'.pickle','rb') as f:
            exec(nod_group+deg+'_nod_only_100_300= pickle.load(f)')             
        with open(nod_group+deg+'_nod_only_300'+'.pickle','rb') as f:
            exec(nod_group+deg+'_nod_only_300= pickle.load(f)')    

        if deg=='_low': #Set variables depending on if we have low of high BMI + for each volume subgroup
            TP_nod_low=TP_nod_low+eval(nod_group+deg+'_nod_only')
            
            TP_nod_low_30_100=TP_nod_low_30_100+eval(nod_group+deg+'_nod_only_30_100')
            TP_nod_low_100_300=TP_nod_low_100_300+eval(nod_group+deg+'_nod_only_100_300')
            TP_nod_low_300=TP_nod_low_300+eval(nod_group+deg+'_nod_only_300')
            
        else:
            TP_nod_high=TP_nod_high+eval(nod_group+deg+'_nod_only')
            
            TP_nod_high_30_100=TP_nod_high_30_100+eval(nod_group+deg+'_nod_only_30_100')   
            TP_nod_high_100_300=TP_nod_high_100_300+eval(nod_group+deg+'_nod_only_100_300')
            TP_nod_high_300=TP_nod_high_300+eval(nod_group+deg+'_nod_only_300')

        
    for lymph_group in lymph_groups: #Similar as above for lymph node groups
        
        with open(lymph_group+deg+'_lymph'+'.pickle','rb') as f:
            exec(lymph_group+deg+'_lymph= pickle.load(f)')
            
        with open(lymph_group+deg+'_lymph_30_100'+'.pickle','rb') as f:
            exec(lymph_group+deg+'_lymph_30_100= pickle.load(f)')   
        with open(lymph_group+deg+'_lymph_100_300'+'.pickle','rb') as f:
            exec(lymph_group+deg+'_lymph_100_300= pickle.load(f)')             
        with open(lymph_group+deg+'_lymph_300'+'.pickle','rb') as f:
            exec(lymph_group+deg+'_lymph_300= pickle.load(f)')    
            
        if deg=='_low':
            TP_lymph_low=TP_lymph_low+eval(lymph_group+deg+'_lymph')
            
            TP_lymph_low_30_100=TP_lymph_low_30_100+eval(lymph_group+deg+'_lymph_30_100')
            TP_lymph_low_100_300=TP_lymph_low_100_300+eval(lymph_group+deg+'_lymph_100_300')
            TP_lymph_low_300=TP_lymph_low_300+eval(lymph_group+deg+'_lymph_300')
            
        else:
            TP_lymph_high=TP_lymph_high+eval(lymph_group+deg+'_lymph')
            TP_lymph_high_30_100=TP_lymph_high_30_100+eval(lymph_group+deg+'_lymph_30_100')   
            TP_lymph_high_100_300=TP_lymph_high_100_300+eval(lymph_group+deg+'_lymph_100_300')
            TP_lymph_high_300=TP_lymph_high_300+eval(lymph_group+deg+'_lymph_300')

#### Below definition of TP depends on reader/AI

In [None]:
#Get total number of nodules (nodules+lymph nodes) for the whole low/high BMI groups and for volume subgroups
#300+ volumes kept here since all these values should be 0 - If not, then delete them
TP_low=TP_nod_low+TP_lymph_low
TP_high=TP_nod_high+TP_lymph_high

TP_low_30_100=TP_nod_low_30_100+TP_lymph_low_30_100
TP_low_100_300=TP_nod_low_100_300+TP_lymph_low_100_300
TP_low_300=TP_nod_low_300+TP_lymph_low_300
TP_high_30_100=TP_nod_high_30_100+TP_lymph_high_30_100
TP_high_100_300=TP_nod_high_100_300+TP_lymph_high_100_300
TP_high_300=TP_nod_high_300+TP_lymph_high_300

In [None]:
assert TP_low==TP_low_30_100+TP_low_100_300+TP_low_300
assert TP_high==TP_high_30_100+TP_high_100_300+TP_high_300

#### Confidence Interval Calculations

In [None]:
#Code below taken from https://gist.github.com/maidens/29939b3383a5e57935491303cf0d8e0b
#For F1 score there was a suggestion on https://github.com/sousanunes/confidence_intervals/blob/master/propagation_confidence_interval.py
#This will not used since it assumes normal distribution

def _proportion_confidence_interval(r, n, z): 
    """Compute confidence interval for a proportion.
    https://real-statistics.com/binomial-and-related-distributions/proportion-distribution/proportion-parameter-confidence-interval/
    Follows notation described on pages 46--47 of [1]. 
    
    References
    ----------
    [1] R. G. Newcombe and D. G. Altman, Proportions and their differences, in Statisics
    with Confidence: Confidence intervals and statisctical guidelines, 2nd Ed., D. G. Altman, 
    D. Machin, T. N. Bryant and M. J. Gardner (Eds.), pp. 45-57, BMJ Books, 2000. 

    Based on the book, r is the observed number of subjects with some feature in a sample of size n. z is a percentile from the norm distribution.
    The formula in the link of the code is the same as in https://real-statistics.com/binomial-and-related-distributions/proportion-distribution/proportion-parameter-confidence-interval/
    There is no continuity correction here. This is used in http://stats.org.uk/statistical-inference/Newcombe1998.pdf
    The actual implementation used continuity correction. This is recommended for small sample sizes:  
    https://towardsdatascience.com/five-confidence-intervals-for-proportions-that-you-should-know-about-7ff5484c024f
    """
    
    A = 2*r + z**2
    # B = z*np.sqrt(z**2 + 4*r*(1 - r/n))
    B_low=1+z*np.sqrt(z**2 + 4*r*(1 - r/n) + (((4*r)-(2*n)-1)/n))
    # if B_low<0:
    #     B_low=0
    
    B_high=1+z*np.sqrt(z**2 + 4*r*(1 - r/n) - (((4*r)-(2*n)+1)/n))
    # if B_high>1:
    #     B_high=1

    C = 2*(n + z**2)
    return ((A-B_low)/C, (A+B_high)/C)


def sensitivity_and_specificity_with_confidence_intervals(TP, FP, FN, TN, alpha=0.95):
    """Compute confidence intervals for sensitivity and specificity using Wilson's method. 
    Based on https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval this calculation is without continuity correction.
    For more information about that check on https://www.statskingdom.com/doc_confidence_interval.html
    Based on the first link, of the possible approximations, Wilson score interval methods (with or without continuity correction) 
    have been shown to be the most accurate and the most robust, though some prefer the Agresti–Coull approach for larger sample sizes
    Another link for that is https://statisticaloddsandends.wordpress.com/2019/06/09/wilson-score-and-agresti-coull-intervals-for-binomial-proportions/
    
    This method does not rely on a normal approximation and results in accurate confidence intervals even for small sample sizes.
    
    Parameters
    ----------
    TP : int
        Number of true positives
    FP : int 
        Number of false positives
    FN : int
        Number of false negatives
    TN : int
        Number of true negatives
    alpha : float, optional
        Desired confidence. Defaults to 0.95, which yields a 95% confidence interval. 
    
    Returns
    -------
    sensitivity_confidence_interval : Tuple (float, float)
        Lower and upper bounds on the alpha confidence interval for sensitivity
    PPV_confidence_interval : Tuple (float, float)
        Lower and upper bounds on the alpha confidence interval for PPV
    F1_confidence_interval : Tuple (float, float)
        Lower and upper bounds on the alpha confidence interval for F1 score
        
    References
    ----------
    [1] R. G. Newcombe and D. G. Altman, Proportions and their differences, in Statisics
    with Confidence: Confidence intervals and statisctical guidelines, 2nd Ed., D. G. Altman, 
    D. Machin, T. N. Bryant and M. J. Gardner (Eds.), pp. 45-57, BMJ Books, 2000. 
    [2] E. B. Wilson, Probable inference, the law of succession, and statistical inference,
    J Am Stat Assoc 22:209-12, 1927. 
    """
    
    z = -ndtri((1.0-alpha)/2)
    
    # Compute sensitivity using method described in [1] 
    sensitivity_confidence_interval = _proportion_confidence_interval(TP, TP + FN, z)

     # Compute PPV
    PPV_confidence_interval = _proportion_confidence_interval(TP, TP + FP, z)
    
    #Compute F1score
    F1_confidence_interval = _proportion_confidence_interval(2*TP, 2*TP + (FP+FN), z) #if n=TP+FP+FN used we get nan errors - sample size of proportion should be with 2*TP
    # Check also based on https://stats.stackexchange.com/questions/363382/confidence-interval-of-precision-recall-and-f1-score
    #It is not a binomial outcome (eg. like accuracy which is num of correct over num of predicted) and so, we probably
    #can't apply any number of binomial conf intervals as stated in https://stats.stackexchange.com/questions/563582/calculate-confidence-intervals-on-accuracy-metrics
    #Also if data not normally distributed we cannot use simple formulas like those in https://aegis4048.github.io/comprehensive_confidence_intervals_for_python_developers

    return sensitivity_confidence_interval, PPV_confidence_interval, F1_confidence_interval

In [None]:
def sensitivity(TP,FN): #same as recall
    return TP/(TP+FN)

def PPV(TP,FP): #Same as precision
    return TP/(TP+FP)

def F1score(TP,FP,FN):
    return (2*TP)/(2*TP+(FP+FN))

#Metrics with TN in their definition can't be used

##### Example of CI calculation

In [None]:
for a in [0.5]: #Can also set other values of a to check the CI

    sensitivity_confidence_interval, PPV_confidence_interval, F1_confidence_interval\
    = sensitivity_and_specificity_with_confidence_intervals(37, 20, 2, 0, alpha=a) #Here TP, FP, FN, TN were set based on an example below - just for demonstration

    print("Sensitivity: %f, PPV: %f, F1 score: %f" %(sensitivity(37,2), PPV(37,20),F1score(37,20,2)))
    print("alpha = %f CI for sensitivity:"%a, sensitivity_confidence_interval)
    print("alpha = %f CI for PPV:"%a, PPV_confidence_interval)
    print("alpha = %f CI for F1 score:"%a, F1_confidence_interval)    
    #Confidence intervals of proportions were calculated using the Wilson method (with continuity correction). 

Sensitivity: 0.948718, PPV: 0.649123, F1 score: 0.770833
alpha = 0.500000 CI for sensitivity: (0.9040941124792334, 0.9775414611018507)
alpha = 0.500000 CI for PPV: (0.5965219235920692, 0.6988783015915245)
alpha = 0.500000 CI for F1 score: (0.7352529801795313, 0.8034004651312406)


From the intervals above we can conclude that we won't get the same results if we use normal approximations (z=1.96 and mean between lower and upper bound of CI)

In [None]:
#Get total number of nodules/non-nodules that were detected only by AI/human reader for each of the low/high BMI groups

#nodules+Lymph nodes included in the right part of the equations below - if only nodules comment 'ai_lymph_..' and activate comments in reader below
FP_nods_low=ai_only_nods_low_30_100+ai_only_nods_low_100_300+ai_only_nods_low_300 +(ai_lymph_low_30_100+ai_lymph_low_100_300+ai_lymph_low_300)
FP_nods_high=ai_only_nods_high_30_100+ai_only_nods_high_100_300+ai_only_nods_high_300 +(ai_lymph_high_30_100+ai_lymph_high_100_300+ai_lymph_high_300)

FP_nonods_low=ai_nonods_low_30_100+ai_nonods_low_100_300+ai_nonods_low_300
FP_nonods_high=ai_nonods_high_30_100+ai_nonods_high_100_300+ai_nonods_high_300

FN_nods_low=reader_nods_low_30_100+reader_nods_low_100_300+reader_nods_low_300 #-(reader_lymph_low_30_100+reader_lymph_low_100_300+reader_lymph_low_300)
FN_nods_high=reader_nods_high_30_100+reader_nods_high_100_300+reader_nods_high_300 #-(reader_lymph_high_30_100+reader_lymph_high_100_300+reader_lymph_high_300)

FN_nonods_low=reader_nonods_low_30_100+reader_nonods_low_100_300+reader_nonods_low_300
FN_nonods_high=reader_nonods_high_30_100+reader_nonods_high_100_300+reader_nonods_high_300

#Similar only for lymph nodes
lymph_reader_low=reader_lymph_low_30_100+reader_lymph_low_100_300+reader_lymph_low_300
lymph_reader_high=reader_lymph_high_30_100+reader_lymph_high_100_300+reader_lymph_high_300
lymph_AI_low=ai_lymph_low_30_100+ai_lymph_low_100_300+ai_lymph_low_300
lymph_AI_high=ai_lymph_high_30_100+ai_lymph_high_100_300+ai_lymph_high_300

Explanation below assumes that GT is whatever is detected only! For TN, this might be incorrect! We assumed that TP (in REDCap) will always be nodules, even though sometimes this might not be correct.

To calculate metrics for AI we consider the following (demonstrated for emphysema - same for non-emphysema cases):

1. TP_AI=TP_both+FP_nods_emph (nodules found as nodules) 
2. FP_AI=FP_nonods_emph (non-nodules found as nodules)
3. FN_AI=FN_nods_emph (nodules missed by AI)

Similarly, for reader metrics:

1. TP_read=TP_both+FN_nods_emph
2. FP_read=FN_nonods_emph
3. FN_read=FP_nods_emph

'AI found and reader found' can be seen from TP in REDCap

'AI missed and reader missed' does not exist - assumes that consensus found extra nodules while they just reviewed discrepancies

In [None]:
AI_found_lymph=lymph_AI_low+lymph_AI_high
read_found_lymph=lymph_reader_low+lymph_reader_high

reader_found_only=FN_nods_low+FN_nods_high - read_found_lymph
AI_found_only=FP_nods_low+FP_nods_high-AI_found_lymph
print("Actual number of nodules among discrepancies is",reader_found_only+AI_found_only)
print("From those {} were detected by the AI only and {} from reader only".format(AI_found_only,reader_found_only))
print("\n")
print("Actual number of lymph nodes among discrepancies is",read_found_lymph+AI_found_lymph)
print("From those {} were detected by the AI only and {} from reader only".format(AI_found_lymph,read_found_lymph))

Actual number of nodules among discrepancies is 56
From those 39 were detected by the AI only and 17 from reader only


Actual number of lymph nodes among discrepancies is 57
From those 14 were detected by the AI only and 43 from reader only


## Low BMI - Main analysis

#### Table only for nodules (+atypical PFNs) - lymph nodes (typical PFNs and bronchovascular) not included in calculations (considered as non-existent). Statistical tests based on it

In [None]:
TP_AI=TP_low+FP_nods_low-(TP_lymph_low) -lymph_AI_low #'FP_nods' include lymph and that's why we subtract 'lymph_AI_emph'
FP_AI=FP_nonods_low
FN_AI=TP_read_only=FN_nods_low-lymph_reader_low #nodules detected only by the reader, excluding lymph nodes

TP_read=TP_low+FN_nods_low-(TP_lymph_low) - lymph_reader_low
FP_read=FN_nonods_low
FN_read=TP_AI_only=FP_nods_low-lymph_AI_low #nodules detected only by AI, excluding lymph nodes

TP_both=TP_low-(TP_lymph_low) #Common nodules detected by both AI and reader

#Print the above
print("Low BMI numbers")
print("TP_AI",TP_AI)
print("FP_AI",FP_AI)
print("FN_AI",FN_AI)
print("TP_read",TP_read)
print("FP_read",FP_read)
print("FN_read",FN_read)
print("TP_both",TP_both)

Low BMI numbers
TP_AI 81
FP_AI 96
FN_AI 4
TP_read 68
FP_read 28
FN_read 17
TP_both 64


In [None]:
#Two tables: one for low and one for high BMI (below), having also percentages.
#Assessing detection performance for low/high groups - For nodules only, we treat lymph nodes as non-existent

df_all_new=pd.DataFrame(columns=['sensitivity (95% CI)','PPV (95% CI)','F1 score (95% CI)','nodules detected','non-nodules detected','nodules missed'],
                        index=['AI, low', 'reader, low'] )

#AI nodules only
df_all_new.iloc[0,0]=np.round(sensitivity(TP_AI,FN_AI),2)
df_all_new.iloc[0,1]=np.round(PPV(TP_AI,FP_AI),2)
df_all_new.iloc[0,2]=np.round(F1score(TP_AI,FP_AI,FN_AI),2)
df_all_new.iloc[0,3]=TP_AI
df_all_new.iloc[0,4]=FP_AI
df_all_new.iloc[0,5]=FN_AI

#Calculate CIs for sensitivity, PPV, and F1score
sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI, FP_AI, FN_AI, 0, alpha=0.95)

#Round CIs to 2 digits
ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[0]=str(df_all_new['sensitivity (95% CI)'].iloc[0])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[0]=str(df_all_new['PPV (95% CI)'].iloc[0])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[0]=str(df_all_new['F1 score (95% CI)'].iloc[0])+' '+str(tuple(ci_f1_ai))


#Reader nodules only
df_all_new.iloc[1,0]=np.round(sensitivity(TP_read,FN_read),2)
df_all_new.iloc[1,1]=np.round(PPV(TP_read,FP_read),2)
df_all_new.iloc[1,2]=np.round(F1score(TP_read,FP_read,FN_read),2)
df_all_new.iloc[1,3]=TP_read
df_all_new.iloc[1,4]=FP_read
df_all_new.iloc[1,5]=FN_read

sensitivity_confidence_interval_read, PPV_confidence_interval_read, F1_confidence_interval_read\
    = sensitivity_and_specificity_with_confidence_intervals(TP_read, FP_read, FN_read, 0, alpha=0.95)

ci_sens_read=[np.round(x,2) for x in sensitivity_confidence_interval_read]
ci_ppv_read=[np.round(x,2) for x in PPV_confidence_interval_read]
ci_f1_read=[np.round(x,2) for x in F1_confidence_interval_read]

df_all_new['sensitivity (95% CI)'].iloc[1]=str(df_all_new['sensitivity (95% CI)'].iloc[1])+' '+str(tuple(ci_sens_read))
df_all_new['PPV (95% CI)'].iloc[1]=str(df_all_new['PPV (95% CI)'].iloc[1])+' '+str(tuple(ci_ppv_read))
df_all_new['F1 score (95% CI)'].iloc[1]=str(df_all_new['F1 score (95% CI)'].iloc[1])+' '+str(tuple(ci_f1_read))


df_all_new['All findings']=df_all_new['nodules detected']+df_all_new['non-nodules detected']+df_all_new['nodules missed']
df_all_new.loc['Total']= df_all_new.sum()
df_all_new.loc['Total'].iloc[0:3]=''

all_findings=df_all_new.iloc[:-1,3:-1].sum().sum()

for i in range(2): #Add percentages to df
    row_all=np.sum(df_all_new.iloc[i][3:6].values) #get all values for a given row

    percentage_tp=np.round((df_all_new.iloc[i][3]/row_all)*100,1) #% of TP
    df_all_new['nodules detected'].iloc[i]=str(df_all_new.iloc[i][3])+' ('+str(percentage_tp)+'%)'
    percentage_fp=np.round((df_all_new.iloc[i][4]/row_all)*100,1) #% of FP
    df_all_new['non-nodules detected'].iloc[i]=str(df_all_new.iloc[i][4])+' ('+str(percentage_fp)+'%)'
    percentage_fn=np.round((df_all_new.iloc[i][5]/row_all)*100,1) #% of FN
    df_all_new['nodules missed'].iloc[i]=str(df_all_new.iloc[i][5])+' ('+str(percentage_fn)+'%)'

    df_all_new['All findings'].iloc[i]=str(df_all_new.iloc[i][6])+' (100%)'

df_all_new

Unnamed: 0,sensitivity (95% CI),PPV (95% CI),F1 score (95% CI),nodules detected,non-nodules detected,nodules missed,All findings
"AI, low","0.95 (0.88, 0.98)","0.46 (0.38, 0.53)","0.62 (0.56, 0.68)",81 (44.8%),96 (53.0%),4 (2.2%),181 (100%)
"reader, low","0.8 (0.7, 0.88)","0.71 (0.61, 0.79)","0.75 (0.68, 0.81)",68 (60.2%),28 (24.8%),17 (15.0%),113 (100%)
Total,,,,149,124,21,294


In [None]:
df_all_new.to_excel('nodules_atypical_only_low.xlsx')

#### McNemar' test

- If we want it with continuity correction we should use 'exact=False, correction=False'. We can compare when it's not applied to see if these values are on different sides of the traditional 0.05 cutoff. If they are, we would have to check the 'exact=True' method to decide which to keep (no corrections at all).  Taken from https://cran.r-project.org/web/packages/exact2x2/vignettes/exactMcNemar.pdf
- Continuity corrections no longer used based on https://stats.stackexchange.com/questions/6448/continuity-correction-for-pearson-and-mcnemars-chi-square-test but statistician suggested it due to small sample size
- McNemar's test is used when we want to know whether there is a statistically significant difference in the proportion of nodules detected by AI and reader (paraphrased from https://www.geeksforgeeks.org/how-to-perform-mcnemars-test-in-python/).
- Other useful sources: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8182550/ (paper used it for similar topic), https://stats.stackexchange.com/questions/358101/statistical-significance-p-value-for-comparing-two-classifiers-with-respect-to

In [None]:
#McNemar's test to compare Reader vs AI (using consensus panel)
#Below format is: [[Both AI found and reader found, reader missed and AI found], [Reader found and AI missed, 0]]

#For nodules
data=[[TP_both, FN_read],
        [FN_AI,0]]
# print(data)

# McNemar's Test without continuity correction
print("McNemar's test (nodules only), AI_vs_Reader with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue) 


#For FPs
data=[[0, FP_AI], 
        [FP_read, 0]]
# print(data)

# McNemar's Test without continuity correction
print("For FP findings, with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

McNemar's test (nodules only), AI_vs_Reader with continuity correction (not exact) p value is 0.00882876095281863
For FP findings, with continuity correction (not exact) p value is 1.7792323833716621e-09


#### Cohen's Kappa

- According to https://en.wikipedia.org/wiki/Fleiss%27_kappa, we must use Fleiss kappa when assessing the agreement between three or more raters or the intra-rater reliability (for one appraiser versus themself). To calculate this, the fleiss_kappa() function from the statsmodels library can be used. Cohen's kappa can be used for two readers and this is what we use below (https://www.statology.org/cohens-kappa-python/).We should only have 0 or 1 labels since otherwise it is considered as a multiclass problem
- Other useful sources https://www.statology.org/cohens-kappa-statistic/, https://vitalflux.com/cohen-kappa-score-python-example-machine-learning/
- Based on the last one, in the contigency table we have reader 1 (actual results) horizontally and reader 2 (predictions) vertically. For this to be true, reasonable to assume reader 1 is GT by radiologists and reader 2 either reader or AI

In [None]:
# #AI vs GT, TP_both included
# # Table looks like below:
# #              GT
# #             Yes                       No
# # AI   Yes    TP_both+TP_AI_only      FP_AI
# #      No     FN_AI                   0

# list_of_lists=[[1 for x in range(TP_both)],[1 for x in range(TP_AI_only)],[0 for x in range(FP_AI) ],[1 for x in range(FN_AI) ] ] 
# rater_GT=[item for sublist in list_of_lists for item in sublist]

# list_of_lists=[[1 for x in range(TP_both)], [1 for x in range(TP_AI_only)],[1 for x in range(FP_AI) ],[0 for x in range(FN_AI) ]]
# rater_AI=[item for sublist in list_of_lists for item in sublist]

# print("AI vs consensus (for nodules only), kappa is ",cohen_kappa_score(rater_GT, rater_AI))

# #Reader vs GT, TP_both included
# # Table looks like below:
# #                    GT
# #                   Yes                       No
# # Reader   Yes    TP_both+TP_read_only      FP_read
# #           No     FN_read                   0

# list_of_lists=[[1 for x in range(TP_both)],[1 for x in range(TP_read_only)],[0 for x in range(FP_read) ],[1 for x in range(FN_read) ] ] 
# rater_GT=[item for sublist in list_of_lists for item in sublist]

# list_of_lists=[[1 for x in range(TP_both)],[1 for x in range(TP_read_only)],[1 for x in range(FP_read) ],[0 for x in range(FN_read) ]]
# rater_read=[item for sublist in list_of_lists for item in sublist]

# print("Reader vs consensus (for non-nodules only), kappa is ",cohen_kappa_score(rater_GT, rater_read))

## High BMI - Main analysis

#### Table only for nodules (+atypical PFNs) - lymph nodes (typical PFNs and bronchovascular) not included in calculations (considered as non-existent). Statistical tests based on it

In [None]:
TP_AI= TP_high+FP_nods_high -(TP_lymph_high) -lymph_AI_high #'FP_nods' include lymph and that's why we subtract 'lymph_AI_high'
FP_AI=FP_nonods_high
FN_AI=TP_read_only=FN_nods_high-lymph_reader_high #nodules detected only by the reader, excluding lymph nodes

TP_read=TP_high+FN_nods_high -(TP_lymph_high) - lymph_reader_high
FP_read=FN_nonods_high
FN_read=TP_AI_only=FP_nods_high-lymph_AI_high #nodules detected only by AI, excluding lymph nodes

TP_both=TP_high-(TP_lymph_high) #Common nodules detected by both AI and reader

#Print the above
print("High BMI numbers")
print("TP_AI",TP_AI)
print("FP_AI",FP_AI)
print("FN_AI",FN_AI)
print("TP_read",TP_read)
print("FP_read",FP_read)
print("FN_read",FN_read)
print("TP_both",TP_both)

High BMI numbers
TP_AI 77
FP_AI 53
FN_AI 13
TP_read 68
FP_read 9
FN_read 22
TP_both 55


In [None]:
# Second part of table split for high BMI
    
#Assessing detection performance - For nodules only, we treat lymph nodes as non-existent
df_all_new=pd.DataFrame(columns=['sensitivity (95% CI)','PPV (95% CI)','F1 score (95% CI)','nodules detected','non-nodules detected','nodules missed'],
                        index=['AI, high', 'reader, high'])

#AI nodules only
df_all_new.iloc[0,0]=np.round(sensitivity(TP_AI,FN_AI),2)
df_all_new.iloc[0,1]=np.round(PPV(TP_AI,FP_AI),2)
df_all_new.iloc[0,2]=np.round(F1score(TP_AI,FP_AI,FN_AI),2)
df_all_new.iloc[0,3]=TP_AI
df_all_new.iloc[0,4]=FP_AI
df_all_new.iloc[0,5]=FN_AI

#Calculate CIs for sensitivity, PPV, and F1score
sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI, FP_AI, FN_AI, 0, alpha=0.95)

#Round CIs to 2 digits
ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[0]=str(df_all_new['sensitivity (95% CI)'].iloc[0])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[0]=str(df_all_new['PPV (95% CI)'].iloc[0])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[0]=str(df_all_new['F1 score (95% CI)'].iloc[0])+' '+str(tuple(ci_f1_ai))

#Reader nodules only
df_all_new.iloc[1,0]=np.round(sensitivity(TP_read,FN_read),2)
df_all_new.iloc[1,1]=np.round(PPV(TP_read,FP_read),2)
df_all_new.iloc[1,2]=np.round(F1score(TP_read,FP_read,FN_read),2)
df_all_new.iloc[1,3]=TP_read
df_all_new.iloc[1,4]=FP_read
df_all_new.iloc[1,5]=FN_read

sensitivity_confidence_interval_read, PPV_confidence_interval_read, F1_confidence_interval_read\
    = sensitivity_and_specificity_with_confidence_intervals(TP_read, FP_read, FN_read, 0, alpha=0.95)

ci_sens_read=[np.round(x,2) for x in sensitivity_confidence_interval_read]
ci_ppv_read=[np.round(x,2) for x in PPV_confidence_interval_read]
ci_f1_read=[np.round(x,2) for x in F1_confidence_interval_read]

df_all_new['sensitivity (95% CI)'].iloc[1]=str(df_all_new['sensitivity (95% CI)'].iloc[1])+' '+str(tuple(ci_sens_read))
df_all_new['PPV (95% CI)'].iloc[1]=str(df_all_new['PPV (95% CI)'].iloc[1])+' '+str(tuple(ci_ppv_read))
df_all_new['F1 score (95% CI)'].iloc[1]=str(df_all_new['F1 score (95% CI)'].iloc[1])+' '+str(tuple(ci_f1_read))


df_all_new['All findings']=df_all_new['nodules detected']+df_all_new['non-nodules detected']+df_all_new['nodules missed']
df_all_new.loc['Total']= df_all_new.sum()
df_all_new.loc['Total'].iloc[0:3]=''

all_findings=df_all_new.iloc[:-1,3:-1].sum().sum()

for i in range(2): #Add percentages to df
    row_all=np.sum(df_all_new.iloc[i][3:6].values)

    percentage_fp=np.round((df_all_new.iloc[i][4]/row_all)*100,1) 
    df_all_new['non-nodules detected'].iloc[i]=str(df_all_new.iloc[i][4])+' ('+str(percentage_fp)+'%)'
    percentage_tp=np.round((df_all_new.iloc[i][3]/row_all)*100,1) 
    df_all_new['nodules detected'].iloc[i]=str(df_all_new.iloc[i][3])+' ('+str(percentage_tp)+'%)'
    percentage_fn=np.round((df_all_new.iloc[i][5]/row_all)*100,1) 
    df_all_new['nodules missed'].iloc[i]=str(df_all_new.iloc[i][5])+' ('+str(percentage_fn)+'%)'

    df_all_new['All findings'].iloc[i]=str(df_all_new.iloc[i][6])+' (100%)'

df_all_new

Unnamed: 0,sensitivity (95% CI),PPV (95% CI),F1 score (95% CI),nodules detected,non-nodules detected,nodules missed,All findings
"AI, high","0.86 (0.76, 0.92)","0.59 (0.5, 0.68)","0.7 (0.63, 0.76)",77 (53.8%),53 (37.1%),13 (9.1%),143 (100%)
"reader, high","0.76 (0.65, 0.84)","0.88 (0.78, 0.94)","0.81 (0.75, 0.87)",68 (68.7%),9 (9.1%),22 (22.2%),99 (100%)
Total,,,,145,62,35,242


In [None]:
df_all_new.to_excel('nodules_atypical_only_high.xlsx')

##### McNemar's test

In [None]:
#For nodules
data=[[TP_both, FN_read],
        [FN_AI,0]]
# print(data)

# McNemar's Test without continuity correction
print("For nodules only (AI vs reader) with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)


#For FPs
data=[[0, FP_AI], 
        [FP_read, 0]]
# print(data)

# McNemar's Test without continuity correction
print("For FP findings, with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

For nodules only (AI vs reader) with continuity correction (not exact) p value is 0.17629637444050728
For FP findings, with continuity correction (not exact) p value is 4.7344548308441886e-08


##### Cohen's Kappa

In [None]:
# #AI vs GT, TP_both included - 
# list_of_lists=[[1 for x in range(TP_both)],[1 for x in range(TP_AI_only)],[0 for x in range(FP_AI) ],[1 for x in range(FN_AI) ] ] 
# rater_GT=[item for sublist in list_of_lists for item in sublist]

# list_of_lists=[[1 for x in range(TP_both)], [1 for x in range(TP_AI_only)],[1 for x in range(FP_AI) ],[0 for x in range(FN_AI) ]]
# rater_AI=[item for sublist in list_of_lists for item in sublist]

# print("AI vs consensus (for nodules only), kappa is ",cohen_kappa_score(rater_GT, rater_AI))

# #Reader vs GT, TP_both included
# list_of_lists=[[1 for x in range(TP_both)],[1 for x in range(TP_read_only)],[0 for x in range(FP_read) ],[1 for x in range(FN_read) ] ] 
# rater_GT=[item for sublist in list_of_lists for item in sublist]

# list_of_lists=[[1 for x in range(TP_both)],[1 for x in range(TP_read_only)],[1 for x in range(FP_read) ],[0 for x in range(FN_read) ]]
# rater_read=[item for sublist in list_of_lists for item in sublist]

# print("Reader vs consensus (for non-nodules only), kappa is ",cohen_kappa_score(rater_GT, rater_read))

#### Comparison of volume subgroups for low BMI - Statistical tests based on it

In [None]:
TP_AI_100=TP_low_30_100+ai_only_nods_low_30_100-(TP_lymph_low_30_100)# -ai_lymph_low_30_100  
FP_AI_100=ai_nonods_low_30_100
FN_AI_100=reader_nods_low_30_100-reader_lymph_low_30_100 #nodules of reader excluding lymph nodes

TP_read_100=TP_low_30_100+reader_nods_low_30_100-(TP_lymph_low_30_100) - reader_lymph_low_30_100
FP_read_100=reader_nonods_low_30_100
FN_read_100=ai_only_nods_low_30_100#-ai_lymph_low_30_100 #nodules of AI excluding lymph nodes

TP_AI_100_300=TP_low_100_300+ai_only_nods_low_100_300-(TP_lymph_low_100_300)# -(ai_lymph_low_100_300)
FP_AI_100_300=ai_nonods_low_100_300
FN_AI_100_300=reader_nods_low_100_300-(reader_lymph_low_100_300) #nodules of reader excluding lymph nodes

TP_read_100_300=TP_low_100_300+reader_nods_low_100_300-(TP_lymph_low_100_300) - (reader_lymph_low_100_300)
FP_read_100_300=reader_nonods_low_100_300
FN_read_100_300=ai_only_nods_low_100_300#-(ai_lymph_low_100_300) #nodules of AI excluding lymph nodes

TP_AI_300=TP_low_300+ai_only_nods_low_300-(TP_lymph_low_300)# -(ai_lymph_low_300)
FP_AI_300=ai_nonods_low_300
FN_AI_300=reader_nods_low_300-(reader_lymph_low_300) #nodules of reader excluding lymph nodes

TP_read_300=TP_low_300+reader_nods_low_300-(TP_lymph_low_300) - (reader_lymph_low_300)
FP_read_300=reader_nonods_low_300
FN_read_300=ai_only_nods_low_300#-(ai_lymph_low_300) #nodules of AI excluding lymph nodes

TP_both_100=TP_low_30_100-(TP_lymph_low_30_100) 
TP_both_100_300=TP_low_100_300-(TP_lymph_low_100_300)
TP_both_300=TP_low_300-(TP_lymph_low_300)

#Print the above
print("Low BMI numbers")
print("TP_AI_100",TP_AI_100)
print("FP_AI_100",FP_AI_100)
print("FN_AI_100",FN_AI_100)
print("TP_read_100",TP_read_100)
print("FP_read_100",FP_read_100)
print("FN_read_100",FN_read_100)
print("TP_AI_100_300",TP_AI_100_300)
print("FP_AI_100_300",FP_AI_100_300)
print("FN_AI_100_300",FN_AI_100_300)
print("TP_read_100_300",TP_read_100_300)
print("FP_read_100_300",FP_read_100_300)
print("FN_read_100_300",FN_read_100_300)
print("TP_AI_300",TP_AI_300)
print("FP_AI_300",FP_AI_300)
print("FN_AI_300",FN_AI_300)
print("TP_read_300",TP_read_300)
print("FP_read_300",FP_read_300)
print("FN_read_300",FN_read_300)
print("TP_both_100",TP_both_100)
print("TP_both_100_300",TP_both_100_300)
print("TP_both_300",TP_both_300)

Low BMI numbers
TP_AI_100 62
FP_AI_100 26
FN_AI_100 4
TP_read_100 53
FP_read_100 22
FN_read_100 13
TP_AI_100_300 14
FP_AI_100_300 43
FN_AI_100_300 0
TP_read_100_300 11
FP_read_100_300 5
FN_read_100_300 3
TP_AI_300 5
FP_AI_300 27
FN_AI_300 0
TP_read_300 4
FP_read_300 1
FN_read_300 1
TP_both_100 49
TP_both_100_300 11
TP_both_300 4


In [None]:
#For low BMI comparison between reader and AI for volume subgroups
df_all_new=pd.DataFrame(columns=['sensitivity (95% CI)','PPV (95% CI)','F1 score (95% CI)','nodules correctly detected','non-nodules incorrectly detected','nodules missed'], 
                        index=['AI, low 30-100mm3', 'AI, low 100-300mm3','AI, low 300+mm3','',
                               'reader, low 30-100mm3', 'reader, low 100-300mm3','reader, low 300+mm3',''
                              ])

df_all_new.index.name = 'GT by radiologists for discrepancies' 

df_all_new.iloc[0,0]=np.round(sensitivity(TP_AI_100,FN_AI_100),2)
df_all_new.iloc[0,1]=np.round(PPV(TP_AI_100,FP_AI_100),2)
df_all_new.iloc[0,2]=np.round(F1score(TP_AI_100,FP_AI_100,FN_AI_100),2)
df_all_new.iloc[0,3]=TP_AI_100
df_all_new.iloc[0,4]=FP_AI_100
df_all_new.iloc[0,5]=FN_AI_100

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI_100, FP_AI_100, FN_AI_100, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[0]=str(df_all_new['sensitivity (95% CI)'].iloc[0])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[0]=str(df_all_new['PPV (95% CI)'].iloc[0])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[0]=str(df_all_new['F1 score (95% CI)'].iloc[0])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[4,0]=np.round(sensitivity(TP_read_100,FN_read_100),2)
df_all_new.iloc[4,1]=np.round(PPV(TP_read_100,FP_read_100),2)
df_all_new.iloc[4,2]=np.round(F1score(TP_read_100,FP_read_100,FN_read_100),2)
df_all_new.iloc[4,3]=TP_read_100
df_all_new.iloc[4,4]=FP_read_100
df_all_new.iloc[4,5]=FN_read_100

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_read_100, FP_read_100, FN_read_100, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[4]=str(df_all_new['sensitivity (95% CI)'].iloc[4])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[4]=str(df_all_new['PPV (95% CI)'].iloc[4])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[4]=str(df_all_new['F1 score (95% CI)'].iloc[4])+' '+str(tuple(ci_f1_ai))



df_all_new.iloc[1,0]=np.round(sensitivity(TP_AI_100_300,FN_AI_100_300),2)
df_all_new.iloc[1,1]=np.round(PPV(TP_AI_100_300,FP_AI_100_300),2)
df_all_new.iloc[1,2]=np.round(F1score(TP_AI_100_300,FP_AI_100_300,FN_AI_100_300),2)
df_all_new.iloc[1,3]=TP_AI_100_300
df_all_new.iloc[1,4]=FP_AI_100_300
df_all_new.iloc[1,5]=FN_AI_100_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI_100_300, FP_AI_100_300, FN_AI_100_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[1]=str(df_all_new['sensitivity (95% CI)'].iloc[1])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[1]=str(df_all_new['PPV (95% CI)'].iloc[1])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[1]=str(df_all_new['F1 score (95% CI)'].iloc[1])+' '+str(tuple(ci_f1_ai))



df_all_new.iloc[5,0]=np.round(sensitivity(TP_read_100_300,FN_read_100_300),2)
df_all_new.iloc[5,1]=np.round(PPV(TP_read_100_300,FP_read_100_300),2)
df_all_new.iloc[5,2]=np.round(F1score(TP_read_100_300,FP_read_100_300,FN_read_100_300),2)
df_all_new.iloc[5,3]=TP_read_100_300
df_all_new.iloc[5,4]=FP_read_100_300
df_all_new.iloc[5,5]=FN_read_100_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_read_100_300, FP_read_100_300, FN_read_100_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[5]=str(df_all_new['sensitivity (95% CI)'].iloc[5])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[5]=str(df_all_new['PPV (95% CI)'].iloc[5])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[5]=str(df_all_new['F1 score (95% CI)'].iloc[5])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[2,0]=np.round(sensitivity(TP_AI_300,FN_AI_300),2)
df_all_new.iloc[2,1]=np.round(PPV(TP_AI_300,FP_AI_300),2)
df_all_new.iloc[2,2]=np.round(F1score(TP_AI_300,FP_AI_300,FN_AI_300),2)
df_all_new.iloc[2,3]=TP_AI_300
df_all_new.iloc[2,4]=FP_AI_300
df_all_new.iloc[2,5]=FN_AI_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI_300, FP_AI_300, FN_AI_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[2]=str(df_all_new['sensitivity (95% CI)'].iloc[2])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[2]=str(df_all_new['PPV (95% CI)'].iloc[2])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[2]=str(df_all_new['F1 score (95% CI)'].iloc[2])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[6,0]=np.round(sensitivity(TP_read_300,FN_read_300),2)
df_all_new.iloc[6,1]=np.round(PPV(TP_read_300,FP_read_300),2)
df_all_new.iloc[6,2]=np.round(F1score(TP_read_300,FP_read_300,FN_read_300),2)
df_all_new.iloc[6,3]=TP_read_300
df_all_new.iloc[6,4]=FP_read_300
df_all_new.iloc[6,5]=FN_read_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_read_300, FP_read_300, FN_read_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[6]=str(df_all_new['sensitivity (95% CI)'].iloc[6])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[6]=str(df_all_new['PPV (95% CI)'].iloc[6])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[6]=str(df_all_new['F1 score (95% CI)'].iloc[6])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[3,0]=0
df_all_new.iloc[3,1]=0
df_all_new.iloc[3,2]=0
df_all_new.iloc[3,3]=0
df_all_new.iloc[3,4]=0
df_all_new.iloc[3,5]=0


AI_all=np.sum(df_all_new.iloc[0:3,3:].values)
reader_all=np.sum(df_all_new.iloc[4:7,3:].values)

df_all_new['All findings']=df_all_new['nodules correctly detected']+df_all_new['non-nodules incorrectly detected']+df_all_new['nodules missed']

df_all_new.iloc[7,0]=''
df_all_new.iloc[7,1]=''
df_all_new.iloc[7,2]=''
df_all_new.iloc[7,3]=''
df_all_new.iloc[7,4]=''
df_all_new.iloc[7,5]=''
df_all_new.iloc[7,6]=np.sum(df_all_new['All findings'].iloc[4:6])

df_all_new.iloc[3,0]=''
df_all_new.iloc[3,1]=''
df_all_new.iloc[3,2]=''
df_all_new.iloc[3,3]=''
df_all_new.iloc[3,4]=''
df_all_new.iloc[3,5]=''
df_all_new.iloc[3,6]=np.sum(df_all_new['All findings'].iloc[0:3])

for i in range(7):
    if i!=3:
        if i<3:
            sum_all=AI_all
        elif i>3:
            sum_all=reader_all
            
        percentage_tp=np.round((df_all_new.iloc[i][3]/sum_all)*100,1) 
        df_all_new['nodules correctly detected'].iloc[i]=str(df_all_new.iloc[i][3])+' ('+str(percentage_tp)+'%)'

        percentage_fp=np.round((df_all_new.iloc[i][4]/sum_all)*100,1) 
        df_all_new['non-nodules incorrectly detected'].iloc[i]=str(df_all_new.iloc[i][4])+' ('+str(percentage_fp)+'%)'

        percentage_fn=np.round((df_all_new.iloc[i][5]/sum_all)*100,1) 
        df_all_new['nodules missed'].iloc[i]=str(df_all_new.iloc[i][5])+' ('+str(percentage_fn)+'%)'

        df_all_new['All findings'].iloc[i]=str(df_all_new.iloc[i][6])+' ('+str(np.round(100*df_all_new.iloc[i][6]/sum_all,1))+'%)'

    
df_all_new['All findings'].iloc[3]=str(df_all_new.iloc[3][6])+' (100%)'
df_all_new['All findings'].iloc[7]=str(df_all_new.iloc[7][6])+' (100%)'

df_all_new #Detection performance comparison for nodules only

Unnamed: 0_level_0,sensitivity (95% CI),PPV (95% CI),F1 score (95% CI),nodules correctly detected,non-nodules incorrectly detected,nodules missed,All findings
GT by radiologists for discrepancies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"AI, low 30-100mm3","0.94 (0.84, 0.98)","0.7 (0.6, 0.79)","0.81 (0.73, 0.86)",62 (34.3%),26 (14.4%),4 (2.2%),92 (50.8%)
"AI, low 100-300mm3","1.0 (0.73, 0.99)","0.25 (0.15, 0.38)","0.39 (0.28, 0.52)",14 (7.7%),43 (23.8%),0 (0.0%),57 (31.5%)
"AI, low 300+mm3","1.0 (0.46, 0.98)","0.16 (0.06, 0.34)","0.27 (0.14, 0.44)",5 (2.8%),27 (14.9%),0 (0.0%),32 (17.7%)
,,,,,,,181 (100%)
"reader, low 30-100mm3","0.8 (0.68, 0.89)","0.71 (0.59, 0.8)","0.75 (0.67, 0.82)",53 (46.9%),22 (19.5%),13 (11.5%),88 (77.9%)
"reader, low 100-300mm3","0.79 (0.49, 0.94)","0.69 (0.41, 0.88)","0.73 (0.54, 0.87)",11 (9.7%),5 (4.4%),3 (2.7%),19 (16.8%)
"reader, low 300+mm3","0.8 (0.3, 0.99)","0.8 (0.3, 0.99)","0.8 (0.44, 0.96)",4 (3.5%),1 (0.9%),1 (0.9%),6 (5.3%)
,,,,,,,107 (100%)


In [None]:
df_all_new.to_excel('nodules_only_volumes_low_all.xlsx')

In [None]:
data=[[TP_both_100, FN_read_100],
        [FN_AI_100,0]]
# print(data)

# McNemar's Test without continuity correction
print("For nodules only (AI vs reader) of 30-100mm3 with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

#For FPs
data=[[0, FP_AI_100], 
        [FP_read_100, 0]]
# print(data)

# McNemar's Test without continuity correction
print("For FP findings of 30-100mm3, with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)
print("\n")


data=[[TP_both_100_300,FN_read_100_300], 
        [FN_AI_100_300, 0]]
# print(data)

# McNemar's Test without continuity correction
print("For nodules only (AI vs reader) of 100-300mm3 with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

#For FPs
data=[[0, FP_AI_100_300], 
        [FP_read_100_300, 0]]
# print(data)

# McNemar's Test without continuity correction
print("For FP findings of 100-300mm3, with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)
print("\n")


data=[[TP_both_300,FN_read_300], 
        [FN_AI_300, 0]]
# print(data)

# McNemar's Test without continuity correction
print("For nodules only (AI vs reader) of 300+mm3 with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

#For FPs
data=[[0, FP_AI_300], 
        [FP_read_300, 0]]
# print(data)

# McNemar's Test without continuity correction
print("For FP findings of 300+mm3, with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

For nodules only (AI vs reader) of 30-100mm3 with continuity correction (not exact) p value is 0.052345063273163295
For FP findings of 30-100mm3, with continuity correction (not exact) p value is 0.6650055421020291


For nodules only (AI vs reader) of 100-300mm3 with continuity correction (not exact) p value is 0.24821307898992026
For FP findings of 100-300mm3, with continuity correction (not exact) p value is 9.269569613019149e-08


For nodules only (AI vs reader) of 300+mm3 with continuity correction (not exact) p value is 1.0
For FP findings of 300+mm3, with continuity correction (not exact) p value is 2.306187599291998e-06


#### Comparison of volume subgroups for high BMI - Statistical tests based on it

In [None]:
TP_AI_100=TP_high_30_100+ai_only_nods_high_30_100-(TP_lymph_high_30_100) #-ai_lymph_high_30_100
FP_AI_100=ai_nonods_high_30_100
FN_AI_100=reader_nods_high_30_100-reader_lymph_high_30_100 #nodules of reader excluding lymph nodes

TP_read_100=TP_high_30_100+reader_nods_high_30_100-(TP_lymph_high_30_100) - reader_lymph_high_30_100
FP_read_100=reader_nonods_high_30_100
FN_read_100=ai_only_nods_high_30_100#-ai_lymph_high_30_100 #nodules of AI excluding lymph nodes

TP_AI_100_300=TP_high_100_300+ai_only_nods_high_100_300-(TP_lymph_high_100_300)# -(ai_lymph_high_100_300)
FP_AI_100_300=ai_nonods_high_100_300
FN_AI_100_300=reader_nods_high_100_300-(reader_lymph_high_100_300) #nodules of reader excluding lymph nodes

TP_read_100_300=TP_high_100_300+reader_nods_high_100_300-(TP_lymph_high_100_300) - (reader_lymph_high_100_300)
FP_read_100_300=reader_nonods_high_100_300
FN_read_100_300=ai_only_nods_high_100_300#-(ai_lymph_high_100_300) #nodules of AI excluding lymph nodes

TP_AI_300=TP_high_300+ai_only_nods_high_300-(TP_lymph_high_300) #-(ai_lymph_high_300)
FP_AI_300=ai_nonods_high_300
FN_AI_300=reader_nods_high_300-(reader_lymph_high_300) #nodules of reader excluding lymph nodes

TP_read_300=TP_high_300+reader_nods_high_300-(TP_lymph_high_300) - (reader_lymph_high_300)
FP_read_300=reader_nonods_high_300
FN_read_300=ai_only_nods_high_300#-(ai_lymph_high_300) #nodules of AI excluding lymph nodes


TP_both_100=TP_high_30_100-(TP_lymph_high_30_100)
TP_both_100_300=TP_high_100_300-(TP_lymph_high_100_300)
TP_both_300=TP_high_300-(TP_lymph_high_300)

#Print the above
print("High BMI numbers")
print("TP_AI_100",TP_AI_100)
print("FP_AI_100",FP_AI_100)
print("FN_AI_100",FN_AI_100)
print("TP_read_100",TP_read_100)
print("FP_read_100",FP_read_100)
print("FN_read_100",FN_read_100)
print("TP_AI_100_300",TP_AI_100_300)
print("FP_AI_100_300",FP_AI_100_300)
print("FN_AI_100_300",FN_AI_100_300)
print("TP_read_100_300",TP_read_100_300)
print("FP_read_100_300",FP_read_100_300)
print("FN_read_100_300",FN_read_100_300)
print("TP_AI_300",TP_AI_300)
print("FP_AI_300",FP_AI_300)
print("FN_AI_300",FN_AI_300)
print("TP_read_300",TP_read_300)
print("FP_read_300",FP_read_300)
print("FN_read_300",FN_read_300)
print("TP_both_100",TP_both_100)
print("TP_both_100_300",TP_both_100_300)
print("TP_both_300",TP_both_300)

High BMI numbers
TP_AI_100 58
FP_AI_100 11
FN_AI_100 11
TP_read_100 51
FP_read_100 8
FN_read_100 18
TP_AI_100_300 15
FP_AI_100_300 27
FN_AI_100_300 2
TP_read_100_300 13
FP_read_100_300 1
FN_read_100_300 4
TP_AI_300 4
FP_AI_300 15
FN_AI_300 0
TP_read_300 4
FP_read_300 0
FN_read_300 0
TP_both_100 40
TP_both_100_300 11
TP_both_300 4


In [None]:
#For non-emphysema only comparison between reader and AI for volume subgroups
df_all_new=pd.DataFrame(columns=['sensitivity (95% CI)','PPV (95% CI)','F1 score (95% CI)','nodules correctly detected','non-nodules incorrectly detected','nodules missed'], 
                        index=['AI, high 30-100mm3', 'AI, high 100-300mm3','AI, high 300+mm3','',
                               'reader, high 30-100mm3','reader, high 100-300mm3', 'reader, high 300+mm3',''
                              ])

df_all_new.index.name = 'GT by radiologists for discrepancies' 


df_all_new.iloc[0,0]=np.round(sensitivity(TP_AI_100,FN_AI_100),2)
df_all_new.iloc[0,1]=np.round(PPV(TP_AI_100,FP_AI_100),2)
df_all_new.iloc[0,2]=np.round(F1score(TP_AI_100,FP_AI_100,FN_AI_100),2)
df_all_new.iloc[0,3]=TP_AI_100
df_all_new.iloc[0,4]=FP_AI_100
df_all_new.iloc[0,5]=FN_AI_100

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI_100, FP_AI_100, FN_AI_100, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[0]=str(df_all_new['sensitivity (95% CI)'].iloc[0])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[0]=str(df_all_new['PPV (95% CI)'].iloc[0])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[0]=str(df_all_new['F1 score (95% CI)'].iloc[0])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[4,0]=np.round(sensitivity(TP_read_100,FN_read_100),2)
df_all_new.iloc[4,1]=np.round(PPV(TP_read_100,FP_read_100),2)
df_all_new.iloc[4,2]=np.round(F1score(TP_read_100,FP_read_100,FN_read_100),2)
df_all_new.iloc[4,3]=TP_read_100
df_all_new.iloc[4,4]=FP_read_100
df_all_new.iloc[4,5]=FN_read_100

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_read_100, FP_read_100, FN_read_100, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[4]=str(df_all_new['sensitivity (95% CI)'].iloc[4])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[4]=str(df_all_new['PPV (95% CI)'].iloc[4])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[4]=str(df_all_new['F1 score (95% CI)'].iloc[4])+' '+str(tuple(ci_f1_ai))



df_all_new.iloc[1,0]=np.round(sensitivity(TP_AI_100_300,FN_AI_100_300),2)
df_all_new.iloc[1,1]=np.round(PPV(TP_AI_100_300,FP_AI_100_300),2)
df_all_new.iloc[1,2]=np.round(F1score(TP_AI_100_300,FP_AI_100_300,FN_AI_100_300),2)
df_all_new.iloc[1,3]=TP_AI_100_300
df_all_new.iloc[1,4]=FP_AI_100_300
df_all_new.iloc[1,5]=FN_AI_100_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI_100_300, FP_AI_100_300, FN_AI_100_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[1]=str(df_all_new['sensitivity (95% CI)'].iloc[1])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[1]=str(df_all_new['PPV (95% CI)'].iloc[1])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[1]=str(df_all_new['F1 score (95% CI)'].iloc[1])+' '+str(tuple(ci_f1_ai))



df_all_new.iloc[5,0]=np.round(sensitivity(TP_read_100_300,FN_read_100_300),2)
df_all_new.iloc[5,1]=np.round(PPV(TP_read_100_300,FP_read_100_300),2)
df_all_new.iloc[5,2]=np.round(F1score(TP_read_100_300,FP_read_100_300,FN_read_100_300),2)
df_all_new.iloc[5,3]=TP_read_100_300
df_all_new.iloc[5,4]=FP_read_100_300
df_all_new.iloc[5,5]=FN_read_100_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_read_100_300, FP_read_100_300, FN_read_100_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[5]=str(df_all_new['sensitivity (95% CI)'].iloc[5])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[5]=str(df_all_new['PPV (95% CI)'].iloc[5])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[5]=str(df_all_new['F1 score (95% CI)'].iloc[5])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[2,0]=np.round(sensitivity(TP_AI_300,FN_AI_300),2)
df_all_new.iloc[2,1]=np.round(PPV(TP_AI_300,FP_AI_300),2)
df_all_new.iloc[2,2]=np.round(F1score(TP_AI_300,FP_AI_300,FN_AI_300),2)
df_all_new.iloc[2,3]=TP_AI_300
df_all_new.iloc[2,4]=FP_AI_300
df_all_new.iloc[2,5]=FN_AI_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI_300, FP_AI_300, FN_AI_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[2]=str(df_all_new['sensitivity (95% CI)'].iloc[2])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[2]=str(df_all_new['PPV (95% CI)'].iloc[2])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[2]=str(df_all_new['F1 score (95% CI)'].iloc[2])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[6,0]=np.round(sensitivity(TP_read_300,FN_read_300),2)
df_all_new.iloc[6,1]=np.round(PPV(TP_read_300,FP_read_300),2)
df_all_new.iloc[6,2]=np.round(F1score(TP_read_300,FP_read_300,FN_read_300),2)
df_all_new.iloc[6,3]=TP_read_300
df_all_new.iloc[6,4]=FP_read_300
df_all_new.iloc[6,5]=FN_read_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_read_300, FP_read_300, FN_read_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[6]=str(df_all_new['sensitivity (95% CI)'].iloc[6])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[6]=str(df_all_new['PPV (95% CI)'].iloc[6])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[6]=str(df_all_new['F1 score (95% CI)'].iloc[6])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[3,0]=0
df_all_new.iloc[3,1]=0
df_all_new.iloc[3,2]=0
df_all_new.iloc[3,3]=0
df_all_new.iloc[3,4]=0
df_all_new.iloc[3,5]=0


AI_all=np.sum(df_all_new.iloc[0:3,3:].values)
reader_all=np.sum(df_all_new.iloc[4:7,3:].values)

df_all_new['All findings']=df_all_new['nodules correctly detected']+df_all_new['non-nodules incorrectly detected']+df_all_new['nodules missed']

df_all_new.iloc[7,0]=''
df_all_new.iloc[7,1]=''
df_all_new.iloc[7,2]=''
df_all_new.iloc[7,3]=''
df_all_new.iloc[7,4]=''
df_all_new.iloc[7,5]=''
df_all_new.iloc[7,6]=np.sum(df_all_new['All findings'].iloc[4:6])

df_all_new.iloc[3,0]=''
df_all_new.iloc[3,1]=''
df_all_new.iloc[3,2]=''
df_all_new.iloc[3,3]=''
df_all_new.iloc[3,4]=''
df_all_new.iloc[3,5]=''
df_all_new.iloc[3,6]=np.sum(df_all_new['All findings'].iloc[0:3])

for i in range(7):
    if i!=3:
        if i<3:
            sum_all=AI_all
        elif i>3:
            sum_all=reader_all
            
        percentage_tp=np.round((df_all_new.iloc[i][3]/sum_all)*100,1) 
        df_all_new['nodules correctly detected'].iloc[i]=str(df_all_new.iloc[i][3])+' ('+str(percentage_tp)+'%)'

        percentage_fp=np.round((df_all_new.iloc[i][4]/sum_all)*100,1) 
        df_all_new['non-nodules incorrectly detected'].iloc[i]=str(df_all_new.iloc[i][4])+' ('+str(percentage_fp)+'%)'

        percentage_fn=np.round((df_all_new.iloc[i][5]/sum_all)*100,1) 
        df_all_new['nodules missed'].iloc[i]=str(df_all_new.iloc[i][5])+' ('+str(percentage_fn)+'%)'

        df_all_new['All findings'].iloc[i]=str(df_all_new.iloc[i][6])+' ('+str(np.round(100*df_all_new.iloc[i][6]/sum_all,1))+'%)'

    
df_all_new['All findings'].iloc[3]=str(df_all_new.iloc[3][6])+' (100%)'
df_all_new['All findings'].iloc[7]=str(df_all_new.iloc[7][6])+' (100%)'

df_all_new #Detection performance comparison for nodules only

Unnamed: 0_level_0,sensitivity (95% CI),PPV (95% CI),F1 score (95% CI),nodules correctly detected,non-nodules incorrectly detected,nodules missed,All findings
GT by radiologists for discrepancies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"AI, high 30-100mm3","0.84 (0.73, 0.91)","0.84 (0.73, 0.91)","0.84 (0.77, 0.9)",58 (40.6%),11 (7.7%),11 (7.7%),80 (55.9%)
"AI, high 100-300mm3","0.88 (0.62, 0.98)","0.36 (0.22, 0.52)","0.51 (0.38, 0.64)",15 (10.5%),27 (18.9%),2 (1.4%),44 (30.8%)
"AI, high 300+mm3","1.0 (0.4, 0.98)","0.21 (0.07, 0.46)","0.35 (0.17, 0.57)",4 (2.8%),15 (10.5%),0 (0.0%),19 (13.3%)
,,,,,,,143 (100%)
"reader, high 30-100mm3","0.74 (0.62, 0.83)","0.86 (0.74, 0.94)","0.8 (0.71, 0.86)",51 (51.5%),8 (8.1%),18 (18.2%),77 (77.8%)
"reader, high 100-300mm3","0.76 (0.5, 0.92)","0.93 (0.64, 1.0)","0.84 (0.66, 0.94)",13 (13.1%),1 (1.0%),4 (4.0%),18 (18.2%)
"reader, high 300+mm3","1.0 (0.4, 0.98)","1.0 (0.4, 0.98)","1.0 (0.6, 0.99)",4 (4.0%),0 (0.0%),0 (0.0%),4 (4.0%)
,,,,,,,95 (100%)


In [None]:
df_all_new.to_excel('nodules_only_volumes_high_all.xlsx')

In [None]:
data=[[TP_both_100, FN_read_100],
        [FN_AI_100,0]]
# print(data)

# McNemar's Test without continuity correction
print("For nodules only (AI vs reader) of 30-100mm3 with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

#For FPs
data=[[0, FP_AI_100], 
        [FP_read_100, 0]]
# print(data)

# McNemar's Test without continuity correction
print("For FP findings of 30-100mm3, with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)
print("\n")


data=[[TP_both_100_300,FN_read_100_300 ], 
        [FN_AI_100_300, 0]]
# print(data)

# McNemar's Test without continuity correction
print("For nodules only (AI vs reader) of 100-300mm3 with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

#For FPs
data=[[0, FP_AI_100_300], 
        [FP_read_100_300, 0]]
# print(data)

# McNemar's Test without continuity correction
print("For FP findings of 100-300mm3, with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)
print("\n")


data=[[TP_both_300,FN_read_300], 
        [FN_AI_300, 0]]
# print(data)

# McNemar's Test without continuity correction
print("For nodules only (AI vs reader) of 300+mm3 with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

#For FPs
data=[[0, FP_AI_300], 
        [FP_read_300, 0]]
# print(data)

# McNemar's Test without continuity correction
print("For FP findings of 300+mm3, with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

For nodules only (AI vs reader) of 30-100mm3 with continuity correction (not exact) p value is 0.2652053925915044
For FP findings of 30-100mm3, with continuity correction (not exact) p value is 0.6463551955394902


For nodules only (AI vs reader) of 100-300mm3 with continuity correction (not exact) p value is 0.6830913983096086
For FP findings of 100-300mm3, with continuity correction (not exact) p value is 2.306187599291998e-06


For nodules only (AI vs reader) of 300+mm3 with continuity correction (not exact) p value is 0.0
For FP findings of 300+mm3, with continuity correction (not exact) p value is 0.0003005976074404506


Analysis based on volume for subcategories not possible since we only have volume subgroups for TPs

Nodule types - Here e.g. 'atypical_triangular_low_nod_only' instead of 'atypical_triangular_low_lymph'

In [7]:
#Both High and Low BMI
#Further analysis for nodule/lymph node subcategories - Not kept for now
#Detailed analysis of what detected or not from both AI and reader for each category in nodules & lymph nodes 

df_categories=pd.DataFrame(columns=['TP','FP','FN'], #below index with the correct order as above
                          index=['pleural nodules',
                                 'calcified nodules',
                                 'subsolid & ground glass nodules',
                                 'other nodules',
                                 'cancer',
                                 'atypical PFNs'
                                ])

# df_categories.index.name = 'GT by radiologists for discrepancies'

df_categories['FP']=[sum([len(x) for x in pleural_FP_low.values()])+sum([len(x) for x in pleural_FP_high.values()]),
                     sum([len(x) for x in calcif_FP_low.values()])+sum([len(x) for x in calcif_FP_high.values()]),
                     sum([len(x) for x in sub_ground_FP_low.values()])+sum([len(x) for x in sub_ground_FP_high.values()]),
                     sum([len(x) for x in other_nodules_FP_low.values()])+sum([len(x) for x in other_nodules_FP_high.values()]),
                     sum([len(x) for x in cancer_FP_low.values()])+sum([len(x) for x in cancer_FP_high.values()]),
                     sum([len(x) for x in atyp_FP_low.values()])+sum([len(x) for x in atyp_FP_high.values()])]

df_categories['FN']=[sum([len(x) for x in pleural_FN_low.values()])+sum([len(x) for x in pleural_FN_high.values()]),
                     sum([len(x) for x in calcif_FN_low.values()])+sum([len(x) for x in calcif_FN_high.values()]),
                     sum([len(x) for x in sub_ground_FN_low.values()])+sum([len(x) for x in sub_ground_FN_high.values()]),
                     sum([len(x) for x in other_nodules_FN_low.values()])+sum([len(x) for x in other_nodules_FN_high.values()]),
                     sum([len(x) for x in cancer_FN_low.values()])+sum([len(x) for x in cancer_FN_high.values()]),
                     sum([len(x) for x in atyp_FN_low.values()])+sum([len(x) for x in atyp_FN_high.values()])]

df_categories['TP']=[pleural_low_nod_only+pleural_high_nod_only,
                     calcified_low_nod_only+calcified_high_nod_only,
                     sub_ground_low_nod_only+sub_ground_high_nod_only,
                     other_all_low_nod_only+other_all_high_nod_only,
                     0,
                     atypical_triangular_low_nod_only+atypical_triangular_high_nod_only]

df_categories['All findings']=df_categories['FP']+df_categories['FN']+df_categories['TP']

df_categories.loc['Total']= df_categories.sum()

total_num_discrepancies_with_tp=df_categories.iloc[:-1,:-1].sum().sum() #To be used in next cells for percentages

all_findings=df_categories.iloc[:-1,:-1].sum().sum()

percentage_fp=np.round((df_categories['FP']/total_num_discrepancies_with_tp)*100,1) 
df_categories['FP']=[str(value[1])+' ('+str(percentage_fp[index])+'%)' for index,value in enumerate(df_categories['FP'].items())]

percentage_fn=np.round((df_categories['FN']/total_num_discrepancies_with_tp)*100,1) 
df_categories['FN']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' for index,value in enumerate(df_categories['FN'].items())]

percentage_fn=np.round((df_categories['TP']/total_num_discrepancies_with_tp)*100,1) 
df_categories['TP']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' for index,value in enumerate(df_categories['TP'].items())]

df_categories['All findings']=[str(val)+' ('+str(np.round(100*val/total_num_discrepancies_with_tp,1))+'%)' for val in df_categories['All findings'].values]

#Rename columns
df_categories.rename(columns={'FP': 'AI found, reader missed', 'FN': 'AI missed, reader found', 'TP':'Both found'}, inplace=True)

df_categories

In [None]:
df_categories.to_excel('nodule_types_all.xlsx')

In [8]:
#Low BMI only
#Further analysis for nodule/lymph node subcategories - Not kept for now
#Detailed analysis of what detected or not from both AI and reader for each category in nodules & lymph nodes 

df_categories=pd.DataFrame(columns=['TP','FP','FN'], #below index with the correct order as above
                          index=['pleural nodules',
                                 'calcified nodules',
                                 'subsolid & ground glass nodules',
                                 'other nodules',
                                 'cancer',
                                 'atypical PFNs'
                                ])

# df_categories.index.name = 'GT by radiologists for discrepancies'

df_categories['FP']=[sum([len(x) for x in pleural_FP_low.values()]),
                     sum([len(x) for x in calcif_FP_low.values()]),
                     sum([len(x) for x in sub_ground_FP_low.values()]),
                     sum([len(x) for x in other_nodules_FP_low.values()]),
                     sum([len(x) for x in cancer_FP_low.values()]),
                     sum([len(x) for x in atyp_FP_low.values()])]

df_categories['FN']=[sum([len(x) for x in pleural_FN_low.values()]),
                     sum([len(x) for x in calcif_FN_low.values()]),
                     sum([len(x) for x in sub_ground_FN_low.values()]),
                     sum([len(x) for x in other_nodules_FN_low.values()]),
                     sum([len(x) for x in cancer_FN_low.values()]),
                     sum([len(x) for x in atyp_FN_low.values()])]

df_categories['TP']=[pleural_low_nod_only,
                     calcified_low_nod_only,
                     sub_ground_low_nod_only,
                     other_all_low_nod_only,
                     0,
                     atypical_triangular_low_nod_only]

df_categories['All findings']=df_categories['FP']+df_categories['FN']+df_categories['TP']

df_categories.loc['Total']= df_categories.sum()

total_num_discrepancies_with_tp=df_categories.iloc[:-1,:-1].sum().sum() #To be used in next cells for percentages

all_findings=df_categories.iloc[:-1,:-1].sum().sum()

percentage_fp=np.round((df_categories['FP']/total_num_discrepancies_with_tp)*100,1) 
df_categories['FP']=[str(value[1])+' ('+str(percentage_fp[index])+'%)' for index,value in enumerate(df_categories['FP'].items())]

percentage_fn=np.round((df_categories['FN']/total_num_discrepancies_with_tp)*100,1) 
df_categories['FN']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' for index,value in enumerate(df_categories['FN'].items())]

percentage_fn=np.round((df_categories['TP']/total_num_discrepancies_with_tp)*100,1) 
df_categories['TP']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' for index,value in enumerate(df_categories['TP'].items())]

df_categories['All findings']=[str(val)+' ('+str(np.round(100*val/total_num_discrepancies_with_tp,1))+'%)' for val in df_categories['All findings'].values]

#Rename columns
df_categories.rename(columns={'FP': 'AI found, reader missed', 'FN': 'AI missed, reader found', 'TP':'Both found'}, inplace=True)

df_categories

In [None]:
# df_categories.to_excel('nodule_types_low.xlsx')

In [9]:
#High BMI
#Further analysis for nodule/lymph node subcategories - Not kept for now
#Detailed analysis of what detected or not from both AI and reader for each category in nodules & lymph nodes 

df_categories=pd.DataFrame(columns=['TP','FP','FN'], #below index with the correct order as above
                          index=['pleural nodules',
                                 'calcified nodules',
                                 'subsolid & ground glass nodules',
                                 'other nodules',
                                 'cancer',
                                 'atypical PFNs'
                                ])

# df_categories.index.name = 'GT by radiologists for discrepancies'

df_categories['FP']=[sum([len(x) for x in pleural_FP_high.values()]),
                     sum([len(x) for x in calcif_FP_high.values()]),
                     sum([len(x) for x in sub_ground_FP_high.values()]),
                     sum([len(x) for x in other_nodules_FP_high.values()]),
                     sum([len(x) for x in cancer_FP_high.values()]),
                     sum([len(x) for x in atyp_FP_high.values()])]

df_categories['FN']=[sum([len(x) for x in pleural_FN_high.values()]),
                     sum([len(x) for x in calcif_FN_high.values()]),
                     sum([len(x) for x in sub_ground_FN_high.values()]),
                     sum([len(x) for x in other_nodules_FN_high.values()]),
                     sum([len(x) for x in cancer_FN_high.values()]),
                     sum([len(x) for x in atyp_FN_high.values()])]

df_categories['TP']=[pleural_high_nod_only,
                     calcified_high_nod_only,
                     sub_ground_high_nod_only,
                     other_all_high_nod_only,
                     0,
                     atypical_triangular_high_nod_only]

df_categories['All findings']=df_categories['FP']+df_categories['FN']+df_categories['TP']

df_categories.loc['Total']= df_categories.sum()

total_num_discrepancies_with_tp=df_categories.iloc[:-1,:-1].sum().sum() #To be used in next cells for percentages

all_findings=df_categories.iloc[:-1,:-1].sum().sum()

percentage_fp=np.round((df_categories['FP']/total_num_discrepancies_with_tp)*100,1) 
df_categories['FP']=[str(value[1])+' ('+str(percentage_fp[index])+'%)' for index,value in enumerate(df_categories['FP'].items())]

percentage_fn=np.round((df_categories['FN']/total_num_discrepancies_with_tp)*100,1) 
df_categories['FN']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' for index,value in enumerate(df_categories['FN'].items())]

percentage_fn=np.round((df_categories['TP']/total_num_discrepancies_with_tp)*100,1) 
df_categories['TP']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' for index,value in enumerate(df_categories['TP'].items())]

df_categories['All findings']=[str(val)+' ('+str(np.round(100*val/total_num_discrepancies_with_tp,1))+'%)' for val in df_categories['All findings'].values]

#Rename columns
df_categories.rename(columns={'FP': 'AI found, reader missed', 'FN': 'AI missed, reader found', 'TP':'Both found'}, inplace=True)

df_categories

In [None]:
# df_categories.to_excel('nodule_types_high.xlsx')

Benign lymph node types

In [10]:
#Further analysis for nodule/lymph node subcategories - Not kept for now
#Detailed analysis of what detected or not from both AI and reader for each category in nodules & lymph nodes 

df_categories=pd.DataFrame(columns=['TP','FP','FN'], #below index with the correct order as above
                          index=['typical PFNs & periphysural lymph nodes',
                                 'bronchiovascular lymph nodes'
                                ])

# df_categories.index.name = 'GT by radiologists for discrepancies'

df_categories['FP']=[#sum([len(x) for x in atyp_FP_low.values()])+sum([len(x) for x in atyp_FP_high.values()]),
                     sum([len(x) for x in per_FP_low.values()])+sum([len(x) for x in per_FP_high.values()]),
                     sum([len(x) for x in bronchioperi_FP_low.values()])+sum([len(x) for x in bronchioperi_FP_high.values()])]

df_categories['FN']=[#sum([len(x) for x in atyp_FN_low.values()])+sum([len(x) for x in atyp_FN_high.values()]),
                     sum([len(x) for x in per_FN_low.values()])+sum([len(x) for x in per_FN_high.values()]),
                     sum([len(x) for x in bronchioperi_FN_low.values()])+sum([len(x) for x in bronchioperi_FN_high.values()])]

df_categories['TP']=[#atypical_triangular_low_nod_only+atypical_triangular_high_nod_only,
                     per_fisu_low_lymph+per_fisu_high_lymph,
                     peri_bronch_low_lymph+peri_bronch_high_lymph]

df_categories['All findings']=df_categories['FP']+df_categories['FN']+df_categories['TP']

df_categories.loc['Total']= df_categories.sum()

total_num_discrepancies_with_tp=df_categories.iloc[:-1,:-1].sum().sum() #To be used in next cells for percentages

all_findings=df_categories.iloc[:-1,:-1].sum().sum()

percentage_fp=np.round((df_categories['FP']/total_num_discrepancies_with_tp)*100,1) 
df_categories['FP']=[str(value[1])+' ('+str(percentage_fp[index])+'%)' for index,value in enumerate(df_categories['FP'].items())]

percentage_fn=np.round((df_categories['FN']/total_num_discrepancies_with_tp)*100,1) 
df_categories['FN']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' for index,value in enumerate(df_categories['FN'].items())]

percentage_fn=np.round((df_categories['TP']/total_num_discrepancies_with_tp)*100,1) 
df_categories['TP']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' for index,value in enumerate(df_categories['TP'].items())]

df_categories['All findings']=[str(val)+' ('+str(np.round(100*val/total_num_discrepancies_with_tp,1))+'%)' for val in df_categories['All findings'].values]

#Rename columns
df_categories.rename(columns={'FP': 'AI found, reader missed', 'FN': 'AI missed, reader found', 'TP':'Both found'}, inplace=True)

df_categories

In [None]:
df_categories.to_excel('lymph_types_all.xlsx')

In [11]:
#Low BMI
#Further analysis for nodule/lymph node subcategories - Not kept for now
#Detailed analysis of what detected or not from both AI and reader for each category in nodules & lymph nodes 

df_categories=pd.DataFrame(columns=['TP','FP','FN'], #below index with the correct order as above
                          index=[#'atypical PFNs',
                                 'typical PFNs & periphysural lymph nodes',
                                 'bronchiovascular lymph nodes'
                                ])

# df_categories.index.name = 'GT by radiologists for discrepancies'

df_categories['FP']=[#sum([len(x) for x in atyp_FP_low.values()]),
                     sum([len(x) for x in per_FP_low.values()]),
                     sum([len(x) for x in bronchioperi_FP_low.values()])]

df_categories['FN']=[#sum([len(x) for x in atyp_FN_low.values()]),
                     sum([len(x) for x in per_FN_low.values()]),
                     sum([len(x) for x in bronchioperi_FN_low.values()])]

df_categories['TP']=[#atypical_triangular_low_nod_only,
                     per_fisu_low_lymph,
                     peri_bronch_low_lymph]

df_categories['All findings']=df_categories['FP']+df_categories['FN']+df_categories['TP']

df_categories.loc['Total']= df_categories.sum()

total_num_discrepancies_with_tp=df_categories.iloc[:-1,:-1].sum().sum() #To be used in next cells for percentages

all_findings=df_categories.iloc[:-1,:-1].sum().sum()

percentage_fp=np.round((df_categories['FP']/total_num_discrepancies_with_tp)*100,1) 
df_categories['FP']=[str(value[1])+' ('+str(percentage_fp[index])+'%)' for index,value in enumerate(df_categories['FP'].items())]

percentage_fn=np.round((df_categories['FN']/total_num_discrepancies_with_tp)*100,1) 
df_categories['FN']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' for index,value in enumerate(df_categories['FN'].items())]

percentage_fn=np.round((df_categories['TP']/total_num_discrepancies_with_tp)*100,1) 
df_categories['TP']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' for index,value in enumerate(df_categories['TP'].items())]

df_categories['All findings']=[str(val)+' ('+str(np.round(100*val/total_num_discrepancies_with_tp,1))+'%)' for val in df_categories['All findings'].values]

#Rename columns
df_categories.rename(columns={'FP': 'AI found, reader missed', 'FN': 'AI missed, reader found', 'TP':'Both found'}, inplace=True)

df_categories

In [None]:
# df_categories.to_excel('lymph_types_low.xlsx')

In [12]:
#High BMI
#Further analysis for nodule/lymph node subcategories - Not kept for now
#Detailed analysis of what detected or not from both AI and reader for each category in nodules & lymph nodes 

df_categories=pd.DataFrame(columns=['TP','FP','FN'], #below index with the correct order as above
                          index=[#'atypical PFNs',
                                 'typical PFNs & periphysural lymph nodes',
                                 'bronchiovascular lymph nodes'
                                ])

# df_categories.index.name = 'GT by radiologists for discrepancies'

df_categories['FP']=[#sum([len(x) for x in atyp_FP_high.values()]),
                     sum([len(x) for x in per_FP_high.values()]),
                     sum([len(x) for x in bronchioperi_FP_high.values()])]

df_categories['FN']=[#sum([len(x) for x in atyp_FN_high.values()]),
                     sum([len(x) for x in per_FN_high.values()]),
                     sum([len(x) for x in bronchioperi_FN_high.values()])]

df_categories['TP']=[#atypical_triangular_high_nod_only,
                    per_fisu_high_lymph,
                     peri_bronch_high_lymph]

df_categories['All findings']=df_categories['FP']+df_categories['FN']+df_categories['TP']

df_categories.loc['Total']= df_categories.sum()

total_num_discrepancies_with_tp=df_categories.iloc[:-1,:-1].sum().sum() #To be used in next cells for percentages

all_findings=df_categories.iloc[:-1,:-1].sum().sum()

percentage_fp=np.round((df_categories['FP']/total_num_discrepancies_with_tp)*100,1) 
df_categories['FP']=[str(value[1])+' ('+str(percentage_fp[index])+'%)' for index,value in enumerate(df_categories['FP'].items())]

percentage_fn=np.round((df_categories['FN']/total_num_discrepancies_with_tp)*100,1) 
df_categories['FN']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' for index,value in enumerate(df_categories['FN'].items())]

percentage_fn=np.round((df_categories['TP']/total_num_discrepancies_with_tp)*100,1) 
df_categories['TP']=[str(value[1])+' ('+str(percentage_fn[index])+'%)' for index,value in enumerate(df_categories['TP'].items())]

df_categories['All findings']=[str(val)+' ('+str(np.round(100*val/total_num_discrepancies_with_tp,1))+'%)' for val in df_categories['All findings'].values]

#Rename columns
df_categories.rename(columns={'FP': 'AI found, reader missed', 'FN': 'AI missed, reader found', 'TP':'Both found'}, inplace=True)

df_categories

In [None]:
# df_categories.to_excel('lymph_types_high.xlsx')

### Similar analysis as the one performed above (low vs high BMI, volume subgroups) but including lymph nodes this time

Low BMI (nodules and lymph nodes)

In [None]:
TP_AI= TP_AI_low=TP_low+FP_nods_low
FP_AI=FP_AI_low=FP_nonods_low
FN_AI=FN_AI_low=TP_read_only=FN_nods_low #nodules and lymph nodes detected only by the reader

TP_read=TP_read_low=TP_low+FN_nods_low
FP_read=FP_read_low=FN_nonods_low
FN_read=FN_read_low=TP_AI_only=FP_nods_low #nodules and lymph nodes detected only by AI

TP_both=TP_low #Common nodules and lymph nodules detected by both AI and reader

#Print the above
print("Low BMI numbers")
print('TP_AI: ',TP_AI)
print('FP_AI: ',FP_AI)
print('FN_AI: ',FN_AI)
print('TP_read: ',TP_read)
print('FP_read: ',FP_read)
print('FN_read: ',FN_read)
print('TP_both: ',TP_both)

Low BMI numbers
TP_AI:  109
FP_AI:  96
FN_AI:  27
TP_read:  114
FP_read:  28
FN_read:  22
TP_both:  87


In [None]:
#Table split in two tables: one for low and one for high BMI having also percentages.

#Assessing detection performance - For nodules + lymph nodes 
df_all_new=pd.DataFrame(columns=['sensitivity (95% CI)','PPV (95% CI)','F1 score (95% CI)','nodules detected','non-nodules detected','nodules missed'],
                        index=['AI, low', 'reader, low'])

#AI nodules only emph
df_all_new.iloc[0,0]=np.round(sensitivity(TP_AI,FN_AI),2)
df_all_new.iloc[0,1]=np.round(PPV(TP_AI,FP_AI),2)
df_all_new.iloc[0,2]=np.round(F1score(TP_AI,FP_AI,FN_AI),2)
df_all_new.iloc[0,3]=TP_AI
df_all_new.iloc[0,4]=FP_AI
df_all_new.iloc[0,5]=FN_AI

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI, FP_AI, FN_AI, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[0]=str(df_all_new['sensitivity (95% CI)'].iloc[0])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[0]=str(df_all_new['PPV (95% CI)'].iloc[0])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[0]=str(df_all_new['F1 score (95% CI)'].iloc[0])+' '+str(tuple(ci_f1_ai))

#reader nodules
df_all_new.iloc[1,0]=np.round(sensitivity(TP_read,FN_read),2)
df_all_new.iloc[1,1]=np.round(PPV(TP_read,FP_read),2)
df_all_new.iloc[1,2]=np.round(F1score(TP_read,FP_read,FN_read),2)
df_all_new.iloc[1,3]=TP_read
df_all_new.iloc[1,4]=FP_read
df_all_new.iloc[1,5]=FN_read

sensitivity_confidence_interval_read, PPV_confidence_interval_read, F1_confidence_interval_read\
    = sensitivity_and_specificity_with_confidence_intervals(TP_read, FP_read, FN_read, 0, alpha=0.95)

ci_sens_read=[np.round(x,2) for x in sensitivity_confidence_interval_read]
ci_ppv_read=[np.round(x,2) for x in PPV_confidence_interval_read]
ci_f1_read=[np.round(x,2) for x in F1_confidence_interval_read]

df_all_new['sensitivity (95% CI)'].iloc[1]=str(df_all_new['sensitivity (95% CI)'].iloc[1])+' '+str(tuple(ci_sens_read))
df_all_new['PPV (95% CI)'].iloc[1]=str(df_all_new['PPV (95% CI)'].iloc[1])+' '+str(tuple(ci_ppv_read))
df_all_new['F1 score (95% CI)'].iloc[1]=str(df_all_new['F1 score (95% CI)'].iloc[1])+' '+str(tuple(ci_f1_read))


df_all_new['All findings']=df_all_new['nodules detected']+df_all_new['non-nodules detected']+df_all_new['nodules missed']
df_all_new.loc['Total']= df_all_new.sum()
df_all_new.loc['Total'].iloc[0:3]=''


all_findings=df_all_new.iloc[:-1,3:-1].sum().sum()

for i in range(2):
    row_all=np.sum(df_all_new.iloc[i][3:6].values)

    percentage_tp=np.round((df_all_new.iloc[i][3]/row_all)*100,1) 
    df_all_new['nodules detected'].iloc[i]=str(df_all_new.iloc[i][3])+' ('+str(percentage_tp)+'%)'
    percentage_fp=np.round((df_all_new.iloc[i][4]/row_all)*100,1) 
    df_all_new['non-nodules detected'].iloc[i]=str(df_all_new.iloc[i][4])+' ('+str(percentage_fp)+'%)'
    percentage_fn=np.round((df_all_new.iloc[i][5]/row_all)*100,1) 
    df_all_new['nodules missed'].iloc[i]=str(df_all_new.iloc[i][5])+' ('+str(percentage_fn)+'%)'

    df_all_new['All findings'].iloc[i]=str(df_all_new.iloc[i][6])+' (100%)'

df_all_new

Unnamed: 0,sensitivity (95% CI),PPV (95% CI),F1 score (95% CI),nodules detected,non-nodules detected,nodules missed,All findings
"AI, low","0.8 (0.72, 0.86)","0.53 (0.46, 0.6)","0.64 (0.59, 0.69)",109 (47.0%),96 (41.4%),27 (11.6%),232 (100%)
"reader, low","0.84 (0.76, 0.89)","0.8 (0.73, 0.86)","0.82 (0.77, 0.86)",114 (69.5%),28 (17.1%),22 (13.4%),164 (100%)
Total,,,,223,124,49,396


In [None]:
df_all_new.to_excel('nodules_lymph_low.xlsx')

In [None]:
#McNemar's test to compare Reader vs AI (using consensus panel)
#Below format is: [[Both AI found and reader found, reader missed and AI found], [Reader found and AI missed, 0]]

#For nodules
data=[[TP_both, FN_read],
        [FN_AI,0]]
# print(data)

# McNemar's Test without continuity correction
print("McNemar's test (nodules and lymphs), AI_vs_Reader with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue) 


# #For FPs
# data=[[0, FP_AI], 
#         [FP_read, 0]]
# # print(data)

# # McNemar's Test without continuity correction
# print("For FP findings, with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

McNemar's test (nodules and lymphs), AI_vs_Reader with continuity correction (not exact) p value is 0.5677091661973526


In [None]:
# #AI vs GT, TP_both included
# # Table looks like below:
# #              GT
# #             Yes                       No
# # AI   Yes    TP_both+TP_AI_only      FP_AI
# #      No     FN_AI                   0

# list_of_lists=[[1 for x in range(TP_both)],[1 for x in range(TP_AI_only)],[0 for x in range(FP_AI) ],[1 for x in range(FN_AI) ] ] 
# rater_GT=[item for sublist in list_of_lists for item in sublist]

# list_of_lists=[[1 for x in range(TP_both)], [1 for x in range(TP_AI_only)],[1 for x in range(FP_AI) ],[0 for x in range(FN_AI) ]]
# rater_AI=[item for sublist in list_of_lists for item in sublist]

# print("AI vs consensus (for nodules and lymphs), kappa is ",cohen_kappa_score(rater_GT, rater_AI))

# #Reader vs GT, TP_both included
# # Table looks like below:
# #                    GT
# #                   Yes                       No
# # Reader   Yes    TP_both+TP_read_only      FP_read
# #           No     FN_read                   0

# list_of_lists=[[1 for x in range(TP_both)],[1 for x in range(TP_read_only)],[0 for x in range(FP_read) ],[1 for x in range(FN_read) ] ] 
# rater_GT=[item for sublist in list_of_lists for item in sublist]

# list_of_lists=[[1 for x in range(TP_both)],[1 for x in range(TP_read_only)],[1 for x in range(FP_read) ],[0 for x in range(FN_read) ]]
# rater_read=[item for sublist in list_of_lists for item in sublist]

# print("Reader vs consensus (for non-nodules only), kappa is ",cohen_kappa_score(rater_GT, rater_read))



# #Due to small numbers of TP, FP and FN, we cannot calculate the kappa for consensus vs AI (or reader). Better for AI vs reader.

# #Reader vs AI, TP_both included

# # Table looks like below (for nodule and lymph nodes (not FP)):
# #                    AI
# #                   Yes                       No
# # Reader   Yes    TP_both                  FN_AI
# #           No     FN_read                   0

# list_of_lists=[[1 for x in range(TP_both)],[1 for x in range(FN_read) ],[0 for x in range(FN_AI)]] 
# rater_AI=[item for sublist in list_of_lists for item in sublist]

# list_of_lists=[[1 for x in range(TP_both)],[0 for x in range(FN_read) ],[1 for x in range(FN_AI)]]
# rater_read=[item for sublist in list_of_lists for item in sublist]

# print("Reader vs AI (for nodules/lymph nodes only), kappa is ",cohen_kappa_score(rater_AI, rater_read))


# # Table looks like below (for FP):
# #                    AI
# #                   Yes                       No
# # Reader   Yes        0                     FP_read
# #           No     FP_AI                       0

# list_of_lists=[[0 for x in range(FP_read) ],[1 for x in range(FP_AI)]] 
# rater_AI=[item for sublist in list_of_lists for item in sublist]

# list_of_lists=[[1 for x in range(FP_read) ],[0 for x in range(FP_AI)]]
# rater_read=[item for sublist in list_of_lists for item in sublist]

# print("Reader vs AI (for non-nodules only), kappa is ",cohen_kappa_score(rater_AI, rater_read))

# #For both nodules/lymphs and non-nodules:
# list_of_lists=[[0 for x in range(FP_read) ],[1 for x in range(FP_AI)],[1 for x in range(TP_both)],[1 for x in range(FN_read) ],[0 for x in range(FN_AI)]] 
# rater_AI=[item for sublist in list_of_lists for item in sublist]

# list_of_lists=[[1 for x in range(FP_read) ],[0 for x in range(FP_AI)],[1 for x in range(TP_both)],[0 for x in range(FN_read) ],[1 for x in range(FN_AI)]]
# rater_read=[item for sublist in list_of_lists for item in sublist]

# print("Reader vs AI (for both nods/lymphs and non-nodules), kappa is ",cohen_kappa_score(rater_AI, rater_read))

High BMI (nodules and lymph nodes)

In [None]:
TP_AI=TP_AI_high=TP_high+FP_nods_high
FP_AI=FP_AI_high=FP_nonods_high
FN_AI=FN_AI_high=TP_read_only=FN_nods_high #nodules and lymph nodes detected only by the reader

TP_read=TP_read_high=TP_high+FN_nods_high
FP_read=FP_read_high=FN_nonods_high
FN_read=FN_read_high=TP_AI_only=FP_nods_high #nodules and lymph nodes detected only by AI

TP_both=TP_high #Common nodules and lymph nodules detected by both AI and reader

#Print the above
print("High BMI numbers")
print('TP_AI: ',TP_AI)
print('FP_AI: ',FP_AI)
print('FN_AI: ',FN_AI)
print('TP_read: ',TP_read)
print('FP_read: ',FP_read)
print('FN_read: ',FN_read)
print('TP_both: ',TP_both)

High BMI numbers
TP_AI:  98
FP_AI:  53
FN_AI:  33
TP_read:  100
FP_read:  9
FN_read:  31
TP_both:  67


In [None]:
#Assessing detection performance - For nodules and lymph nodes 
df_all_new=pd.DataFrame(columns=['sensitivity (95% CI)','PPV (95% CI)','F1 score (95% CI)','nodules detected','non-nodules detected','nodules missed'],
                        index=['AI, high', 'reader, high' ])


#AI nodules + lymph nodes
df_all_new.iloc[0,0]=np.round(sensitivity(TP_AI,FN_AI),2)
df_all_new.iloc[0,1]=np.round(PPV(TP_AI,FP_AI),2)
df_all_new.iloc[0,2]=np.round(F1score(TP_AI,FP_AI,FN_AI),2)
df_all_new.iloc[0,3]=TP_AI
df_all_new.iloc[0,4]=FP_AI
df_all_new.iloc[0,5]=FN_AI

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI, FP_AI, FN_AI, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[0]=str(df_all_new['sensitivity (95% CI)'].iloc[0])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[0]=str(df_all_new['PPV (95% CI)'].iloc[0])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[0]=str(df_all_new['F1 score (95% CI)'].iloc[0])+' '+str(tuple(ci_f1_ai))

#reader nodules + lymph nodes
df_all_new.iloc[1,0]=np.round(sensitivity(TP_read,FN_read),2)
df_all_new.iloc[1,1]=np.round(PPV(TP_read,FP_read),2)
df_all_new.iloc[1,2]=np.round(F1score(TP_read,FP_read,FN_read),2)
df_all_new.iloc[1,3]=TP_read
df_all_new.iloc[1,4]=FP_read
df_all_new.iloc[1,5]=FN_read

sensitivity_confidence_interval_read, PPV_confidence_interval_read, F1_confidence_interval_read\
    = sensitivity_and_specificity_with_confidence_intervals(TP_read, FP_read, FN_read, 0, alpha=0.95)

ci_sens_read=[np.round(x,2) for x in sensitivity_confidence_interval_read]
ci_ppv_read=[np.round(x,2) for x in PPV_confidence_interval_read]
ci_f1_read=[np.round(x,2) for x in F1_confidence_interval_read]

df_all_new['sensitivity (95% CI)'].iloc[1]=str(df_all_new['sensitivity (95% CI)'].iloc[1])+' '+str(tuple(ci_sens_read))
df_all_new['PPV (95% CI)'].iloc[1]=str(df_all_new['PPV (95% CI)'].iloc[1])+' '+str(tuple(ci_ppv_read))
df_all_new['F1 score (95% CI)'].iloc[1]=str(df_all_new['F1 score (95% CI)'].iloc[1])+' '+str(tuple(ci_f1_read))

df_all_new['All findings']=df_all_new['nodules detected']+df_all_new['non-nodules detected']+df_all_new['nodules missed']
df_all_new.loc['Total']= df_all_new.sum()
df_all_new.loc['Total'].iloc[0:3]=''

all_findings=df_all_new.iloc[:-1,3:-1].sum().sum()

for i in range(2):
    row_all=np.sum(df_all_new.iloc[i][3:6].values)
    percentage_fp=np.round((df_all_new.iloc[i][4]/row_all)*100,1) 
    df_all_new['non-nodules detected'].iloc[i]=str(df_all_new.iloc[i][4])+' ('+str(percentage_fp)+'%)'
    percentage_tp=np.round((df_all_new.iloc[i][3]/row_all)*100,1) 
    df_all_new['nodules detected'].iloc[i]=str(df_all_new.iloc[i][3])+' ('+str(percentage_tp)+'%)'
    percentage_fn=np.round((df_all_new.iloc[i][5]/row_all)*100,1) 
    df_all_new['nodules missed'].iloc[i]=str(df_all_new.iloc[i][5])+' ('+str(percentage_fn)+'%)'

    df_all_new['All findings'].iloc[i]=str(df_all_new.iloc[i][6])+' (100%)'

df_all_new

Unnamed: 0,sensitivity (95% CI),PPV (95% CI),F1 score (95% CI),nodules detected,non-nodules detected,nodules missed,All findings
"AI, high","0.75 (0.66, 0.82)","0.65 (0.57, 0.72)","0.7 (0.64, 0.75)",98 (53.3%),53 (28.8%),33 (17.9%),184 (100%)
"reader, high","0.76 (0.68, 0.83)","0.92 (0.84, 0.96)","0.83 (0.78, 0.88)",100 (71.4%),9 (6.4%),31 (22.1%),140 (100%)
Total,,,,198,62,64,324


In [None]:
df_all_new.to_excel('nodules_lymph_high.xlsx')

In [None]:
#McNemar's test to compare Reader vs AI (using consensus panel)
#Below format is: [[Both AI found and reader found, reader missed and AI found], [Reader found and AI missed, 0]]

#For nodules
data=[[TP_both, FN_read],
        [FN_AI,0]]
# print(data)

# McNemar's Test without continuity correction
print("McNemar's test (nodules and lymphs), AI_vs_Reader with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue) 


# #For FPs
# data=[[0, FP_AI], 
#         [FP_read, 0]]
# # print(data)

# # McNemar's Test without continuity correction
# print("For FP findings, with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

McNemar's test (nodules and lymphs), AI_vs_Reader with continuity correction (not exact) p value is 0.9005235503397742


In [None]:
# #AI vs GT, TP_both included
# # Table looks like below:
# #              GT
# #             Yes                       No
# # AI   Yes    TP_both+TP_AI_only      FP_AI
# #      No     FN_AI                   0

# list_of_lists=[[1 for x in range(TP_both)],[1 for x in range(TP_AI_only)],[0 for x in range(FP_AI) ],[1 for x in range(FN_AI) ] ] 
# rater_GT=[item for sublist in list_of_lists for item in sublist]

# list_of_lists=[[1 for x in range(TP_both)], [1 for x in range(TP_AI_only)],[1 for x in range(FP_AI) ],[0 for x in range(FN_AI) ]]
# rater_AI=[item for sublist in list_of_lists for item in sublist]

# print("AI vs consensus (for nodules and lymphs), kappa is ",cohen_kappa_score(rater_GT, rater_AI))

# #Reader vs GT, TP_both included
# # Table looks like below:
# #                    GT
# #                   Yes                       No
# # Reader   Yes    TP_both+TP_read_only      FP_read
# #           No     FN_read                   0

# list_of_lists=[[1 for x in range(TP_both)],[1 for x in range(TP_read_only)],[0 for x in range(FP_read) ],[1 for x in range(FN_read) ] ] 
# rater_GT=[item for sublist in list_of_lists for item in sublist]

# list_of_lists=[[1 for x in range(TP_both)],[1 for x in range(TP_read_only)],[1 for x in range(FP_read) ],[0 for x in range(FN_read) ]]
# rater_read=[item for sublist in list_of_lists for item in sublist]

# print("Reader vs consensus (for non-nodules only), kappa is ",cohen_kappa_score(rater_GT, rater_read))


# #Due to small numbers of TP, FP and FN, we cannot calculate the kappa for consensus vs AI (or reader). Better for AI vs reader.

# #Reader vs AI, TP_both included

# # Table looks like below (for nodule and lymph nodes (not FP)):
# #                    AI
# #                   Yes                       No
# # Reader   Yes    TP_both                  FN_AI
# #           No     FN_read                   0

# list_of_lists=[[1 for x in range(TP_both)],[1 for x in range(FN_read) ],[0 for x in range(FN_AI)]] 
# rater_AI=[item for sublist in list_of_lists for item in sublist]

# list_of_lists=[[1 for x in range(TP_both)],[0 for x in range(FN_read) ],[1 for x in range(FN_AI)]]
# rater_read=[item for sublist in list_of_lists for item in sublist]

# print("Reader vs AI (for nodules/lymph nodes only), kappa is ",cohen_kappa_score(rater_AI, rater_read))


# # Table looks like below (for FP):
# #                    AI
# #                   Yes                       No
# # Reader   Yes        0                     FP_read
# #           No     FP_AI                       0

# list_of_lists=[[0 for x in range(FP_read) ],[1 for x in range(FP_AI)]] 
# rater_AI=[item for sublist in list_of_lists for item in sublist]

# list_of_lists=[[1 for x in range(FP_read) ],[0 for x in range(FP_AI)]]
# rater_read=[item for sublist in list_of_lists for item in sublist]

# print("Reader vs AI (for non-nodules only), kappa is ",cohen_kappa_score(rater_AI, rater_read))


# #For both nodules/lymphs and non-nodules:
# list_of_lists=[[0 for x in range(FP_read) ],[1 for x in range(FP_AI)],[1 for x in range(TP_both)],[1 for x in range(FN_read) ],[0 for x in range(FN_AI)]] 
# rater_AI=[item for sublist in list_of_lists for item in sublist]

# list_of_lists=[[1 for x in range(FP_read) ],[0 for x in range(FP_AI)],[1 for x in range(TP_both)],[0 for x in range(FN_read) ],[1 for x in range(FN_AI)]]
# rater_read=[item for sublist in list_of_lists for item in sublist]

# print("Reader vs AI (for both nods/lymphs and non-nodules), kappa is ",cohen_kappa_score(rater_AI, rater_read))

Note: We should not perform comparisons between emphysema/non-emphysema groups using McNemar's test - It should only be used in the same group of participants - better to use Mann-Whitney!

#### Same analysis for volume subgroups (nodules and lymph nodes)

Low BMI volume subgroups

In [None]:
TP_AI_100=TP_low_30_100+ai_only_nods_low_30_100+ai_lymph_low_30_100 
FP_AI_100=ai_nonods_low_30_100
FN_AI_100=reader_nods_low_30_100 #nodules of reader including lymph nodes

TP_read_100=TP_low_30_100+reader_nods_low_30_100
FP_read_100=reader_nonods_low_30_100
FN_read_100=ai_only_nods_low_30_100+ai_lymph_low_30_100 #nodules of AI including lymph nodes

TP_AI_100_300=TP_low_100_300+ai_only_nods_low_100_300+ai_lymph_low_100_300
FP_AI_100_300=ai_nonods_low_100_300
FN_AI_100_300=reader_nods_low_100_300 #nodules of reader excluding lymph nodes

TP_read_100_300=TP_low_100_300+reader_nods_low_100_300
FP_read_100_300=reader_nonods_low_100_300
FN_read_100_300=ai_only_nods_low_100_300+ai_lymph_low_100_300 #nodules of AI excluding lymph nodes

TP_AI_300=TP_low_300+ai_only_nods_low_300+ai_lymph_low_300
FP_AI_300=ai_nonods_low_300
FN_AI_300=reader_nods_low_300 #nodules of reader excluding lymph nodes

TP_read_300=TP_low_300+reader_nods_low_300
FP_read_300=reader_nonods_low_300
FN_read_300=ai_only_nods_low_300+ai_lymph_low_300 #nodules of AI excluding lymph nodes

TP_both_100=TP_low_30_100
TP_both_100_300=TP_low_100_300
TP_both_300=TP_low_300

#Print the above
print("Low BMI numbers")
print('TP_AI_100: ',TP_AI_100)
print('FP_AI_100: ',FP_AI_100)
print('FN_AI_100: ',FN_AI_100)
print('TP_read_100: ',TP_read_100)
print('FP_read_100: ',FP_read_100)
print('FN_read_100: ',FN_read_100)
print('TP_AI_100_300: ',TP_AI_100_300)
print('FP_AI_100_300: ',FP_AI_100_300)
print('FN_AI_100_300: ',FN_AI_100_300)
print('TP_read_100_300: ',TP_read_100_300)
print('FP_read_100_300: ',FP_read_100_300)
print('FN_read_100_300: ',FN_read_100_300)
print('TP_AI_300: ',TP_AI_300)
print('FP_AI_300: ',FP_AI_300)
print('FN_AI_300: ',FN_AI_300)
print('TP_read_300: ',TP_read_300)
print('FP_read_300: ',FP_read_300)
print('FN_read_300: ',FN_read_300)
print('TP_both_100: ',TP_both_100)
print('TP_both_100_300: ',TP_both_100_300)
print('TP_both_300: ',TP_both_300)

Low BMI numbers
TP_AI_100:  81
FP_AI_100:  26
FN_AI_100:  26
TP_read_100:  92
FP_read_100:  22
FN_read_100:  15
TP_AI_100_300:  21
FP_AI_100_300:  43
FN_AI_100_300:  1
TP_read_100_300:  17
FP_read_100_300:  5
FN_read_100_300:  5
TP_AI_300:  7
FP_AI_300:  27
FN_AI_300:  0
TP_read_300:  5
FP_read_300:  1
FN_read_300:  2
TP_both_100:  66
TP_both_100_300:  16
TP_both_300:  5


In [13]:
#Table with Reader, AI and consensus findings for nodules and lymph nodes for low BMI
df_all_new=pd.DataFrame(columns=['Reader','AI','Consensus'],
                        index=['Non-nodules','TP 30-100mm3','TP 100-300mm3', 'TP >300mm3'])

df_all_new.index.name = 'Low BMI cases'

df_all_new['Reader']=[FP_read_100+FP_read_100_300+FP_read_300,TP_both_100+FN_AI_100,TP_both_100_300+FN_AI_100_300, TP_both_300+FN_AI_300]
df_all_new['AI']=[FP_AI_100+FP_AI_100_300+FP_AI_300,TP_both_100+FN_read_100,TP_both_100_300+FN_read_100_300,TP_both_300+FN_read_300]
df_all_new['Consensus']=[FP_AI_100+FP_AI_100_300+FP_AI_300+FP_read_100+FP_read_100_300+FP_read_300,FN_AI_100+FN_read_100, FN_AI_100_300+FN_read_100_300, FN_AI_300+FN_read_300]

df_all_new

In [None]:
df_all_new.to_excel('non_nodules_and_TP_low.xlsx')

In [None]:
#Comparison between reader and AI for volume subgroups
df_all_new=pd.DataFrame(columns=['sensitivity (95% CI)','PPV (95% CI)','F1 score (95% CI)','nodules correctly detected','non-nodules incorrectly detected','nodules missed'], 
                        index=['AI, low 30-100mm3', 'AI, low 100-300mm3','AI, low 300+mm3','',
                               'reader, low 30-100mm3','reader, low 100-300mm3', 'reader, low 300+mm3',''
                              ])

df_all_new.index.name = 'GT by radiologists for discrepancies' 

df_all_new.iloc[0,0]=np.round(sensitivity(TP_AI_100,FN_AI_100),2)
df_all_new.iloc[0,1]=np.round(PPV(TP_AI_100,FP_AI_100),2)
df_all_new.iloc[0,2]=np.round(F1score(TP_AI_100,FP_AI_100,FN_AI_100),2)
df_all_new.iloc[0,3]=TP_AI_100
df_all_new.iloc[0,4]=FP_AI_100
df_all_new.iloc[0,5]=FN_AI_100

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI_100, FP_AI_100, FN_AI_100, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[0]=str(df_all_new['sensitivity (95% CI)'].iloc[0])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[0]=str(df_all_new['PPV (95% CI)'].iloc[0])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[0]=str(df_all_new['F1 score (95% CI)'].iloc[0])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[4,0]=np.round(sensitivity(TP_read_100,FN_read_100),2)
df_all_new.iloc[4,1]=np.round(PPV(TP_read_100,FP_read_100),2)
df_all_new.iloc[4,2]=np.round(F1score(TP_read_100,FP_read_100,FN_read_100),2)
df_all_new.iloc[4,3]=TP_read_100
df_all_new.iloc[4,4]=FP_read_100
df_all_new.iloc[4,5]=FN_read_100

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_read_100, FP_read_100, FN_read_100, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[4]=str(df_all_new['sensitivity (95% CI)'].iloc[4])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[4]=str(df_all_new['PPV (95% CI)'].iloc[4])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[4]=str(df_all_new['F1 score (95% CI)'].iloc[4])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[1,0]=np.round(sensitivity(TP_AI_100_300,FN_AI_100_300),2)
df_all_new.iloc[1,1]=np.round(PPV(TP_AI_100_300,FP_AI_100_300),2)
df_all_new.iloc[1,2]=np.round(F1score(TP_AI_100_300,FP_AI_100_300,FN_AI_100_300),2)
df_all_new.iloc[1,3]=TP_AI_100_300
df_all_new.iloc[1,4]=FP_AI_100_300
df_all_new.iloc[1,5]=FN_AI_100_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI_100_300, FP_AI_100_300, FN_AI_100_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[1]=str(df_all_new['sensitivity (95% CI)'].iloc[1])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[1]=str(df_all_new['PPV (95% CI)'].iloc[1])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[1]=str(df_all_new['F1 score (95% CI)'].iloc[1])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[5,0]=np.round(sensitivity(TP_read_100_300,FN_read_100_300),2)
df_all_new.iloc[5,1]=np.round(PPV(TP_read_100_300,FP_read_100_300),2)
df_all_new.iloc[5,2]=np.round(F1score(TP_read_100_300,FP_read_100_300,FN_read_100_300),2)
df_all_new.iloc[5,3]=TP_read_100_300
df_all_new.iloc[5,4]=FP_read_100_300
df_all_new.iloc[5,5]=FN_read_100_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_read_100_300, FP_read_100_300, FN_read_100_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[5]=str(df_all_new['sensitivity (95% CI)'].iloc[5])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[5]=str(df_all_new['PPV (95% CI)'].iloc[5])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[5]=str(df_all_new['F1 score (95% CI)'].iloc[5])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[2,0]=np.round(sensitivity(TP_AI_300,FN_AI_300),2)
df_all_new.iloc[2,1]=np.round(PPV(TP_AI_300,FP_AI_300),2)
df_all_new.iloc[2,2]=np.round(F1score(TP_AI_300,FP_AI_300,FN_AI_300),2)
df_all_new.iloc[2,3]=TP_AI_300
df_all_new.iloc[2,4]=FP_AI_300
df_all_new.iloc[2,5]=FN_AI_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI_300, FP_AI_300, FN_AI_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[2]=str(df_all_new['sensitivity (95% CI)'].iloc[2])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[2]=str(df_all_new['PPV (95% CI)'].iloc[2])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[2]=str(df_all_new['F1 score (95% CI)'].iloc[2])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[6,0]=np.round(sensitivity(TP_read_300,FN_read_300),2)
df_all_new.iloc[6,1]=np.round(PPV(TP_read_300,FP_read_300),2)
df_all_new.iloc[6,2]=np.round(F1score(TP_read_300,FP_read_300,FN_read_300),2)
df_all_new.iloc[6,3]=TP_read_300
df_all_new.iloc[6,4]=FP_read_300
df_all_new.iloc[6,5]=FN_read_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_read_300, FP_read_300, FN_read_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[6]=str(df_all_new['sensitivity (95% CI)'].iloc[6])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[6]=str(df_all_new['PPV (95% CI)'].iloc[6])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[6]=str(df_all_new['F1 score (95% CI)'].iloc[6])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[3,0]=0
df_all_new.iloc[3,1]=0
df_all_new.iloc[3,2]=0
df_all_new.iloc[3,3]=0
df_all_new.iloc[3,4]=0
df_all_new.iloc[3,5]=0


AI_all=np.sum(df_all_new.iloc[0:3,3:].values)
reader_all=np.sum(df_all_new.iloc[4:7,3:].values)

df_all_new['All findings']=df_all_new['nodules correctly detected']+df_all_new['non-nodules incorrectly detected']+df_all_new['nodules missed']

df_all_new.iloc[7,0]=''
df_all_new.iloc[7,1]=''
df_all_new.iloc[7,2]=''
df_all_new.iloc[7,3]=''
df_all_new.iloc[7,4]=''
df_all_new.iloc[7,5]=''
df_all_new.iloc[7,6]=np.sum(df_all_new['All findings'].iloc[4:7])

df_all_new.iloc[3,0]=''
df_all_new.iloc[3,1]=''
df_all_new.iloc[3,2]=''
df_all_new.iloc[3,3]=''
df_all_new.iloc[3,4]=''
df_all_new.iloc[3,5]=''
df_all_new.iloc[3,6]=np.sum(df_all_new['All findings'].iloc[0:3])

for i in range(7):
    if i!=3:
        if i<3:
            sum_all=AI_all
        elif i>3:
            sum_all=reader_all
            
        percentage_tp=np.round((df_all_new.iloc[i][3]/sum_all)*100,1) 
        df_all_new['nodules correctly detected'].iloc[i]=str(df_all_new.iloc[i][3])+' ('+str(percentage_tp)+'%)'

        percentage_fp=np.round((df_all_new.iloc[i][4]/sum_all)*100,1) 
        df_all_new['non-nodules incorrectly detected'].iloc[i]=str(df_all_new.iloc[i][4])+' ('+str(percentage_fp)+'%)'

        percentage_fn=np.round((df_all_new.iloc[i][5]/sum_all)*100,1) 
        df_all_new['nodules missed'].iloc[i]=str(df_all_new.iloc[i][5])+' ('+str(percentage_fn)+'%)'

        df_all_new['All findings'].iloc[i]=str(df_all_new.iloc[i][6])+' ('+str(np.round(100*df_all_new.iloc[i][6]/sum_all,1))+'%)'

    
df_all_new['All findings'].iloc[3]=str(df_all_new.iloc[3][6])+' (100%)'
df_all_new['All findings'].iloc[7]=str(df_all_new.iloc[7][6])+' (100%)'

df_all_new #Detection performance comparison for nodules and lymph nodes

Unnamed: 0_level_0,sensitivity (95% CI),PPV (95% CI),F1 score (95% CI),nodules correctly detected,non-nodules incorrectly detected,nodules missed,All findings
GT by radiologists for discrepancies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"AI, low 30-100mm3","0.76 (0.66, 0.83)","0.76 (0.66, 0.83)","0.76 (0.69, 0.81)",81 (34.9%),26 (11.2%),26 (11.2%),133 (57.3%)
"AI, low 100-300mm3","0.95 (0.75, 1.0)","0.33 (0.22, 0.46)","0.49 (0.38, 0.6)",21 (9.1%),43 (18.5%),1 (0.4%),65 (28.0%)
"AI, low 300+mm3","1.0 (0.56, 0.99)","0.21 (0.09, 0.38)","0.34 (0.21, 0.51)",7 (3.0%),27 (11.6%),0 (0.0%),34 (14.7%)
,,,,,,,232 (100%)
"reader, low 30-100mm3","0.86 (0.78, 0.92)","0.81 (0.72, 0.87)","0.83 (0.78, 0.88)",92 (56.1%),22 (13.4%),15 (9.1%),129 (78.7%)
"reader, low 100-300mm3","0.77 (0.54, 0.91)","0.77 (0.54, 0.91)","0.77 (0.62, 0.88)",17 (10.4%),5 (3.0%),5 (3.0%),27 (16.5%)
"reader, low 300+mm3","0.71 (0.3, 0.95)","0.83 (0.36, 0.99)","0.77 (0.46, 0.94)",5 (3.0%),1 (0.6%),2 (1.2%),8 (4.9%)
,,,,,,,164 (100%)


In [None]:
df_all_new.to_excel('nodules_lymph_volumes_low.xlsx')

In [None]:
data=[[TP_both_100, FN_read_100],
        [FN_AI_100,0]]
# print(data)

# McNemar's Test without continuity correction
print("For nodules and lymph nodes (AI vs reader) of 30-100mm3 with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)
# print("\n")

# #For FPs
# data=[[0, FP_AI_100], 
#         [FP_read_100, 0]]
# # print(data)

# # McNemar's Test without continuity correction
# print("For FP findings of 30-100mm3, with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)
# print("\n")



data=[[TP_both_100_300,FN_read_100_300 ], 
        [FN_AI_100_300, 0]]
# print(data)

# McNemar's Test without continuity correction
print("For nodules and lymph nodes (AI vs reader) of 100-300mm3 with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)
# print("\n")

# #For FPs
# data=[[0, FP_AI_100_300], 
#         [FP_read_100_300, 0]]
# # print(data)

# # McNemar's Test without continuity correction
# print("For FP findings of 100-300mm3, with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

For nodules and lymph nodes (AI vs reader) of 30-100mm3 with continuity correction (not exact) p value is 0.11834981273562842
For nodules and lymph nodes (AI vs reader) of 100-300mm3 with continuity correction (not exact) p value is 0.22067136191984324


High BMI volume subgroups

In [None]:
TP_AI_100=TP_high_30_100+ai_only_nods_high_30_100+ai_lymph_high_30_100
FP_AI_100=ai_nonods_high_30_100
FN_AI_100=reader_nods_high_30_100 #nodules of reader excluding lymph nodes

TP_read_100=TP_high_30_100+reader_nods_high_30_100
FP_read_100=reader_nonods_high_30_100
FN_read_100=ai_only_nods_high_30_100+ai_lymph_high_30_100 #nodules of AI excluding lymph nodes

TP_AI_100_300=TP_high_100_300+ai_only_nods_high_100_300+ai_lymph_high_100_300
FP_AI_100_300=ai_nonods_high_100_300
FN_AI_100_300=reader_nods_high_100_300 #nodules of reader excluding lymph nodes

TP_read_100_300=TP_high_100_300+reader_nods_high_100_300
FP_read_100_300=reader_nonods_high_100_300
FN_read_100_300=ai_only_nods_high_100_300+ai_lymph_high_100_300 #nodules of AI excluding lymph nodes

TP_AI_300=TP_high_300+ai_only_nods_high_300+ai_lymph_high_300
FP_AI_300=ai_nonods_high_300
FN_AI_300=reader_nods_high_300 #nodules of reader excluding lymph nodes

TP_read_300=TP_high_300+reader_nods_high_300
FP_read_300=reader_nonods_high_300
FN_read_300=ai_only_nods_high_300+ai_lymph_high_300 #nodules of AI excluding lymph nodes

TP_both_100=TP_high_30_100
TP_both_100_300=TP_high_100_300
TP_both_300=TP_high_300

#Print the above
print("High BMI numbers")
print('TP_AI_100: ',TP_AI_100)
print('FP_AI_100: ',FP_AI_100)
print('FN_AI_100: ',FN_AI_100)
print('TP_read_100: ',TP_read_100)
print('FP_read_100: ',FP_read_100)
print('FN_read_100: ',FN_read_100)
print('TP_AI_100_300: ',TP_AI_100_300)
print('FP_AI_100_300: ',FP_AI_100_300)
print('FN_AI_100_300: ',FN_AI_100_300)
print('TP_read_100_300: ',TP_read_100_300)
print('FP_read_100_300: ',FP_read_100_300)
print('FN_read_100_300: ',FN_read_100_300)
print('TP_AI_300: ',TP_AI_300)
print('FP_AI_300: ',FP_AI_300)
print('FN_AI_300: ',FN_AI_300)
print('TP_read_300: ',TP_read_300)
print('FP_read_300: ',FP_read_300)
print('FN_read_300: ',FN_read_300)
print('TP_both_100: ',TP_both_100)
print('TP_both_100_300: ',TP_both_100_300)
print('TP_both_300: ',TP_both_300)

High BMI numbers
TP_AI_100:  76
FP_AI_100:  11
FN_AI_100:  29
TP_read_100:  79
FP_read_100:  8
FN_read_100:  26
TP_AI_100_300:  18
FP_AI_100_300:  27
FN_AI_100_300:  4
TP_read_100_300:  17
FP_read_100_300:  1
FN_read_100_300:  5
TP_AI_300:  4
FP_AI_300:  15
FN_AI_300:  0
TP_read_300:  4
FP_read_300:  0
FN_read_300:  0
TP_both_100:  50
TP_both_100_300:  13
TP_both_300:  4


In [14]:
#Table with Reader, AI and consensus findings for nodules and lymph nodes for high BMI
df_all_new=pd.DataFrame(columns=['Reader','AI','Consensus'],
                        index=['Non-nodules','TP 30-100mm3','TP 100-300mm3', 'TP >300mm3'])

df_all_new.index.name = 'High BMI cases'

df_all_new['Reader']=[FP_read_100+FP_read_100_300+FP_read_300,TP_both_100+FN_AI_100,TP_both_100_300+FN_AI_100_300, TP_both_300+FN_AI_300]
df_all_new['AI']=[FP_AI_100+FP_AI_100_300+FP_AI_300,TP_both_100+FN_read_100,TP_both_100_300+FN_read_100_300,TP_both_300+FN_read_300]
df_all_new['Consensus']=[FP_AI_100+FP_AI_100_300+FP_AI_300+FP_read_100+FP_read_100_300+FP_read_300,FN_AI_100+FN_read_100, FN_AI_100_300+FN_read_100_300, FN_AI_300+FN_read_300]

df_all_new

In [None]:
df_all_new.to_excel('non_nodules_and_TP_high.xlsx')

In [None]:
#Comparison between reader and AI for volume subgroups
df_all_new=pd.DataFrame(columns=['sensitivity (95% CI)','PPV (95% CI)','F1 score (95% CI)','nodules correctly detected','non-nodules incorrectly detected','nodules missed'], 
                        index=['AI, high 30-100mm3', 'AI, high 100-300mm3','AI, high 300+mm3','',
                               'reader, high 30-100mm3','reader, high 100-300mm3', 'reader, high 300+mm3',''
                              ])

df_all_new.index.name = 'GT by radiologists for discrepancies' 


df_all_new.iloc[0,0]=np.round(sensitivity(TP_AI_100,FN_AI_100),2)
df_all_new.iloc[0,1]=np.round(PPV(TP_AI_100,FP_AI_100),2)
df_all_new.iloc[0,2]=np.round(F1score(TP_AI_100,FP_AI_100,FN_AI_100),2)
df_all_new.iloc[0,3]=TP_AI_100
df_all_new.iloc[0,4]=FP_AI_100
df_all_new.iloc[0,5]=FN_AI_100

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI_100, FP_AI_100, FN_AI_100, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[0]=str(df_all_new['sensitivity (95% CI)'].iloc[0])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[0]=str(df_all_new['PPV (95% CI)'].iloc[0])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[0]=str(df_all_new['F1 score (95% CI)'].iloc[0])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[4,0]=np.round(sensitivity(TP_read_100,FN_read_100),2)
df_all_new.iloc[4,1]=np.round(PPV(TP_read_100,FP_read_100),2)
df_all_new.iloc[4,2]=np.round(F1score(TP_read_100,FP_read_100,FN_read_100),2)
df_all_new.iloc[4,3]=TP_read_100
df_all_new.iloc[4,4]=FP_read_100
df_all_new.iloc[4,5]=FN_read_100

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_read_100, FP_read_100, FN_read_100, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[4]=str(df_all_new['sensitivity (95% CI)'].iloc[4])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[4]=str(df_all_new['PPV (95% CI)'].iloc[4])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[4]=str(df_all_new['F1 score (95% CI)'].iloc[4])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[1,0]=np.round(sensitivity(TP_AI_100_300,FN_AI_100_300),2)
df_all_new.iloc[1,1]=np.round(PPV(TP_AI_100_300,FP_AI_100_300),2)
df_all_new.iloc[1,2]=np.round(F1score(TP_AI_100_300,FP_AI_100_300,FN_AI_100_300),2)
df_all_new.iloc[1,3]=TP_AI_100_300
df_all_new.iloc[1,4]=FP_AI_100_300
df_all_new.iloc[1,5]=FN_AI_100_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI_100_300, FP_AI_100_300, FN_AI_100_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[1]=str(df_all_new['sensitivity (95% CI)'].iloc[1])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[1]=str(df_all_new['PPV (95% CI)'].iloc[1])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[1]=str(df_all_new['F1 score (95% CI)'].iloc[1])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[5,0]=np.round(sensitivity(TP_read_100_300,FN_read_100_300),2)
df_all_new.iloc[5,1]=np.round(PPV(TP_read_100_300,FP_read_100_300),2)
df_all_new.iloc[5,2]=np.round(F1score(TP_read_100_300,FP_read_100_300,FN_read_100_300),2)
df_all_new.iloc[5,3]=TP_read_100_300
df_all_new.iloc[5,4]=FP_read_100_300
df_all_new.iloc[5,5]=FN_read_100_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_read_100_300, FP_read_100_300, FN_read_100_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[5]=str(df_all_new['sensitivity (95% CI)'].iloc[5])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[5]=str(df_all_new['PPV (95% CI)'].iloc[5])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[5]=str(df_all_new['F1 score (95% CI)'].iloc[5])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[2,0]=np.round(sensitivity(TP_AI_300,FN_AI_300),2)
df_all_new.iloc[2,1]=np.round(PPV(TP_AI_300,FP_AI_300),2)
df_all_new.iloc[2,2]=np.round(F1score(TP_AI_300,FP_AI_300,FN_AI_300),2)
df_all_new.iloc[2,3]=TP_AI_300
df_all_new.iloc[2,4]=FP_AI_300
df_all_new.iloc[2,5]=FN_AI_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_AI_300, FP_AI_300, FN_AI_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[2]=str(df_all_new['sensitivity (95% CI)'].iloc[2])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[2]=str(df_all_new['PPV (95% CI)'].iloc[2])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[2]=str(df_all_new['F1 score (95% CI)'].iloc[2])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[6,0]=np.round(sensitivity(TP_read_300,FN_read_300),2)
df_all_new.iloc[6,1]=np.round(PPV(TP_read_300,FP_read_300),2)
df_all_new.iloc[6,2]=np.round(F1score(TP_read_300,FP_read_300,FN_read_300),2)
df_all_new.iloc[6,3]=TP_read_300
df_all_new.iloc[6,4]=FP_read_300
df_all_new.iloc[6,5]=FN_read_300

sensitivity_confidence_interval_AI, PPV_confidence_interval_AI, F1_confidence_interval_AI \
= sensitivity_and_specificity_with_confidence_intervals(TP_read_300, FP_read_300, FN_read_300, 0, alpha=0.95)

ci_sens_ai=[np.round(x,2) for x in sensitivity_confidence_interval_AI]
ci_ppv_ai=[np.round(x,2) for x in PPV_confidence_interval_AI]
ci_f1_ai=[np.round(x,2) for x in F1_confidence_interval_AI]

df_all_new['sensitivity (95% CI)'].iloc[6]=str(df_all_new['sensitivity (95% CI)'].iloc[6])+' '+str(tuple(ci_sens_ai))
df_all_new['PPV (95% CI)'].iloc[6]=str(df_all_new['PPV (95% CI)'].iloc[6])+' '+str(tuple(ci_ppv_ai))
df_all_new['F1 score (95% CI)'].iloc[6]=str(df_all_new['F1 score (95% CI)'].iloc[6])+' '+str(tuple(ci_f1_ai))


df_all_new.iloc[3,0]=0
df_all_new.iloc[3,1]=0
df_all_new.iloc[3,2]=0
df_all_new.iloc[3,3]=0
df_all_new.iloc[3,4]=0
df_all_new.iloc[3,5]=0


AI_all=np.sum(df_all_new.iloc[0:3,3:].values)
reader_all=np.sum(df_all_new.iloc[4:7,3:].values)

df_all_new['All findings']=df_all_new['nodules correctly detected']+df_all_new['non-nodules incorrectly detected']+df_all_new['nodules missed']

df_all_new.iloc[7,0]=''
df_all_new.iloc[7,1]=''
df_all_new.iloc[7,2]=''
df_all_new.iloc[7,3]=''
df_all_new.iloc[7,4]=''
df_all_new.iloc[7,5]=''
df_all_new.iloc[7,6]=np.sum(df_all_new['All findings'].iloc[4:7])

df_all_new.iloc[3,0]=''
df_all_new.iloc[3,1]=''
df_all_new.iloc[3,2]=''
df_all_new.iloc[3,3]=''
df_all_new.iloc[3,4]=''
df_all_new.iloc[3,5]=''
df_all_new.iloc[3,6]=np.sum(df_all_new['All findings'].iloc[0:3])

for i in range(7):
    if i!=3:
        if i<3:
            sum_all=AI_all
        elif i>3:
            sum_all=reader_all
            
        percentage_tp=np.round((df_all_new.iloc[i][3]/sum_all)*100,1) 
        df_all_new['nodules correctly detected'].iloc[i]=str(df_all_new.iloc[i][3])+' ('+str(percentage_tp)+'%)'

        percentage_fp=np.round((df_all_new.iloc[i][4]/sum_all)*100,1) 
        df_all_new['non-nodules incorrectly detected'].iloc[i]=str(df_all_new.iloc[i][4])+' ('+str(percentage_fp)+'%)'

        percentage_fn=np.round((df_all_new.iloc[i][5]/sum_all)*100,1) 
        df_all_new['nodules missed'].iloc[i]=str(df_all_new.iloc[i][5])+' ('+str(percentage_fn)+'%)'

        df_all_new['All findings'].iloc[i]=str(df_all_new.iloc[i][6])+' ('+str(np.round(100*df_all_new.iloc[i][6]/sum_all,1))+'%)'

    
df_all_new['All findings'].iloc[3]=str(df_all_new.iloc[3][6])+' (100%)'
df_all_new['All findings'].iloc[7]=str(df_all_new.iloc[7][6])+' (100%)'

df_all_new #Detection performance comparison for nodules and lymph nodes

Unnamed: 0_level_0,sensitivity (95% CI),PPV (95% CI),F1 score (95% CI),nodules correctly detected,non-nodules incorrectly detected,nodules missed,All findings
GT by radiologists for discrepancies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"AI, high 30-100mm3","0.72 (0.63, 0.8)","0.87 (0.78, 0.93)","0.79 (0.73, 0.85)",76 (41.3%),11 (6.0%),29 (15.8%),116 (63.0%)
"AI, high 100-300mm3","0.82 (0.59, 0.94)","0.4 (0.26, 0.56)","0.54 (0.41, 0.66)",18 (9.8%),27 (14.7%),4 (2.2%),49 (26.6%)
"AI, high 300+mm3","1.0 (0.4, 0.98)","0.21 (0.07, 0.46)","0.35 (0.17, 0.57)",4 (2.2%),15 (8.2%),0 (0.0%),19 (10.3%)
,,,,,,,184 (100%)
"reader, high 30-100mm3","0.75 (0.66, 0.83)","0.91 (0.82, 0.96)","0.82 (0.76, 0.87)",79 (56.4%),8 (5.7%),26 (18.6%),113 (80.7%)
"reader, high 100-300mm3","0.77 (0.54, 0.91)","0.94 (0.71, 1.0)","0.85 (0.69, 0.94)",17 (12.1%),1 (0.7%),5 (3.6%),23 (16.4%)
"reader, high 300+mm3","1.0 (0.4, 0.98)","1.0 (0.4, 0.98)","1.0 (0.6, 0.99)",4 (2.9%),0 (0.0%),0 (0.0%),4 (2.9%)
,,,,,,,140 (100%)


In [None]:
df_all_new.to_excel('nodules_lymph_volumes_high.xlsx')

In [None]:
data=[[TP_both_100, FN_read_100],
        [FN_AI_100,0]]
# print(data)

# McNemar's Test without continuity correction
print("For nodules and lymph nodes (AI vs reader) of 30-100mm3 with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

# #For FPs
# data=[[0, FP_AI_100], 
#         [FP_read_100, 0]]
# # print(data)

# # McNemar's Test without continuity correction
# print("For FP findings of 30-100mm3, with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)
# print("\n")


data=[[TP_both_100_300,FN_read_100_300 ], 
        [FN_AI_100_300, 0]]
# print(data)

# McNemar's Test without continuity correction
print("For nodules and lymph nodes (AI vs reader) of 100-300mm3 with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

# #For FPs
# data=[[0, FP_AI_100_300], 
#         [FP_read_100_300, 0]]
# # print(data)

# # McNemar's Test without continuity correction
# print("For FP findings of 100-300mm3, with continuity correction (not exact) p value is",mcnemar(data, exact=False,correction=True).pvalue)

For nodules and lymph nodes (AI vs reader) of 30-100mm3 with continuity correction (not exact) p value is 0.7874064906662693
For nodules and lymph nodes (AI vs reader) of 100-300mm3 with continuity correction (not exact) p value is 1.0


#### Mann-Whitney U test to check for differences in volumes between low/high BMI within each volume subgroup 

This is an unpaired test meaning that we consider each nodule as separate from others. It can be used with unequal sample sizes as well.
Bland-Altman is not a good choice since it can be performed on nodules detected by both AI and reader to assess for agreement in the volume measurement

In [None]:
# #Perform the Mann-Whitney U test for nodules only
# # stats.mannwhitneyu(group1, group2, alternative='two-sided')

# #Compare all high BMI reader vs AI volumes
# print('High BMI reader vs AI volumes p value is', stats.mannwhitneyu(reader_only_nods_high_30_100_vols+reader_only_nods_high_100_300_vols+reader_only_nods_high_300_vols,
#                    ai_only_nods_high_30_100_vols+ai_only_nods_high_100_300_vols+ai_only_nods_high_300_vols).pvalue)

# #Compare all low BMI reader vs AI volumes
# print('Low BMI reader vs AI volumes p value is',stats.mannwhitneyu(reader_only_nods_low_30_100_vols+reader_only_nods_low_100_300_vols+reader_only_nods_low_300_vols,
#                    ai_only_nods_low_30_100_vols+ai_only_nods_low_100_300_vols+ai_only_nods_low_300_vols).pvalue)

# #Compare all low vs high BMI volumes for nodules, for reader only
# print('Low vs high BMI for reader p value is',stats.mannwhitneyu(reader_only_nods_low_30_100_vols+reader_only_nods_low_100_300_vols+reader_only_nods_low_300_vols,
#                    reader_only_nods_high_30_100_vols+reader_only_nods_high_100_300_vols+reader_only_nods_high_300_vols).pvalue)

# #Compare all low vs high BMI volumes for nodules, for AI only
# print('Low vs high BMI for AI p value is',stats.mannwhitneyu(ai_only_nods_high_30_100_vols+ai_only_nods_high_100_300_vols+ai_only_nods_high_300_vols,
#                    ai_only_nods_low_30_100_vols+ai_only_nods_low_100_300_vols+ai_only_nods_low_300_vols).pvalue)

In [None]:
# #Similarly as above for non-nodule findings

# #Compare all high BMI reader vs AI volumes for non-nodule findings
# print('High BMI reader vs AI volumes for non-nodule findings p value is',stats.mannwhitneyu(reader_nonods_high_30_100_vols+reader_nonods_high_100_300_vols+reader_nonods_high_300_vols,
#                    ai_nonods_high_30_100_vols+ai_nonods_high_100_300_vols+ai_nonods_high_300_vols).pvalue)

# #Compare all low BMI reader vs AI volumes for non-nodule findings
# print('Low BMI reader vs AI volumes for non-nodule findings p value is',stats.mannwhitneyu(reader_nonods_low_30_100_vols+reader_nonods_low_100_300_vols+reader_nonods_low_300_vols,
#                    ai_nonods_low_30_100_vols+ai_nonods_low_100_300_vols+ai_nonods_low_300_vols).pvalue)

# #Compare all low vs high BMI volumes for reader only, for non-nodule findings
# print('Low vs high BMI volumes for reader only, for non-nodule findings p value is',stats.mannwhitneyu(reader_nonods_low_30_100_vols+reader_nonods_low_100_300_vols+reader_nonods_low_300_vols,
#                    reader_nonods_high_30_100_vols+reader_nonods_high_100_300_vols+reader_nonods_high_300_vols).pvalue)

# #Compare all low vs high BMI volumes for AI only, for non-nodule findings
# print('Low vs high BMI volumes for AI only, for non-nodule findings p value is',stats.mannwhitneyu(ai_nonods_high_30_100_vols+ai_nonods_high_100_300_vols+ai_nonods_high_300_vols,
#                    ai_nonods_low_30_100_vols+ai_nonods_low_100_300_vols+ai_nonods_low_300_vols).pvalue)

In [None]:
# #Similarly as above for nodules and lymph nodes

# #Compare all high BMI reader vs AI volumes for nodules and lymph nodes
# print('High BMI reader vs AI volumes for nodules and lymph nodes p value is',stats.mannwhitneyu(reader_only_nods_high_30_100_vols+reader_only_nods_high_100_300_vols+reader_only_nods_high_300_vols+
#                    reader_lymph_high_30_100_vols+ reader_lymph_high_100_300_vols+ reader_lymph_high_300_vols,
#                    ai_only_nods_high_30_100_vols+ai_only_nods_high_100_300_vols+ai_only_nods_high_300_vols+
#                   ai_lymph_high_30_100_vols+ ai_lymph_high_100_300_vols+ ai_lymph_high_300_vols).pvalue)

# #Compare all low BMI reader vs AI volumes for nodules and lymph nodes
# print('Low BMI reader vs AI volumes for nodules and lymph nodes p value is',stats.mannwhitneyu(reader_only_nods_low_30_100_vols+reader_only_nods_low_100_300_vols+reader_only_nods_low_300_vols+
#                    reader_lymph_low_30_100_vols+ reader_lymph_low_100_300_vols+ reader_lymph_low_300_vols,
#                    ai_only_nods_low_30_100_vols+ai_only_nods_low_100_300_vols+ai_only_nods_low_300_vols+
#                   ai_lymph_low_30_100_vols+ ai_lymph_low_100_300_vols+ ai_lymph_low_300_vols).pvalue)

# #Compare all low vs high BMI volumes for reader only for nodules and lymph nodes
# print('Low vs high BMI volumes for reader only for nodules and lymph nodes p value is',stats.mannwhitneyu(reader_only_nods_low_30_100_vols+reader_only_nods_low_100_300_vols+reader_only_nods_low_300_vols+
#                    reader_lymph_low_30_100_vols+ reader_lymph_low_100_300_vols+ reader_lymph_low_300_vols,
#                    reader_only_nods_high_30_100_vols+reader_only_nods_high_100_300_vols+reader_only_nods_high_300_vols+
#                   reader_lymph_high_30_100_vols+ reader_lymph_high_100_300_vols+ reader_lymph_high_300_vols).pvalue)

# #Compare all low vs high BMI volumes for AI only for nodules and lymph nodes
# print('Low vs high BMI volumes for AI only for nodules and lymph nodes p value is',stats.mannwhitneyu(ai_only_nods_high_30_100_vols+ai_only_nods_high_100_300_vols+ai_only_nods_high_300_vols+
#                    ai_lymph_high_30_100_vols+ ai_lymph_high_100_300_vols+ ai_lymph_high_300_vols,
#                    ai_only_nods_low_30_100_vols+ai_only_nods_low_100_300_vols+ai_only_nods_low_300_vols+
#                   ai_lymph_low_30_100_vols+ ai_lymph_low_100_300_vols+ ai_lymph_low_300_vols).pvalue)