In [1]:
#Import dependencies
import os
import numpy as np
import pickle
from tqdm import tqdm
import time
import traceback

In [2]:
start=time.time() #Count time to run script

In [3]:
#This is needed to get access to path set in other script. It's the path of all AI txt files with processing time and tasks
%store -r path

In [4]:
def get_patient_id(path,file): 
    'Function to get a extract from a txt file the participant_id and the information on it.'
    
    with open(path+'/'+file,'r') as f: #Read txt file
        lines = f.readlines()
        
    for line in lines: #Loop over lines of txt file
        if 'patientID :' in line: #If there is a patient_id in the txt file
            
            if len(line.split(':')[1].split(',')[0])==6: #If a participant_id with 6 digits exists                                                                
                patient_id=line.split(':')[1].split(',')[0] #Get that id if it's a valid number

            elif "imalife_" in line.split(':')[1].split(',')[0].lower(): #If 'imalife_' in participant's name also keep participant with that prefix
                patient_id=line.split(':')[1].split(',')[0]

    try: #To avoid errors when patient_id is not present
        if len(patient_id)!=0: #This will given error if patient_id doesn't exists 
            pass
    except:
        patient_id=''
        lines=''
        
    return patient_id,lines #return patient_id and txt file information for that patient as a list

### Participants that were sent for nodules, aorta, CACS, emphysema, cardiac fat, or vertebra measurements

In [5]:
def sort_and_list(pat_sent_for_parameter,parameter_files): 
    'Sorts participants and their corresponding files and returns them as lists'
    
    pats, parameter_files = zip(*sorted(zip(pat_sent_for_parameter, parameter_files))) #Sort them

    #and convert them to list
    pats=list(pats)
    parameter_files=list(parameter_files)
    
    return pats, parameter_files

In [6]:
def check_parameter(path):
    
    'Gets the path of AI files with a timestamp and returns the participants and the corresponding file names in which measurements were performed.'
    'These measurements are: nodules, aorta, CACS, vertebra, cardiac fat, emphysema'

    pat_sent_for_nodules=[] #empty lists to be filled with participants that were sent to AI for nodule detection
    nodule_files=[] #Files that correspond to the above participants
    pat_sent_for_aorta=[] #empty lists to be filled with participants that were sent to AI for aorta measurements
    aorta_files=[] #Files that correspond to the above participants
    pat_sent_for_CACS=[] #empty lists to be filled with participants that were sent to AI for CACS
    CACS_files=[] #Files that correspond to the above participants
    pat_sent_for_vertebra=[] #empty lists to be filled with participants that were sent to AI for vertebra
    vertebra_files=[] #Files that correspond to the above participants
    pat_sent_for_fat=[] #empty lists to be filled with participants that were sent to AI for cardiac fat
    fat_files=[] #Files that correspond to the above participants
    pat_sent_for_emph=[] #empty lists to be filled with participants that were sent to AI for emphysema
    emph_files=[] #Files that correspond to the above participants

    no_measurements=0 #Keep track of the number of files without any of the above measurements
    empty_files=0 #Keep track of empty files
    measurements=0 #Keep track of number of files with measurements
    pat_ignored=0 #Keep track of number of files with empty patient names

    for file in tqdm(os.listdir(path)): #loop over all txt files 
      
        try: #If we also have other files than txt (eg. zip) it will give error

            with open(path+'/'+file,'r') as f: #Read each txt file
                lines = f.readlines()

            flag_nods=0 #Set a flag to be set to 1 if file was sent for nodules
            flag_aorta=0 #Set a flag to be set to 1 if file was sent for aorta
            flag_CACS=0 #Set a flag to be set to 1 if file was sent for CACS
            flag_CACS_remove=0 #Set a flag to be set to 1 if file was sent for CACS and aorta - only keep aorta
            flag_fat=0 #Set a flag to be set to 1 if file was sent for cardiac fat
            flag_vertebra=0 #Set a flag to be set to 1 if file was sent for vertebra
            flag_emph=0 #Set a flag to be set to 1 if file was sent for emphysema
            
            flag_all=0 #Set a flag to be set to 1 if file was sent to AI for any of the above

            for line in lines: #Loop over all lines of txt file

                if "Slices :" in line: #If 'slices' in one of the lines of the txt file
                    slice_num=int(line.split('Slices : ')[-1].split(',')[0]) #Get number of slices sent to AI
                
                if 'LungCAD' in line: #If this string exists, then file was sent for nodules
                    flag_nods=1
                    
                #we only care for 'CalciumScoreVB40' but it's the same as the one below   
                if 'CalciumScore' in line and slice_num<200: #If this string exists, then file was sent for CACS
                    flag_CACS=1

                if 'Aorta Overview generation' in line: #If this true, then file was sent for aorta
                    #We repeat it again since we have already checked for CACS and now we check for aorta based on order of 'if' statements
                    #First is CACS in the txt file and then aorta. 
                    flag_CACS_remove=1     
                    flag_aorta=1
                    
                if 'Vertebra Measurement algorithms' in line: #If this string exists, then file was sent for vertebra
                    flag_vertebra=1

                if 'Cardiac Fat' in line: #If this string exists, then file was sent for cardiac fat
                    flag_fat=1

                if "LungMeasurement finished" in line: #If this string exists, then file was sent for emphysema
                    flag_emph=1
                
                    
                #To check for files containing any of the above measurements
                if ('LungCAD' in line or 'Aorta Overview generation' in line or 'CalciumScore' 
                    in line or 'Vertebra Measurement algorithms' in line or 'Cardiac Fat' in line 
                    or 'LungMeasurement finished' in line):
                    
                    flag_all=1


            pat_name,_=get_patient_id(path,file) #Get patient_id and txt information

            if pat_name.isnumeric() or 'imalife_' in pat_name.lower(): #If patient_id is a valid number (6 digits) or 'imalife_' in its name

                if flag_nods==1: #If scan sent for nodules
                    pat_sent_for_nodules.append(pat_name) #Save its id
                    nodule_files.append(file) #Save file name

                if flag_aorta==1: #sent for aorta measurements
                    pat_sent_for_aorta.append(pat_name)
                    aorta_files.append(file)       

                if flag_CACS==1 and flag_CACS_remove==0 : #When both aorta and CACS are present, only consider aorta measurements
                    pat_sent_for_CACS.append(pat_name)
                    CACS_files.append(file)

                if flag_vertebra==1: #sent for vertebra measurements
                    pat_sent_for_vertebra.append(pat_name)
                    vertebra_files.append(file)       

                if flag_fat==1: #sent for cardiac fat measurements
                    pat_sent_for_fat.append(pat_name)
                    fat_files.append(file)   
                    
                if flag_emph==1: #sent for emphysema measurements
                    pat_sent_for_emph.append(pat_name)
                    emph_files.append(file)                     
                
                    
                #If the file does not have any of the above measurements count it    
                if flag_all==0:  
                    no_measurements=no_measurements+1
                else: #Same for when measurements are available
                    measurements=measurements+1
                    
            else: #If participant_id in not print a valid name
                if pat_name!='': #Other non-proper names. Some examples: ima_40, ima_41, ima_41, ima_17, ima_59, ima_17, ima_41, ima_43, ima_18
                    #These are the results of earlier versions
                    pat_ignored=pat_ignored+1
                else: #if it's empty, count it
                    empty_files=empty_files+1

        except: #For files other than txt 
            print(file)
            
    print("Total number of files without any of the above measurements (nodules, aorta, CACS, vertebra, cardiac fat) is:",no_measurements)
    print("Total number of empty files is:",empty_files)
    print("Total number of files with measurements is:",measurements)
    print("Total number of files with empty patient_names:",pat_ignored)
    
    try: #Confirm that no file missed - if error should be because of the existance of non-txt AI files
        assert(len(os.listdir(path))==no_measurements+empty_files+measurements+pat_ignored)
    except:
        print(traceback.format_exc())

    
    return (pat_sent_for_nodules, nodule_files,
            pat_sent_for_aorta, aorta_files,
            pat_sent_for_CACS, CACS_files,
            pat_sent_for_vertebra, vertebra_files,
            pat_sent_for_fat, fat_files,
            pat_sent_for_emph, emph_files)

In [7]:
(pat_sent_for_nodules, nodule_files,
 pat_sent_for_aorta, aorta_files,
 pat_sent_for_CACS, CACS_files,
 pat_sent_for_vertebra, vertebra_files,
 pat_sent_for_fat, fat_files,
 pat_sent_for_emph, emph_files)=check_parameter(path=path)

100%|██████████| 10029/10029 [00:02<00:00, 3621.43it/s]

Total number of files without any of the above measurements (nodules, aorta, CACS, vertebra, cardiac fat) is: 0
Total number of empty files is: 492
Total number of files with measurements is: 9528
Total number of files with empty patient_names: 9





In [8]:
print('Unique participants sent for nodules: ',len(np.unique(pat_sent_for_nodules))) #Unique patient_ids
print('All participants sent for nodules: ',len(pat_sent_for_nodules)) #All patient_ids - some of them exist more than once and we will keep the latest below
print('\n')

print('Unique participants sent for aorta: ',len(np.unique(pat_sent_for_aorta))) 
print('All participants sent for aorta: ',len(pat_sent_for_aorta)) 
print('\n')

print('Unique participants sent for CACS: ',len(np.unique(pat_sent_for_CACS))) 
print('All participants sent for CACS: ',len(pat_sent_for_CACS))
print('\n')

print('Unique participants sent for vertebra: ',len(np.unique(pat_sent_for_vertebra))) 
print('All participants sent for vertebra: ',len(pat_sent_for_vertebra))
print('\n')

print('Unique participants sent for cardiac fat: ',len(np.unique(pat_sent_for_fat))) 
print('All participants sent for cardiac fat: ',len(pat_sent_for_fat))
print('\n')

print('Unique participants sent for emphysema: ',len(np.unique(pat_sent_for_emph))) 
print('All participants sent for emphysema: ',len(pat_sent_for_emph))

Unique participants sent for nodules:  375
All participants sent for nodules:  547


Unique participants sent for aorta:  1580
All participants sent for aorta:  1685


Unique participants sent for CACS:  4965
All participants sent for CACS:  6951


Unique participants sent for vertebra:  1880
All participants sent for vertebra:  2081


Unique participants sent for cardiac fat:  1647
All participants sent for cardiac fat:  1648


Unique participants sent for emphysema:  1883
All participants sent for emphysema:  2116


### Sort participants and their respective lists

In [9]:
try:
    pat_sent_for_nodules, nodule_files=sort_and_list(pat_sent_for_nodules, nodule_files)
except:
    pass

In [10]:
try:
    pat_sent_for_aorta, aorta_files=sort_and_list(pat_sent_for_aorta, aorta_files)
except:
    pass

In [11]:
try:
    pat_sent_for_CACS, CACS_files=sort_and_list(pat_sent_for_CACS, CACS_files)
except:
    pass

In [12]:
try:
    pat_sent_for_vertebra, vertebra_files=sort_and_list(pat_sent_for_vertebra, vertebra_files)
except:
    pass

In [13]:
try:
    pat_sent_for_fat, fat_files=sort_and_list(pat_sent_for_fat, fat_files)
except:
    pass

In [14]:
try:
    pat_sent_for_emph, emph_files=sort_and_list(pat_sent_for_emph, emph_files)
except:
    pass

### Create dictionaries of participant ids and their txt files

In [15]:
def create_dictionary_parameter(pat_sent_for_parameter,parameter_files):
    
    'Combine all files corresponding to a participants in a list and create dictionary of {participants:files of them}'
    
    pat_file={} #Empty dictionary to store patient_ids and filenames corresponding to that patient
    
    for pat in np.unique(pat_sent_for_parameter): #Loop over unique patient_ids
        pat_file[pat]=[] #Add patient_id as key and as value set an empty list to be filled below

        for index in np.where(np.array(pat_sent_for_parameter)==pat)[0]: #For all indices in which we get the same patient
            pat_file[pat].append(parameter_files[index]) #Add file name to above dictionary

    return pat_file

In [16]:
nodule_dict=create_dictionary_parameter(pat_sent_for_nodules, nodule_files)

In [17]:
aorta_dict=create_dictionary_parameter(pat_sent_for_aorta, aorta_files)

In [18]:
CACS_dict=create_dictionary_parameter(pat_sent_for_CACS, CACS_files)

In [19]:
vertebra_dict=create_dictionary_parameter(pat_sent_for_vertebra, vertebra_files)

In [20]:
fat_dict=create_dictionary_parameter(pat_sent_for_fat, fat_files)

In [21]:
emph_dict=create_dictionary_parameter(pat_sent_for_emph, emph_files)

### Keep only the latest file or the file with the most slices sent to AI

In [22]:
def keep_one_file(path,parameter_dict):
    
    'Gets the path of all AI files with time duration and the dictionary with participants and all files for them'
    'Keeps only one file for each participant, the one with the latest date or the one with the most slices'

    pat_file_final={} #Keep only one file for each patient, the latest one (or the one with the most slices)

    for pat,files in parameter_dict.items(): #Loop over each patient_id and its corresponding txt files

        if len(files)>1: #If more than one file

            slice_num_final=0 #Initialize the number of slices for this patient to 0
            series_final='20150101' #Set a random very early date to compare and select the latest one below

            for index,file in enumerate(files): #Loop over all files of this patient
                _,info=get_patient_id(path,file) #Get information of txt file for it

                for line in info: #Loop over all lines of this file
                    if "Slices :" in line: #If 'slices' in one of the lines of the txt file
                        slice_num=int(line.split('Slices : ')[-1].split(',')[0]) #Get the number of slices in that patient
                    
                    if "SeriesDate :" in line: #If series information in line
                        series_date=str(line.split('SeriesDate :')[-1].split(',')[0]) #Get the series_id of that patient
                    
                #if the number of slices is bigger than those met already, and the same series (avoid comparing with repeat scans)
                if slice_num>slice_num_final and series_date<=series_final: 
                    slice_num_final=slice_num #Keep track of it
                    index_final=index #and of the index in which it can be found
                    series_final=series_date #Keep track of the series date

                elif slice_num==slice_num_final and series_date<=series_final: #If it's equal to one already met:
                    before_date=files[index_final].split('.')[-2][:12] #Compare the date of the previous time it was met
                    current_date=files[index].split('.')[-2][:12] #with the current date of the file

                    if current_date>before_date: #if the current date is the latest one
                        index_final=index #Keep this index   
                        
            pat_file_final[pat]=files[index_final] #Add the final patient_id and file name information to dictionary

        else: #if only 1 file then keep it as is - Assumed that it's not a repeat scan
            pat_file_final[pat]=files[0]

    return pat_file_final

In [23]:
nodule_dict_final=keep_one_file(path,nodule_dict)

In [24]:
aorta_dict_final=keep_one_file(path,aorta_dict)

In [25]:
CACS_dict_final=keep_one_file(path,CACS_dict)

In [26]:
vertebra_dict_final=keep_one_file(path,vertebra_dict)

In [27]:
fat_dict_final=keep_one_file(path,fat_dict)

In [28]:
emph_dict_final=keep_one_file(path,emph_dict)

In [29]:
print('Total number of files with aorta measurements',len(aorta_dict_final))

Total number of files with aorta measurements 1580


In [30]:
print('Total number of files with CACS measurements',len(CACS_dict_final))

Total number of files with CACS measurements 4965


In [31]:
print('Total number of files with nodule measurements',len(nodule_dict_final))

Total number of files with nodule measurements 375


In [32]:
print('Total number of files with vertebra measurements',len(vertebra_dict_final))

Total number of files with vertebra measurements 1880


In [33]:
print('Total number of files with cardiac fat measurements',len(fat_dict_final))

Total number of files with cardiac fat measurements 1647


In [34]:
print('Total number of files with emphysema measurements',len(emph_dict_final))

Total number of files with emphysema measurements 1883


### Select files only after a specific date (avoid errors with earlier versions)

In [35]:
def keep_based_on_date(parameter_dict,date):
    
    'For a specific parameter, keep only files that are after a specified date. This date should be defined in the format "YYYYMM"'

    pat_file_keep={} #Empty dictionary to be filled only with files after date specified above

    for pat,file in parameter_dict.items(): #Loop over patient_id and filename corresponding to this patient

        if file.split('.')[-2][:6]>=date: #If the date is later than the specified date
            pat_file_keep[pat]=file #Keep that file

    return pat_file_keep

In [36]:
#For nodules we keep based on date below (if before February 2022 we will have the same file for nodules and for CACS 
#which is wrong since they require different kernels). There are also files sent for nodules in January (21 earliest), 
#but if kept we get errors like in 100761 (not exist in Syngo.via)

nodule_dict_final_date=keep_based_on_date(nodule_dict_final,'202202') #1st February 2022 onwards for nodules

In [37]:
aorta_dict_final_date=keep_based_on_date(aorta_dict_final,'202209') #1st September 2022 onwards for aorta

In [38]:
CACS_dict_final_date=keep_based_on_date(CACS_dict_final,'202205') #1st May 2022 onwards for CACS

In [39]:
vertebra_dict_final_date=keep_based_on_date(vertebra_dict_final,'202209') #1st September 2022 onwards for vertebra

In [40]:
cardiac_fat_dict_final_date=keep_based_on_date(fat_dict_final,'202209') #1st September 2022 onwards for cardiac fat

In [41]:
emph_dict_final_date=keep_based_on_date(emph_dict_final,'202209') #1st September 2022 onwards for emphysema

In [42]:
print('Total number of files with CACS measurements after 1st May 2022',len(CACS_dict_final_date))

Total number of files with CACS measurements after 1st May 2022 3651


In [43]:
print('Total number of files with aorta measurements after 1st September 2022',len(aorta_dict_final_date))

Total number of files with aorta measurements after 1st September 2022 1368


In [44]:
print('Total number of files with nodule measurements after 1st February 2022',len(nodule_dict_final_date))

Total number of files with nodule measurements after 1st February 2022 334


In [45]:
print('Total number of files with vertebra measurements after 1st September 2022',len(vertebra_dict_final_date))

Total number of files with vertebra measurements after 1st September 2022 1654


In [46]:
print('Total number of files with CACS measurements after 1st September 2022',len(cardiac_fat_dict_final_date))

Total number of files with CACS measurements after 1st September 2022 1647


In [47]:
print('Total number of files with emphysema measurements after 1st September 2022',len(emph_dict_final_date))

Total number of files with emphysema measurements after 1st September 2022 1654


In [48]:
# Confirm that same file with information about parameters exists
for file in list(nodule_dict_final_date.values()):
    if file in os.listdir(path):
        pass
    else:
        print(file) #Should not have any prints

In [49]:
nodule_dict_final_date

{'100785': '1.2.276.0.28.3.345049594267.42.2984.20220902083623014.txt',
 '101191': '1.2.276.0.28.3.345049594267.42.8396.20220523072606014.txt',
 '101493': '1.2.276.0.28.3.345049594267.42.9296.20221027185912014.txt',
 '102236': '1.2.276.0.28.3.345049594267.42.4032.20221027191123009.txt',
 '102427': '1.2.276.0.28.3.345049594267.42.3308.20221027192131000.txt',
 '102847': '1.2.276.0.28.3.345049594267.42.6596.20221027193526014.txt',
 '105179': '1.2.276.0.28.3.345049594267.42.9904.20221027194536000.txt',
 '106103': '1.2.276.0.28.3.345049594267.42.5716.20221018064249014.txt',
 '109640': '1.2.276.0.28.3.345049594267.42.2868.20221027195542000.txt',
 '111877': '1.2.276.0.28.3.345049594267.42.476.20221027200706014.txt',
 '113137': '1.2.276.0.28.3.345049594267.42.7164.20220822163648000.txt',
 '114616': '1.2.276.0.28.3.345049594267.42.4676.20221027201807000.txt',
 '116518': '1.2.276.0.28.3.345049594267.42.5032.20221027202803000.txt',
 '117028': '1.2.276.0.28.3.345049594267.42.3212.20221027203827014

### Save dictionaries as pickle to be used to find actual files with information as provided by AI

In [50]:
# #Save those files in dictionary to be used by the main file to add to REDCap
with open('patient_and_files_nodules.pkl', 'wb') as f:
    pickle.dump(nodule_dict_final_date, f)

In [51]:
with open('patient_and_files_aorta.pkl', 'wb') as f:
    pickle.dump(aorta_dict_final_date, f)

In [52]:
with open('patient_and_files_CACS.pkl', 'wb') as f:
    pickle.dump(CACS_dict_final_date, f)

In [53]:
with open('patient_and_files_vertebra.pkl', 'wb') as f:
    pickle.dump(vertebra_dict_final_date, f)

In [54]:
with open('patient_and_files_fat.pkl', 'wb') as f:
    pickle.dump(cardiac_fat_dict_final_date, f)

In [55]:
with open('patient_and_files_emph.pkl', 'wb') as f:
    pickle.dump(emph_dict_final_date, f)

### Check that there are unique files for nodules, aorta, and CACS measurements (for the rest assumed we won't have any issues)

### Confirm that not the same file in more than one groups

In [56]:
#Check if participants in CACS list exist in aorta list
for CACS_pat in list(CACS_dict_final.keys()):
    if CACS_pat in list(aorta_dict_final.keys()):
        if aorta_dict_final[CACS_pat]==CACS_dict_final[CACS_pat]:
            print("Participant {} exists in both aorta and CACS list with the same file name which is {}".
                  format(CACS_pat, CACS_dict_final[CACS_pat]))

#Check if participants in aorta list exist in CACS list
for aorta_pat in list(aorta_dict_final.keys()):
    if aorta_pat in list(CACS_dict_final.keys()):
        if aorta_dict_final[aorta_pat]==CACS_dict_final[aorta_pat]:
            print("Participant {} exists in both aorta and CACS list with the same file name which is {}".
                  format(aorta_pat, CACS_dict_final[aorta_pat]))
            
#For the above two lists, we don't expect any common files - otherwise error!


#Initialize two counters to keep track number of common files between nodules list and aorta and CACS lists
aorta_and_nodules=0
CACS_and_nodules=0

for nodule_pat in list(nodule_dict_final_date.keys()):#nodule_dict_final.keys()):

    #Check if participants in nodule list exist in aorta list
    if nodule_pat in list(aorta_dict_final.keys()):
        if aorta_dict_final[nodule_pat]==nodule_dict_final[nodule_pat]:
            print("Participant {} exists in both aorta and nodules list with the same file name which is {}".
                  format(nodule_pat, aorta_dict_final[nodule_pat]))
            aorta_and_nodules=aorta_and_nodules+1
    
    #Check if participants in nodule list exist in CACS list
    if nodule_pat in list(CACS_dict_final.keys()):
        if CACS_dict_final[nodule_pat]==nodule_dict_final[nodule_pat]:
            print("Participant {} exists in both CACS and nodules list with the same file name which is {}".
                  format(nodule_pat, CACS_dict_final[nodule_pat]))
            CACS_and_nodules=CACS_and_nodules+1
            
    #Check if participants in nodule list exist in both aorta and CACS lists
    if nodule_pat in list(CACS_dict_final.keys()) and nodule_pat in list(aorta_dict_final.keys()):
        if CACS_dict_final[nodule_pat]==nodule_dict_final[nodule_pat] and aorta_dict_final[nodule_pat]==nodule_dict_final[nodule_pat]:
            print("Participant {} exists in ALL nodule, aorta, and CACS lists!".format(nodule_pat))

print('\n')
print("Total number of common files between aorta and nodules is: ",aorta_and_nodules)
print("Total number of common files between CACS and nodules is: ",CACS_and_nodules)
#We should not have any common files since different kernels sent for each of them



Total number of common files between aorta and nodules is:  0
Total number of common files between CACS and nodules is:  0


### Compare files between two different folders - Not needed anymore

In [57]:
# #Takes ~33min to run for 9616 files in a i7-10750H 2.6GHz CPU

# all_files=os.listdir(path) #All txt files
# files_left=all_files.copy() #A copy of them to gradually remove files already looped - check below

# patient_and_files={} #Empty dictionary to be filled with the patient_id and all txt files that correspond to it

# for file in tqdm(all_files):
#     pat_id,_=get_patient_id(path,file) #Get patient_id in a txt file
    
#     if pat_id not in list(patient_and_files.keys()) and pat_id!='': #If it's a valid patient and not already added in dict
        
#         patient_and_files[pat_id]=[file] #Add patient and txt file to dictionary

#         to_be_deleted=[file] #Add that file in the list of files to be deleted
#         for file_check in files_left: #Check remaining files to see if this participant exists more than once
#             pat_id_check,_=get_patient_id(path,file_check) #Get patient_id for each of the remaining txt files

#             if pat_id==pat_id_check and file!=file_check: #If we have the same patient and not the same file name
#                 patient_and_files[pat_id].append(file_check) #Add txt file to dictionary list
#                 to_be_deleted.append(file_check) #Add that file in the list of files to be deleted

#         if file in files_left: #If the file has not be deleted from the list of remaining files
#             for file_del in to_be_deleted:
#                 files_left.remove(file_del) #Delete it, along with all the rest in which the same patient appears
            
# patient_and_files

In [58]:
##Save those files in dictionary
# with open('patient_and_files.pkl', 'wb') as f:
#     pickle.dump(patient_and_files, f)

### Load dictionary with participant and all the txt files that correspond to it

In [59]:
# #Retrieve files as dictionary
# with open('patient_and_files.pkl', 'rb') as f:
#     patient_and_files = pickle.load(f)

In [60]:
# print(dict(sorted(patient_and_files.items())))

##### Problems with 714969 and 109077 fixed now

In [61]:
end=time.time()
print('Total time to run was {}secs'.format(end-start)) #~20secs to run in a i7-10750H 2.6GHz CPU for 10k files

Total time to run was 20.0094952583313secs
