### About
* This notebook is used to determine **missing files** in time-lapse lightsheet data.

### How to use
* Insert the path to the folder to check for missing files
* Insert the number of timepoints in the folder
* Insert the extension of the files you want to check
* Insert the regex pattern to find the view. example: view 4 : "_G4.czi" or "_G4(1).czi". grouping () is important

### Output
* A list of missing file names in the notebook

### Advanced Imaging Facility @ IGC 2024

### Import modules

In [1]:
import os
import glob
import re

def natural_sort(l): 

	# from here: https://stackoverflow.com/questions/4836710/is-there-a-built-in-function-for-string-natural-sort

    convert = lambda text: int(text) if text.isdigit() else text.lower()

    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]

    return sorted(l, key=alphanum_key)


### Insert Variables

In [2]:
inputdir = "/Users/marianaferreira/Git/AIf-coding/test" # folder to check for missing files
number_of_timepoints = 5 #insert the maximum number between the parenthesis: e.g. for "_G1(178).czi" insert 178
#If not all views in the folder have the same number of timepoints, insert the maximum number among them. 
#WARNING:This will give extra missing files for views with less timepoints.

extension = ".czi"
pattern = "(_G\d+).*czi" # regex to find the views. example: view 4 : "_G4.czi" or "_G4(1).czi". grouping () is important 


### Check for missing files

In [5]:

#Find and sort all .czi files
files=natural_sort(glob.glob(inputdir+"/*"+extension))

# find all identifiers for different views:
view_identifiers=[] # will be a list of strings like : ['_G3(', '_G4('] 

for fn in files:
    match=re.findall(pattern,fn)[0] # returns the 1st group, e.g. "_G1". should always only be one match!
    if match not in view_identifiers:
        view_identifiers.append(match) 

pretty_text=[txt[1:3] for txt in view_identifiers]

files_count= len(files)
files_count_expected=len(view_identifiers)*(number_of_timepoints+1) # number_of_timepoints
files_count_difference=files_count_expected-files_count

if(files_count_difference>0):
    print("WARNING: {} files missing!".format(files_count_expected-files_count))
    print("Run next cell to list missing files.")
elif(files_count_difference<0):
    print("WARNING: {} extra files!".format(files_count-files_count_expected))
    print("Check if the number of timepoints variable inserted is correct!")
else:
    print("No missing files")
    print("Expected number of files: {}\nFound number of files: {}".format(files_count_expected,files_count))

 

Run next cell to list missing files.


### List missing files

In [6]:
# variables for missing files portion
missing_files=[]
expected_numbers=range(1,number_of_timepoints+1)

# check if all files exist for each view
for view_idx in range(len(view_identifiers)):
    files=natural_sort(glob.glob(inputdir+"/*"+view_identifiers[view_idx]+"*"+extension))
    identifier_numbers=[]

    for fn in files:   
        fn = fn.split(view_identifiers[view_idx]+"(")
        stringsplit=fn[-1].split(")")
        if(len(stringsplit)!=1):
            identifier_numbers.append(int(stringsplit[0]))
            basefilename = fn[0].split("/")[-1]

    for tp in (set(expected_numbers)-set(identifier_numbers)):
        missing_files.append(basefilename+view_identifiers[view_idx]+"("+str(tp)+")"+extension)
        
print("Found {} missing files in the folder.\n\nMissing file names:".format(len(missing_files)))
for i, mfn in enumerate(missing_files):
    print(mfn)
    
  

Found 3 missing files in the folder.

Missing file names:
testfiles_G1(3).rtf
testfiles_G2(4).rtf
testfiles_G2(5).rtf
