   Copyright 2015-2023, University of Bern, Laboratory for High Energy Physics and Theodor Kocher Institute, M. Vladymyrov

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


# Libs & Fns

In [5]:
import sys
import os
import time
import shutil

In [6]:
def create_ds_dir(idx, ds_name, datasets_path):
    path = os.path.join(datasets_path, '%03d'%idx)
    if os.path.exists(path):
        return False
    else:
        path = os.path.join(path, ds_name)
        os.makedirs(path, exist_ok=False)
        return True

In [7]:
def make_record_info(idx, ds_name, datasets_path, tile = None):
    path = os.path.join(datasets_path, 'info')
    with open(path, 'at') as f:
        s = '%03d - %s' % (idx, ds_name) + (', tile %d' % tile if tile is not None else '') + '\n'
        f.write(s)
    f.close()

In [8]:
def ds_info(ds_path):
    
    list_files = sorted([n for n in os.listdir(ds_path) if '.tif' in n])
    
    
    file_names = [fn.replace('.tif', '') for fn in list_files if ('t' in fn and '_' in fn)] # only formatted, timelapse
    
    list_sfx = [fn.split('_')[-1] for fn in file_names]
    
    name_tmpls = ['_'.join(fn.split('_')[:-1]) for fn in file_names if '_' in fn]
    
    if len(name_tmpls) == 0:
        raise FileNotFoundError('files with name "<xxx>_<xx>t%d<xx>.tif not found"')
    
    name_tmpl = name_tmpls[0]
    
    last_sfx = list_sfx[-1]
    has_ch =  'c' in last_sfx
    has_tl =  'm' in last_sfx
    has_ps =  's' in last_sfx  # positions
    
    if has_ps:
        last_sfx_trunc_s = last_sfx.replace('s', '')
        
        n_ps_s, res_sfx = last_sfx_trunc_s.split('t')
        n_ps = int(n_ps_s)
        
        if has_ch:
            n_t_s, ch_tl_s = res_sfx.split('c')
            n_t = int(n_t_s)

            if has_tl:
                n_ch, n_tl = [int(s) for s in ch_tl_s.split('m')]
            else:
                n_ch = int(ch_tl_s)
                n_tl = 1
        else:
            if has_tl:
                n_t, n_tl = [int(s) for s in res_sfx.split('m')]
            else:
                n_t = int(res_sfx)
                n_tl = 1
            n_ch = 1
    else:
        n_ps = 1
        last_sfx_trunc_t = last_sfx.replace('t', '')
        if has_ch:
            n_t_s, ch_tl_s = last_sfx_trunc_t.split('c')
            n_t = int(n_t_s)

            if has_tl:
                n_ch, n_tl = [int(s) for s in ch_tl_s.split('m')]
            else:
                n_ch = int(ch_tl_s)
                n_tl = 1
        else:
            if has_tl:
                n_t, n_tl = [int(s) for s in last_sfx_trunc_t.split('m')]
            else:
                n_t = int(last_sfx_trunc_t)
                n_tl = 1
            n_ch = 1
        
    
    tmpl_ps = 's%0' + '%d' % len('%d' % n_tl) + 'd'
    tmpl_t = 't%0' + '%d' % len('%d' % n_t) + 'd'
    tmpl_ch = 'c%0' + '%d' % len('%d' % n_ch) + 'd'
    tmpl_tl = 'm%0' + '%d' % len('%d' % n_tl) + 'd'
    
    return {'n_ps' : n_ps, 
            'n_t' : n_t, 
            'n_ch' :n_ch, 
            'n_tl' : n_tl, 
            'has_ps' :has_ps, 
            'has_ch' :has_ch, 
            'has_tl' :has_tl,
            'tmpl_ps' :tmpl_ps,
            'tmpl_t' :tmpl_t,
            'tmpl_ch' :tmpl_ch,
            'tmpl_tl' :tmpl_tl,
            'name_tmpl':name_tmpl
           }

def get_file_name(path, inf_dict, ps, t, ch, tile):
    sfx = '_'
    
    if inf_dict['has_ps']:
        sfx += inf_dict['tmpl_ps'] % (ps+1)
    sfx += inf_dict['tmpl_t'] % (t+1)
    if inf_dict['has_ch']:
        sfx += inf_dict['tmpl_ch'] % (ch+1)
    if inf_dict['has_tl']:
        sfx += inf_dict['tmpl_tl'] % (tile+1)

    name = inf_dict['name_tmpl'] + sfx + '.tif'
    fn = os.path.join(path, name)
    return fn

In [16]:
def create_segmentation_datasets(datasets_path, datasets_names, start_ds_idx, time_subsample_min_t = 210):  # if n_t >time_subsample_min_t - take every second frame
    for ds_names in datasets_names:
        if isinstance(ds_names, str):
            ds_names = [(0, ds_names, 0)]
            
        # 1. get info for all
        copy_struct = []  # 1 element per ds: (ds_dir_name, info, n_before, n_after)
        
        all_nt = []
        for item in ds_names:
            n_copy_before, ds_path, n_copy_after = item
            inf = ds_info(ds_path)
            copy_struct.append((ds_path, inf, n_copy_before, n_copy_after))
            all_nt.append(inf['n_t'])
            
        subsample_fact = 2 if max(all_nt)>time_subsample_min_t else 1
        # print(subsample_fact, copy_struct, '\n')
        # continue
            
        # check validity: all mush have same format
        for key in ['n_ps', 'n_ch', 'n_tl', 'has_ps', 'has_ch', 'has_tl']:
            el0 = copy_struct[0][1][key]
            for struct in copy_struct[1:]:
                assert(struct[1][key] == el0)
        
        n_t_out = 0  # num output timeframes
        for struct in copy_struct:
            n_t_out += struct[2] + struct[3] + (struct[1]['n_t'] // subsample_fact)
            
        inf0 = copy_struct[0][1]
        n_ch = inf0['n_ch']
        
        n_ps_i = inf0['n_ps']
        n_tl_i = inf0['n_tl']
        
        n_tl = n_ps_i * n_tl_i
        
        has_ch = inf0['has_ch']
        has_tl = inf0['has_ps'] or inf0['has_tl']
        
        out_ds_name_general = '_'.join([struct[1]['name_tmpl'] for struct in copy_struct])
        
        tmpl_t = 't%0' + '%d' % len('%d' % n_t_out) + 'd'
        tmpl_ch = 'c%0' + '%d' % len('%d' % n_ch) + 'd'
        tmpl_tl = 'm%0' + '%d' % len('%d' % 1) + 'd'
        
        oinf = {'n_ps' : 1, 
                'n_t' : n_t_out, 
                'n_ch' :n_ch, 
                'n_tl' : 1, 
                'has_ps' :False,
                'has_ch' :has_ch, 
                'has_tl' :False,
                'tmpl_t' :tmpl_t,
                'tmpl_ch' :tmpl_ch,
                'tmpl_tl' :tmpl_tl,
                'name_tmpl':out_ds_name_general
               }
        
        # create dirs and fill info file
        tile_to_idx = {}
        tile_ds_name = {}
        
        
        creation_ok = True
        for tl in range(n_tl):
            idx = start_ds_idx + tl
            tile_to_idx[tl] = idx
    
            out_ds_name = out_ds_name_general + ('_tile%d' % (tl+1) if has_tl else '')
            tile_ds_name[tl] = out_ds_name
    
            if not create_ds_dir(idx, out_ds_name, datasets_path):
                print('dataset with idx', idx, 'already exists. please check manually. Aborting.')
                creation_ok = False
                break
        if not creation_ok:
            break
            
        start_ds_idx += n_tl
    
    
        for tl in range(n_tl):
            idx = tile_to_idx[tl]
            out_ds_name = tile_ds_name[tl]
            make_record_info(idx, out_ds_name, datasets_path, (tl+1) if has_tl else None)
    
            ods_path = os.path.join(datasets_path, '%03d' % idx)
            with open(os.path.join(ods_path, 'info.txt'), 'wt') as f:
                f.write(out_ds_name)
    
        
        for ps_i in range(n_ps_i):
            for tl_i in range(n_tl_i):
                tl = ps_i * n_tl_i + tl_i
                
                idx = tile_to_idx[tl]
                out_ds_name = tile_ds_name[tl]
                oinf['name_tmpl'] = out_ds_name
    
                ods_path = os.path.join(datasets_path, '%03d' % idx, out_ds_name)
                
                block_boundaries = []
                for ch in range(n_ch):
                    t_o = 0
                    for struct in copy_struct:
                        in_path, inf, copy_before, copy_after = struct
                        n_t_i = inf['n_t']
                        for i in range(copy_before):
                            t_i = 0
                            i_file = get_file_name(in_path,   inf, ps_i, t_i, ch, tl_i)
                            o_file = get_file_name(ods_path, oinf,   -1, t_o, ch, 0)
                            shutil.copy(i_file, o_file)
                            t_o += 1
    
                        for t_i in range(n_t_i//subsample_fact):
                            i_file = get_file_name(in_path,   inf, ps_i, t_i*subsample_fact, ch, tl_i)
                            o_file = get_file_name(ods_path, oinf,   -1, t_o, ch, 0)
                            shutil.move(i_file, o_file)
                            t_o += 1
    
                        for i in range(copy_after):
                            i_file = get_file_name(ods_path, oinf, ps_i, t_o-1, ch, tl_i)
                            o_file = get_file_name(ods_path, oinf,   -1, t_o,   ch, 0)
                            shutil.copy(i_file, o_file)
                            t_o += 1
                            
                        if ch==0:
                            begin = 0 if len(block_boundaries)==0 else block_boundaries[-1][1]
                            end = t_o
                            block_boundaries.append([begin, end])
                            
                block_info_path = os.path.join(datasets_path, '%03d' % idx, 'block_info.txt')
                with open(block_info_path, 'wt') as f:
                    txt = '|'.join([' '.join([str(bi) for bi in b]) for b in block_boundaries])
                    f.write(txt)
    
        for item in ds_names:
            n_copy_before, ds_path, n_copy_after = item
            #shutil.rmtree(ds_path)
    
    return start_ds_idx

# Datasets info

In [10]:
start_ds_idx = 0

In [11]:
datasets_names = []
path = '../raw_data/yyyy.mm.dd'
path = r'g:\IVFCA\Dixy\run1\raw\bk'
path = os.path.abspath(path)
for p2 in sorted(os.listdir(path)):
        path3 = os.path.join(path, p2)
        datasets_names.append([[0, path3, 0]])
        
# multiple sequences can be joiined, e..g if differernt experimental parts are saved in different czi. paddings can be added:
# datasets_names = [
#     [[0, 'path_part_1', 1],
#      [1, 'path_part_1', 0]]
# ]

In [12]:
datasets_names

[[[0, 'g:\\IVFCA\\Dixy\\run1\\raw\\bk\\FC2_Dotarem', 0]],
 [[0, 'g:\\IVFCA\\Dixy\\run1\\raw\\bk\\FC3_Untreated', 0]],
 [[0, 'g:\\IVFCA\\Dixy\\run1\\raw\\bk\\FC4_Dotarem', 0]],
 [[0, 'g:\\IVFCA\\Dixy\\run1\\raw\\bk\\FC8_Untreated', 0]]]

# Run

In [13]:
datasets_path = '../datasets_seg/'

In [17]:
start_ds_idx = create_segmentation_datasets(datasets_path, 
                                            datasets_names,
                                            start_ds_idx)