# Transform data from WormND .h5 and Stardist segmentation to file format after preprocessing by Atanas et al. (2024)
- goal: create ROI, image, and labels files according to registration logic (in pairs of 2; moving and fixed)
- next step: run centroid code from BrainAlignnet? otherwise make own centroids, difficulty is making sure the format is the same as expected from BrainAlignNet


In [None]:
import h5py
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import json
import os


dataset='EY'


os.makedirs('data/train',exist_ok=True)
os.makedirs('data/valid',exist_ok=True)
os.makedirs('data/test',exist_ok=True)


In [72]:
def pad_labels(labels,label_length):
    
    moving_len=labels.shape[0]
    if moving_len>label_length:
        #print(f'movin len: {moving_len}')
        labels=labels[:label_length]
    elif moving_len<label_length:
        L=label_length-moving_len
        filler=np.full((L,3),-1)
        labels=np.vstack([labels,filler])
    return labels


def make_centroids(segmentation):
    centroids=[]

    unique_labels = np.unique(segmentation)

    for label in unique_labels:
        if label == 0:  # Skip background label
            continue
        
        # Get the indices (coordinates) where the label exists
        coords = np.column_stack(np.where(segmentation == label))
        
        # Compute the centroid by averaging the coordinates
        centroid = np.mean(coords, axis=0)  # axis=0 means average across the (x, y, z) dimensions
        
        # Append the centroid to the list
        centroids.append(centroid)

    # Convert the list of centroids into a numpy array (shape will be 113x3)
    centroids = np.array(centroids)
    return centroids

In [64]:
split = {}
split['sub']={}
#split['sess']={}
split['sub']['train']=['sub-20190924-01',
'sub-20190924-02',
'sub-20190924-03',
'sub-20190925-04',
'sub-20190928-05',
'sub-20190928-08',
'sub-20190928-13',
'sub-20190929-02',
'sub-20190929-03',
'sub-20190929-05',
'sub-20191030-03',
'sub-20191104-08',
'sub-20191104-10',]
split['sub']['valid']=['sub-20190925-01',
'sub-20190928-01',
'sub-20190928-07',
'sub-20190929-07',]
split['sub']['test']=['sub-20190928-03',
'sub-20190928-11',
'sub-20190929-06',
'sub-20191030-07',]


for key in split['sub'].keys():
    print(key)
    sessions=[]
    for string in split['sub'][key]:
        _,sess,_=string.split("-")
        split[string]=sess
    #split['sess'][key]=sessions

print(split)


train
valid
test
{'sub': {'train': ['sub-20190924-01', 'sub-20190924-02', 'sub-20190924-03', 'sub-20190925-04', 'sub-20190928-05', 'sub-20190928-08', 'sub-20190928-13', 'sub-20190929-02', 'sub-20190929-03', 'sub-20190929-05', 'sub-20191030-03', 'sub-20191104-08', 'sub-20191104-10'], 'valid': ['sub-20190925-01', 'sub-20190928-01', 'sub-20190928-07', 'sub-20190929-07'], 'test': ['sub-20190928-03', 'sub-20190928-11', 'sub-20190929-06', 'sub-20191030-07']}, 'sub-20190924-01': '20190924', 'sub-20190924-02': '20190924', 'sub-20190924-03': '20190924', 'sub-20190925-04': '20190925', 'sub-20190928-05': '20190928', 'sub-20190928-08': '20190928', 'sub-20190928-13': '20190928', 'sub-20190929-02': '20190929', 'sub-20190929-03': '20190929', 'sub-20190929-05': '20190929', 'sub-20191030-03': '20191030', 'sub-20191104-08': '20191104', 'sub-20191104-10': '20191104', 'sub-20190925-01': '20190925', 'sub-20190928-01': '20190928', 'sub-20190928-07': '20190928', 'sub-20190929-07': '20190929', 'sub-20190928-0

## Make registration_problems.json

In [66]:
new_reg={'train':{},'valid':{},'test':{},}
pre_path=''#.../
data_dir='EY_Out'

for mode in new_reg.keys():
    print(f'mode: {mode}')
    for worm in split['sub'][mode]:
        print(worm)
        whole_string=f'{data_dir}/{worm}_ses-{split[worm]}'
        print(whole_string)
        new_reg[mode][whole_string]=[]
        
        #load image data to get T
        filename = f"{pre_path}{whole_string}_ophys.h5"
        h5 = h5py.File(filename,'r')
        image_data = np.array(h5["calcium_image"])
        T = image_data.shape[0]

        all_timesteps=np.arange(T)
        #print(all_timesteps)
        
        match_timesteps=np.random.choice(all_timesteps, T, replace=False) #replace=true, just make sure that the same neuron is not in one row twice??
        while (np.sum(all_timesteps==match_timesteps)>0):
            match_timesteps=np.random.choice(all_timesteps, size=(T), replace=False)

        match_idxs=np.hstack([all_timesteps[:,np.newaxis], match_timesteps[:,np.newaxis]])

        for i,row in enumerate(match_idxs):
            new_reg[mode][whole_string].append(f"{row[0]}to{row[1]}")
    print(new_reg)


with open('data/registration_problems.json', 'w') as f:
    json.dump(new_reg, f)

with open(f'data/registration_problems.json', 'r') as file:
    reg_ours = json.load(file)
     

mode: train
sub-20190924-01
EY_Out/sub-20190924-01_ses-20190924
sub-20190924-02
EY_Out/sub-20190924-02_ses-20190924
sub-20190924-03
EY_Out/sub-20190924-03_ses-20190924
sub-20190925-04
EY_Out/sub-20190925-04_ses-20190925
sub-20190928-05
EY_Out/sub-20190928-05_ses-20190928
sub-20190928-08
EY_Out/sub-20190928-08_ses-20190928
sub-20190928-13
EY_Out/sub-20190928-13_ses-20190928
sub-20190929-02
EY_Out/sub-20190929-02_ses-20190929
sub-20190929-03
EY_Out/sub-20190929-03_ses-20190929
sub-20190929-05
EY_Out/sub-20190929-05_ses-20190929
sub-20191030-03
EY_Out/sub-20191030-03_ses-20191030
sub-20191104-08
EY_Out/sub-20191104-08_ses-20191104
sub-20191104-10
EY_Out/sub-20191104-10_ses-20191104
{'train': {'EY_Out/sub-20190924-01_ses-20190924': ['0to760', '1to12', '2to948', '3to292', '4to921', '5to41', '6to312', '7to958', '8to851', '9to398', '10to607', '11to531', '12to127', '13to842', '14to439', '15to702', '16to249', '17to201', '18to295', '19to566', '20to296', '21to381', '22to572', '23to490', '24to180'

# Make image, ROI, and label (centroid) files

In [81]:
import os
label_length=400
for mode in reg_ours.keys():
    print(f'mode: {mode}')
    for worm in split['sub'][mode]:
        print(worm)
        os.makedirs(f'data/{mode}/{worm}',exist_ok=True)
        whole_string=f'{data_dir}/{worm}_ses-{split[worm]}'
        #images
        path_fixed = f'data/{mode}/{worm}/fixed_images.h5'
        path_moving = f'data/{mode}/{worm}/moving_images.h5'
        f_fixed= h5py.File(path_fixed, 'w')
        f_moving= h5py.File(path_moving, 'w')
        #rois
        path_fixed_roi = f'data/{mode}/{worm}/fixed_rois.h5'
        path_moving_roi = f'data/{mode}/{worm}/moving_rois.h5'
        f_fixed_roi= h5py.File(path_fixed_roi, 'w')
        f_moving_roi= h5py.File(path_moving_roi, 'w')
        #labels
        path_fixed_l = f'data/{mode}/{worm}/fixed_labels.h5'
        path_moving_l = f'data/{mode}/{worm}/moving_labels.h5'
        f_fixed_l= h5py.File(path_fixed_l, 'w')
        f_moving_l= h5py.File(path_moving_l, 'w')


        filename = f"{pre_path}{whole_string}_ophys.h5"
        filename2 = f"{pre_path}{whole_string}_ophys_stardist.npy"

        h5 = h5py.File(filename,'r')
        target_labels =  np.array(h5["calcium_segmentation"])
        image_data = np.array(h5["calcium_image"])
        seg = np.load(filename2)

        print(reg_ours)
        print(worm)
        print(reg_ours[mode][whole_string])
        for reg_problem in reg_ours[mode][whole_string]:
            print(reg_problem)

            mov,fix=np.array(reg_problem.split("to")).astype(int)

            #images
            moving_img = image_data[mov,:,:,:]
            fixed_img = image_data[fix,:,:,:]

            #write images to moving_images.h5 and fixed_images.h5
            f_fixed.create_dataset(reg_problem, data=fixed_img)
            f_moving.create_dataset(reg_problem, data=moving_img)

            #ROIs
            #transforming output to get from shape (21, 128, 256) to (256, 128, 21) should probably be-> same shape as for image files)
            moving_roi = seg[mov,:,:,:].T
            fixed_roi = seg[fix,:,:,:].T

            #write images to moving_images.h5 and fixed_images.h5
            f_fixed_roi.create_dataset(reg_problem, data=fixed_roi)
            f_moving_roi.create_dataset(reg_problem, data=moving_roi)

            #labels
            moving_l=pad_labels(make_centroids(moving_roi),label_length)
            #print(moving_l.shape)
            #print(moving_l)
            fixed_l=pad_labels(make_centroids(fixed_roi), label_length)
            #print(fixed_l.shape)

            #write images to moving_images.h5 and fixed_images.h5
            f_fixed_l.create_dataset(reg_problem, data=fixed_l)
            f_moving_l.create_dataset(reg_problem, data=moving_l)

        f_fixed.close()
        f_moving.close()
        f_fixed_roi.close()
        f_moving_roi.close()
        f_fixed_l.close()
        f_moving_l.close()

mode: train
sub-20190924-01
{'train': {'EY_Out/sub-20190924-01_ses-20190924': ['0to760', '1to12', '2to948', '3to292', '4to921', '5to41', '6to312', '7to958', '8to851', '9to398', '10to607', '11to531', '12to127', '13to842', '14to439', '15to702', '16to249', '17to201', '18to295', '19to566', '20to296', '21to381', '22to572', '23to490', '24to180', '25to254', '26to676', '27to355', '28to712', '29to347', '30to935', '31to577', '32to458', '33to378', '34to92', '35to515', '36to193', '37to950', '38to480', '39to350', '40to252', '41to592', '42to388', '43to14', '44to873', '45to495', '46to218', '47to626', '48to104', '49to823', '50to891', '51to405', '52to898', '53to435', '54to742', '55to275', '56to289', '57to479', '58to230', '59to791', '60to393', '61to718', '62to691', '63to78', '64to949', '65to543', '66to461', '67to542', '68to508', '69to766', '70to880', '71to449', '72to534', '73to385', '74to367', '75to279', '76to54', '77to595', '78to902', '79to454', '80to560', '81to228', '82to827', '83to357', '84to259', '8