In [1]:
#import cudf
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
data_folder = Path.cwd() / "data" / "FLY"
meta_folder = data_folder / "meta"
plot_folder = data_folder / "visualization"
plot_folder.mkdir(parents=True, exist_ok=True)

synapses_file = data_folder / "synapses_flat_v783.csv"
synapses_file_curated = data_folder / "synapses_flat_v783_ext.csv"
soma_file = data_folder / "meta" / "soma.csv"

grid_size =  25000 # 50000 25000
grid_size_descriptor = f"grid-{grid_size}"

In [3]:
df_celltypes = pd.read_csv(meta_folder/"cell_types.csv")
df_celltypes['root_id'] = df_celltypes['root_id'].astype(str)
celltypes_rootid_typeid = df_celltypes.set_index('root_id')['primary_type_id'].to_dict()
celltypes_typeid_name = df_celltypes.set_index('primary_type_id')['primary_type'].to_dict()
celltypes_name_typeid = df_celltypes.set_index('primary_type')['primary_type_id'].to_dict()

In [4]:
df_optical_columns = pd.read_csv(meta_folder/"column_assignment.csv")
column_ids = [628]
df_optical_selected = df_optical_columns[df_optical_columns.column_id.isin(column_ids)]

enabled_celltypes = set(df_optical_selected.type.values.tolist())
enabled_ids = set([str(x) for x in sorted(df_optical_selected.root_id.values.tolist())])

In [5]:
# enabled_ids

In [6]:
sel_id = []
sel_mapped_id = []
sel_celltype = []
sel_celltype_name = []
for mapped_id, orirignal_id in enumerate(sorted(enabled_ids)):
    sel_id.append(orirignal_id)
    sel_mapped_id.append(mapped_id)
    if(orirignal_id in celltypes_rootid_typeid):
        celltype_id = celltypes_rootid_typeid[orirignal_id]
        sel_celltype.append(celltype_id)
        sel_celltype_name.append(celltypes_typeid_name[celltype_id])
    else:
        sel_celltype.append(-1)
        sel_celltype_name.append("-1")

df_selected = pd.DataFrame({'root_id': sel_id, 'mapped_id': sel_mapped_id, 'celltype_id': sel_celltype, 'celltype_name': sel_celltype_name})
df_selected.to_csv(meta_folder/f"selected_neurons.csv", index=False)

In [7]:
df_selected

Unnamed: 0,root_id,mapped_id,celltype_id,celltype_name
0,720575940608668867,0,7,T5a
1,720575940617634772,1,11,Tm9
2,720575940620584319,2,9,T4a
3,720575940621153306,3,37,Tm3
4,720575940621736205,4,20,T5d
5,720575940622690954,5,18,Mi9
6,720575940623683854,6,107,Tm4
7,720575940623862676,7,26,T2a
8,720575940624347978,8,38,T3
9,720575940625204226,9,6,T5b


In [8]:
id_mapping = df_selected.set_index('root_id')["mapped_id"].to_dict()
#id_mapping

In [9]:
def curate_annotate_synapses(filename_in, filename_out, enabled_ct_set, id_mapping):

    def get_mapped_id(value):
        if(value == ""):
            return "-1"
        else:
            if(value in id_mapping):
                return str(id_mapping[value])
            else:
                return "-1"
    
    def get_celltype(root_id):
        if(root_id in celltypes_rootid_typeid):
            celltype_id = celltypes_rootid_typeid[root_id]
            celltype_name = celltypes_typeid_name[celltype_id]
            if(celltype_name in  enabled_ct_set):
                return str(celltype_id)
            else:
                return "-1"
        else:
            return "-1"

    with open(filename_in, "r") as f_in, open(filename_out, "w") as f_out:
        linecount = 0
        for line in f_in:
            if(linecount == 0):
                f_out.write("x,y,z,pre_id_mapped,post_id_mapped,pre_celltype,post_celltype\n")
                linecount += 1
                continue

            parts = line.rstrip().split(",")
            pre_id = get_mapped_id(parts[0])
            post_id = get_mapped_id(parts[1])
            x = parts[2]
            y = parts[3]
            z = parts[4]

            pre_celltype = get_celltype(parts[0])
            post_celltype = get_celltype(parts[1])
            
            parts_new = [x, y, z, pre_id, post_id, pre_celltype, post_celltype]
            #print(parts_new)
            f_out.write(",".join(parts_new) + "\n")

            #if(linecount == 1000):
            #    break
            linecount += 1

In [10]:
curate_annotate_synapses(synapses_file, synapses_file_curated, enabled_celltypes, id_mapping)

In [11]:
synapses_df = pd.read_csv(synapses_file_curated, dtype={
    "x": "uint32", 
    "y": "uint32", 
    "z": "uint32",     
    "pre_id": "int32", 
    "post_id": "int32",
    "pre_celltype": "int32", 
    "post_celltype": "int32",
})
synapses_df

Unnamed: 0,x,y,z,pre_id_mapped,post_id_mapped,pre_celltype,post_celltype
0,705988,275676,221800,-1,-1,0,-1
1,706908,275084,222160,-1,-1,-1,-1
2,707848,275456,222100,-1,-1,-1,-1
3,708050,275176,222200,-1,-1,-1,-1
4,711344,267664,223360,-1,-1,-1,-1
...,...,...,...,...,...,...,...
34156315,436160,163856,81040,-1,-1,-1,-1
34156316,436370,165082,78360,-1,-1,-1,-1
34156317,438644,167636,77860,-1,-1,-1,-1
34156318,442126,167062,78700,-1,-1,-1,-1


In [12]:
# synapses_df["pre_id_mapped"] = pd.Series(-1 * np.ones(len(synapses_df)), dtype='int16')  # replaced cudf.Series
# synapses_df["post_id_mapped"] = pd.Series(-1 * np.ones(len(synapses_df)), dtype='int16') # replaced cudf.Series
synapses_df["overlap_volume"] = pd.Series(np.zeros(len(synapses_df)), dtype='uint64') # replaced cudf.Series

In [13]:
from lib.gridder import Gridder

gridder = Gridder()

xyz = synapses_df[["x", "y", "z"]].to_numpy()

gridder.setPositions(xyz)
indices, df_grid_meta = gridder.computeGrid(grid_size)
synapses_df["overlap_volume"] = indices.astype("uint64")

In [14]:
df_grid_meta.to_csv(data_folder/f"{grid_size_descriptor}_meta.csv", index=False)

In [15]:
synapses_df.sample(frac=0.05).to_csv(plot_folder/f"synapses_{grid_size_descriptor}_5pct.csv", index=False)

In [16]:
#for neuron_id, mapped_id in mapping_dict.items():
#    synapses_df.loc[synapses_df.pre_id == neuron_id, "pre_id_mapped"] = mapped_id
#    synapses_df.loc[synapses_df.post_id == neuron_id, "post_id_mapped"] = mapped_id

In [17]:
synapses_mapped_neurons = synapses_df[(synapses_df.pre_id_mapped > -1) | (synapses_df.post_id_mapped > -1)]
synapses_mapped_neurons.to_csv(plot_folder/f"synapses_{grid_size_descriptor}_mapped-neurons.csv", index=False)
synapses_mapped_neurons

Unnamed: 0,x,y,z,pre_id_mapped,post_id_mapped,pre_celltype,post_celltype,overlap_volume
361865,796132,289728,147420,-1,26,-1,69,28933
593767,851798,298866,115980,-1,28,-1,12,31012
772237,717138,270396,181400,-1,7,-1,26,24839
1521040,737020,276876,203120,0,-1,7,-1,25863
1552520,762546,294372,172380,-1,2,-1,9,26918
...,...,...,...,...,...,...,...,...
33944981,810508,286950,142000,-1,6,-1,107,28933
33944992,801692,286770,149960,-1,11,-1,68,28933
33944999,800776,285756,148800,-1,12,-1,102,28933
34142181,809168,288128,141580,-1,14,-1,248,28933


In [18]:
synapses_btw_mapped_neurons = synapses_df[(synapses_df.pre_id_mapped > -1) & (synapses_df.post_id_mapped > -1)]
synapses_btw_mapped_neurons.to_csv(plot_folder/f"synapses_{grid_size_descriptor}_between-mapped-neurons.csv", index=False)
synapses_btw_mapped_neurons

Unnamed: 0,x,y,z,pre_id_mapped,post_id_mapped,pre_celltype,post_celltype,overlap_volume
7820955,725972,275420,188560,1,0,11,7,25863
18805633,724398,274400,187180,12,0,102,7,25863


In [19]:
selected_overlap_volumes = synapses_mapped_neurons.overlap_volume.unique()
selected_overlap_volumes.size

17

In [20]:
range_filter_enabled = False

if(range_filter_enabled):
    
    offset = 5000

    x_min = synapses_mapped_neurons.x.min() - offset
    y_min = synapses_mapped_neurons.y.min() - offset
    #z_min = synapses_mapped_neurons.z.min() - offset

    x_max = synapses_mapped_neurons.x.max() + offset
    y_max = synapses_mapped_neurons.y.max() + offset
    #z_max = synapses_mapped_neurons.z.max() + offset

    df_synapses_range = synapses_df[
        #(synapses_df.x >= x_min) & (synapses_df.x <= x_max) &
        #(synapses_df.y >= y_min) & (synapses_df.y <= y_max) &
        synapses_df.overlap_volume.isin(selected_overlap_volumes)
        #(synapses_df.z >= z_min) & (synapses_df.z <= z_max) 
    ]

    df_synapses_range.reset_index(drop=True)

else:

    df_synapses_range = synapses_df.copy()

In [21]:
df_synapses_range.to_csv(data_folder/f"synapses_{grid_size_descriptor}.csv", index=False) # replaced df_synapses_range.to_pandas().to_csv
df_synapses_range.sample(frac=0.05).to_csv(plot_folder/f"synapses_{grid_size_descriptor}_5pct.csv", index=False)

In [22]:
df_synapses_range

Unnamed: 0,x,y,z,pre_id_mapped,post_id_mapped,pre_celltype,post_celltype,overlap_volume
0,705988,275676,221800,-1,-1,0,-1,24840
1,706908,275084,222160,-1,-1,-1,-1,24840
2,707848,275456,222100,-1,-1,-1,-1,24840
3,708050,275176,222200,-1,-1,-1,-1,24840
4,711344,267664,223360,-1,-1,-1,-1,24840
...,...,...,...,...,...,...,...,...
34156315,436160,163856,81040,-1,-1,-1,-1,13411
34156316,436370,165082,78360,-1,-1,-1,-1,13411
34156317,438644,167636,77860,-1,-1,-1,-1,13442
34156318,442126,167062,78700,-1,-1,-1,-1,13443


In [23]:
df_synapses_aggregated = df_synapses_range.groupby(["overlap_volume", 
                                                    "pre_id_mapped", 
                                                    "post_id_mapped",
                                                    "pre_celltype", 
                                                    "post_celltype"]).size().reset_index(name="synapse_count")

df_synapses_aggregated.to_csv(data_folder/f"synapses_{grid_size_descriptor}_aggregated.csv", index=False) # replaced df_synapses_aggregated.to_pandas().to_csv
print(df_synapses_aggregated.synapse_count.sum())
df_synapses_aggregated

34156320


Unnamed: 0,overlap_volume,pre_id_mapped,post_id_mapped,pre_celltype,post_celltype,synapse_count
0,294,-1,-1,-1,-1,10
1,294,-1,-1,-1,12,2
2,294,-1,-1,-1,60,2
3,295,-1,-1,-1,-1,49
4,295,-1,-1,-1,12,16
...,...,...,...,...,...,...
25233,32101,-1,-1,-1,199,1
25234,32101,-1,-1,-1,248,12
25235,32101,-1,-1,-1,356,6
25236,32102,-1,-1,-1,-1,83
