In [1]:
#import cudf  (use if you are on linux with a recent Nvidia GPU)
import pandas as pd
import numpy as np
from pathlib import Path

### Define local data folder paths 

Here we use the Flywire reconstruction (v783) that can be downloaded from [https://codex.flywire.ai/api/download](https://codex.flywire.ai/api/download)

In [2]:
data_folder = Path.cwd() / "data" / "FLY"
meta_folder = data_folder / "meta"
plot_folder = data_folder / "visualization"
plot_folder.mkdir(parents=True, exist_ok=True)

synapses_file = data_folder / "synapses_flat_v783.csv"


# Select overlap volume size
# grid_size =  25000 
selection_descriptor = f"optical-lobes"

synapses_file_curated = data_folder / f"synapses_flat_{selection_descriptor}.csv"

### Select neurons and cell types of interest  

Load cell types meta information into dictionaries

In [3]:
df_celltypes = pd.read_csv(meta_folder/"cell_types.csv")
df_celltypes['root_id'] = df_celltypes['root_id'].astype(str)
celltypes_rootid_typeid = df_celltypes.set_index('root_id')['primary_type_id'].to_dict()
celltypes_typeid_name = df_celltypes.set_index('primary_type_id')['primary_type'].to_dict()
celltypes_name_typeid = df_celltypes.set_index('primary_type')['primary_type_id'].to_dict()

Here we select some neurons and the respective cell types from an optical lobe column

In [4]:
df_optical_columns = pd.read_csv(meta_folder/"column_assignment.csv")

map_rootid_columnid = df_optical_columns.set_index('root_id')['column_id'].to_dict()
enabled_ids = set([str(x) for x in sorted(df_optical_columns.root_id.values.tolist())])

Write the selected cells, their zero-index-based neuron IDs, and their cell types to a separate dataframe

In [5]:
sel_id = []
sel_mapped_id = []
sel_celltype = []
sel_celltype_name = []
sel_columns = []
for mapped_id, orirignal_id in enumerate(sorted(enabled_ids)):
    sel_id.append(orirignal_id)
    sel_mapped_id.append(mapped_id)
    if(orirignal_id in celltypes_rootid_typeid):
        celltype_id = celltypes_rootid_typeid[orirignal_id]
        sel_celltype.append(celltype_id)
        sel_celltype_name.append(celltypes_typeid_name[celltype_id])
        sel_columns.append(map_rootid_columnid[int(orirignal_id)])
    else:
        sel_celltype.append(-1)
        sel_celltype_name.append("-1")
        sel_columns.append(-1)

df_selected = pd.DataFrame({'root_id': sel_id, 'mapped_id': sel_mapped_id, 'column_id': sel_columns, 'celltype_id': sel_celltype, 'celltype_name': sel_celltype_name})
df_selected.to_csv(meta_folder/f"selected_neurons_{selection_descriptor}.csv", index=False)

In [8]:
df_selected.head()

Unnamed: 0,root_id,mapped_id,column_id,celltype_id,celltype_name
0,720575940596125868,0,93,0,T5c
1,720575940599333574,1,355,4,Tm1
2,720575940599457990,2,247,5,T4b
3,720575940599459782,3,513,6,T5b
4,720575940599704006,4,331,7,T5a


In [9]:
id_mapping = df_selected.set_index('root_id')["mapped_id"].to_dict()
id_column = df_selected.set_index('root_id')["column_id"].to_dict()

Read the raw synapses file, attach meta information (e.g. type labels), set zero-index-based neuron IDs for selected neurons, and assign missing data to -1.  

In [10]:
from collections import defaultdict

def parse_synapse_table(unzipped_file_path):
    pre_to_rows, post_to_rows = defaultdict(list), defaultdict(list)
    with open(unzipped_file_path, 'r') as file:
        next(file)  # Skip header row
        pre_id, post_id = None, None
        for line in file:
            parts = line.strip().split(',')
            pre_id = int(parts[0]) if parts[0] else pre_id
            post_id = int(parts[1]) if parts[1] else post_id
            x, y, z = map(int, parts[2:5])
            row = [pre_id, post_id, x, y, z]
            pre_to_rows[pre_id].append(row)
            post_to_rows[post_id].append(row)
    return pre_to_rows, post_to_rows

In [11]:
pre_to_rows, post_to_rows = parse_synapse_table(synapses_file)

In [13]:
pre_to_rows[720575940607861443]

[[720575940607861443, 720575940605899692, 333656, 228150, 226300],
 [720575940607861443, 720575940605899692, 335348, 228290, 227340],
 [720575940607861443, 720575940605899692, 343204, 221438, 232680],
 [720575940607861443, 720575940605899692, 343608, 228886, 231940],
 [720575940607861443, 720575940605899692, 346264, 215008, 238920],
 [720575940607861443, 720575940605899692, 353564, 220146, 239260],
 [720575940607861443, 720575940606354633, 380338, 209320, 151200],
 [720575940607861443, 720575940606354633, 391264, 214998, 157760],
 [720575940607861443, 720575940606354633, 396440, 214996, 160840],
 [720575940607861443, 720575940606354633, 405056, 206798, 166940],
 [720575940607861443, 720575940606354633, 406072, 210956, 156080],
 [720575940607861443, 720575940606354633, 406716, 211798, 158280],
 [720575940607861443, 720575940606354633, 406950, 205266, 163820],
 [720575940607861443, 720575940606354633, 412502, 206948, 162740],
 [720575940607861443, 720575940606354633, 416028, 210472, 1563

In [33]:
def curate_annotate_synapses(filename_out):

    def get_celltype(root_id):
        if(root_id in celltypes_rootid_typeid):
            celltype_id = celltypes_rootid_typeid[root_id]
            return str(celltype_id)
        else:
            return "-1"
        
    def get_column_id(pre_root_id, post_root_id):
        if(post_root_id in map_rootid_columnid):
            return str(map_rootid_columnid[post_root_id])
        elif(pre_root_id in map_rootid_columnid):
            return str(map_rootid_columnid[pre_root_id])
        else:
            return "-1"	
        
    def get_mapped_id(root_id):
        if(root_id in id_mapping):
            return str(id_mapping[root_id])
        else:
            return "-1"

    def get_row_str(props):
        pre_id = props[0]
        post_id = props[1]
        pre_id_str = str(pre_id)
        post_id_str = str(post_id)
        
        pre_id_mapped = get_mapped_id(pre_id_str)
        post_id_mapped = get_mapped_id(post_id_str)

        x = str(props[2])
        y = str(props[3])
        z = str(props[4])

        pre_celltype = get_celltype(str(pre_id))
        post_celltype = get_celltype(str(post_id))

        column_id = get_column_id(pre_id, post_id)
        
        return ",".join([x, y, z, pre_id_mapped, post_id_mapped, pre_celltype, post_celltype, column_id]) + "\n"
        

    rows = []

    for pre_id, props in pre_to_rows.items():
        pre_id_str = str(pre_id)
        if pre_id_str in id_mapping:
            for prop in props:
                rows.append(get_row_str(prop))

    for post_id, props in post_to_rows.items():
        post_id_str = str(post_id)
        if post_id_str in id_mapping:
            for prop in props:
                rows.append(get_row_str(prop))

    
        
    with open(filename_out, "w") as f_out:
        f_out.write("x,y,z,pre_id_mapped,post_id_mapped,pre_celltype,post_celltype,column_id\n")

        for row in rows:       
            f_out.write(row)
            

In [34]:
curate_annotate_synapses(synapses_file_curated)

### Apply local overlap volumes 

In [35]:
synapses_df = pd.read_csv(synapses_file_curated, dtype={
    "column_id": "uint32",    
    "pre_id_mapped": "int32", 
    "post_id_mapped": "int32",
    "pre_celltype": "int32", 
    "post_celltype": "int32",
})
synapses_df

Unnamed: 0,x,y,z,pre_id_mapped,post_id_mapped,pre_celltype,post_celltype,column_id
0,705988,275676,221800,0,-1,0,500,93
1,706908,275084,222160,0,-1,0,500,93
2,707848,275456,222100,0,-1,0,500,93
3,708050,275176,222200,0,-1,0,500,93
4,711344,267664,223360,0,-1,0,500,93
...,...,...,...,...,...,...,...,...
6854234,828462,280700,122820,22683,10471,739,636,694
6854235,828706,280740,122420,22683,10471,739,636,694
6854236,828794,280706,122300,22683,10471,739,636,694
6854237,842292,283330,145920,22683,10471,739,636,694


In [36]:
synapses_df.rename(columns={'column_id': 'overlap_volume'}, inplace=True)

In [None]:
df_synapses_aggregated = synapses_df.groupby(["overlap_volume", 
                                                "pre_id_mapped", 
                                                "post_id_mapped",
                                                "pre_celltype", 
                                                "post_celltype"]).size().reset_index(name="synapse_count")

df_synapses_aggregated.to_csv(data_folder/f"synapses_{selection_descriptor}_aggregated.csv", index=False) # replaced df_synapses_aggregated.to_pandas().to_csv
print(df_synapses_aggregated.synapse_count.sum())
df_synapses_aggregated

6854239


Unnamed: 0,overlap_volume,pre_id_mapped,post_id_mapped,pre_celltype,post_celltype,synapse_count
0,1,-1,36,131,77,11
1,1,-1,36,444,77,11
2,1,-1,36,747,77,34
3,1,-1,36,864,77,22
4,1,-1,36,7332,77,11
...,...,...,...,...,...,...
362152,796,19612,-1,9,8384,5
362153,796,20501,13071,4,20,18
362154,796,21343,5880,12,248,22
362155,796,21343,8891,12,102,10


In [46]:
df_synapses_aggregated["overlap_volume"].unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18