In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import os
from tqdm.notebook import tqdm
import networkx as nx

# data
data_folder = '../../data/FlyWire/'
files = os.listdir(data_folder)
for f in files:
    if f.endswith('.csv.gz'):
        file_name = f.split('.')[0]
        command = file_name+"= pd.read_csv('"+ os.path.join(data_folder, f) +"')"
        exec(command)
        print(command)

connectivity_tags= pd.read_csv('../../data/FlyWire/connectivity_tags.csv.gz')
consolidated_cell_types= pd.read_csv('../../data/FlyWire/consolidated_cell_types.csv.gz')
coordinates= pd.read_csv('../../data/FlyWire/coordinates.csv.gz')
names= pd.read_csv('../../data/FlyWire/names.csv.gz')
neurons= pd.read_csv('../../data/FlyWire/neurons.csv.gz')
column_assignment= pd.read_csv('../../data/FlyWire/column_assignment.csv.gz')
cell_stats= pd.read_csv('../../data/FlyWire/cell_stats.csv.gz')
classification= pd.read_csv('../../data/FlyWire/classification.csv.gz')
synapse_coordinates= pd.read_csv('../../data/FlyWire/synapse_coordinates.csv.gz')
connections_no_threshold= pd.read_csv('../../data/FlyWire/connections_no_threshold.csv.gz')
processed_labels= pd.read_csv('../../data/FlyWire/processed_labels.csv.gz')
synapse_attachment_rates= pd.read_csv('../../data/FlyWire/synapse_attachment_rates.csv.gz')
connections_princeton_no_threshold= pd.read_csv('../../data/FlyWire/connections_princeton_no_thres

In [5]:
# find rows with nan values
def find_nan(df):
    return df[df.isna().any(axis=1)]

# define utility functions
def get_number_input_synapse(root_id, connections):
    return connections[connections['post_root_id']==root_id]['syn_count'].sum()

In [3]:
print(find_nan(neurons))
# fill nan values in nt_type with 'UNK'
neurons['nt_type'] = neurons['nt_type'].fillna('UNK')
# save the data
neurons.to_csv(data_folder+'neurons.csv.gz', index=False, compression='gzip')

Empty DataFrame
Columns: [root_id, group, nt_type, nt_type_score, da_avg, ser_avg, gaba_avg, glut_avg, ach_avg, oct_avg]
Index: []


In [4]:
print(find_nan(classification))
# fill nan values with ''
classification = classification.fillna('')
# update the format of the hemibrain_type
classification['hemibrain_type'] = classification['hemibrain_type'].str.replace('+',',')
# save the data
classification.to_csv(data_folder+'classification.csv.gz', index=False, compression='gzip')

                   root_id       flow super_class        class sub_class  \
0       720575940640144768  intrinsic       optic          NaN      L1-5   
1       720575940630759755  intrinsic     central          NaN       NaN   
2       720575940637932009  intrinsic     central  Kenyon_Cell       NaN   
3       720575940606131116  intrinsic       optic          NaN       NaN   
4       720575940633723091  intrinsic       optic          NaN       NaN   
...                    ...        ...         ...          ...       ...   
139250  720575940632239661   afferent     sensory    olfactory       NaN   
139251  720575940624423312  intrinsic       optic          NaN       NaN   
139252  720575940625090916  intrinsic       optic          NaN       NaN   
139253  720575940631596103  intrinsic       optic          NaN       NaN   
139254  720575940628407368  intrinsic       optic          NaN       NaN   

       cell_type hemibrain_type hemilineage   side nerve  
0             L1            

In [5]:
# synapse_coordinates all nan need to be filled with the last non nan value for each column
synapse_coordinates = synapse_coordinates.ffill()
# save the data
synapse_coordinates.to_csv(data_folder+'synapse_coordinates.csv.gz', index=False, compression='gzip')

In [8]:
print(find_nan(connections_princeton_no_threshold))
# rename columns
connections_princeton_no_threshold = connections_princeton_no_threshold.rename(columns={'pre_pt_root_id':'pre_root_id', 'post_pt_root_id':'post_root_id'})
# fill nt_type with 'UNK'
connections_princeton_no_threshold['nt_type'] = connections_princeton_no_threshold['nt_type'].fillna('UNK')
# fill neuropil with 'UNK'
connections_princeton_no_threshold['neuropil'] = connections_princeton_no_threshold['neuropil'].fillna('UNK')
# add input_synapse_count and syn_strength
input_synapse_count_map = {}
all_root_ids = np.unique(connections_princeton_no_threshold['post_root_id'])
for i in tqdm(range(len(all_root_ids))):
    root_id = all_root_ids[i]
    input_synapse_count_map[root_id] = get_number_input_synapse(root_id, connections_princeton_no_threshold)
connections_princeton_no_threshold['input_synapse_count'] = connections_princeton_no_threshold['post_root_id'].map(input_synapse_count_map)
connections_princeton_no_threshold['syn_strength'] = connections_princeton_no_threshold['syn_count']/connections_princeton_no_threshold['input_synapse_count']
# save the data
connections_princeton_no_threshold.to_csv(data_folder+'connections_princeton_no_threshold.csv.gz', index=False, compression='gzip')

Empty DataFrame
Columns: [pre_root_id, post_root_id, neuropil, syn_count, nt_type, input_synapse_count, syn_strength]
Index: []


  0%|          | 0/138998 [00:00<?, ?it/s]

In [10]:
print(find_nan(labels))
# fill user_name and user_affiliation with 'Unknown'
labels['user_name'] = labels['user_name'].fillna('Unknown')
labels['user_affiliation'] = labels['user_affiliation'].fillna('Unknown')
# save the data
labels.to_csv(data_folder+'labels.csv.gz', index=False, compression='gzip')

Empty DataFrame
Columns: [root_id, label, user_id, position, supervoxel_id, label_id, date_created, user_name, user_affiliation]
Index: []


In [11]:
print(find_nan(connections))
# fill nt_type with 'UNK'
connections['nt_type'] = connections['nt_type'].fillna('UNK')
# fill neuropil with 'UNK'
connections['neuropil'] = connections['neuropil'].fillna('UNK')
# add input_synapse_count and syn_strength
input_synapse_count_map = {}
all_root_ids = np.unique(connections['post_root_id'])
for i in tqdm(range(len(all_root_ids))):
    root_id = all_root_ids[i]
    input_synapse_count_map[root_id] = get_number_input_synapse(root_id)
connections['input_synapse_count'] = connections['post_root_id'].map(input_synapse_count_map)
connections['syn_strength'] = connections['syn_count']/connections['input_synapse_count']
# save the data
connections.to_csv(data_folder+'connections.csv.gz', index=False, compression='gzip')

Empty DataFrame
Columns: [pre_root_id, post_root_id, neuropil, syn_count, nt_type]
Index: []


  0%|          | 0/125192 [00:00<?, ?it/s]

In [9]:
# for every dataset, get the dataframe columns summary
for f in files:
    if f.endswith('.csv.gz'):
        db_name = f.split('.')[0]
        print(db_name + '\n' + '-'*len(db_name))
        exec('print('+db_name+'.info())')
        print('\n')

connectivity_tags
-----------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122362 entries, 0 to 122361
Data columns (total 2 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   root_id           122362 non-null  int64 
 1   connectivity_tag  122362 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.9+ MB
None


consolidated_cell_types
-----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137687 entries, 0 to 137686
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   root_id             137687 non-null  int64 
 1   primary_type        137687 non-null  object
 2   additional_type(s)  28787 non-null   object
dtypes: int64(1), object(2)
memory usage: 3.2+ MB
None


coordinates
-----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238909 entries, 0 to 238908
Data columns (total 3 columns):
 #   Column    