In [2]:
import collections
import dask
import gcsfs
import h5py
import io
import numba
import numpy as np
import os
import os.path
import pickle
import requests

from dask.distributed import Client
from dask_kubernetes import KubeCluster
from numba.typed import Dict

In [3]:
cluster = KubeCluster.from_yaml('worker-spec.yml')
cluster

distributed.scheduler - INFO - Clear task state
distributed.scheduler - INFO -   Scheduler at:   tcp://10.36.0.109:34455
distributed.scheduler - INFO -   dashboard at:                     :8787


VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [4]:
client = Client(cluster)

distributed.scheduler - INFO - Receive client connection: Client-90270908-43c4-11ea-804c-16c367d7eede
distributed.core - INFO - Starting established connection


Check that the client is working well.

In [32]:
@dask.delayed
def the_sum(a, b):
    return a + b
the_sum(the_sum(1, 2), 3).compute()

6

In [None]:
if not os.path.exists('../../.gcs_tokens'):
    # Get a token
    gcsfs.GCSFileSystem(project='neuron-jungle', token='browser')

In [5]:
with open('../.gcs_tokens', 'rb') as f:
    credentials = pickle.load(f)
credentials = credentials[list(credentials.keys())[0]]
fs = gcsfs.GCSFileSystem(project='neuron-jungle', token=credentials)
fs.ls('l4dense/segmentation-volume')[:10]

['l4dense/segmentation-volume/x1y4z1.hdf5',
 'l4dense/segmentation-volume/x0y2z0.hdf5',
 'l4dense/segmentation-volume/x3y0z0.hdf5',
 'l4dense/segmentation-volume/x4y7z3.hdf5',
 'l4dense/segmentation-volume/x4y2z2.hdf5',
 'l4dense/segmentation-volume/x5y7z1.hdf5',
 'l4dense/segmentation-volume/x0y5z3.hdf5',
 'l4dense/segmentation-volume/x2y1z1.hdf5',
 'l4dense/segmentation-volume/x5y8z2.hdf5',
 'l4dense/segmentation-volume/x2y0z0.hdf5']

# Create a map from segment id to neuron id

In [7]:
def download(filename):
    url = f"https://l4dense2019.brain.mpg.de/webdav/{filename}"
    result = requests.get(url, verify=False)
    result.raise_for_status()
    return result.content

def upload(filename, data, credentials):
    fs = gcsfs.GCSFileSystem(project='neuron-jungle', token=credentials)
    with fs.open(f'l4dense/{filename}', 'wb') as f:
        num_bytes = f.write(data)
    return num_bytes

def mirror(filename):
    print(f"Fetching {filename}")
    data = download(filename)
    num_bytes = upload(filename, data, credentials)
    return num_bytes


def locally_cache(filename, credentials):
    fs = gcsfs.GCSFileSystem(project='neuron-jungle', token=credentials)
    with fs.open(f'l4dense/{filename}', 'rb') as f:
        data = f.read()
    with open(f'../cache/{filename}', 'wb') as f:
        f.write(data)
    return len(data)

mirror('axons.hdf5')
locally_cache('dendrites.hdf5', credentials)
locally_cache('axons.hdf5', credentials)

318936822

In [71]:
dendrites = h5py.File('../cache/dendrites.hdf5', 'r')
axons = h5py.File('../cache/axons.hdf5', 'r')

# Build a map from agglomerate ID to neuron id
agg_to_neuron_id = {k: v for k, v in zip(list(dendrites['dendrites']['agglomerate']), list(dendrites['dendrites']['neuronId']))}

d = collections.defaultdict(lambda: [])
for agg in list(dendrites['dendrites']['agglomerate'].keys()):
    if agg in agg_to_neuron_id and agg_to_neuron_id[agg] > 0:
        id = agg_to_neuron_id[agg]
        d[id] += list(dendrites['dendrites']['agglomerate'][agg])
        
# Also add the axons for these neurons.
in_map = 0
for agg in list(axons['axons']['agglomerate'].keys()):
    # Find the neuron id for this one.
    if agg in agg_to_neuron_id:
        in_map += 1
    if agg in agg_to_neuron_id and agg_to_neuron_id[agg] > 0:
        id = agg_to_neuron_id[agg]
        d[id] += list(axons['axons']['agglomerate'][agg])
        
neuron_map = {}
for neuron_id, segment_ids in d.items():
    for segment_id in segment_ids:
        neuron_map[segment_id] = neuron_id

Save it to GCS.

In [77]:
fs = gcsfs.GCSFileSystem(project='neuron-jungle', token=credentials)
with fs.open('l4dense/neuron-map-with-axons.pkl', 'wb') as f:
    f.write(pickle.dumps(neuron_map))

In [6]:
@numba.jit(nopython=True)
def remap(data, the_map):
    b = np.zeros_like(data)
    c = {}
    for i in range(len(data)):
        if data[i] in the_map:
            b[i] = the_map[data[i]]
            c[the_map[data[i]]] = 1
    return b, c

In [12]:
# To repaint: map dendrite ids to neuron id (default to 0)
from scipy.ndimage import morphology

@dask.delayed
def repaint(filename, credentials):    
    # Create a typed map for segment_to_neuron
    fs = gcsfs.GCSFileSystem(project='neuron-jungle', token=credentials)
    with fs.open('l4dense/neuron-map.pkl', 'rb') as f:
        segment_to_neuron = pickle.loads(f.read())

    the_map_typed = Dict.empty(key_type=numba.int32, value_type=numba.uint8)
    for k, v in segment_to_neuron.items():
        the_map_typed[k] = v
    
    neuron_ids = set()
    with fs.open(f'l4dense/segmentation-volume/{filename}', 'rb') as f:    
        cube = h5py.File(f, 'r')
        
        a = np.zeros((1024, 1024, 1024), dtype=np.uint8)
        
        slice_size = 32
        nslices = int(1024 / slice_size)
        
        for j in range(nslices):
            subd = np.array(cube['data'][(slice_size*j):(slice_size*(j+1)), :, :])
            for i in range(nslices):
                r, neuron_id = remap(subd[i, :, :].ravel(), the_map_typed)
                a[i + j*slice_size, :, :] = r.astype(np.uint8).reshape((1024, 1024))
                neuron_ids = neuron_ids.union(set(neuron_id.keys()))
    
    neuron_ids = np.array(list(neuron_ids))
    
    # Do some signal processing on each of the neurons
    #a_processed = np.zeros((1024, 1024, 1024), dtype=np.uint8)
    #for neuron_id in neuron_ids:
    #    B = (a == neuron_id)
    #    B = morphology.binary_erosion(morphology.binary_fill_holes(morphology.binary_dilation(B, iterations=3)), iterations=2)
    #    a_processed[B] = neuron_id
    
    #del a
    
    bio = io.BytesIO()
    cube = h5py.File(bio, 'w')
    cube.create_dataset('data', a_processed.shape, compression="gzip", data=a_processed)
    cube.create_dataset('neuron_ids', neuron_ids.shape, data=neuron_ids)
    cube.close()

    data = bio.getvalue()
    with fs.open(f'l4dense/neuron-volume-with-axons/{filename}', 'wb') as f:
        f.write(data)
    return len(data)

In [None]:
# x5y8z3 are the largest ids
bytes_total = 0
for i in range(6):
    for j in range(9):
        for k in range(4):
            print(i, j, k)
            bytes_total += repaint(f"x{i}y{j}z{k}.hdf5", credentials)
bytes_total.compute()

0 0 0
0 0 1
0 0 2
0 0 3
0 1 0
0 1 1
0 1 2
0 1 3
0 2 0
0 2 1
0 2 2
0 2 3
0 3 0
0 3 1
0 3 2
0 3 3
0 4 0
0 4 1
0 4 2
0 4 3
0 5 0
0 5 1
0 5 2
0 5 3
0 6 0
0 6 1
0 6 2
0 6 3
0 7 0
0 7 1
0 7 2
0 7 3
0 8 0
0 8 1
0 8 2
0 8 3
1 0 0
1 0 1
1 0 2
1 0 3
1 1 0
1 1 1
1 1 2
1 1 3
1 2 0
1 2 1
1 2 2
1 2 3
1 3 0
1 3 1
1 3 2
1 3 3
1 4 0
1 4 1
1 4 2
1 4 3
1 5 0
1 5 1
1 5 2
1 5 3
1 6 0
1 6 1
1 6 2
1 6 3
1 7 0
1 7 1
1 7 2
1 7 3
1 8 0
1 8 1
1 8 2
1 8 3
2 0 0
2 0 1
2 0 2
2 0 3
2 1 0
2 1 1
2 1 2
2 1 3
2 2 0
2 2 1
2 2 2
2 2 3
2 3 0
2 3 1
2 3 2
2 3 3
2 4 0
2 4 1
2 4 2
2 4 3
2 5 0
2 5 1
2 5 2
2 5 3
2 6 0
2 6 1
2 6 2
2 6 3
2 7 0
2 7 1
2 7 2
2 7 3
2 8 0
2 8 1
2 8 2
2 8 3
3 0 0
3 0 1
3 0 2
3 0 3
3 1 0
3 1 1
3 1 2
3 1 3
3 2 0
3 2 1
3 2 2
3 2 3
3 3 0
3 3 1
3 3 2
3 3 3
3 4 0
3 4 1
3 4 2
3 4 3
3 5 0
3 5 1
3 5 2
3 5 3
3 6 0
3 6 1
3 6 2
3 6 3
3 7 0
3 7 1
3 7 2
3 7 3
3 8 0
3 8 1
3 8 2
3 8 3
4 0 0
4 0 1
4 0 2
4 0 3
4 1 0
4 1 1
4 1 2
4 1 3
4 2 0
4 2 1
4 2 2
4 2 3
4 3 0
4 3 1
4 3 2
4 3 3
4 4 0
4 4 1
4 4 2
4 4 3
4 5 0
4 5 1
4 5 

We're done!