In [1]:
import collections
import dask
import gcsfs
import h5py
import io
import numba
import numpy as np
import os
import os.path
import pickle
import requests

from dask.distributed import Client
from dask_kubernetes import KubeCluster
from numba.typed import Dict

In [2]:
cluster = KubeCluster.from_yaml('worker-spec.yml')
cluster

distributed.scheduler - INFO - Clear task state
distributed.scheduler - INFO -   Scheduler at:    tcp://10.36.0.42:40839
distributed.scheduler - INFO -   dashboard at:                     :8787


VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [3]:
client = Client(cluster)

distributed.scheduler - INFO - Receive client connection: Client-29386508-394f-11ea-8077-3effd2c6d20c
distributed.core - INFO - Starting established connection


Check that the client is working well.

In [32]:
@dask.delayed
def the_sum(a, b):
    return a + b
the_sum(the_sum(1, 2), 3).compute()

6

In [None]:
if not os.path.exists('../../.gcs_tokens'):
    # Get a token
    gcsfs.GCSFileSystem(project='neuron-jungle', token='browser')

In [5]:
with open('../.gcs_tokens', 'rb') as f:
    credentials = pickle.load(f)
credentials = credentials[list(credentials.keys())[0]]
fs = gcsfs.GCSFileSystem(project='neuron-jungle', token=credentials)
fs.ls('l4dense/segmentation-volume')[:10]

['l4dense/segmentation-volume/x3y6z0.hdf5',
 'l4dense/segmentation-volume/x2y2z0.hdf5',
 'l4dense/segmentation-volume/x2y7z3.hdf5',
 'l4dense/segmentation-volume/x2y8z0.hdf5',
 'l4dense/segmentation-volume/x1y2z0.hdf5',
 'l4dense/segmentation-volume/x4y5z1.hdf5',
 'l4dense/segmentation-volume/x2y7z0.hdf5',
 'l4dense/segmentation-volume/x5y0z3.hdf5',
 'l4dense/segmentation-volume/x0y3z0.hdf5',
 'l4dense/segmentation-volume/x5y4z2.hdf5']

# Create a map from segment id to neuron id

In [14]:
def locally_cache(filename, credentials):
    fs = gcsfs.GCSFileSystem(project='neuron-jungle', token=credentials)
    with fs.open(f'l4dense/{filename}', 'rb') as f:
        data = f.read()
    with open(f'../cache/{filename}', 'wb') as f:
        f.write(data)
    return len(data)

locally_cache('dendrites.hdf5', credentials)

263193540

In [16]:
dendrites['dendrites']['agglomerate']

<HDF5 group "/dendrites/agglomerate" (11400 members)>

In [19]:
dendrites = h5py.File('../cache/dendrites.hdf5', 'r')

d = collections.defaultdict(lambda: [])
neuronId = np.array(dendrites['dendrites']['neuronId'])
for i, id in enumerate(neuronId):
    if id > 0:
        # Append the dendrite ids to the right slot.
        d[id] += np.array(dendrites['dendrites']['agglomerate'][str(i + 1)]).tolist()
    
neuron_map = {}
for neuron_id, segment_ids in d.items():
    for segment_id in segment_ids:
        neuron_map[segment_id] = neuron_id

In [10]:
@numba.jit(nopython=True)
def remap(data, the_map):
    b = np.zeros_like(data)
    c = {}
    for i in range(len(data)):
        if data[i] in the_map:
            b[i] = the_map[data[i]]
            c[the_map[data[i]]] = 1
    return b, c

distributed.scheduler - INFO - Remove worker tcp://10.36.9.2:39039
distributed.core - INFO - Removing comms to tcp://10.36.9.2:39039


In [13]:
# To repaint: map dendrite ids to neuron id (default to 0)
@dask.delayed
def repaint(filename, credentials):    
    # Create a typed map for segment_to_neuron
    fs = gcsfs.GCSFileSystem(project='neuron-jungle', token=credentials)
    with fs.open('l4dense/neuron-map.pkl', 'rb') as f:
        segment_to_neuron = pickle.loads(f.read())

    the_map_typed = Dict.empty(key_type=numba.int32, value_type=numba.uint8)
    for k, v in segment_to_neuron.items():
        the_map_typed[k] = v
    
    neuron_ids = set()
    with fs.open(f'l4dense/segmentation-volume/{filename}', 'rb') as f:    
        cube = h5py.File(f, 'r')
        
        a = np.zeros((1024, 1024, 1024), dtype=np.uint8)
        
        slice_size = 32
        nslices = int(1024 / slice_size)
        
        for j in range(nslices):
            subd = np.array(cube['data'][(slice_size*j):(slice_size*(j+1)), :, :])
            for i in range(nslices):
                r, neuron_id = remap(subd[i, :, :].ravel(), the_map_typed)
                a[i + j*slice_size, :, :] = r.astype(np.uint8).reshape((1024, 1024))
                neuron_ids = neuron_ids.union(set(neuron_id.keys()))
    
    neuron_ids = np.array(list(neuron_ids))
    
    bio = io.BytesIO()
    cube = h5py.File(bio, 'w')
    cube.create_dataset('data', a.shape, compression="gzip", data=a)
    cube.create_dataset('neuron_ids', neuron_ids.shape, data=neuron_ids)
    cube.close()

    data = bio.getvalue()
    with fs.open(f'l4dense/neuron-volume/{filename}', 'wb') as f:
        f.write(data)
    return len(data)

In [54]:
fs = gcsfs.GCSFileSystem(project='neuron-jungle', token=credentials)
with fs.open('l4dense/neuron-map.pkl', 'wb') as f:
    f.write(pickle.dumps(neuron_map))

In [None]:
# x5y8z3 are the largest ids
bytes_total = 0
for i in range(6):
    for j in range(9):
        for k in range(4):
            print(i, j, k)
            bytes_total += repaint(f"x{i}y{j}z{k}.hdf5", credentials)
bytes_total.compute()

0 0 0
0 0 1
0 0 2
0 0 3
0 1 0
0 1 1
0 1 2
0 1 3
0 2 0
0 2 1
0 2 2
0 2 3
0 3 0
0 3 1
0 3 2
0 3 3
0 4 0
0 4 1
0 4 2
0 4 3
0 5 0
0 5 1
0 5 2
0 5 3
0 6 0
0 6 1
0 6 2
0 6 3
0 7 0
0 7 1
0 7 2
0 7 3
0 8 0
0 8 1
0 8 2
0 8 3
1 0 0
1 0 1
1 0 2
1 0 3
1 1 0
1 1 1
1 1 2
1 1 3
1 2 0
1 2 1
1 2 2
1 2 3
1 3 0
1 3 1
1 3 2
1 3 3
1 4 0
1 4 1
1 4 2
1 4 3
1 5 0
1 5 1
1 5 2
1 5 3
1 6 0
1 6 1
1 6 2
1 6 3
1 7 0
1 7 1
1 7 2
1 7 3
1 8 0
1 8 1
1 8 2
1 8 3
2 0 0
2 0 1
2 0 2
2 0 3
2 1 0
2 1 1
2 1 2
2 1 3
2 2 0
2 2 1
2 2 2
2 2 3
2 3 0
2 3 1
2 3 2
2 3 3
2 4 0
2 4 1
2 4 2
2 4 3
2 5 0
2 5 1
2 5 2
2 5 3
2 6 0
2 6 1
2 6 2
2 6 3
2 7 0
2 7 1
2 7 2
2 7 3
2 8 0
2 8 1
2 8 2
2 8 3
3 0 0
3 0 1
3 0 2
3 0 3
3 1 0
3 1 1
3 1 2
3 1 3
3 2 0
3 2 1
3 2 2
3 2 3
3 3 0
3 3 1
3 3 2
3 3 3
3 4 0
3 4 1
3 4 2
3 4 3
3 5 0
3 5 1
3 5 2
3 5 3
3 6 0
3 6 1
3 6 2
3 6 3
3 7 0
3 7 1
3 7 2
3 7 3
3 8 0
3 8 1
3 8 2
3 8 3
4 0 0
4 0 1
4 0 2
4 0 3
4 1 0
4 1 1
4 1 2
4 1 3
4 2 0
4 2 1
4 2 2
4 2 3
4 3 0
4 3 1
4 3 2
4 3 3
4 4 0
4 4 1
4 4 2
4 4 3
4 5 0
4 5 1
4 5 

distributed.scheduler - INFO - Remove worker tcp://10.36.13.2:46263
distributed.core - INFO - Removing comms to tcp://10.36.13.2:46263
distributed.scheduler - INFO - Register tcp://10.36.13.2:36307
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.36.13.2:36307
distributed.core - INFO - Starting established connection


In [13]:
fs = gcsfs.GCSFileSystem(project='neuron-jungle', token=credentials)
with open('chunk_template.xdmf', 'r') as f:
    data = f.read()
    
with fs.open('l4dense/chunk_template.xdmf', 'w') as f:
    f.write(data)

In [8]:
np.unique(data.ravel())

array([ 0,  7, 19, 20, 32, 33, 37, 38, 49, 53, 58, 68, 70, 79, 88, 89],
      dtype=uint8)

In [10]:
data = []

In [12]:
!gsutil

/bin/sh: 1: gsutil: not found


In [4]:
import tempfile
import vtk

def fetch_and_cache(filename, credentials, replacement=None):
    fs = gcsfs.GCSFileSystem(project='neuron-jungle', token=credentials)
    
    if replacement is not None:
        mode = 'r'
    else:
        mode = 'rb'
    
    with fs.open(f'l4dense/{filename}', mode) as f:
        data = f.read()
    
    # Write this as a temp file.
    _, filename = tempfile.mkstemp()
    
    if replacement is not None:
        data = data.format(replacement)
    
    if replacement is not None:
        mode = 'w'
    else:
        mode = 'wb'
    
    with open(filename, mode) as f:
        f.write(data)
    
    return filename

def process_one_chunk(filename, credentials):
    index = 7
    xdmf_file = "chunk_template.xdmf"
    local_hdf_file = fetch_and_cache(filename, credentials)
    local_xdmf = fetch_and_cache(xdmf_file, credentials, local_hdf_file)
    
    # Do the 
    colors = vtk.vtkNamedColors()

    # Prepare to read the file.
    readerVolume = vtk.vtkXdmfReader()
    readerVolume.SetFileName(local_xdmf)
    readerVolume.Update()

    # Extract the region of interest.
    voi = vtk.vtkExtractVOI()
    voi.SetInputConnection(readerVolume.GetOutputPort())
    voi.SetVOI(0, 1023, 0, 1023, 0, 1023)
    voi.SetSampleRate(1, 1, 1)
    voi.Update()  # Necessary for GetScalarRange().
    srange = voi.GetOutput().GetScalarRange()  # Needs Update() before!
    print("Range", srange)

    # Prepare surface generation.
    contour = vtk.vtkDiscreteMarchingCubes()  # For label images.
    contour.SetInputConnection(voi.GetOutputPort())
    # contour.ComputeNormalsOn()

    print("Doing label", index)

    contour.SetValue(0, index)
    contour.Update()  # Needed for GetNumberOfPolys()!!!
    
    print("Done contour")

    smoother = vtk.vtkWindowedSincPolyDataFilter()
    smoother.SetInputConnection(contour.GetOutputPort())
    smoother.SetNumberOfIterations(20)  # This has little effect on the error!
    smoother.BoundarySmoothingOff()
    smoother.FeatureEdgeSmoothingOff()
    smoother.SetPassBand(.001)        # This increases the error a lot!
    smoother.NonManifoldSmoothingOn()
    smoother.NormalizeCoordinatesOn()
    smoother.GenerateErrorScalarsOn()
    smoother.Update()

    smoothed_polys = smoother.GetOutput()
    smoother_error = smoothed_polys.GetPointData().GetScalars()

    writer = vtk.vtkXMLDataSetWriter()
    writer.SetFileName("out.vtp")
    writer.SetInputData(smoothed_polys)
    writer.Write()
    
process_one_chunk('neuron-volume/x0y0z0.hdf5', credentials)

AttributeError: module 'vtk' has no attribute 'vtkXdmfReader'

In [5]:
vtk.

[0;31mType:[0m        module
[0;31mString form:[0m <module 'vtk' from '/opt/conda/lib/python3.7/site-packages/vtk/__init__.py'>
[0;31mFile:[0m        /opt/conda/lib/python3.7/site-packages/vtk/__init__.py
[0;31mDocstring:[0m  
This module loads the entire VTK library into its namespace.  It
also allows one to use specific packages inside the vtk directory..


We're done!