# High-level interface

Our data is in 216 blocks of 1024 x 1024 x 1024 voxels. It would be real nice to be able to manipulate the full volume using a high-level interface. We will lazily assemble a volume out of the 216 blocks and do a simple operation: we will visualize one neuron. We will find all voxels that match the target neuron id and count their numbers along the z axis using a reduction. This will give us the outline of the cell via an orthographic projection.

In [2]:
import collections
import dask
import gcsfs
import h5py
import io
import numba
import numpy as np
import os
import os.path
import pickle
import requests

from dask.distributed import Client
from dask_kubernetes import KubeCluster
from numba.typed import Dict

In [54]:
#cluster = KubeCluster.from_yaml('worker-spec.yml')
cluster

VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [4]:
client = Client(cluster)

distributed.scheduler - INFO - Receive client connection: Client-d6d756ac-3888-11ea-8052-26996f775df6
distributed.core - INFO - Starting established connection


In [7]:
with open('../.gcs_tokens', 'rb') as f:
    credentials = pickle.load(f)
credentials = credentials[list(credentials.keys())[0]]
fs = gcsfs.GCSFileSystem(project='neuron-jungle', token=credentials)
fs.ls('l4dense/neuron-volume')[:10]

['l4dense/neuron-volume/x1y5z2.hdf5',
 'l4dense/neuron-volume/x5y7z2.hdf5',
 'l4dense/neuron-volume/x1y7z2.hdf5',
 'l4dense/neuron-volume/x4y8z3.hdf5',
 'l4dense/neuron-volume/x5y2z1.hdf5',
 'l4dense/neuron-volume/x4y4z3.hdf5',
 'l4dense/neuron-volume/x1y0z2.hdf5',
 'l4dense/neuron-volume/x5y6z0.hdf5',
 'l4dense/neuron-volume/x4y4z1.hdf5',
 'l4dense/neuron-volume/x5y7z0.hdf5']

In [57]:
filename = 'test.hdf5'

a = np.random.randn(100, 100)

bio = io.BytesIO()
cube = h5py.File(bio, 'w')
cube.create_dataset('data', a.shape, compression="gzip", data=a)
cube.close()

data = bio.getvalue()
with open(f'/tmp/{filename}', 'wb') as f:
    f.write(data)

with open(f'/tmp/{filename}', 'rb') as f:
    cube = h5py.File(f, 'r')

distributed.scheduler - INFO - Remove worker tcp://10.36.1.2:37737
distributed.core - INFO - Removing comms to tcp://10.36.1.2:37737
distributed.scheduler - INFO - Remove worker tcp://10.36.2.2:36459
distributed.core - INFO - Removing comms to tcp://10.36.2.2:36459
distributed.scheduler - INFO - Remove worker tcp://10.36.3.2:37939
distributed.core - INFO - Removing comms to tcp://10.36.3.2:37939
distributed.scheduler - INFO - Remove worker tcp://10.36.4.2:40497
distributed.core - INFO - Removing comms to tcp://10.36.4.2:40497
distributed.scheduler - INFO - Lost all workers


# Assemble the volume

In [43]:
import dask.array as da
import xarray as xr

filenames = ['x0y0z0.hdf5', 'x0y0z1.hdf5', 'x0y0z2.hdf5', 'x0y0z3.hdf5']

fs = gcsfs.GCSFileSystem(project='neuron-jungle', token=credentials)
gcsmap = gcsfs.mapping.GCSMap('l4dense', gcs=fs, check=False)

dask_arrays = []
for fn in filenames[:1]:
    fullname = f'l4dense/segmentation-volume/{fn}'
    #print(fullname)
    f = fs.open(fullname, 'rb')
    g = h5py.File(f, 'r')
    d = g['/data']
    d = np.zeros((1024, 1024, 1024), dtype=np.uint8)
    array = da.from_array(d, chunks=(1024, 1024, 1024))
    dask_arrays.append(array)

big_data = da.concatenate(dask_arrays, axis=2)  # concatenate arrays along axis 2

my_dataarray = xr.Dataset({'neuron_labels': (['x', 'y', 'z'], big_data)})

In [44]:
my_dataarray

In [46]:
my_dataarray.to_zarr(store=gcsmap)

ModuleNotFoundError: No module named 'zarr'

Unnamed: 0,Array,Chunk
Bytes,1.07 GB,1.07 GB
Shape,"(1024, 1024, 1024)","(1024, 1024, 1024)"
Count,1 Tasks,1 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 1.07 GB 1.07 GB Shape (1024, 1024, 1024) (1024, 1024, 1024) Count 1 Tasks 1 Chunks Type uint8 numpy.ndarray",1024  1024  1024,

Unnamed: 0,Array,Chunk
Bytes,1.07 GB,1.07 GB
Shape,"(1024, 1024, 1024)","(1024, 1024, 1024)"
Count,1 Tasks,1 Chunks
Type,uint8,numpy.ndarray


In [11]:
import gcsfs

ds = xr.Dataset()

# write to the bucket
ds.to_zarr(store=gcsmap)
# read it back
ds_gcs = xr.open_zarr(gcsmap)

<gcsfs.core.GCSFileSystem at 0x7f3c80b8c390>

In [29]:
x.sum().compute()

0