# Direct chunk write with bitshuffle compression

In [1]:
# Determine byteorder of current system
import sys
sys.byteorder

'little'

In [2]:
import h5py
import numpy
import struct
import bitshuffle
import bitshuffle.h5

In [3]:
filename = 'test_direct_chunk_write_bitshuffle.hdf5'
filehandle = h5py.File(filename, "w")

In [4]:
block_size = 2048
dataset = filehandle.create_dataset("data", (100, 100, 100), maxshape=(None, 100, 100), compression=bitshuffle.h5.H5FILTER, compression_opts=(block_size, bitshuffle.h5.H5_COMPRESS_LZ4), chunks=(1,100,100), dtype='float32')

In [5]:
# Create random numbers
array = numpy.random.rand(100, 100)
array = array.astype('float32')

In [6]:
array

array([[  2.87754506e-01,   8.95269871e-01,   4.05164272e-01, ...,
          7.55991876e-01,   4.78493497e-02,   8.96251351e-02],
       [  2.65910923e-01,   1.51550084e-01,   3.39855462e-01, ...,
          2.76492964e-02,   9.72803414e-01,   5.45942962e-01],
       [  7.68216312e-01,   6.48744822e-01,   5.93377292e-01, ...,
          3.20077204e-04,   8.72908056e-01,   2.40250707e-01],
       ..., 
       [  5.97783029e-01,   3.39329571e-01,   3.03336948e-01, ...,
          7.56952286e-01,   6.26245618e-01,   7.05749273e-01],
       [  4.73752826e-01,   9.83646631e-01,   6.89552069e-01, ...,
          8.75296354e-01,   4.58672673e-01,   4.17434067e-01],
       [  3.51259738e-01,   8.57367933e-01,   7.73854613e-01, ...,
          1.37655526e-01,   4.24313545e-01,   4.63528596e-02]], dtype=float32)

In [7]:
# Generic function to create bitshuffle compressed chunk
def compress_as_chunk(array, block_size=2048):
    compressed_bytes = bitshuffle.compress_lz4(array, block_size)
    bytes_number_of_elements = struct.pack('>q', (array.shape[0]*array.shape[1]*array.dtype.itemsize))
    bytes_block_size = struct.pack('>i', block_size*array.dtype.itemsize)
    all_bytes = bytes_number_of_elements + bytes_block_size + compressed_bytes.tobytes()
    return all_bytes

In [8]:
compressed = compress_as_chunk(array, block_size=block_size)

In [9]:
# Actual size data
print(array.nbytes)
# Compressed size
print(len(compressed))

40000
33215


In [10]:
# Direct chunk write
index = 0
dataset.id.write_direct_chunk((index, 0, 0), compressed)

In [11]:
dataset[0]

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [12]:
filehandle.close()

In [13]:
filehandle = h5py.File(filename, "r")

In [14]:
filehandle['data'][0]

array([[  2.87754506e-01,   8.95269871e-01,   4.05164272e-01, ...,
          7.55991876e-01,   4.78493497e-02,   8.96251351e-02],
       [  2.65910923e-01,   1.51550084e-01,   3.39855462e-01, ...,
          2.76492964e-02,   9.72803414e-01,   5.45942962e-01],
       [  7.68216312e-01,   6.48744822e-01,   5.93377292e-01, ...,
          3.20077204e-04,   8.72908056e-01,   2.40250707e-01],
       ..., 
       [  5.97783029e-01,   3.39329571e-01,   3.03336948e-01, ...,
          7.56952286e-01,   6.26245618e-01,   7.05749273e-01],
       [  4.73752826e-01,   9.83646631e-01,   6.89552069e-01, ...,
          8.75296354e-01,   4.58672673e-01,   4.17434067e-01],
       [  3.51259738e-01,   8.57367933e-01,   7.73854613e-01, ...,
          1.37655526e-01,   4.24313545e-01,   4.63528596e-02]], dtype=float32)

In [15]:
filehandle['data'][0].shape

(100, 100)

In [16]:
filehandle['data']._filters

{'32008': (0, 2, 4, 2048, 2)}

In [17]:
filehandle['data'].shuffle

False

In [18]:
# Checking for sameness
if (array == filehandle['data'][0]).all():
    print('same')

same


In [19]:
filehandle.close()