# How to use HDF5 files in Python 
[source](https://pythonforthelab.com/blog/how-to-use-hdf5-files-in-python/)

Datasets in hdf5
- Their info is stored on the hard drive and no not load to RAM if unused

Chunk size
- It is recommended to keep the total size of your chunks between 10 KiB and 1 MiB, larger for larger datasets
- when any element in a chunk is accessed, the entire chunk is read from disk
- auto chunking is set by default

In [14]:
import h5py
import numpy as np

arr = np.random.randn(1000)
arr2 = np.random.randn(1000)*0

"""Write to file"""
# open file with w: write permission
# with removes the need to use .close()
with h5py.File('random.hdf5', 'w') as f:
    # create dataset with the following name and data 
    dset = f.create_dataset("My Data Set", data=arr)
    dset2 = f.create_dataset("My second Data Set", data=arr2)


""" Read file """
# using r: read attribute
with h5py.File('random.hdf5', 'r') as f:
   # data is pointing to the file and not loaded to memory 
   # you can see this in the Jupyter variables 
   data = f['My Data Set']
   data2 = f['My second Data Set']
   print(min(data))
   print(max(data2))
   print(data[:5])

   # keep a set of values 
   data_set = data2[:5]
   
   # behavior of the data set is similar to a dictionary
   for key in f.keys():
        print(key)

print(data_set[4])
# print(data2[1]) # the dataset is locked but the variable we created is not

""" Placing the Data into RAM """
# done by including [()]
f = h5py.File('random.hdf5', 'r')
data_in_RAM = f['My second Data Set'][()]
f.close()
print(data_in_RAM[:5])

-3.6174143729211554
0.0
[-0.83943907 -1.0197887  -1.57003876  0.98782958 -0.71319319]
My Data Set
My second Data Set
0.0
[0. 0. 0. 0. 0.]


# Complex Data sets
Strategy: If a data set is too large we can loop through the disk rather than attempt to read it all to RAM at once 

In [18]:
""" Loading via Loop"""

import h5py
import numpy as np

arr1 = np.random.randn(10000)
arr2 = np.random.randn(10000)

with h5py.File('complex_read.hdf5', 'w') as f:
    f.create_dataset('array_1', data=arr1)
    f.create_dataset('array_2', data=arr2)


with h5py.File('complex_read.hdf5', 'r') as f:
    d1 = f['array_1']
    d2 = f['array_2']

    data = []

    for i in range(len(d1)):
        if d1[i] > 0:
            data.append(d2[i])

print('The length of data with a for loop: {}'.format(len(data)))



""" Loading all at once """

with h5py.File('complex_read.hdf5', 'r') as f:
    d1 = f['array_1']
    d2 = f['array_2']

    data = d2[d1[()]>0] # all of d1 is loaded and some of d2 is taken

print(data[4])

The length of data with a for loop: 5089
0.3575088283136155


In [35]:
arr = np.random.randn(100)

with h5py.File('random.hdf5', 'w') as f:
   # here we have a 1d dataset set to 1000 entries
   dset = f.create_dataset("default", (1000,))
   dset_matrix = f.create_dataset('matrix', (500, 1024))

   # you must specify where you are storing 
   # something like dset = arr will NOT work
   dset[10:20] = arr[50:60]

   #setting some values 
   dset_matrix[1,2] = 153
   dset_matrix[200:500, 500:1024] = 123

   this = dset_matrix[()]

print(this[1,2])
print(this[199:500, 499:1024])

153.0
[[  0.   0.   0. ...   0.   0.   0.]
 [  0. 123. 123. ... 123. 123. 123.]
 [  0. 123. 123. ... 123. 123. 123.]
 ...
 [  0. 123. 123. ... 123. 123. 123.]
 [  0. 123. 123. ... 123. 123. 123.]
 [  0. 123. 123. ... 123. 123. 123.]]


# Types
- `i1`: int 1 byte
- `i8`: int 8 bytes 
- `c16`: complex numbers of 16 bytes

In [64]:
with h5py.File('several_datasets.hdf5', 'w') as f:
   dset_int_1 = f.create_dataset('integers', (10, ), dtype='i1')
   dset_int_8 = f.create_dataset('integers8', (10, ), dtype='i8')
   dset_complex = f.create_dataset('complex', (10, ), dtype='c16')

   dset_int_1[0] = 127
   dset_int_8[0] = 1200.1
   dset_complex[0] = 3j + 4

   print(dset_int_1[0])
   print(dset_int_8[0])
   print(dset_complex[0])


127
1200
(4+3j)


Compression 
- default level is 4 
- high in example is 9 
- 0 is no compression 

> Ints compress better than floats 

Float compression: 

$$
\begin{align*}
    \text{no compression} &= 1602144 \\
    \text{compression 4} &= 1469868 \\
    \text{compression 9} &= 1469580 \\
\end{align*}
$$

about 8% compression gained

In [71]:
import h5py
import numpy as np

arr = np.random.randn(100000)
level = 0

with h5py.File('integer_1_compr.hdf5', 'w') as f:
    d = f.create_dataset('dataset', (100000,), dtype='i1', compression="gzip", compression_opts= level)
    d[:] = arr

with h5py.File('integer_8_compr.hdf5', 'w') as f:
    d = f.create_dataset('dataset', (100000,), dtype='i8', compression="gzip", compression_opts=level)
    d[:] = arr

with h5py.File('float_compr.hdf5', 'w') as f:
    d = f.create_dataset('dataset', (100000,), dtype='f16', compression="gzip", compression_opts=level)
    d[:] = arr

# Use the terminal to see the file byte sizes by level 
