In [1]:
import labrotation.file_handling as fh
import h5py
import os
import datadoc_util as dd

In [2]:
fpath1 = fh.open_file("Open first assembled traces hdf5 file!")
fpath2 = fh.open_file("Open second assembled traces hdf5 file!")

In [3]:
assert os.path.exists(fpath1)
assert os.path.exists(fpath2)
assert fpath1 != fpath2

In [4]:
env_dict = dict()
if not os.path.exists("./.env"):
    print(".env does not exist")
else:
    with open("./.env", "r") as f:
        for line in f.readlines():
            l = line.rstrip().split("=")
            env_dict[l[0]] = l[1]
print(env_dict.keys())

dict_keys(['DATA_DOCU_FOLDER', 'DOWNLOADS_FOLDER', 'LOG_FOLDER', 'MATLAB_2P_FOLDER', 'FLUO_LV_MATCHED_FOLDER'])


In [5]:
data_doc = dd.DataDocumentation(env_dict["DATA_DOCU_FOLDER"])

In [6]:
data_doc.loadDataDoc()

In [7]:
# assume the assembled_files dataset looks like this:
#file
#  * <uuid> as group
#    - attributes
#    - datasets

In [8]:
uuids1 = []
uuids2 = []
with h5py.File(fpath1, "r") as hf:
    for uuid in hf:
        uuids1.append(uuid)
with h5py.File(fpath2, "r") as hf:
    for uuid in hf:
        uuids2.append(uuid)

In [23]:
def compare_two_iterables(iterable1, iterable2):
    # return true if the iterables contain same elements
    found_diff_in_f1 = False
    found_diff_in_f2 = False
    only_in_f1 = []
    only_in_f2 = []
    for it1 in iterable1:
        if it1 not in iterable2:
            if not found_diff_in_f1:
                print("First iterable contains extra:")
                found_diff_in_f1 = True
            only_in_f1.append(it1)
            print(it1)
    for it2 in iterable2:
        if it2 not in iterable1:
            if not found_diff_in_f2:
                print("Second file contains extra:")
                found_diff_in_f2 = True
            only_in_f2.append(it2)
            print(it2)
    return not (found_diff_in_f1 or found_diff_in_f2)

In [24]:
a = compare_two_iterables(uuids1, uuids2)

Second file contains extra:
04b8cfbfa1c347058bb139b4661edcf1
12b84915b25141a4bf3af4a61b0acd25
163212a865784e6aafc1909c6859b891
1ee3e7c0ddcc4efb92c66e3d71bbae56
271794cbff1e44b88ca7ac2fb819ac62
43529408970b479a9e4133ca799c3b92
4358ba2ed52f4707b249dd07be416a7d
4473f11d58d043f68a128617fe3e3289
4ed771c0d9ee44c9a741ad22f54de667
52bd5171bbda4f3eabd9428f5851c74a
6847d178827c4d0a8d5d7727718d7842
798df122ea59497f8fd99fe7c12b3444
7f49aab18e8048ff903d7201ddf72cb0
89d90b1809cc4597bd3a176795d5791c
8e42c12f0a2f4f0b9863f4b87e849e5e
96b61faf7cec4ce29a1593a2e3c0d62f
9b401346419f45068b4c641b18922e45
a0881ba1f53948cb9ac002fc6e92cc9b
aad74bb04b074c33af828dc72c5d774f
b48b7bfa08b7424390c067b2695ff712
bb5c07a26c604c6faa01845dd3d6c18b
bd796ef078e449f8b80685c5681133bc
bfcfb8283dd0422c919cf6671e7ae630
c6bb2ce9db3842788c3d6679021bf3f0
c954329c0bfc4fad9aa39dea2018cd54
cbe78f51e2b0415a844f5c136a349a30
dd1aa8d816da4d5c860861b63bd50b04
e6874377639546eebeb6372599818fad
e6dd1813721b4f2381dbd5e24553b424
f049ef71fdd04f3

In [12]:
common_uuids = list(set(uuids1).intersection(uuids2))

In [26]:
# compare all attributes
with h5py.File(fpath1, "r") as hf1:
    with h5py.File(fpath2, "r") as hf2:
        for uuid in common_uuids:
            gr1 = hf1[uuid]
            gr2 = hf2[uuid]
            # check attributes
            # todo: extract the function of comparing two iterables from above (checking uuids) and use it for attributes
            # as well as dataset names
            print(uuid)
            compare_two_iterables(gr1.attrs, gr2.attrs)



238cb698d9cf445ca324d9925cbe26fb
Second file contains extra:
stim_duration_s
b9f18da25af3478caaccb17d87c0a4f4
Second file contains extra:
stim_duration_s
73b3828d53b6437a8a4990c778ed2ef4
Second file contains extra:
stim_duration_s
09e0277c86234572ac586ab18be1cd58
Second file contains extra:
stim_duration_s
4ae789df9809469b8668ff01a8cc91ee
Second file contains extra:
stim_duration_s
d43a110d015345c58ed961ee510e70aa
Second file contains extra:
stim_duration_s
3ddbac0cc89a4c7ebb5a2e42060b54dd
Second file contains extra:
stim_duration_s
f3d89599307c45cf81b57fce62796221
Second file contains extra:
stim_duration_s
28878982fe9146449f28c76c13abd9a6
Second file contains extra:
stim_duration_s
5f956ee24bb74b159d71125175aeecb6
Second file contains extra:
stim_duration_s
8a6d1f27381b469c80e3cf72d4da9817
Second file contains extra:
stim_duration_s
d7cde4682e4b48d2936c7eaed4915089
Second file contains extra:
stim_duration_s
8f982e72620648ecbe79ec4a5f605f25
Second file contains extra:
stim_duration_s

In [27]:
# compare all dataset names. This should be the same, so assert.
with h5py.File(fpath1, "r") as hf1:
    with h5py.File(fpath2, "r") as hf2:
        for uuid in common_uuids:
            gr1 = hf1[uuid]
            gr2 = hf2[uuid]
            # check attributes
            # todo: extract the function of comparing two iterables from above (checking uuids) and use it for attributes
            # as well as dataset names
            print(uuid)
            assert compare_two_iterables(gr1.keys(), gr2.keys())

238cb698d9cf445ca324d9925cbe26fb
b9f18da25af3478caaccb17d87c0a4f4
73b3828d53b6437a8a4990c778ed2ef4
09e0277c86234572ac586ab18be1cd58
4ae789df9809469b8668ff01a8cc91ee
d43a110d015345c58ed961ee510e70aa
3ddbac0cc89a4c7ebb5a2e42060b54dd
f3d89599307c45cf81b57fce62796221
28878982fe9146449f28c76c13abd9a6
5f956ee24bb74b159d71125175aeecb6
8a6d1f27381b469c80e3cf72d4da9817
d7cde4682e4b48d2936c7eaed4915089
8f982e72620648ecbe79ec4a5f605f25
cf46116ed3f04cf7909c774bae9dc722
9c9550fdbd15460b8aed0e87d8f6031e
c803483e98664c5185770b31e769fcbb
4b688fabd0c146a791c2b822332920ef
c0b4ed2d34b34bdc8340a8a6332291f2
41b6c53dfd2d4b258e019fa18f233aa8
904cc7c85915482c9fea5a43242fca5f
2c4f735f82834b868f705a447613b31b
83783b95d77c417bbb8710aba76ea79c
e05f30498e6b4f9087ef949fa794a89c
cbb15c1d27d74f928b20a901457bea47
165df3ec480a4ef7adcc62735c850a1b
8dec51d8e6944f97b07da4aa35c87e55
77e5fc88100f4525bb827e1d0503460f
6a7decd2c7634a02b1a344688b5fdbb7
ad02dcce6dc449e29b8d483b5f8a431e
ea0966dfc987412c83b66c2535b9d622
6d7002226b

In [31]:
# compare datasets for groups (uuid) that exist in both files
with h5py.File(fpath1, "r") as hf1:
    with h5py.File(fpath2, "r") as hf2:
        for uuid in common_uuids:
            print(uuid)
            gr1 = hf1[uuid]
            gr2 = hf2[uuid]
            for dset_name in gr1:
                dset1 = gr1[dset_name][()]
                dset2 = gr2[dset_name][()]
                if not (dset1 == dset2).all():
                    print(f"\t{dset_name} mismatch")

238cb698d9cf445ca324d9925cbe26fb
b9f18da25af3478caaccb17d87c0a4f4
73b3828d53b6437a8a4990c778ed2ef4
09e0277c86234572ac586ab18be1cd58
4ae789df9809469b8668ff01a8cc91ee
d43a110d015345c58ed961ee510e70aa
3ddbac0cc89a4c7ebb5a2e42060b54dd
f3d89599307c45cf81b57fce62796221
28878982fe9146449f28c76c13abd9a6
5f956ee24bb74b159d71125175aeecb6
8a6d1f27381b469c80e3cf72d4da9817
d7cde4682e4b48d2936c7eaed4915089
8f982e72620648ecbe79ec4a5f605f25
cf46116ed3f04cf7909c774bae9dc722
9c9550fdbd15460b8aed0e87d8f6031e
c803483e98664c5185770b31e769fcbb
4b688fabd0c146a791c2b822332920ef
c0b4ed2d34b34bdc8340a8a6332291f2
41b6c53dfd2d4b258e019fa18f233aa8
904cc7c85915482c9fea5a43242fca5f
2c4f735f82834b868f705a447613b31b
83783b95d77c417bbb8710aba76ea79c
e05f30498e6b4f9087ef949fa794a89c
cbb15c1d27d74f928b20a901457bea47
165df3ec480a4ef7adcc62735c850a1b
8dec51d8e6944f97b07da4aa35c87e55
77e5fc88100f4525bb827e1d0503460f
6a7decd2c7634a02b1a344688b5fdbb7
ad02dcce6dc449e29b8d483b5f8a431e
ea0966dfc987412c83b66c2535b9d622
6d7002226b