In [None]:
""""
The h5 files had a loooong string for dirty diff, which logged all the code changes since last git commit when run -- led to a 100x bloat in file size.  
This includes all the deletions, additions to notebook outside the code as well. 
This code makes a copy of the h5 file that deletes the dirty diff field and deletes the original.
Now that field is omitted in the code logs.
"""

In [12]:
import h5py
import json
import os
import shutil
from tqdm import tqdm

FILE = "h5_results/hp_sweep_infinite_joint_gaussian.h5"
TMP = FILE + ".tmp_slim"

print("Creating temporary slim file:", TMP)

# First, collect all node names for tqdm
with h5py.File(FILE, "r") as f:
    all_items = []
    def collect(name, obj):
        all_items.append(name)
    f.visititems(collect)

print(f"Total HDF5 nodes to process: {len(all_items)}")

with h5py.File(FILE, "r") as f_src, h5py.File(TMP, "w") as f_dst:

    # NEW: copy root-level attributes (schema_version, created_at, tool, etc.)
    for k, v in f_src.attrs.items():
        f_dst.attrs[k] = v

    for name in tqdm(all_items, desc="Slimming file", unit="node"):
        obj = f_src.get(name)

        # ---------------- GROUPS ----------------
        if isinstance(obj, h5py.Group):
            g = f_dst.require_group(name)
            for k, v in obj.attrs.items():
                g.attrs[k] = v
            continue

        # ---------------- DATASETS ----------------
        if isinstance(obj, h5py.Dataset):

            # Intercept the giant JSON dataset
            if name.endswith("/attrs/json"):
                raw = obj[()]
                s = raw.decode("utf-8")
                cfg = json.loads(s)

                code_block = cfg.get("params", {}).get("code", {})
                has_diff = "dirty_diff" in code_block

                if has_diff:
                    del code_block["dirty_diff"]
                    slim_json = json.dumps(cfg, sort_keys=True)
                    #tqdm.write(f"[CLEANED] Removed dirty_diff in {name}")
                else:
                    # keep as-is, but re-store as vlen str
                    slim_json = s
                    # tqdm.write(f"[COPIED] No dirty_diff present in {name}")

                # Create a scalar vlen UTF-8 dataset (no compression)
                d = f_dst.create_dataset(
                    name,
                    data=slim_json,
                    dtype=h5py.special_dtype(vlen=str),
                )

                # copy dataset attributes
                for k, v in obj.attrs.items():
                    d.attrs[k] = v

                continue

            # Normal dataset: copy with compression (if allowed)
            data = obj[()]
            d = f_dst.create_dataset(
                name,
                data=data,
                compression=obj.compression or "gzip",
                chunks=obj.chunks,
            )
            for k, v in obj.attrs.items():
                d.attrs[k] = v

            continue

print("Slim file complete:", TMP)

# ------- SAFE REPLACEMENT -------
backup = FILE + ".backup_before_slim"
print("Backing up original file to:", backup)
shutil.move(FILE, backup)

print("Replacing original file with slimmed file...")
shutil.move(TMP, FILE)

print("Done!")
print("Backup saved as:", backup)
print("Slimmed file is now:", FILE)


Creating temporary slim file: h5_results/hp_sweep_infinite_joint_gaussian.h5.tmp_slim
Total HDF5 nodes to process: 1921


Slimming file: 100%|██████████| 1921/1921 [00:03<00:00, 506.64node/s]


Slim file complete: h5_results/hp_sweep_infinite_joint_gaussian.h5.tmp_slim
Backing up original file to: h5_results/hp_sweep_infinite_joint_gaussian.h5.backup_before_slim
Replacing original file with slimmed file...
Done!
Backup saved as: h5_results/hp_sweep_infinite_joint_gaussian.h5.backup_before_slim
Slimmed file is now: h5_results/hp_sweep_infinite_joint_gaussian.h5
