In [5]:
import h5py
import numpy as np
from tqdm import tqdm
from pathlib import Path

In [6]:
SRC_PATH = "/Volumes/SSD/mark/Documents/Works/MT_Dataset/mt_filtered_20250714.h5"
DST_PATH = "/Volumes/SSD/mark/Documents/Works/MT_Dataset/mt_ship_663221_test.h5"

TARGET_SHIP_ID = 20744
CHUNK_ROWS = 500_000

In [7]:
def ensure_group(h5: h5py.File | h5py.Group, path: str) -> h5py.Group:
    if path in h5:
        return h5[path]
    return h5.create_group(path)


In [8]:
with h5py.File(SRC_PATH, "r") as src, h5py.File(DST_PATH, "w") as dst:
    # Копируем attrs файла
    for k, v in src.attrs.items():
        dst.attrs[k] = v

    # Копируем все кроме positions
    for key in src.keys():
        if key == "positions":
            continue
        src.copy(key, dst)

    # Обрабатываем positions
    src_pos = src["positions"]
    dst_pos = dst.create_group("positions")

    # Посчитаем общее число строк для tqdm
    total_rows = 0
    days = []

    for yyyy in src_pos:
        for mm in src_pos[yyyy]:
            for dd in src_pos[yyyy][mm]:
                ds_day = src_pos[yyyy][mm][dd]
                days.append((yyyy, mm, dd, ds_day))
                total_rows += ds_day.shape[0]

    pbar = tqdm(total=total_rows, desc="Filter positions by ship_id", unit="rows")

    for yyyy, mm, dd, day_src in days:
        out_chunks = []

        n = day_src.shape[0]
        for start in range(0, n, CHUNK_ROWS):
            end = min(n, start + CHUNK_ROWS)
            chunk = day_src[start:end]

            mask = chunk["ship_id"] == TARGET_SHIP_ID
            if np.any(mask):
                out_chunks.append(chunk[mask])

            pbar.update(end - start)

        if not out_chunks:
            continue  # в этот день нужного судна нет

        out = np.concatenate(out_chunks)

        g_year = ensure_group(dst_pos, yyyy)
        g_month = ensure_group(g_year, mm)

        g_month.create_dataset(
            dd,
            data=out,
            maxshape=(None,),
            chunks=True,
            compression="gzip",
            compression_opts=4,
        )

    pbar.close()


Filter positions by ship_id: 100%|██████████| 826329360/826329360 [04:23<00:00, 3141512.33rows/s]
