In [2]:
import h5py
import numpy as np
from tqdm.auto import tqdm
from datetime import datetime


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset_path = "/Volumes/SSD/mark/Documents/Works/MT_Dataset/mt_master_20250714.h5"

if "ds" in vars():
    ds.close()   # type: ignore

ds = h5py.File(dataset_path, "r")
for attr in ds.attrs:
    print(f"{attr}: {ds.attrs[attr]}")

author: Mark Vodyanitskiy (mvodya@icloud.com)
created_at: 2025-07-13T14:26:08.378871
sources_count: 27555
sources_size: 439.3Gb
version: 1.0


In [None]:
# --- Chunk sizes ---
CHUNK_ROWS_POSITIONS = 2_000_000
CHUNK_ROWS_SHIPS = 5_000_000

# --- Speed rules ---
SPEED_MOVING_MIN = 10     # speed >= этого порога считаем "движением"
SPEED_SANITY_MAX = 800    # все, что выше, считаем мусором и удаляем

# --- Keep criteria ---
MIN_TOTAL_POINTS  = 50
MIN_MOVING_POINTS = 5
MIN_MAX_SPEED     = 20

In [9]:
def iter_day_datasets(ds: h5py.File):
    """
    Yield tuples: (('YYYY','MM','DD'), dataset)
    """
    if "positions" not in ds:
        return
    gpos = ds["positions"]
    for yyyy in sorted(gpos.keys()):
        gy = gpos[yyyy]
        if not isinstance(gy, h5py.Group):
            continue
        for mm in sorted(gy.keys()):
            gm = gy[mm]
            if not isinstance(gm, h5py.Group):
                continue
            for dd in sorted(gm.keys()):
                dsd = gm[dd]
                if isinstance(dsd, h5py.Dataset):
                    yield (yyyy, mm, dd), dsd


def ensure_group(h5: h5py.File, path: str) -> h5py.Group:
    """
    Create nested groups like mkdir -p. Returns final group.
    path like: "positions/2024/10"
    """
    g = h5
    for part in [p for p in path.split("/") if p]:
        if part not in g:
            g = g.create_group(part)
        else:
            g = g[part]
    return g


def append_rows(dst_ds: h5py.Dataset, rows: np.ndarray) -> None:
    """
    Append structured rows to resizable 1D dataset.
    """
    if rows.size == 0:
        return
    old = dst_ds.shape[0]
    new = old + rows.shape[0]
    dst_ds.resize((new,))
    dst_ds[old:new] = rows


In [10]:
days = []
total_positions = 0

for (yyyy, mm, dd), day_ds in iter_day_datasets(ds):
    days.append(((yyyy, mm, dd), day_ds))
    total_positions += int(day_ds.shape[0])

print("Days:", len(days))
print("Total positions:", total_positions)


Days: 247
Total positions: 1250874033


In [11]:
ships_src = ds["ships"]
n_ships = int(ships_src.shape[0])

max_ship_id = 0
p0 = tqdm(total=n_ships, desc="Pass0: scan ships for max_ship_id", unit="rows")

for start in range(0, n_ships, CHUNK_ROWS_SHIPS):
    end = min(n_ships, start + CHUNK_ROWS_SHIPS)
    chunk = ships_src[start:end]
    if chunk.size:
        m = int(chunk["ship_id"].max(initial=0))
        if m > max_ship_id:
            max_ship_id = m
    p0.update(end - start)

p0.close()
print("max_ship_id =", max_ship_id)


Pass0: scan ships for max_ship_id: 100%|██████████| 156832602/156832602 [01:28<00:00, 1764832.67rows/s]

max_ship_id = 156832602





In [None]:
total_points  = np.zeros(max_ship_id + 1, dtype=np.uint32)
moving_points = np.zeros(max_ship_id + 1, dtype=np.uint32)
max_speed     = np.zeros(max_ship_id + 1, dtype=np.uint16)

p1 = tqdm(total=total_positions, desc="Pass1: scan positions (stats)", unit="rows")

for (yyyy, mm, dd), day_ds in days:
    n = int(day_ds.shape[0])

    for start in range(0, n, CHUNK_ROWS_POSITIONS):
        end = min(n, start + CHUNK_ROWS_POSITIONS)
        chunk = day_ds[start:end]

        ship_ids = chunk["ship_id"].astype(np.int64, copy=False)

        sp = chunk["speed"].astype(np.int32, copy=False)
        sp_sane = np.clip(sp, 0, SPEED_SANITY_MAX).astype(np.int32, copy=False)
        moving_mask = sp_sane >= SPEED_MOVING_MIN

        # Total points per ship in this chunk
        u, c = np.unique(ship_ids, return_counts=True)
        total_points[u] += c.astype(np.uint32, copy=False)

        # Moving points per ship
        if moving_mask.any():
            mv_ids = ship_ids[moving_mask]
            u2, c2 = np.unique(mv_ids, return_counts=True)
            moving_points[u2] += c2.astype(np.uint32, copy=False)

        # Max speed per ship
        np.maximum.at(max_speed, ship_ids, sp_sane.astype(np.uint16, copy=False))

        p1.update(end - start)

p1.close()
print("Stats computed")


Pass1: scan positions (stats): 100%|██████████| 1250874033/1250874033 [09:49<00:00, 2120348.72rows/s]

Stats computed.





In [13]:
keep_mask = (
    (total_points >= MIN_TOTAL_POINTS) &
    (moving_points >= MIN_MOVING_POINTS) &
    (max_speed >= MIN_MAX_SPEED)
)

kept_ship_count = int(keep_mask.sum())
print(f"Kept ships: {kept_ship_count:,} / (mask size={keep_mask.size:,})")


Kept ships: 372,403 / (mask size=156,832,603)


In [None]:
out_path = "/Volumes/SSD/mark/Documents/Works/MT_Dataset/mt_filtered_20250714.h5"

dst = h5py.File(out_path, "w")

# Root attrs
for k, v in ds.attrs.items():
    dst.attrs[k] = v

dst.attrs["filtered_at"] = datetime.utcnow().isoformat()
dst.attrs["filter_rules"] = (
    f"MIN_TOTAL_POINTS={MIN_TOTAL_POINTS}, "
    f"MIN_MOVING_POINTS={MIN_MOVING_POINTS}, "
    f"MIN_MAX_SPEED={MIN_MAX_SPEED}, "
    f"SPEED_MOVING_MIN={SPEED_MOVING_MIN}, "
    f"SPEED_SANITY_MAX={SPEED_SANITY_MAX}"
)

# Copy static datasets
if "files" in ds:
    ds.copy("files", dst)
if "zones" in ds:
    ds.copy("zones", dst)

# tracks: пустой, dtype берем из исходника если есть
tracks_dtype = ds["tracks"].dtype if "tracks" in ds else np.dtype([("track_id", "i8")])
dst.create_dataset(
    "tracks",
    shape=(0,), maxshape=(None,),
    dtype=tracks_dtype,
    chunks=True, compression="gzip", compression_opts=4
)

# Ships output
ships_dtype = ships_src.dtype
ships_dst = dst.create_dataset(
    "ships",
    shape=(0,), maxshape=(None,),
    dtype=ships_dtype,
    chunks=True, compression="gzip", compression_opts=4
)

# Positions root group
ensure_group(dst, "positions")

print("Destination file created:", out_path)


Destination file created: /Volumes/SSD/mark/Documents/Works/MT_Dataset/mt_filtered_20250714.h5


  dst.attrs["filtered_at"] = datetime.utcnow().isoformat()


In [15]:
p2 = tqdm(total=n_ships, desc="Pass2: write ships", unit="rows")

for start in range(0, n_ships, CHUNK_ROWS_SHIPS):
    end = min(n_ships, start + CHUNK_ROWS_SHIPS)
    chunk = ships_src[start:end]
    ids = chunk["ship_id"].astype(np.int64, copy=False)
    m = keep_mask[ids]
    kept = chunk[m]
    append_rows(ships_dst, kept)
    p2.update(end - start)

p2.close()
print("ships written:", ships_dst.shape[0])


Pass2: write ships: 100%|██████████| 156832602/156832602 [03:19<00:00, 787320.78rows/s]

ships written: 372403





In [None]:
p3 = tqdm(total=total_positions, desc="Pass3: write positions", unit="rows")

for (yyyy, mm, dd), day_src in days:
    # positions/YYYY/MM
    g = ensure_group(dst, f"positions/{yyyy}/{mm}")

    # Create day dataset resizable
    day_dst = g.create_dataset(
        dd,
        shape=(0,), maxshape=(None,),
        dtype=day_src.dtype,
        chunks=True, compression="gzip", compression_opts=4
    )

    n = int(day_src.shape[0])
    for start in range(0, n, CHUNK_ROWS_POSITIONS):
        end = min(n, start + CHUNK_ROWS_POSITIONS)
        chunk = day_src[start:end]
        ids = chunk["ship_id"].astype(np.int64, copy=False)
        m = keep_mask[ids]
        kept = chunk[m]
        append_rows(day_dst, kept)
        p3.update(end - start)

p3.close()
print("Positions written")


Pass3: write positions: 100%|██████████| 1250874033/1250874033 [27:55<00:00, 746346.28rows/s]

positions written.





In [None]:
dst.close()
print(out_path)


Done: /Volumes/SSD/mark/Documents/Works/MT_Dataset/mt_filtered_20250714.h5
