In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Incrementally build a centroid table.

For every source file named  grid_<zone>.parquet  in <input_dir>
  • compute WGS-84 centroids,
  • append them as ONE row-group (one part file) to <output_dataset>,
  • remember progress in <output_dataset>.done  so the job can restart.

Output:
  <output_dataset>/part.<n>.parquet   ← one per processed zone
  <output_dataset>.done               ← JSON list of processed filenames
Readable by: fastparquet, pyarrow, pandas, dask.
Requires: fastparquet ≥ 0.7, geopandas, pyarrow.
"""

import json, os, re, gc, sys
from pathlib import Path

import geopandas as gpd
import pandas as pd
from fastparquet import write
from tqdm import tqdm


# ---------------------------------------------------------------------------
# helpers
# ---------------------------------------------------------------------------
def extract_zone_order(tile_id: pd.Series) -> pd.Series:
    """
    Produces a sortable key so final output is NP → UTM(1N…60S) → SP.
    """
    def _key(tid: str) -> int:
        m = re.search(r"G\d+m_(\w+)_", tid)
        if not m:
            return 9999
        zone = m.group(1)
        if zone == "NP":
            return -1
        if zone == "SP":
            return 9998
        m2 = re.match(r"(\d+)([NS])", zone)
        if not m2:
            return 9999
        znum, hemi = int(m2[1]), m2[2]
        return znum * 2 + (0 if hemi == "N" else 1)

    return tile_id.apply(_key)


def append_df(df: pd.DataFrame, dataset_path: Path) -> None:
    """
    Append `df` as ONE new row-group (one new part file) to <dataset_path>.
    """
    write(
        dataset_path,               # a directory will be created on 1st call
        df,
        compression="ZSTD",
        append=dataset_path.exists(),   # True after first zone
        file_scheme="simple",          # 1 part file per append, restart-safe
        object_encoding="utf8",        # geometry stored as WKT
    )


# ---------------------------------------------------------------------------
# main driver
# ---------------------------------------------------------------------------
def combine_grid_parquets_centroid_incremental(
    input_dir: str | Path,
    output_dataset: str | Path,
    checkpoint_file: str | Path | None = None,
) -> None:
    input_dir      = Path(input_dir)
    output_dataset = Path(output_dataset)
    checkpoint     = Path(checkpoint_file) if checkpoint_file else output_dataset.with_suffix(".done")

    # ----------------------------------------------------------------------
    # 1. Already processed zones
    done: set[str] = set()
    if checkpoint.exists():
        done.update(json.loads(checkpoint.read_text()))

    # ----------------------------------------------------------------------
    # 2. Iterate over sources
    sources = sorted(
        p for p in input_dir.iterdir()
        if p.name.startswith("grid_") and p.suffix == ".parquet"
    )

    for src in tqdm(sources, desc="UTM zones"):
        if src.name in done:
            continue          # skip if already finished earlier

        try:
            # -- read minimal columns
            gdf = gpd.read_parquet(src, columns=["tile_id", "geometry"])
            if gdf.crs is None:
                raise ValueError("missing CRS")

            # -- accurate centroid in equal-area, then back to WGS-84
            gdf_proj  = gdf.to_crs("EPSG:6933")
            centroids = (
                gdf_proj.geometry.centroid
                .to_crs("EPSG:4326")
                .apply(lambda geom: geom.wkt)
            )

            chunk = pd.DataFrame(
                {
                    "tile_id": gdf["tile_id"].values,
                    "geometry": centroids.values,
                    "zone_order": extract_zone_order(gdf["tile_id"]),
                }
            ).sort_values("zone_order").drop(columns="zone_order")

            append_df(chunk, output_dataset)

            # -- mark success
            done.add(src.name)
            checkpoint.write_text(json.dumps(sorted(done)))

            # -- memory hygiene
            del gdf, gdf_proj, centroids, chunk
            gc.collect()

        except Exception as exc:
            print(f"[WARN] {src.name} skipped: {exc}", file=sys.stderr)

    print(f"✅ Finished. Dataset at {output_dataset} now contains {len(done)} zones.")


# ---------------------------------------------------------------------------
# CLI wrapper (edit the three paths then run)


In [None]:
# Apply to current sandbox directory
input_directory = "D:/nesteo_hf/grids/grid_600m"
output_file = "D:/nesteo_hf/grids/grids_centers/grid_600m.parquet"
combine_grid_parquets_centroid_incremental(input_directory, output_file)