In [None]:
import geopandas as gpd
from shapely.geometry import box
import pandas as pd
import rasterio 
from rasterstats import zonal_stats
import numpy as np
import os
import glob
import time

### Configuration ###
NDSM = "/media/remap/NO_HEAT_RB/City_Atlanta/Processed/DSM/normalized_DSM.tif"
BD_PATH = "/media/remap/NO_HEAT_RB/City_Atlanta/Processed/Building_Footprints/atlanta_building.geojson"
INT_DIR = "/media/remap/NO_HEAT_RB/City_Atlanta/Processed/Building_Footprints/intermediate"
OUTPUT_PATH = "/media/remap/NO_HEAT_RB/City_Atlanta/Processed/Building_Footprints/building_ft_height.geojson"
### ------------- ###

In [None]:

def compute_height(
    gdf_path: str,
    ndsm_path: str,
    output_dir: str,
    output_path: str,
    chunk_size: int = 500,
) -> None:
    """
    Compute building height per feature by chunking a GeoDataFrame,
    computing zonal stats on the NDSM raster, and saving both chunked
    outputs and a merged final GeoJSON.

    Parameters
    ----------
    gdf_path : str
        Path to the input vector file (any format supported by GeoPandas).
    ndsm_path : str
        Path to the normalized DSM raster used for zonal stats.
    output_dir : str
        Directory where part files will be saved.
    output_path : str
        Path where final output will be saved.
    chunk_size : int, optional
        Number of features per chunk (default is 500).
    """
    # ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # load and subset only needed columns
    gdf = gpd.read_file(gdf_path)

    # Load raster CRS using rasterio
    with rasterio.open(ndsm_path) as ndsm_src:
        ndsm_crs = ndsm_src.crs

    if gdf.crs != ndsm_crs:
        print(f"Reprojecting vector data from {gdf.crs} to {ndsm_crs}")
        gdf = gdf.to_crs(ndsm_crs)

    # split into chunks
    chunks = [gdf.iloc[i : i+chunk_size] 
              for i in range(0, len(gdf), chunk_size)]

    part_paths = []
    for idx, chunk in enumerate(chunks, start=1):
        # compute stats for this chunk only
        stats = zonal_stats(
            chunk, ndsm_path,
            stats=["mean", "std", "max"],
            geojson_out=True
        )

        # build a DataFrame of stats
        stats_df = pd.DataFrame([feat["properties"] for feat in stats])
        stats_df["std"] = stats_df["std"].fillna(0).round(2)
        stats_df["mean_h"] = stats_df["mean"].fillna(0).round(2)
        stats_df["max_h"] = stats_df["max"].fillna(0).round(2)
        stats_df["coef_var"] = stats_df["std"] / stats_df["mean_h"]

        # reset indices so they align
        chunk_reset = chunk.reset_index(drop=True)
        stats_reset = stats_df.reset_index(drop=True)

        # join and compute final fields
        result = chunk_reset.join(stats_reset[["coef_var", "mean_h", "max_h"]])
        result["bd_h"] = np.where(
            result["coef_var"] >= 0.7,
            result["mean_h"],
            result["max_h"]
        )
        # save this chunk
        part_path = os.path.join(
            output_dir,
            f"part_{idx:03d}.geojson"
        )
        result.to_file(part_path, driver="GeoJSON")
        part_paths.append(part_path)
        print(f"Saved chunk {idx} to {part_path}")

    # merge all parts
    merged = gpd.GeoDataFrame(pd.concat(
        [gpd.read_file(p) for p in sorted(part_paths)],
        ignore_index=True
    ))
    merged.to_file(output_path, driver="GeoJSON")
    print(f"Merged output saved to {output_path}")

In [None]:
if __name__ == "__main__":
    s_time = time.time()
    
    compute_height(
        gdf_path=BD_PATH,
        ndsm_path=NDSM,
        output_dir=INT_DIR,
        output_path=OUTPUT_PATH
        chunk_size=500,                      # tweak as needed
    )
    f_time = time.time()
    p_time = f_time - s_time
    print(f"Processing time: {p_time:.2f} seconds")