This script aims to 
- load newly calculated `high_work_productivity` raster outputs and perform zonal stats (`mean`) by bourough

*ps: this script is adpated from `invest_result_zonal_stats_temp.ipynb`*

# Load data

## Function 

### function - short name

In [2]:
import re
from pathlib import Path

from pathlib import Path
import re

def make_short_name(path: Path, base_dir: Path) -> str:
    """
    Generate a compact name from a UCM scenario file.

    Example:
    input:
        base_dir / "new_work_intensity" / "scenario0" / "high_work_productivity_22deg_2uhi_55hum.tif"
    output:
        "scenario0_22_2"
    """
    path = Path(path)

    # 1) Get the scenario prefix (folder immediately under base_dir)
    try:
        rel_parts = path.relative_to(base_dir).parts            # EDIT
        # Prefer the first component under base_dir that contains 'scenario', else just the first folder  # EDIT
        prefix = next((p for p in rel_parts[:-1] if "scenario" in p.lower()), rel_parts[0])  # EDIT
    except ValueError:
        # If the path isn't under base_dir, fall back to the direct parent (keeps function usable)  # EDIT
        prefix = path.parent.name                                # EDIT

    # 2) Extract deg/uhi numbers from the filename (no longer require 'scenario' in name)  # EDIT
    m = re.search(
        r"(?P<deg>\d+(?:\.\d+)?)deg_.*?(?P<uhi>\d+(?:\.\d+)?)uhi",  # EDIT
        path.name, flags=re.IGNORECASE                               # EDIT
    )
    if not m:
        raise ValueError(f"Could not parse deg/uhi from {path.name}")  # EDIT

    def _norm_num(s: str) -> str:                                     # EDIT
        f = float(s)
        return str(int(f)) if f.is_integer() else f"{f:g}"

    deg = _norm_num(m.group("deg"))                                   # EDIT
    uhi = _norm_num(m.group("uhi"))                                   # EDIT

    return f"{prefix}_{deg}_{uhi}"



# # Example usage
# from pathlib import Path
# wd_main     = Path(r"G:\Shared drives\Wellcome Trust Project Data") 
# dir_ucm_out = wd_main / "2_postprocess_intermediate" / "UCM_official_runs"
# f = dir_ucm_out / "new_work_intensity" / "scenario0" / "high_work_productivity_22deg_2uhi_55hum.tif"
# print(make_short_name(f, dir_ucm_out))


### function - zonal stats

In [3]:
import pandas as pd
import geopandas as gpd
from pathlib import Path
from rasterstats import zonal_stats

import math                              # EDIT
import numpy as np                       # EDIT
import rasterio as rio

def run_zonal_stats(
    aoi_path,
    raster_path,
    model_output_path, 
    out_dir,
    stats=["mean"],
    nodata=-9999.0,
    suffix=None,
    drop_cols=None
):
    """
    Compute zonal statistics for a single raster and save results to CSV.

    Parameters
    ----------
    aoi_path : str or Path
        Path to AOI shapefile/GeoJSON/GeoPackage.
    raster_path : str or Path
        Path to raster file.
    out_dir : str or Path
        Directory where output CSV will be saved.
    stats : list[str], default ["mean"]
        Zonal statistics to compute.
    nodata : float, default -9999.0
        NoData value in raster.
    suffix : str or None
        Optional suffix to append to stat column names and output file.
        If None, uses raster file stem.
    drop_cols : list[str] or None
        Columns to drop from output DataFrame.

    Returns
    -------
    pd.DataFrame
        DataFrame with zonal statistics.
    """
    aoi = gpd.read_file(aoi_path)
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    raster_path = Path(raster_path)
    suffix = make_short_name(raster_path, model_output_path)

    
    # look for folder names of interest
    candidates = ["lulc", "scenario"]
    lc_scenario = next((p for p in raster_path.parts if any(c in p for c in candidates)), None)
    print(lc_scenario)  # → "scenario1"




    # -------- Detect NoData from raster metadata (robust) --------  # EDIT
    with rio.open(raster_path) as ds:                                 # EDIT
        meta_nodata = ds.nodata if ds.nodata is not None else ds.nodatavals[0]  # EDIT
        dtype = ds.dtypes[0].lower()                                   # EDIT

    # Prefer the raster's own NoData; otherwise smart fallback          # EDIT
    nodata_to_use = meta_nodata if meta_nodata is not None else nodata # EDIT

    # If dtype is float32 and nodata is still None, many rasters imply the sentinel -FLT_MAX  # EDIT
    if (meta_nodata is None) and ("float32" in dtype):                 # EDIT
        nodata_to_use = np.finfo(np.float32).min                       # EDIT

    # If nodata_to_use somehow ended up NaN, rasterstats may not like it as a parameter.       # EDIT
    # In that case, skip passing nodata explicitly; rasterio will still mask NaNs on read.     # EDIT
    pass_nodata = not (isinstance(nodata_to_use, float) and math.isnan(nodata_to_use))  # EDIT




    # -------------------- Run zonal stats --------------------
    zs_kwargs = dict(geojson_out=True, stats=stats)                    # EDIT
    if pass_nodata:                                                    # EDIT
        zs_kwargs["nodata"] = nodata_to_use                            # EDIT

    zs_result = zonal_stats(
        aoi,
        str(raster_path),
        **zs_kwargs
    )

    zs_gdf = gpd.GeoDataFrame.from_features(zs_result)
    df = pd.DataFrame(zs_gdf)

    # Add scenario/lc column if found
    if lc_scenario:
        df["lc_scenario"] = lc_scenario

    # add raster name column
    df["raster"] = raster_path.name

    # # Rename stats columns
    # rename_dict = {s: f"{s}_{suffix}" for s in stats}
    # df = df.rename(columns=rename_dict)

    

    # Drop unwanted columns
    if drop_cols:
        df = df.drop(columns=drop_cols, errors="ignore")

    # Save to CSV
    out_csv = out_dir / f"{suffix}_zonal_stats.csv"
    df.to_csv(out_csv, index=False)
    print(f"\t[OK] Zonal stats saved → {out_csv}")
    print(f"\t[info] dtype={dtype}, nodata_used={nodata_to_use!r}")      # EDIT

    return df


## Data dir

In [6]:
from pathlib import Path

# Set the working directory
wd_main     = Path(r"G:\Shared drives\Wellcome Trust Project Data") 
wd_shp      = wd_main / "1_preprocess" / "UrbanCoolingModel" / "OfficialWorkingInputs" / "AOIs"
dir_ucm_out = wd_main / "2_postprocess_intermediate" / "UCM_official_runs"

dir_ucm_prod_new      = dir_ucm_out / "new_work_intensity"
dir_ucm_prod_new_stat = dir_ucm_prod_new / "post_processing_stats"
figures_dir = wd_main / "3_final" / "UCM_figures"

## check nodata value

In [7]:
# Example raster path
import rasterio as rio
from rasterio.enums import MaskFlags

path = dir_ucm_prod_new / "scenario0" / "high_work_productivity_22deg_5uhi_55hum.tif"

with rio.open(path) as ds:
    # 1) Reported NoData (per band)
    print("nodata (band 1):", ds.nodatavals[0])  # or ds.nodata (alias for band 1)

    # 2) Does the band use an internal mask instead of a nodata value?
    print("mask flags:", ds.mask_flag_enums[0])  # e.g., [MaskFlags.per_dataset, MaskFlags.alpha]

    # 3) Is there an alpha band?
    print("count (bands):", ds.count)

    # 4) Quick sanity on what’s actually masked
    m = ds.read_masks(1)                 # 0 = masked, 255 = valid
    print("masked pixels:", (m == 0).sum())


nodata (band 1): 1.7976931348623157e+308
mask flags: [<MaskFlags.nodata: 8>]
count (bands): 1
masked pixels: 3242580


## Batch temp raster 

In [8]:
aoi_adm = wd_shp / "London_Borough_aoi.shp"  # Administrative boundary (e.g., census tracts)

drop_cols = ["HECTARES", "NONLD_AREA", "ONS_INNER", "SUB_2009", "SUB_2006", "geometry"]


## loop more rasters

In [10]:

# 2) Recursive: all files in subfolders too
rasters = sorted(dir_ucm_prod_new.rglob("*work_productivity*.tif"))

for p in rasters:
    print(p)

G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\new_work_intensity\scenario0\high_work_productivity_22.0deg_2.0uhi_55.0hum.tif
G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\new_work_intensity\scenario0\high_work_productivity_22deg_5uhi_55hum.tif
G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\new_work_intensity\scenario0\high_work_productivity_25deg_2uhi_45hum.tif
G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\new_work_intensity\scenario0\high_work_productivity_25deg_5uhi_45hum.tif
G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\new_work_intensity\scenario0\high_work_productivity_28deg_2uhi_45hum.tif
G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\new_work_intensity\scenario0\high_work_productivity_28deg_5uhi_45hum.tif
G:\Shared drives

In [11]:

all_results = []
for raster in rasters:
    # out_dir = raster.parent
    df = run_zonal_stats(
        aoi_path=aoi_adm,
        raster_path=raster,
        model_output_path = dir_ucm_prod_new,
        out_dir=dir_ucm_prod_new_stat,
        stats=["mean"],
        drop_cols=drop_cols
    )
    all_results.append(df)

# # Optionally merge all results side-by-side
# df_combined = pd.concat(all_results, axis=1)
# print(df_combined.head())


# row-wise combine
df_combined = pd.concat(all_results, axis=0, ignore_index=True)


scenario0
	[OK] Zonal stats saved → G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\new_work_intensity\post_processing_stats\scenario0_22_2_zonal_stats.csv
	[info] dtype=float64, nodata_used=1.7976931348623157e+308
scenario0
	[OK] Zonal stats saved → G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\new_work_intensity\post_processing_stats\scenario0_22_5_zonal_stats.csv
	[info] dtype=float64, nodata_used=1.7976931348623157e+308
scenario0
	[OK] Zonal stats saved → G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\new_work_intensity\post_processing_stats\scenario0_25_2_zonal_stats.csv
	[info] dtype=float64, nodata_used=1.7976931348623157e+308
scenario0
	[OK] Zonal stats saved → G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\new_work_intensity\post_processing_stats\scenario0_25_5_zonal_stats.csv
	[info] dtype=float64, nodata_u

## save all summarized data

In [None]:
# ## 1. directly save if run all the rasters at once
# out_csv = dir_ucm_prod_new_stat / f"work_productivity_{len(all_results)}_zonal_stats_long.csv"
# df_combined.to_csv(out_csv, index=False)
# print(f"[OK] Zonal stats saved → {out_csv}")

[OK] Zonal stats saved → G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\new_work_intensity\post_processing_stats\work_productivity_24_zonal_stats_long.csv


In [12]:

## 2. save after running multiple rasters <- need to manually bind all results

# ==== CONFIG ====
ROOT = dir_ucm_prod_new_stat   # ← change me
PATTERN = "scenario*_zonal_stats.csv" # start with "scenario", end with "_zonal_stats.csv"
RECURSIVE = False                     # set False to only scan top-level
# ================

def parse_scenario(filename: str) -> str:
    """
    Extract the 'scenario...' block from filenames like:
      scenario4_10prc_zonal_stats.csv
      scenario_foo_bar_zonal_stats.csv
      scenario3_zonal_stats.csv
    """
    m = re.search(r"(scenario(?:\d+)?(?:_[A-Za-z0-9]+)*)_zonal_stats\.csv$", filename)
    return m.group(1) if m else "scenario"

# 1) Find matching files
files = sorted((ROOT.rglob if RECURSIVE else ROOT.glob)(PATTERN))

if not files:
    raise FileNotFoundError(f"No files matching {PATTERN} under {ROOT}")

# 2) Read & bind
dfs = []
for f in files:
    df = pd.read_csv(f)
    dfs.append(
        df.assign(
            # source=str(f),
            scenario=parse_scenario(f.name)
        )
    )

combined = pd.concat(dfs, ignore_index=True, sort=False)

## 3) Save combined as CSV
out_csv = ROOT / f"work_productivity_{len(files)}_zonal_stats_long.csv"
combined.to_csv(out_csv, index=False)

# Optional: quick sanity checks
print(f"Loaded {len(files)} files → {len(combined):,} rows")

# combined is your final DataFrame
combined.head()

print(f"[OK] Zonal stats saved → {out_csv}")

Loaded 42 files → 1,386 rows
[OK] Zonal stats saved → G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\new_work_intensity\post_processing_stats\work_productivity_42_zonal_stats_long.csv
