In [20]:
# Import live code changes in
%load_ext autoreload
%autoreload 

from pathlib import Path
import os
import pandas as pd
import rasterio
from rasterio.transform import from_origin
from rasterstats import zonal_stats
import geopandas as gpd
import numpy as np
from tqdm import tqdm
import random
import matplotlib.pyplot as plt

from sovereign.utils import df_to_raster

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Set filepaths and data info

In [21]:
root = Path.cwd().parent # find project root
future_flood_path = Path(os.path.join(root, 'outputs', 'flood', 'future', 'results.parquet.gzip'))
BASIN_SHP = Path(os.path.join(root, 'inputs', 'boundaries', 'basins', 'BA_THA_lev06.shp'))
# where to save rasters
RASTER_DIR = Path(os.path.join(root, 'outputs', 'flood', 'future', 'maps'))
# where to save the basin RP shifts
FUTURE_BASINS = Path(os.path.join(root, 'outputs', 'flood', 'future', 'basin_rp_shifts.csv'))
# USER CONFIG
RPS = [10, 20, 50, 75, 100, 200, 500] # what are the retrun periods we are using
EPOCHS = ['2030', '2040', '2050', '2060', '2070'] # what future epochs are we interested in?
SCENARIOS = ['ssp126', 'ssp370', 'ssp585'] # what climate scenarios are we intersted in?
# Relevant columns from the future flood dataframe
VALUE_COL = 'adjusted_return_period'
HYDRO_COL = 'hydro'
CLIM_COL = 'climate'
SCEN_COL = 'climate_scenario'
EPOCH_COL = 'period'
RP_COL = 'return_period'
LAT_COL = 'latitude'
LON_COL = 'longitude'
# Info for basin columns
BASIN_ID_COL = "HYBAS_ID"
geometry_col = "geometry"
NEW_BASIN_ID_COL = "HB_L6"

In [22]:
# Open and clean the data
future_df = pd.read_parquet(future_flood_path)
# Do some cleaning of the dataframe
future_df['noext'] = future_df['model'].str.replace('.nc', '', regex=False)
# split by '_'
parts = future_df['noext'].str.split('_', expand=True)
future_df[HYDRO_COL] = parts[2] # add a specific hydro model column
future_df[CLIM_COL] = parts[3] # add a specific climate model column
future_df = future_df.drop(columns=['noext']) # clean up
basins = gpd.read_file(BASIN_SHP)
basins = basins[[BASIN_ID_COL, geometry_col]]
basins = basins.rename(columns={BASIN_ID_COL: NEW_BASIN_ID_COL})
# Create raster directory if it doesn't already exoist
RASTER_DIR.mkdir(parents=True, exist_ok=True)

#### Create ISIMIP future RP change rasters (from dataframe)

In [23]:
# Find all unique hydro model and climate model pairs in the dataframe
model_pairs = (
    future_df[[HYDRO_COL, CLIM_COL]]
    .drop_duplicates()
    .itertuples(index=False, name=None)
)
model_pairs = list(model_pairs) # Make it a list

# Loop over all combinations and rasterize
for hydro, clim in model_pairs:
    for scen in SCENARIOS:
        for epoch in EPOCHS:
            for rp in RPS:               
                # Set output filepath
                out_name = f"{hydro}_{clim}_{scen}_{epoch}_rp{rp:03d}.tif"
                out_path = Path(os.path.join(RASTER_DIR, out_name))
                # Skip if file already exists
                if out_path.exists():
                    continue
                # Extract dataframe subset
                sub = future_df[(future_df[HYDRO_COL] == hydro) & (future_df[CLIM_COL] == clim) & (future_df[SCEN_COL] == scen) 
                    & (future_df[EPOCH_COL] == epoch) & (future_df[RP_COL] == rp)]
                if sub.empty:
                    continue
                # Convert to raster
                df_to_raster(sub, out_path)

#### Loop over the rasters and calculate basin return period change averages

In [24]:
records = [] # empty list to save results to
# Loop over all rasters
for tif_path in tqdm(sorted(RASTER_DIR.glob("*.tif")), desc="Running Zonal Stats"):
    stem = tif_path.stem  # e.g. "cwatm_gfdl-esm4_ssp126_2030_rp010"
    parts = stem.split("_")
    if len(parts) != 5:
        print(f"Skipping unexpected filename: {stem}")
        continue

    hydro, clim, scen, epoch, rp_str = parts
    rp = int(rp_str.replace("rp", ""))

    # Zonal stats: mean per basin
    zs = zonal_stats(
        basins,
        tif_path,
        stats=["mean"],
        nodata=-9999,
        all_touched=True   # calculate stats on any cells that touch region
    )
    means = [z["mean"] for z in zs]

    tmp = pd.DataFrame({
        NEW_BASIN_ID_COL: basins[NEW_BASIN_ID_COL].values,
        HYDRO_COL: hydro,
        CLIM_COL: clim,
        SCEN_COL: scen,
        EPOCH_COL: int(epoch),
        RP_COL: rp,
        "basin_mean_value": means,
    })

    records.append(tmp)

# Combine everything
future_rp_shifts = pd.concat(records, ignore_index=True)

Running Zonal Stats: 100%|███████████████████████████████████████████████████████| 3150/3150 [2:31:58<00:00,  2.89s/it]


#### Create Basin Return Period Change Dataframe and Save

In [25]:
# Calculate stats across climate models for each hydro model and scenario:epoch:rp combination
grouped = (future_rp_shifts.groupby([NEW_BASIN_ID_COL, HYDRO_COL, SCEN_COL, EPOCH_COL, RP_COL])
    .agg(q90=("basin_mean_value", lambda x: x.quantile(0.1)),
        q50=("basin_mean_value", lambda x: x.quantile(0.50)),
        q10=("basin_mean_value", lambda x: x.quantile(0.9)),
        mean=("basin_mean_value", np.mean)))
grouped.columns.name = "stat"
# Convert to stacked dataframe for saving
long_stats = (
    grouped
    .stack()
    .reset_index(name="new_rp_value")
)
# Save to CSV
long_stats.to_csv(os.path.join(root, 'outputs', 'flood', 'future', 'basin_rp_shifts.csv'))

  grouped = (future_rp_shifts.groupby([NEW_BASIN_ID_COL, HYDRO_COL, SCEN_COL, EPOCH_COL, RP_COL])
