# Near-surface max wind speed data wrangle
### Takes CSIRO CMIP6 Application-Ready Gridded Climate Data NetCDF daily time series, summarises by health district using population-weighted averaging, and outputs this average as a daily time series for all health districts.

#### <i> https://data.csiro.au/collection/csiro:64206 </i>

In [1]:
import xarray as xr
import utils.agcd_agg_functions as agg
from importlib import reload
from shapely.geometry import mapping, box
import geopandas as gpd
import rioxarray
import numpy as np
from rasterio.enums import Resampling
from rasterio.features import geometry_mask
import dill
import pandas as pd
from pathlib import Path
from collections import defaultdict

pd.set_option('display.max_columns',  None)

reload(agg)

<module 'utils.agcd_agg_functions' from 'c:\\Users\\jake.allen.ALLUVIUMQLD\\Documents\\Repos\\climate_health_reference_manual\\utils\\agcd_agg_functions.py'>

### Load datasets
We load everything into xarray dataset definitions for easy iterations later. Note that no data is being held in memory yet thanks to dask. This only happens once computations are triggered.

In [2]:
# build path dictionary
clim_fold = 'E:/Jake_ClimateRasters'

def nested_dict():
    return defaultdict(lambda: defaultdict(list))

path_dict = defaultdict(lambda: defaultdict(nested_dict))

models = ['ACCESS-ESM1-5', 'CMCC-ESM2', 'CNRM-ESM2-1', 'EC-Earth3', 'MPI-ESM1-2-HR', 'UKESM1-0-LL']
#models = ['ACCESS-CM2']
ssps = ['ssp245', 'ssp370']
epochs = ['mid', 'late']
variables = ['sfcWindmax']

# assign model paths
for model in models:
    for ssp in ssps:
        for epoch in epochs:
            thisepoch = '2035-2064' if epoch == 'mid' else '2070-2099'
            for thisvar in variables:
                thispath = Path(f'{clim_fold}/{model}/{ssp}/{thisvar}/AUS-11/{thisepoch}')
                ncdfs = []
                for path in thispath.rglob('*.nc'):
                    ncdfs.append(path)
                path_dict[model][ssp][epoch][thisvar] = ncdfs

# assign historical observation paths
for thisvar in variables:
    thispath = Path(f'{clim_fold}/Historical/{thisvar}')
    ncdfs = []
    for path in thispath.rglob('*.nc'):
        ncdfs.append(path)
    path_dict['Historical'][thisvar] = ncdfs

In [3]:
# load districts

districts = gpd.read_file('Inputs/health_district_merged.json')

#subset districts if necessary
#districts = districts[districts['health_district_name'].isin(['Northern NSW', 'Western Sydney', 'Eyre and Far North'])].copy().to_crs('EPSG:4326')

# get bounding box
minx, miny, maxx, maxy = districts.total_bounds

bbox_geom = box(minx, miny, maxx, maxy)
bbox = [mapping(bbox_geom)]

In [4]:
# Load ncdfs into dictionary for easy wrangling, subsetting to bounding box and chunking by time = 365
# use load_var helper function

# initialise empty dictionary
data_dict = defaultdict(lambda: defaultdict(nested_dict))

# load model data
for model in models:
    for ssp in ssps:
        for epoch in epochs:
            ds = xr.Dataset({
                thisvar: agg.load_var(path_dict[model][ssp][epoch][thisvar], thisvar, bbox)
                for thisvar in variables
            })
            data_dict[model][ssp][epoch] = ds


# Load all historical variables into a dict
# historical data has misaligned time coordinates, causing issues when combining into a single dataset
hist_vars = {}
for thisvar in variables:
    ds = agg.load_var(path_dict['Historical'][thisvar], thisvar, bbox, chunk=False)

    # Force matching time coordinates
    if 'time' in hist_vars:
        ds['time'] = hist_vars['time']
    else:
        hist_vars['time'] = ds['time']

    hist_vars[thisvar] = ds

# Drop the saved 'time' array from the dict
hist_vars.pop('time')

# Now build the dataset
data_dict['Historical'] = xr.Dataset(hist_vars)

Prepare district polygons (reproject and extract transformation, create column name)

In [5]:
all_districts = districts.to_crs(data_dict['Historical']['sfcWindmax'].rio.crs)
all_districts['district_name_id'] = all_districts['state'] + '_' +  all_districts['health_district_name']

affine = data_dict['Historical']['sfcWindmax'].rio.transform()

### Daily maximum windspeed per health district

In [6]:
for model in models:
    for ssp in ssps:
        for epoch in epochs:
            for thisvar in variables:
                # retrieve dataset from dictionary
                ds = data_dict[model][ssp][epoch][thisvar]
                # create time series dataset using function in agg functions file
                df = agg.zonal_maximum_time_series(ds, all_districts, affine, 'district_name_id')

                # get time series bounds for filename
                period = "FUTURE2035-2064" if epoch == 'mid' else "FUTURE2070-2099"
                
                fname = f"{thisvar}_DailyTimeSeries_52HealthDistricts_{period}_{ssp}_{model}_maximum"

                df.to_csv(f'Outputs/sfcWindmax/{fname}.csv', index=False) 

In [None]:
# Historical
ds = data_dict['Historical']['sfcWindmax']
df = agg.zonal_maximum_time_series(ds, all_districts, affine, 'district_name_id')
fname = "sfcWindmax_DailyTimeSeries_52HealthDistricts_CURRENT1985-2014_BARRA-R2_maximum.csv"
df.to_csv(f'Outputs/sfcWindMax/{fname}', index=False)

In [11]:
# Drop the date column
df_no_date = df.drop(columns='date')

# Get the index and column of the max value
row_idx, col_name = divmod(df_no_date.values.argmax(), df_no_date.shape[1])

# Extract info
max_value = df_no_date.iat[row_idx, col_name]
max_date = df.loc[row_idx, 'date']
max_column = df_no_date.columns[col_name]

print(f"Max value: {max_value} in column '{max_column}' on date {max_date}")

Max value: 46.84375 in column 'Western Australia_Pilbara' on date 1996-04-10 12:00:00
