### calculate the leadtime-dependent climatological terciles, deciles and percentiles (0.02, then 0.05 to 0.95 with 0.05 step) from the individual GCMs' hindcast dataset (period 1993 - 2016) for admin areas  

### This notebook is driven via papermill by `ICU_forecast_table/drive_admin_GCMs_evaluation.ipynb`

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

### os and standard libraries 
import os
import sys
from collections import OrderedDict
from itertools import product

### datetimes
from datetime import datetime, timedelta

### scipy
import numpy as np
import pandas as pd
import xarray as xr
import geopandas as gpd
import cartopy.crs as ccrs
import dask
from dask.diagnostics import ProgressBar
from tqdm import tqdm

### plotting
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns


In [3]:
import pathlib

HOME = pathlib.Path.home()
CWD = pathlib.Path.cwd() 

In [4]:
sys.path.append('../../') 

In [145]:
from ICU_Water_Watch import geo, C3S, domains, plot, utils, verification

### dictionnary holding quantile name and quantile values, they are passed as **lists** to avoid any numerical issues 

In [6]:
dict_quantiles = OrderedDict()
dict_quantiles['tercile'] = [0.3333, 0.6666]
dict_quantiles['quartile'] = [0.25, 0.50, 0.75]
dict_quantiles['decile'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
dict_quantiles['percentile'] = [0.02, 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95] 

### list of GCMs with complete hindcast period 

In [7]:
GCMs = ['ECMWF', 'UKMO', 'METEO_FRANCE', 'DWD', 'CMCC', 'NCEP', 'JMA', 'ECCC_CanCM4i', 'ECCC_GEM_NEMO']

### PARAMETERS FOR PAPERMILL 

In [8]:
provider = 'CDS'
varname = 'tprate'
period = 'seasonal'
GCM = 'ECMWF'
quantiles = 'quartile'
method = 'empirical' # whether to calculate the parametrized or empirical quantiles \
use_verif = 'era'

In [None]:
dset_obs, dset_anomalies = verification.get_era5()

In [None]:
if quantiles == 'tercile': 
    ncategories = 3
if quantiles == 'quartile': 
    ncategories = 4
elif quantiles == 'decile': 
    ncategories = 10 
elif quantiles == 'percentile': 
    ncategories = 21

### path definitions follow

### outputs 

In [9]:
outputs_path = CWD.parents[1].joinpath("outputs/admin/operational")

In [10]:
print(str(outputs_path))

/home/nicolasf/operational/ICU/development/hotspots/code/ICU_Water_Watch/outputs/admin/operational


#### where to find the GCM hindcast datasets 

In [11]:
gcm_path = pathlib.Path(f'/media/nicolasf/END19101/ICU/data/{provider}/operational/hindcasts')

In [12]:
dpath = gcm_path.joinpath(GCM).joinpath(varname.upper())

In [13]:
print(dpath)

/media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/ECMWF/TPRATE


#### where to save the climatologies 

In [14]:
clim_path = gcm_path.joinpath(f'CLIMATOLOGY/{GCM}')

In [15]:
print(clim_path)

/media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/CLIMATOLOGY/ECMWF


In [16]:
if not clim_path.exists(): 
    
    clim_path.mkdir(parents=True)

### get the list of files 

In [17]:
lfiles = list(dpath.glob(f"ensemble_seas_forecasts_{varname}_from_*.netcdf"))

In [18]:
lfiles.sort()

In [20]:
len(lfiles)

288

In [21]:
dset = xr.open_mfdataset(lfiles, preprocess=C3S.preprocess_GCM, parallel=True, engine='netcdf4')

In [22]:
dset = dset.sortby('time')

In [23]:
dset.info

<bound method Dataset.info of <xarray.Dataset>
Dimensions:  (time: 288, lon: 121, lat: 81, member: 25, step: 5)
Coordinates:
  * time     (time) datetime64[ns] 1993-01-01 1993-02-01 ... 2016-12-01
  * lon      (lon) float32 120.0 121.0 122.0 123.0 ... 237.0 238.0 239.0 240.0
  * lat      (lat) float32 -50.0 -49.0 -48.0 -47.0 -46.0 ... 27.0 28.0 29.0 30.0
  * member   (member) int32 0 1 2 3 4 5 6 7 8 9 ... 16 17 18 19 20 21 22 23 24
  * step     (step) int64 1 2 3 4 5
Data variables:
    tprate   (time, step, member, lat, lon) float32 dask.array<chunksize=(1, 5, 25, 81, 121), meta=np.ndarray>
Attributes:
    Conventions:  CF-1.6
    history:      2022-01-17 01:17:57 GMT by grib_to_netcdf-2.23.0: /opt/ecmw...>

### print the number of members in the ensemble for each time step 

In [24]:
# for t in range(len(dset.time)): 
    
#     s = dset.isel(time=t)
    
#     print(f"{pd.to_datetime(dset.isel(time=t)['time'].data):%Y-%m}:", len(s.dropna('member')['member']))

### selects the hindcast period 

In [25]:
dset = dset.sel(time=slice('1993', '2016'))

In [26]:
dset

Unnamed: 0,Array,Chunk
Bytes,1.31 GiB,4.67 MiB
Shape,"(288, 5, 25, 81, 121)","(1, 5, 25, 81, 121)"
Count,2016 Tasks,288 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.31 GiB 4.67 MiB Shape (288, 5, 25, 81, 121) (1, 5, 25, 81, 121) Count 2016 Tasks 288 Chunks Type float32 numpy.ndarray",5  288  121  81  25,

Unnamed: 0,Array,Chunk
Bytes,1.31 GiB,4.67 MiB
Shape,"(288, 5, 25, 81, 121)","(1, 5, 25, 81, 121)"
Count,2016 Tasks,288 Chunks
Type,float32,numpy.ndarray


### convert to monthly rainfall accumulations (mm/month)

In [27]:
dset.tprate.attrs

{'units': 'm s**-1', 'long_name': 'Mean total precipitation rate'}

In [28]:
dset = C3S.convert_rainfall(dset, varin='tprate', varout='precip', leadvar='step', timevar='time', dropvar=True)


unit is m s**-1, converting to mm/day

now converting to mm/month, converted precipitation will be held in var = precip


In [29]:
dset.info

<bound method Dataset.info of <xarray.Dataset>
Dimensions:  (time: 288, lon: 121, lat: 81, member: 25, step: 5)
Coordinates:
  * time     (time) datetime64[ns] 1993-01-01 1993-02-01 ... 2016-12-01
  * lon      (lon) float32 120.0 121.0 122.0 123.0 ... 237.0 238.0 239.0 240.0
  * lat      (lat) float32 -50.0 -49.0 -48.0 -47.0 -46.0 ... 27.0 28.0 29.0 30.0
  * member   (member) int32 0 1 2 3 4 5 6 7 8 9 ... 16 17 18 19 20 21 22 23 24
  * step     (step) int64 1 2 3 4 5
Data variables:
    precip   (time, step, member, lat, lon) float64 dask.array<chunksize=(1, 5, 25, 81, 121), meta=np.ndarray>
Attributes:
    Conventions:  CF-1.6
    history:      2022-01-17 01:17:57 GMT by grib_to_netcdf-2.23.0: /opt/ecmw...>

In [30]:
dset.precip.attrs

{'units': 'mm/month'}

### make sure there are no negative values 

In [31]:
dset = dset.clip(min=0)

### if the period is set to `seasonal`, calculates the seasonal accumulations 

In [32]:
if period == 'seasonal': 
    
    dset = dset.rolling({'step':3}, min_periods=3).sum('step')
    
    dset = dset.sel({'step':slice(3, None)})

In [33]:
dset.info

<bound method Dataset.info of <xarray.Dataset>
Dimensions:  (time: 288, lon: 121, lat: 81, member: 25, step: 3)
Coordinates:
  * time     (time) datetime64[ns] 1993-01-01 1993-02-01 ... 2016-12-01
  * lon      (lon) float32 120.0 121.0 122.0 123.0 ... 237.0 238.0 239.0 240.0
  * lat      (lat) float32 -50.0 -49.0 -48.0 -47.0 -46.0 ... 27.0 28.0 29.0 30.0
  * member   (member) int32 0 1 2 3 4 5 6 7 8 9 ... 16 17 18 19 20 21 22 23 24
  * step     (step) int64 3 4 5
Data variables:
    precip   (time, step, member, lat, lon) float64 dask.array<chunksize=(1, 3, 25, 81, 121), meta=np.ndarray>
Attributes:
    Conventions:  CF-1.6
    history:      2022-01-17 01:17:57 GMT by grib_to_netcdf-2.23.0: /opt/ecmw...>

In [34]:
steps = dset.step.data

In [35]:
steps

array([3, 4, 5])

### read the csv 

In [38]:
countries_admin = pd.read_csv('/home/nicolasf/operational/ICU/development/hotspots/code/ICU_Water_Watch/outputs/admin/list_countries_admin.csv', index_col=0)

In [41]:
countries_admin.query("country == 'Fiji'")

Unnamed: 0,country,admin
30,Fiji,Rotuma
31,Fiji,Northern
32,Fiji,Eastern
33,Fiji,Central
34,Fiji,Western


### reads the land sea mask for one country

In [45]:
lsmasks_path = pathlib.Path(HOME.joinpath('operational/ICU/development/hotspots/code/ICU_Water_Watch/outputs/admin/lsmasks/'))

In [143]:
for country in countries_admin['country'].unique(): 
    
    df_sub = countries_admin.query(f"country == '{country}'")
    
    for admin in df_sub['admin'].unique(): 
        
        lsmask = xr.open_dataset(lsmasks_path.joinpath(f"landsea_mask_{country}_{admin}.nc")) 
        
        domain = [lsmask['mask'].attrs['lon_min'], \
            lsmask['mask'].attrs['lon_max'], \
            lsmask['mask'].attrs['lat_min'], \
            lsmask['mask'].attrs['lat_max']]
        
        dset_sub = dset.sel(lon=slice(*domain[:2]), lat=slice(*domain[2:]))
        
        dset_sub = utils.interp(dset_sub, interp_factor=5)
        
        gcm_domain = domains.get_domain(dset)
        
        dset_sub['mask'] = lsmask['mask']
        
        dset_sub['precip'] = dset_sub['precip'] * dset_sub['mask']
        
        dset_sub_reg = dset_sub.mean(['lat','lon'])
        
        with ProgressBar(): 
            
            dset_sub_reg.load()
            
        if method == 'parametrized': 
    
            clim_p = dset_sub_reg.groupby(dset_sub_reg.time.dt.month).apply(C3S.calc_parametrized_quantiles, \
                                                              **{'quantiles':dict_quantiles[quantiles], 'dims':('time','member')})
        elif method == 'empirical': 
    
            clim_p = dset_sub_reg.groupby(dset_sub_reg.time.dt.month).apply(C3S.calc_empirical_quantiles, \
                                                              **{'quantiles':dict_quantiles[quantiles], 'dims':('time','member')})
        
        
        if not outputs_path.joinpath("climatologies").exists(): 
    
            outputs_path.joinpath("climatologies").mkdir(parents=True)
        
        print(f"saving {quantiles} climatology for {varname} {period} {GCM} {utils.sanitize_name(country)} {utils.sanitize_name(admin)} in {str(outputs_path.joinpath('climatologies'))}")
        
        clim_p.to_netcdf(outputs_path.joinpath("climatologies").joinpath(f"{method}_{quantiles}_{varname}_{period}_{GCM}_{utils.sanitize_name(country)}_{utils.sanitize_name(admin)}.nc"))
        
        dset_cat = []

        for month in np.unique(dset_sub.time.dt.month): 

            x = dset_sub_reg.sel(time=(dset_sub_reg.time.dt.month == month))

            # drop the missing members 

            x = x.dropna('member')

            # digitize

            qc = C3S.get_GCM_category_digitize(x, clim_p.sel(month=month), varname='precip', dim='quantile')

            dset_cat.append(qc)
        
        dset_cat = xr.concat(dset_cat, dim='time')
        
        dset_cat = dset_cat.sortby('time')

        quantiles_category_percent = C3S.calculate_quantiles_probabilities(dset_cat, ncategories=ncategories)
        
        if not outputs_path.joinpath(f"hindcast_categories").exists(): 
    
            outputs_path.joinpath(f"hindcast_categories").mkdir()
        
        quantiles_category_percent.to_netcdf(outputs_path.joinpath(f"hindcast_categories/{method}_{quantiles}_categories_probabilities_hindcast_{GCM}_{period}_{utils.sanitize_name(country)}_{utils.sanitize_name(admin)}.nc"))
        
        df_quantile_probabilities = []

        for step in steps: 
    
            df = quantiles_category_percent.sel(step=step)['precip'].to_pandas().T

            cols = pd.MultiIndex.from_product([[step], df.columns]) 

            df.columns = cols 

            df_quantile_probabilities.append(df)
            
        df_quantile_probabilities = pd.concat(df_quantile_probabilities, axis=1) 
        
        df_quantile_probabilities.to_csv(outputs_path.joinpath(f"hindcast_categories/{method}_{quantiles}_categories_probabilities_hindcast_{GCM}_{period}_{utils.sanitize_name(country)}_{utils.sanitize_name(admin)}.csv"))
        
        dset_obs_sub = dset_obs.sel(lon=slice(*gcm_domain[:2]), lat=slice(*gcm_domain[2:]))
        
        if period == 'seasonal': 
    
            dset_obs_sub = dset_obs_sub.rolling({"time":3}, min_periods=3, center=False).sum('time')
    
            dset_obs_sub = dset_obs_sub.isel(time=slice(2, None))
        
        dset_obs_sub = dset_obs_sub.interp_like(dset_sub[['lon','lat']])
        
        dset_obs_sub['mask'] = dset_sub['mask']
        
        dset_obs_sub['precip'] = dset_obs_sub['precip'] * dset_obs_sub['mask']
        
        dset_obs_sub = dset_obs_sub.expand_dims({'member':[1]})
        
        dset_obs_sub = dset_obs_sub[['precip']].mean(['lat','lon'])
        
        dset_obs_sub = dset_obs_sub.chunk({'member':-1, 'time':-1})
        
        with ProgressBar(): 
            
            dset_obs_sub = dset_obs_sub.compute()
            
        if method == 'parametrized': 
    
            clim_obs_p = dset_obs_sub.groupby(dset_obs_sub.time.dt.month).apply(C3S.calc_parametrized_quantiles, **{'quantiles':dict_quantiles[quantiles], 'dims':('time','member')})

        elif method == 'empirical': 
    
            clim_obs_p = dset_obs_sub.groupby(dset_obs_sub.time.dt.month).apply(C3S.calc_empirical_quantiles, **{'quantiles':dict_quantiles[quantiles], 'dims':('time','member')})
        
        dset_obs_cat = []

        for month in np.unique(dset_obs_sub.time.dt.month): 
    
            x = dset_obs_sub.sel(time=(dset_obs_sub.time.dt.month == month))

            # digitize

            qc = C3S.get_GCM_category_digitize(x, clim_obs_p.sel(month=month), varname='precip', dim='quantile')

            dset_obs_cat.append(qc)
            
        dset_obs_cat = xr.concat(dset_obs_cat, dim='time')
        
        dset_obs_cat = dset_obs_cat.sortby('time')

In [131]:
dset_obs_cat

In [132]:
dset_obs_cat = dset_obs_cat.squeeze()['precip'].to_pandas()

In [133]:
dset_obs_cat

time
1993-03-01    2
1993-04-01    1
1993-05-01    2
1993-06-01    2
1993-07-01    2
             ..
2016-08-01    4
2016-09-01    4
2016-10-01    4
2016-11-01    3
2016-12-01    4
Length: 286, dtype: int64

In [134]:
df_quantile_probabilities

Unnamed: 0_level_0,3,3,3,3,4,4,4,4,5,5,5,5
quartile,1,2,3,4,1,2,3,4,1,2,3,4
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1993-01-01,52.0,16.0,24.0,8.0,48.0,16.0,20.0,16.0,48.0,20.0,24.0,8.0
1993-02-01,36.0,44.0,16.0,4.0,56.0,24.0,12.0,8.0,64.0,16.0,20.0,0.0
1993-03-01,60.0,12.0,28.0,0.0,52.0,36.0,12.0,0.0,44.0,28.0,24.0,4.0
1993-04-01,68.0,16.0,12.0,4.0,40.0,40.0,12.0,8.0,48.0,32.0,16.0,4.0
1993-05-01,44.0,36.0,16.0,4.0,36.0,40.0,20.0,4.0,44.0,32.0,16.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2016-08-01,36.0,40.0,24.0,0.0,8.0,56.0,36.0,0.0,16.0,28.0,44.0,12.0
2016-09-01,20.0,40.0,32.0,8.0,12.0,40.0,40.0,8.0,4.0,60.0,28.0,8.0
2016-10-01,8.0,20.0,52.0,20.0,0.0,40.0,56.0,4.0,8.0,32.0,40.0,20.0
2016-11-01,8.0,28.0,36.0,28.0,16.0,24.0,36.0,24.0,20.0,36.0,28.0,16.0


In [135]:
dset_obs_cat = dset_obs_cat.to_frame(name='obs')

In [136]:
df_quantile_probabilities_obs = dset_obs_cat.join(df_quantile_probabilities, on='time')

  df_quantile_probabilities_obs = dset_obs_cat.join(df_quantile_probabilities, on='time')


In [137]:
df_quantile_probabilities_obs.head()

Unnamed: 0_level_0,obs,"(3, 1)","(3, 2)","(3, 3)","(3, 4)","(4, 1)","(4, 2)","(4, 3)","(4, 4)","(5, 1)","(5, 2)","(5, 3)","(5, 4)"
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1993-03-01,2,60.0,12.0,28.0,0.0,52.0,36.0,12.0,0.0,44.0,28.0,24.0,4.0
1993-04-01,1,68.0,16.0,12.0,4.0,40.0,40.0,12.0,8.0,48.0,32.0,16.0,4.0
1993-05-01,2,44.0,36.0,16.0,4.0,36.0,40.0,20.0,4.0,44.0,32.0,16.0,8.0
1993-06-01,2,28.0,32.0,12.0,28.0,20.0,36.0,24.0,20.0,24.0,40.0,28.0,8.0
1993-07-01,2,32.0,44.0,16.0,8.0,48.0,28.0,24.0,0.0,40.0,32.0,24.0,4.0


In [138]:
df_quantile_probabilities_obs.dropna()

Unnamed: 0_level_0,obs,"(3, 1)","(3, 2)","(3, 3)","(3, 4)","(4, 1)","(4, 2)","(4, 3)","(4, 4)","(5, 1)","(5, 2)","(5, 3)","(5, 4)"
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1993-03-01,2,60.0,12.0,28.0,0.0,52.0,36.0,12.0,0.0,44.0,28.0,24.0,4.0
1993-04-01,1,68.0,16.0,12.0,4.0,40.0,40.0,12.0,8.0,48.0,32.0,16.0,4.0
1993-05-01,2,44.0,36.0,16.0,4.0,36.0,40.0,20.0,4.0,44.0,32.0,16.0,8.0
1993-06-01,2,28.0,32.0,12.0,28.0,20.0,36.0,24.0,20.0,24.0,40.0,28.0,8.0
1993-07-01,2,32.0,44.0,16.0,8.0,48.0,28.0,24.0,0.0,40.0,32.0,24.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-08-01,4,36.0,40.0,24.0,0.0,8.0,56.0,36.0,0.0,16.0,28.0,44.0,12.0
2016-09-01,4,20.0,40.0,32.0,8.0,12.0,40.0,40.0,8.0,4.0,60.0,28.0,8.0
2016-10-01,4,8.0,20.0,52.0,20.0,0.0,40.0,56.0,4.0,8.0,32.0,40.0,20.0
2016-11-01,3,8.0,28.0,36.0,28.0,16.0,24.0,36.0,24.0,20.0,36.0,28.0,16.0


In [139]:
if not (outputs_path.joinpath("hindcasts_probs_and_obs").exists()):
    
    outputs_path.joinpath("hindcasts_probs_and_obs").mkdir(parents=True)

In [140]:
df_quantile_probabilities_obs.to_csv(outputs_path.joinpath(f"hindcasts_probs_and_obs/{utils.sanitize_name(country)}_{utils.sanitize_name(admin)}_{period}_{GCM}_{method}_{quantiles}_probs_and_obs.csv"))

In [141]:
outputs_path

PosixPath('/home/nicolasf/operational/ICU/development/hotspots/code/ICU_Water_Watch/outputs/admin/operational')