# calculate the leadtime-dependent climatological terciles, deciles and percentiles (0.02, then 0.05 to 0.95 with 0.05 step) from the individual GCMs' hindcast dataset (period 1993 - 2016) 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

### os
import os
import sys
from collections import OrderedDict

### datetimes
from datetime import datetime, timedelta

### scipy
import numpy as np
import pandas as pd
import xarray as xr
import geopandas as gpd
import cartopy.crs as ccrs
import dask
from dask.diagnostics import ProgressBar
from tqdm import tqdm

### plotting
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns


In [3]:
import pathlib

HOME = pathlib.Path.home()
CWD = pathlib.Path.cwd() 

In [4]:
sys.path.append('../../') 

In [5]:
from ICU_Water_Watch import geo, C3S, domains, plot

### dictionnary holding quantile name and quantile values, they are passed as **lists** to avoid any numerical issues 

In [6]:
dict_quantiles = OrderedDict()
dict_quantiles['tercile'] = [0.3333, 0.6666]
dict_quantiles['decile'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
dict_quantiles['percentile'] = [0.02, 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95] 

### variables 

### provider 

In [7]:
provider = 'CDS'

### variable name in the hindcast dataset 

In [8]:
varname = 'tprate'

### period: `monthly` or `seasonal` 

In [9]:
period = 'seasonal'
# period = 'monthly'

### list of valid GCMs 

In [10]:
GCMs = ['ECMWF', 'UKMO', 'METEO_FRANCE', 'DWD', 'CMCC', 'NCEP', 'JMA', 'ECCC']

### demonstration for one GCM 

In [11]:
GCM = 'ECMWF'

### path definition 

### where to find the GCM hindcast datasets 

In [12]:
gcm_path = pathlib.Path(f'/media/nicolasf/END19101/ICU/data/{provider}')

In [13]:
dpath = gcm_path.joinpath(GCM).joinpath(varname.upper())

In [14]:
print(dpath)

/media/nicolasf/END19101/ICU/data/CDS/ECMWF/TPRATE


### where to save the climatologies 

In [15]:
clim_path = gcm_path.joinpath(f'CLIMATOLOGY/{GCM}')

In [16]:
clim_path

PosixPath('/media/nicolasf/END19101/ICU/data/CDS/CLIMATOLOGY/ECMWF')

### get the list of files 

In [17]:
lfiles = list(dpath.glob(f"ensemble_seas_forecasts_{varname}_from_*_{GCM}.netcdf"))

In [18]:
lfiles.sort()

In [19]:
lfiles[0]

PosixPath('/media/nicolasf/END19101/ICU/data/CDS/ECMWF/TPRATE/ensemble_seas_forecasts_tprate_from_1993_01_ECMWF.netcdf')

In [20]:
lfiles[-1]

PosixPath('/media/nicolasf/END19101/ICU/data/CDS/ECMWF/TPRATE/ensemble_seas_forecasts_tprate_from_2021_04_ECMWF.netcdf')

In [21]:
len(lfiles)

338

### open the multiple files dataset, concatenating over the time dimension, and preprocessing 

In [22]:
dset = xr.open_mfdataset(lfiles, preprocess=C3S.preprocess_GCM, parallel=True, engine='netcdf4')

In [23]:
dset.info

<bound method Dataset.info of <xarray.Dataset>
Dimensions:  (lat: 86, lon: 126, member: 51, step: 5, time: 338)
Coordinates:
  * member   (member) int64 0 1 2 3 4 5 6 7 8 9 ... 42 43 44 45 46 47 48 49 50
  * time     (time) datetime64[ns] 1993-01-01 1993-02-01 ... 2021-04-01
  * lon      (lon) float32 120.0 121.0 122.0 123.0 ... 242.0 243.0 244.0 245.0
  * lat      (lat) float32 -55.0 -54.0 -53.0 -52.0 -51.0 ... 27.0 28.0 29.0 30.0
  * step     (step) int64 1 2 3 4 5
Data variables:
    tprate   (time, step, member, lat, lon) float32 dask.array<chunksize=(1, 5, 51, 86, 126), meta=np.ndarray>
Attributes:
    Conventions:  CF-1.6
    history:      2021-02-10 19:56:10 GMT by grib_to_netcdf-2.16.0: /opt/ecmw...>

### selects the hindcast period 

In [24]:
dset = dset.sel(time=slice('1993', '2016'))

### convert to monthly rainfall accumulations (mm/month)

In [25]:
dset.tprate.attrs

{'units': 'm s**-1', 'long_name': 'Mean total precipitation rate'}

In [26]:
dset = C3S.convert_rainfall(dset, varin='tprate', varout='precip', leadvar='step', timevar='time', dropvar=True)


unit is m s**-1, converting to mm/day

now converting to mm/month, converted precipitation will be held in var = precip


In [27]:
dset.info

<bound method Dataset.info of <xarray.Dataset>
Dimensions:  (lat: 86, lon: 126, member: 51, step: 5, time: 288)
Coordinates:
  * member   (member) int64 0 1 2 3 4 5 6 7 8 9 ... 42 43 44 45 46 47 48 49 50
  * time     (time) datetime64[ns] 1993-01-01 1993-02-01 ... 2016-12-01
  * lon      (lon) float32 120.0 121.0 122.0 123.0 ... 242.0 243.0 244.0 245.0
  * lat      (lat) float32 -55.0 -54.0 -53.0 -52.0 -51.0 ... 27.0 28.0 29.0 30.0
  * step     (step) int64 1 2 3 4 5
Data variables:
    precip   (time, step, member, lat, lon) float64 dask.array<chunksize=(1, 5, 51, 86, 126), meta=np.ndarray>
Attributes:
    Conventions:  CF-1.6
    history:      2021-02-10 19:56:10 GMT by grib_to_netcdf-2.16.0: /opt/ecmw...>

In [28]:
dset.precip.attrs

{'units': 'mm/month'}

### make sure there are no negative values 

In [29]:
dset = dset.clip(min=0)

### if the period is set to seasonal, calculates the seasonal accumulations 

In [30]:
if period == 'seasonal': 
    
    dset = dset.rolling({'step':3}, min_periods=3).sum('step')
    
    dset = dset.sel({'step':slice(3, None)})

In [31]:
dset.info

<bound method Dataset.info of <xarray.Dataset>
Dimensions:  (lat: 86, lon: 126, member: 51, step: 3, time: 288)
Coordinates:
  * member   (member) int64 0 1 2 3 4 5 6 7 8 9 ... 42 43 44 45 46 47 48 49 50
  * time     (time) datetime64[ns] 1993-01-01 1993-02-01 ... 2016-12-01
  * lon      (lon) float32 120.0 121.0 122.0 123.0 ... 242.0 243.0 244.0 245.0
  * lat      (lat) float32 -55.0 -54.0 -53.0 -52.0 -51.0 ... 27.0 28.0 29.0 30.0
  * step     (step) int64 3 4 5
Data variables:
    precip   (time, step, member, lat, lon) float64 dask.array<chunksize=(1, 3, 51, 86, 126), meta=np.ndarray>
Attributes:
    Conventions:  CF-1.6
    history:      2021-02-10 19:56:10 GMT by grib_to_netcdf-2.16.0: /opt/ecmw...>

### rechunk 

In [32]:
dset.chunks

Frozen(SortedKeysDict({'time': (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 'step': (3,), 'member': (51,), 'lat': (86,), 'lon': (126,)}))

In [33]:
chunks = {
    'time':-1, 
    'member':-1, 
    'step':1, 
    'lat':5, 
    'lon':5
}

In [34]:
dset = dset.chunk(chunks)

### get the months present in the hindcast dataset (not all months are present for all GCMs, i.e. the ECCC GCM's hindcasts are updated every month, when the corresponding realtime forecast is made available ..)

In [35]:
months = np.unique(dset.time.dt.month) 

In [36]:
months

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [None]:
for month in months:
    
    print(f"calculating {period} hindcast climatologies for {GCM}, month {month}\n")
    
    # selects the month
    clim = dset.sel(time=(dset.time.dt.month==month))
    
    # stack the time and member dimension 
    clim = clim.stack(instance=('time','member'))

    # drop the eventual missing instances (due to sometimes inconsistent number of members)
    clim = clim.dropna(dim='instance')
    
    # rechunk so that we have a continuous instance dimension 
    clim = clim.chunk({'instance':-1})
    
    # number of instances for the calculation of the quantiles climatologies, will be added to the dataset for record-keeping 
    n_instances = len(clim['instance'])
    
    # print 
    print(f"The hindcast climatologies for month {month} in {GCM} will be calculated using {n_instances} instances\n")
    
    # now loop over the quantiles dictionnary, calculates the climatological quantiles, do some 
    # data munging, and saves to disk 
    
    for k in dict_quantiles.keys():
        
        print(f"{k} climatology\n")
        
        clim_quantile = clim.quantile(dict_quantiles[k], dim='instance')
        
        # adds a `'month' dimension with coordinate `month`, for later concatenation
        clim_quantile = clim_quantile.expand_dims({'month':[month]})
        
        # add the number of instances in the dataset for record-keeping
        clim_quantile['n_instances'] = (('month'), [n_instances])
        
        # now compute 
        with ProgressBar(): 
            
            clim_quantile = clim_quantile.compute() 
        
        # save to disk 
        clim_quantile.to_netcdf(clim_path.joinpath(f"{GCM}_{period}_{k}_climatology_{str(month).zfill(2)}.netcdf"))
        
        if clim_path.joinpath(f"{GCM}_{period}_{k}_climatology_{str(month).zfill(2)}.netcdf").exists(): 
            
            print(f"Successfully saved {str(clim_path.joinpath(f'{GCM}_{period}_{k}_climatology_{str(month).zfill(2)}.netcdf'))}\n")
            
            clim_quantile.close()

calculating seasonal hindcast climatologies for ECMWF, month 1

The hindcast climatologies for month 1 in ECMWF will be calculated using 600 instances

[########################################] | 100% Completed | 20.8s
Successfully saved /media/nicolasf/END19101/ICU/data/CDS/CLIMATOLOGY/ECMWF/ECMWF_seasonal_tercile_climatology_01.netcdf

[########################################] | 100% Completed | 20.8s
Successfully saved /media/nicolasf/END19101/ICU/data/CDS/CLIMATOLOGY/ECMWF/ECMWF_seasonal_decile_climatology_01.netcdf

[########################################] | 100% Completed | 20.9s
Successfully saved /media/nicolasf/END19101/ICU/data/CDS/CLIMATOLOGY/ECMWF/ECMWF_seasonal_percentile_climatology_01.netcdf

calculating seasonal hindcast climatologies for ECMWF, month 2

The hindcast climatologies for month 2 in ECMWF will be calculated using 600 instances

[########################################] | 100% Completed | 20.9s
Successfully saved /media/nicolasf/END19101/ICU/data/CDS/CL