# calculate the leadtime-dependent climatological terciles, deciles and percentiles (0.02, then 0.05 to 0.95 with 0.05 step) from the individual GCMs' hindcast dataset (period 1993 - 2016) 

#### **Note**: this is quite slow: It takes about 15 minutes to process a complete climatology (12 months) for one GCM, for one accumulation period (monthly or seasonal), but only needs to be calculated once ... 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

### os
import os
import sys
from collections import OrderedDict

### datetimes
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from calendar import month_name


### scipy
import numpy as np
import pandas as pd
import xarray as xr
import geopandas as gpd
import cartopy.crs as ccrs
import dask
from dask.diagnostics import ProgressBar
from tqdm import tqdm

### plotting
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns


In [3]:
import pathlib

HOME = pathlib.Path.home()
CWD = pathlib.Path.cwd() 

In [4]:
sys.path.append('../../') 

In [5]:
from ICU_Water_Watch import geo, C3S, domains, plot

### dictionnary holding quantile name and quantile values, they are passed as **lists** to avoid any numerical issues 

In [6]:
dict_quantiles = OrderedDict()
dict_quantiles['tercile'] = [0.3333, 0.6666]
dict_quantiles['decile'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
dict_quantiles['percentile'] = [0.02, 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95] 

### variables 

### provider 

In [7]:
provider = 'CDS'

### variable name in the hindcast dataset 

In [8]:
varname = 'tprate'

### period: `monthly` or `seasonal` 

In [9]:
# period = 'monthly'
period = 'seasonal'

### lag in months (if need to calculate climatologies for older hindcasts)

In [10]:
lag = 0

### get today's date 

In [11]:
date = datetime.utcnow()

### apply lag 

In [12]:
date = date - relativedelta(months=lag)

In [13]:
print(f"will process hindcasts for {date:%B}")

will process hindcasts for August


In [14]:
initial_month = date.month

### list of valid GCMs 

In [15]:
GCMs = ['ECMWF', 'UKMO', 'METEO_FRANCE', 'DWD', 'CMCC', 'NCEP', 'JMA', 'ECCC']

### demonstration for one GCM, this code will e.g. need to be re-run every month for the ECC, as the hindcast dataset is apparently updated once the corresponding forecasts are made available 

In [16]:
GCM = 'METEO_FRANCE'

### path definitions follow

#### where to find the GCM hindcast datasets 

In [17]:
gcm_path = pathlib.Path(f'/media/nicolasf/END19101/ICU/data/{provider}/operational/hindcasts')

In [18]:
# gcm_path = pathlib.Path(f'/media/nicolasf/END19101/ICU/data/{provider}')

In [19]:
dpath = gcm_path.joinpath(GCM).joinpath(varname.upper())

In [20]:
print(dpath)

/media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/UKMO/TPRATE


#### where to save the climatologies 

In [21]:
clim_path = gcm_path.joinpath(f'CLIMATOLOGY/{GCM}')

In [22]:
clim_path

PosixPath('/media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/CLIMATOLOGY/UKMO')

In [23]:
if not clim_path.exists(): 
    
    clim_path.mkdir(parents=True)

### get the list of files 

In [24]:
lfiles = list(dpath.glob(f"ensemble_seas_forecasts_{varname}_from_*_{initial_month:02d}_{GCM}.netcdf"))

In [25]:
lfiles.sort()

In [26]:
lfiles

[PosixPath('/media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/UKMO/TPRATE/ensemble_seas_forecasts_tprate_from_1993_08_UKMO.netcdf'),
 PosixPath('/media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/UKMO/TPRATE/ensemble_seas_forecasts_tprate_from_1994_08_UKMO.netcdf'),
 PosixPath('/media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/UKMO/TPRATE/ensemble_seas_forecasts_tprate_from_1995_08_UKMO.netcdf'),
 PosixPath('/media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/UKMO/TPRATE/ensemble_seas_forecasts_tprate_from_1996_08_UKMO.netcdf'),
 PosixPath('/media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/UKMO/TPRATE/ensemble_seas_forecasts_tprate_from_1997_08_UKMO.netcdf'),
 PosixPath('/media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/UKMO/TPRATE/ensemble_seas_forecasts_tprate_from_1998_08_UKMO.netcdf'),
 PosixPath('/media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/UKMO/TPRATE/ensemble_seas_forecasts_tprate_from_1999_08_UKMO.netcdf'),

In [27]:
lfiles[0]

PosixPath('/media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/UKMO/TPRATE/ensemble_seas_forecasts_tprate_from_1993_08_UKMO.netcdf')

In [28]:
lfiles[-1]

PosixPath('/media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/UKMO/TPRATE/ensemble_seas_forecasts_tprate_from_2016_08_UKMO.netcdf')

In [29]:
len(lfiles)

24

### open the multiple files dataset, concatenating over the time dimension, and preprocessing 

In [30]:
dset = xr.open_mfdataset(lfiles, preprocess=C3S.preprocess_GCM, parallel=True, engine='netcdf4')

In [31]:
dset

Unnamed: 0,Array,Chunk
Bytes,125.62 MiB,5.23 MiB
Shape,"(24, 5, 28, 81, 121)","(1, 5, 28, 81, 121)"
Count,168 Tasks,24 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 125.62 MiB 5.23 MiB Shape (24, 5, 28, 81, 121) (1, 5, 28, 81, 121) Count 168 Tasks 24 Chunks Type float32 numpy.ndarray",5  24  121  81  28,

Unnamed: 0,Array,Chunk
Bytes,125.62 MiB,5.23 MiB
Shape,"(24, 5, 28, 81, 121)","(1, 5, 28, 81, 121)"
Count,168 Tasks,24 Chunks
Type,float32,numpy.ndarray


### make sure we select the hindcast period 

In [32]:
dset = dset.sel(time=slice('1993', '2016'))

### convert to monthly rainfall accumulations (mm/month)

In [33]:
dset.tprate.attrs

{'units': 'm s**-1', 'long_name': 'Mean total precipitation rate'}

In [34]:
dset = C3S.convert_rainfall(dset, varin='tprate', varout='precip', leadvar='step', timevar='time', dropvar=True)


unit is m s**-1, converting to mm/day

now converting to mm/month, converted precipitation will be held in var = precip


In [35]:
dset.info

<bound method Dataset.info of <xarray.Dataset>
Dimensions:  (time: 24, lon: 121, lat: 81, member: 28, step: 5)
Coordinates:
  * time     (time) datetime64[ns] 1993-08-01 1994-08-01 ... 2016-08-01
  * lon      (lon) float32 120.0 121.0 122.0 123.0 ... 237.0 238.0 239.0 240.0
  * lat      (lat) float32 -50.0 -49.0 -48.0 -47.0 -46.0 ... 27.0 28.0 29.0 30.0
  * member   (member) int32 0 1 2 3 4 5 6 7 8 9 ... 19 20 21 22 23 24 25 26 27
  * step     (step) int64 1 2 3 4 5
Data variables:
    precip   (time, step, member, lat, lon) float64 dask.array<chunksize=(1, 5, 28, 81, 121), meta=np.ndarray>
Attributes:
    Conventions:  CF-1.6
    history:      2021-08-16 01:01:19 GMT by grib_to_netcdf-2.20.0: /opt/ecmw...>

In [36]:
dset.precip.attrs

{'units': 'mm/month'}

### make sure there are no negative values 

In [37]:
dset = dset.clip(min=0)

### if the period is set to `seasonal`, calculates the seasonal accumulations 

In [38]:
if period == 'seasonal': 
    
    dset = dset.rolling({'step':3}, min_periods=3).sum('step')
    
    dset = dset.sel({'step':slice(3, None)})

In [39]:
dset.info

<bound method Dataset.info of <xarray.Dataset>
Dimensions:  (time: 24, lon: 121, lat: 81, member: 28, step: 3)
Coordinates:
  * time     (time) datetime64[ns] 1993-08-01 1994-08-01 ... 2016-08-01
  * lon      (lon) float32 120.0 121.0 122.0 123.0 ... 237.0 238.0 239.0 240.0
  * lat      (lat) float32 -50.0 -49.0 -48.0 -47.0 -46.0 ... 27.0 28.0 29.0 30.0
  * member   (member) int32 0 1 2 3 4 5 6 7 8 9 ... 19 20 21 22 23 24 25 26 27
  * step     (step) int64 3 4 5
Data variables:
    precip   (time, step, member, lat, lon) float64 dask.array<chunksize=(1, 3, 28, 81, 121), meta=np.ndarray>
Attributes:
    Conventions:  CF-1.6
    history:      2021-08-16 01:01:19 GMT by grib_to_netcdf-2.20.0: /opt/ecmw...>

### rechunk, just make sure each chunk can fit in memory

In [40]:
dset.chunks

Frozen({'time': (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 'step': (3,), 'member': (28,), 'lat': (81,), 'lon': (121,)})

In [41]:
chunks = {
    'time':-1, 
    'member':-1, 
    'step':1, 
    'lat':10, 
    'lon':10
}

In [42]:
dset = dset.chunk(chunks)

In [43]:
dset

Unnamed: 0,Array,Chunk
Bytes,150.75 MiB,525.00 kiB
Shape,"(24, 3, 28, 81, 121)","(24, 1, 28, 10, 10)"
Count,2038 Tasks,351 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 150.75 MiB 525.00 kiB Shape (24, 3, 28, 81, 121) (24, 1, 28, 10, 10) Count 2038 Tasks 351 Chunks Type float64 numpy.ndarray",3  24  121  81  28,

Unnamed: 0,Array,Chunk
Bytes,150.75 MiB,525.00 kiB
Shape,"(24, 3, 28, 81, 121)","(24, 1, 28, 10, 10)"
Count,2038 Tasks,351 Chunks
Type,float64,numpy.ndarray


### uncomment the following for visual inspection of the hindcasts 

In [44]:
# for t in range(24): 
#     for s in range(5): 
#         fg = dset.isel(time=t, step=s)['precip'].plot(vmax=1000., col='member', col_wrap=5)
#         fg.fig.savefig(f'./tmp/JMA_hindcast_t_{t}_s_{s}.png', dpi=200, bbox_inches='tight', facecolor='w')
#         plt.close(fg.fig)

### stack the time and members 

In [45]:
clim = dset.stack(instance=('time','member'))

In [46]:
# drop the eventual missing instances (due to sometimes inconsistent number of members)
clim = clim.dropna(dim='instance')

# rechunk so that we have a continuous instance dimension 
clim = clim.chunk({'instance':-1})

# number of instances for the calculation of the quantiles climatologies, will be added to the dataset for record-keeping 
n_instances = len(clim['instance'])

# print 
print(f"The {period} hindcast climatologies from month {initial_month} in {GCM} will be calculated using {n_instances} instances\n")

# now loop over the quantiles dictionnary, calculates the climatological quantiles, do some 
# data munging, and saves to disk 


The seasonal hindcast climatologies from month 8 in UKMO will be calculated using 672 instances



In [47]:
for k in dict_quantiles.keys():

    print(f"{k} climatology\n")

    clim_quantile = clim.quantile(dict_quantiles[k], dim='instance')

    # adds a `'month' dimension with coordinate `month`, for later concatenation
    clim_quantile = clim_quantile.expand_dims({'month':[initial_month]})

    # add the number of instances in the dataset for record-keeping
    clim_quantile['n_instances'] = (('month'), [n_instances])

    # now compute 
    with ProgressBar(): 

        clim_quantile = clim_quantile.compute() 

    # save to disk 
    clim_quantile.to_netcdf(clim_path.joinpath(f"{GCM}_{period}_{k}_climatology_{initial_month:02d}.netcdf"))

    if clim_path.joinpath(f"{GCM}_{period}_{k}_climatology_{initial_month:02d}.netcdf").exists(): 

        print(f"Successfully saved {str(clim_path.joinpath(f'{GCM}_{period}_{k}_climatology_{initial_month:02d}.netcdf'))}\n")

        clim_quantile.close()

tercile climatology

[########################################] | 100% Completed |  4.8s
Successfully saved /media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/CLIMATOLOGY/UKMO/UKMO_seasonal_tercile_climatology_08.netcdf

decile climatology

[########################################] | 100% Completed |  4.9s
Successfully saved /media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/CLIMATOLOGY/UKMO/UKMO_seasonal_decile_climatology_08.netcdf

percentile climatology

[########################################] | 100% Completed |  5.2s
Successfully saved /media/nicolasf/END19101/ICU/data/CDS/operational/hindcasts/CLIMATOLOGY/UKMO/UKMO_seasonal_percentile_climatology_08.netcdf

