## calculate the probabilities for terciles, deciles and percentiles (0.02 then 0.05 to 0.95) categories for a 'realtime' forecast, with respect to the lead-time dependent monthly and seasonal hindcast climatologies, for each of the 8 C3S GCMs (ECMWF, UKMO, METEO-FRANCE, DWD, CMCC, NCEP, JMA and ECCC)

This notebook:   
    
1) reads the latest forecasts from the C3S MME [ECMWF, UKMO, METEO-FRANCE, DWD, CMCC, NCEP, JMA and ECCC]  
2) preprocesses, converts to monthly / seasonal rainfall accumulations   
3) reads the leadtime-dependent terciles, deciles and percentiles (0.02 then 0.05 to 0.95) climatologies corresponding to the initial month of the forecast  
4) calculate the probabilities for each quantile category as the proportion of the GCM's ensemble members  
5) saves to disk these probabilities for later use and mapping   

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
### os 
import os 
import sys

### datetimes 
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from calendar import month_name

### scipy 
import numpy as np 
import pandas as pd
import xarray as xr

In [3]:
import pathlib
HOME = pathlib.Path.home()
CWD = pathlib.Path.cwd() 

### import local functions for the processing of the C3S forecasts 

In [4]:
sys.path.append('../..')

In [5]:
from ICU_Water_Watch import C3S, domains

### parameters for papermill 

In [6]:
provider = 'CDS' # should not change
GCM = 'ECMWF'
# list_GCMs = ['ECMWF','UKMO','METEO_FRANCE','CMCC','DWD', 'NCEP', 'JMA', 'ECCC_CanCM4i', 'ECCC_GEM5_NEMO']
varname = 'tprate' # can be in ['tprate', 't2m']
period = 'monthly' # valid values: 'seasonal or 'monthly'
# if one wants to process older forecasts 
lag = 0
# path where to find the hindcasts and forecasts 
gcm_path = f'/media/nicolasf/END19101/ICU/data/{provider}/operational'
outputs_path = '/home/nicolasf/operational/ICU/development/hotspots/code/ICU_Water_Watch/outputs/C3S'

In [8]:
gcm_path = pathlib.Path(gcm_path)
outputs_path = pathlib.Path(outputs_path)

### create the outputs path if it doesnt exist 

In [9]:
if not(outputs_path).exists(): 
    outputs_path.mkdir(parents=True)

### domain for extraction 

In [10]:
domain = domains.domains['C3S_download']

### get today's date 

In [11]:
date = datetime.utcnow()

### apply lag 

In [12]:
date = date - relativedelta(months=lag)

In [13]:
print(f"will process forecasts issued in {date:%B %Y}")

will process forecasts issued in May 2022


### path to the GCMs **hindcast datasets** and **climatologies** 

In [14]:
hindcasts_path = gcm_path.joinpath('hindcasts')

### path to where the **realtime forecasts** have been downloaded 

In [15]:
forecasts_path = gcm_path.joinpath('forecasts')

### get year and month 

In [16]:
year, month =  date.year, date.month

### calculates and saves the quantiles probabilities 

#### read the climatologies 

In [17]:
clim_path = hindcasts_path.joinpath(f'CLIMATOLOGY/{GCM}/{varname.upper()}')

In [18]:
tercile_climatology = xr.open_dataset(clim_path.joinpath(f"{GCM}_{varname}_{period}_tercile_climatology_{str(month).zfill(2)}.netcdf"), engine='netcdf4')

quartile_climatology = xr.open_dataset(clim_path.joinpath(f"{GCM}_{varname}_{period}_quartile_climatology_{str(month).zfill(2)}.netcdf"), engine='netcdf4')

decile_climatology = xr.open_dataset(clim_path.joinpath(f"{GCM}_{varname}_{period}_decile_climatology_{str(month).zfill(2)}.netcdf"), engine='netcdf4')

percentile_climatology = xr.open_dataset(clim_path.joinpath(f"{GCM}_{varname}_{period}_percentile_climatology_{str(month).zfill(2)}.netcdf"), engine='netcdf4')

#### make sure we have the same domains

In [19]:
tercile_climatology = domains.extract_domain(tercile_climatology, domain)

quartile_climatology = domains.extract_domain(quartile_climatology, domain)

decile_climatology = domains.extract_domain(decile_climatology, domain)

percentile_climatology = domains.extract_domain(percentile_climatology, domain)

#### Now read the forecasts 

In [20]:
print(f"{50*'-'}\nReading {varname} forecasts issued {year}-{str(month).zfill(2)} for GCM {GCM}")

--------------------------------------------------
Reading tprate forecasts issued 2022-05 for GCM ECCC_GEM_NEMO


In [21]:
if 'ECCC' in GCM: 

    x = xr.open_dataset(forecasts_path.joinpath(f"{GCM}/{varname.upper()}/ensemble_seas_forecasts_{varname}_from_{year}_{str(month).zfill(2)}_{GCM.split('_')[0]}.netcdf"), engine='netcdf4')

else: 

    x = xr.open_dataset(forecasts_path.joinpath(f"{GCM}/{varname.upper()}/ensemble_seas_forecasts_{varname}_from_{year}_{str(month).zfill(2)}_{GCM}.netcdf"), engine='netcdf4')


In [22]:
x

#### preprocess (harmonize the variable names, sort the latitudes, etc )

In [23]:
x = C3S.preprocess_GCM(x)

#### convert from mm/day to mm/month

In [24]:
if varname == 'tprate': 

    varin = 'tprate'

    varout = 'precip'

    x = C3S.convert_rainfall(x, varin=varin, varout=varout, leadvar='step', timevar='time', dropvar=True)
    
    x = x.clip(min=0)

else: 

    varout = varname



unit is m s**-1, converting to mm/day

now converting to mm/month, converted precipitation will be held in var = precip


#### just in case, remove potential missing fields (members)

In [25]:
x = x.dropna(dim='member')

#### make sure we have the same domain for the climatologies and the latest forecasts

In [26]:
x = domains.extract_domain(x, domain)

#### calculates the seasonal values if period == seasonal

In [27]:
if period == 'seasonal': 

    print("Calculating the seasonal (3 months) accumulations or averages")

    if varname == 'tprate': 

        x = x.rolling({'step':3}, min_periods=3, center=False).sum('step') 

    else: 

        x = x.rolling({'step':3}, min_periods=3, center=False).mean('step')

    # get rid of the 2 first steps, which by definition contain missing values 

    x = x.sel(step=slice(3, None))

Calculating the seasonal (3 months) accumulations or averages


#### checks that the initial month corresponds indeed to what we defined earlier

In [28]:
if (x.time.dt.year != year) or (x.time.dt.month != month): 

    print(f"issue with the initial date in the latest forecast, expected {year}-{month}, got {x.time.dt.year}-{x.time.dt.month}")

##### Now calculates the tercile category for each member

In [29]:
terciles_category = C3S.get_GCM_category_digitize(x, tercile_climatology.squeeze(), varname=varout, dim='quantile')

##### and calculate the proportion of member in each category

In [30]:
terciles_category_percent = C3S.calculate_quantiles_probabilities(terciles_category, ncategories=3)

##### creates a dummy 'GCM' dimension

In [31]:
terciles_category_percent = terciles_category_percent.expand_dims(dim={'GCM':[GCM]}, axis=0) 

##### includes the quantile values (i.e. the 'bounds' for the quantile categories) in the dataset

In [32]:
terciles_category_percent.attrs['pct_values'] = tercile_climatology['quantile'].data

##### do the same for quartiles, deciles, percentiles probabilities 

In [33]:
quartiles_category = C3S.get_GCM_category_digitize(x, quartile_climatology.squeeze(), varname=varout, dim='quantile')
quartiles_category_percent = C3S.calculate_quantiles_probabilities(quartiles_category, ncategories=4)
quartiles_category_percent = quartiles_category_percent.expand_dims(dim={'GCM':[GCM]}, axis=0) 
quartiles_category_percent.attrs['pct_values'] = quartile_climatology['quantile'].data

In [34]:
deciles_category = C3S.get_GCM_category_digitize(x, decile_climatology.squeeze(), varname=varout, dim='quantile')
deciles_category_percent = C3S.calculate_quantiles_probabilities(deciles_category, ncategories=10)
deciles_category_percent = deciles_category_percent.expand_dims(dim={'GCM':[GCM]}, axis=0) 
deciles_category_percent.attrs['pct_values'] = decile_climatology['quantile'].data

In [35]:
percentiles_category = C3S.get_GCM_category_digitize(x, percentile_climatology.squeeze(), varname=varout, dim='quantile')
percentiles_category_percent = C3S.calculate_quantiles_probabilities(percentiles_category, ncategories=21)
percentiles_category_percent = percentiles_category_percent.expand_dims(dim={'GCM':[GCM]}, axis=0) 
percentiles_category_percent.attrs['pct_values'] = percentile_climatology['quantile'].data

### saves to disk 

In [36]:
print(f"saving the quantile probabilities in the folder {str(outputs_path)}")

saving the quantile probabilities in the folder /home/nicolasf/operational/ICU/development/hotspots/code/ICU_Water_Watch/outputs/C3S


In [37]:
dict_lat = dict(units = "degrees_north", long_name = "Latitude")
dict_lon = dict(units = "degrees_east", long_name = "Longitude")

In [38]:
terciles_category_percent['lat'].attrs.update(dict_lat)
terciles_category_percent['lon'].attrs.update(dict_lon)

In [39]:
quartiles_category_percent['lat'].attrs.update(dict_lat)
quartiles_category_percent['lon'].attrs.update(dict_lon)

In [40]:
deciles_category_percent['lat'].attrs.update(dict_lat)
deciles_category_percent['lon'].attrs.update(dict_lon)

In [41]:
percentiles_category_percent['lat'].attrs.update(dict_lat)
percentiles_category_percent['lon'].attrs.update(dict_lon)

In [42]:
terciles_category_percent.to_netcdf(outputs_path.joinpath(f"{period}_{varname}_terciles_probabilities_from_{date:%Y-%m}_{GCM}.netcdf")) 

quartiles_category_percent.to_netcdf(outputs_path.joinpath(f"{period}_{varname}_quartiles_probabilities_from_{date:%Y-%m}_{GCM}.netcdf")) 

deciles_category_percent.to_netcdf(outputs_path.joinpath(f"{period}_{varname}_deciles_probabilities_from_{date:%Y-%m}_{GCM}.netcdf")) 

percentiles_category_percent.to_netcdf(outputs_path.joinpath(f"{period}_{varname}_percentiles_probabilities_from_{date:%Y-%m}_{GCM}.netcdf")) 

print(f"\n{GCM} {varname} {period} forecasts from {year}-{str(month).zfill(2)} processed and saved in {str(outputs_path)}...\n")



ECCC_GEM_NEMO tprate seasonal forecasts from 2022-05 processed and saved in /home/nicolasf/operational/ICU/development/hotspots/code/ICU_Water_Watch/outputs/C3S...

