In [1]:
import os
import numpy as np
import xarray as xr
import pandas as pd

from pyesgf.search import SearchConnection
os.environ["ESGF_PYCLIENT_NO_FACETS_STAR_WARNING"] = "on"
import xesmf as xe
from geopy.geocoders import Nominatim
from dask.diagnostics import ProgressBar
from xclim import ensembles
from xclim.ensembles import create_ensemble
import xclim.indices as xci

from utils import *

wd = '/Users/malavirdee/Documents/ccia/'



## 1. Data search/download via ESGF

In [5]:
### specify data search parameters

# project: 'CMIP5', 'CMIP6'
# models: see https://pcmdi.llnl.gov/CMIP6/ArchiveStatistics/esgf_data_holdings/
# variable_id: see https://pcmdi.llnl.gov/mips/cmip3/variableList.html
# table_id: 'day', 'Amon', ... (realm (A=Atmoshere O=Ocean) + time frequency)
# experiment_id: historical, or future forcing scenarios - see ScenarioMIP experiments available per model https://pcmdi.llnl.gov/CMIP6/ArchiveStatistics/esgf_data_holdings/ScenarioMIP/index.html
# member_id: ensemble member e.g. 'r1i1p1f1' where:
## r = initial conditions run, i = initialization parameters setting, p = perturbed physics setting, f = forcing dataset applied

project='CMIP6'
models = 'IPSL-CM6A-LR,GFDL-ESM4,CanESM5'#,INM-CM5-0,MIROC6'
variable_id = 'tas, tasmax, tasmin'
table_id = 'day'
experiment_id='historical'
member_id='r1i1p1f1'

In [3]:
conn = SearchConnection('https://esgf-data.dkrz.de/esg-search', distrib=True)
query = conn.new_context(
    latest = True,
    project='CMIP6',
    source_id=models,
    experiment_id=experiment_id,
    variable_id=variable_id,
    table_id=table_id,
    member_id=member_id,
    data_node='esgf.ceda.ac.uk')

print("Number of search results:", query.hit_count)

results = query.search()
files=[]
for i, result in enumerate(results):
    print("Retrieving search results: ", result.dataset_id)
    #print(result.json)
    files.extend(list(map(lambda f : {'model': f.json['source_id'].pop(), 'dataset_id': result.dataset_id, 'filename': f.filename, 'url': f.opendap_url}, result.file_context().search())))    
    
files = list(files)
files = pd.DataFrame.from_dict(files)
files.drop_duplicates('filename')
#files

# group by model
# url and filename is now a list of urls and filenames for each model, for multiple files split by time and potentially for multiple variables
grouped_files = files.groupby('model', as_index=False).agg(list)
grouped_files

Number of search results: 9
Retrieving search results:  CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.day.tas.gn.v20190429|esgf.ceda.ac.uk
Retrieving search results:  CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.day.tasmax.gn.v20190429|esgf.ceda.ac.uk
Retrieving search results:  CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.day.tasmin.gn.v20190429|esgf.ceda.ac.uk
Retrieving search results:  CMIP6.CMIP.IPSL.IPSL-CM6A-LR.historical.r1i1p1f1.day.tas.gr.v20190614|esgf.ceda.ac.uk
Retrieving search results:  CMIP6.CMIP.IPSL.IPSL-CM6A-LR.historical.r1i1p1f1.day.tasmax.gr.v20190614|esgf.ceda.ac.uk
Retrieving search results:  CMIP6.CMIP.IPSL.IPSL-CM6A-LR.historical.r1i1p1f1.day.tasmin.gr.v20180803|esgf.ceda.ac.uk
Retrieving search results:  CMIP6.CMIP.NOAA-GFDL.GFDL-ESM4.historical.r1i1p1f1.day.tas.gr1.v20190726|esgf.ceda.ac.uk
Retrieving search results:  CMIP6.CMIP.NOAA-GFDL.GFDL-ESM4.historical.r1i1p1f1.day.tasmax.gr1.v20190726|esgf.ceda.ac.uk
Retrieving search results:  CMIP6.CMIP.NOAA-GFDL.GF

Unnamed: 0,model,dataset_id,filename,url
0,CanESM5,[CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1....,[tas_day_CanESM5_historical_r1i1p1f1_gn_185001...,[https://esgf.ceda.ac.uk/thredds/dodsC/esg_cmi...
1,GFDL-ESM4,[CMIP6.CMIP.NOAA-GFDL.GFDL-ESM4.historical.r1i...,[tas_day_GFDL-ESM4_historical_r1i1p1f1_gr1_185...,[https://esgf.ceda.ac.uk/thredds/dodsC/esg_cmi...
2,IPSL-CM6A-LR,[CMIP6.CMIP.IPSL.IPSL-CM6A-LR.historical.r1i1p...,[tas_day_IPSL-CM6A-LR_historical_r1i1p1f1_gr_1...,[https://esgf.ceda.ac.uk/thredds/dodsC/esg_cmi...


In [4]:
# load all files from url list for each model into xarray multi-file dataset
# note: takes a few minutes per model

data={}
for i,model in enumerate(grouped_files.model):
    print("Loading data into xarray multi-file dataset: ", model)
    data[model]=xr.open_mfdataset(grouped_files.iloc[i].url, chunks={'time': 120})

Loading data into xarray multi-file dataset:  CanESM5
Loading data into xarray multi-file dataset:  GFDL-ESM4
Loading data into xarray multi-file dataset:  IPSL-CM6A-LR


## 2. Spatial regridding
#### Interpolate data onto a common spatial grid (necessary to align coordinates for combination into xclim ensemble later) - see xesmf docs for options

In [5]:
# create target 1*1 degree rectilinear grid

rg = xr.Dataset(
   {"lat": (["lat"], np.arange(-90, 90, 1.0)),
    "lon": (["lon"], np.arange(0, 360, 1.0)),})

# set up regridder for each model
regridders = {}
for i, (model, dataset) in enumerate(data.items()):
    regridder = xe.Regridder(dataset, rg, 'bilinear', periodic=True) # periodic longitudes
    print(model, regridder, '\n')
    regridders[model] = regridder
    
# create regridded datasets
data_rg = {}
for i, (model, dataset) in enumerate(data.items()):
    rg_model = regridders[model](dataset, keep_attrs=True)
    data_rg[model] = rg_model

CanESM5 xESMF Regridder 
Regridding algorithm:       bilinear 
Weight filename:            bilinear_64x128_180x360_peri.nc 
Reuse pre-computed weights? False 
Input grid shape:           (64, 128) 
Output grid shape:          (180, 360) 
Periodic in longitude?      True 

GFDL-ESM4 xESMF Regridder 
Regridding algorithm:       bilinear 
Weight filename:            bilinear_180x288_180x360_peri.nc 
Reuse pre-computed weights? False 
Input grid shape:           (180, 288) 
Output grid shape:          (180, 360) 
Periodic in longitude?      True 

IPSL-CM6A-LR xESMF Regridder 
Regridding algorithm:       bilinear 
Weight filename:            bilinear_143x144_180x360_peri.nc 
Reuse pre-computed weights? False 
Input grid shape:           (143, 144) 
Output grid shape:          (180, 360) 
Periodic in longitude?      True 



## 3. Subselect location
#### Select data for 1 city and save xarray files

In [3]:
# retrieve lat, lon for a city

city = 'Lagos'
latitude, longitude = get_coords(city)

ensemble_name="_".join([city.replace(" ","")])#, table_id, member_id])
ensemble_path=os.path.join(wd, ensemble_name)

Location, (lat, lon):  Lagos, Lagos Island, Lagos, 100242, Nigeria (6.4550575, 3.3941795)


In [7]:
# select time series for city from each model and save multi-file datasets, splitting files by year
# note: saving takes a long time even for single location selection

os.chdir(wd)
print("Current directory: %s" % (os.getcwd()))
if os.path.isdir(ensemble_path):
    print("Saving files to", ensemble_path)
else:
    print("Creating subdirectory", ensemble_path)
    os.makedirs(ensemble_path)
    print("Saving files to",  ensemble_path)
    
    
os.chdir(wd)
for model, dataset in data_rg.items():
    print("Saving ", model, "for city ", city)
    dataset_sel = dataset.sel(lat=latitude, lon=longitude, method='nearest')
    identifier = '_'.join([model, city])
    years, y_datasets = zip(*dataset_sel.groupby("time.year"))
    fns=[identifier+f'_{y}.nc' for y in years]
    paths=[os.path.join(ensemble_path,fn) for fn in fns]
    with ProgressBar():
        xr.save_mfdataset(y_datasets[-2:], paths[-2:], mode="w") # !!! subselecting 2 years of data for demo

Current directory: /Users/malavirdee/Documents/ccia
Saving files to /Users/malavirdee/Documents/ccia/Lagos
Saving  CanESM5 for city  Lagos
[########################################] | 100% Completed | 27.08 s
[########################################] | 100% Completed | 22.99 s
Saving  GFDL-ESM4 for city  Lagos
[########################################] | 100% Completed | 118.36 s
[########################################] | 100% Completed | 96.63 s
Saving  IPSL-CM6A-LR for city  Lagos
[########################################] | 100% Completed | 47.44 s
[########################################] | 100% Completed | 38.94 s


## 4. Load city dataset into xclim ensemble

In [6]:
os.chdir(wd)
city_data = load_mf_dataset(ensemble_path, models)

IPSL-CM6A-LR 2
GFDL-ESM4 2
CanESM5 2


In [7]:
ens = multimodel_ensemble(city_data)
ens

## 5. Bias correction
#### Bias correction methods: mean shift, quantile mapping, ...
#### tba

## 6. Model evaluation
#### Model evaluation metrics: rmse, ...
#### tba

## 7. Aggregate models
#### Multi-model aggregation methods: unweighted average, skill-weighted average

In [8]:
## unweighted average

ens_stats = ensembles.ensemble_mean_std_max_min(ens)

In [9]:
## weighted average

# for now randomly generate some weights (these should come from model evaluation step)
u = [np.random.random() for i in range(len(ens.realization))]
v = sum(u)
w = [i/v for i in u]
print(w)
weights = xr.DataArray(w)
weighted_ens_stats = ensembles.ensemble_mean_std_max_min(ens, weights=weights)

[0.09952794201975601, 0.4832734833866716, 0.41719857459357235]


## 8. Calculate extreme indices
#### See https://xclim.readthedocs.io/en/stable/indices.html for definitions of indices used. Note, all indices parameters (thresholds, windows, sampling frequency) currently left as xclim default values but may need to be updated.

In [10]:
model_indices = extreme_temperature_indices(ens, aggregate=False)
unweighted_average_indices = extreme_temperature_indices(ens_stats, aggregate=True).assign_coords(realization='unweighted_average').expand_dims(realization=1)

In [11]:
indices = xr.concat([model_indices.mean('time'), unweighted_average_indices.mean('time')], dim='realization')
indices = indices.drop_vars(['height', 'lat', 'lon'])
indices.to_dataframe()

Unnamed: 0_level_0,dtr,dtrv,etr,hwf,hwi,hwtl,hsf,hwml
realization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.475014,0.505568,9.819595,10.0,358.0,100.0,10.0,25.5
1,2.834759,0.49738,9.773773,5.5,363.0,76.5,5.5,45.0
2,4.531099,0.741739,14.980789,10.5,365.0,128.0,9.5,45.5
unweighted_average,3.613624,0.351379,9.685287,2.5,365.0,95.5,2.5,82.5
