# Upper ocean oxygen versus temperature

Compute global mean temperature and oxygen concentration for the upper 1000 m.

Plot O2 anomalies versus temperature anomalies.

In [1]:
%matplotlib inline

from itertools import product

import numpy as np

import pandas as pd
import xarray as xr
import intake

# util.py is in the local directory
# it contains code that is common across project notebooks
# or routines that are too extensive and might otherwise clutter
# the notebook design
import util 



## Spin-up a dask cluster
Syntax is different if on an NCAR machine versus the cloud.

In [2]:
if util.is_ncar_host():
    from ncar_jobqueue import NCARCluster
    cluster = NCARCluster(project='UCGD0006')
    cluster.adapt(minimum_jobs=1, maximum_jobs=10)
else:
    from dask_kubernetes import KubeCluster
    cluster = KubeCluster()
    cluster.adapt(minimum=1, maximum=10)
cluster

VBox(children=(HTML(value='<h2>NCARCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [3]:
from dask.distributed import Client
client = Client(cluster) # Connect this local process to remote workers
client

0,1
Client  Scheduler: tcp://128.117.181.209:44728  Dashboard: https://jupyterhub.ucar.edu/dav/user/mclong/proxy/8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


## Open `intake-esm` collection

In [4]:
if util.is_ncar_host():
    col = intake.open_esm_datastore("../catalogs/glade-cmip6.json")
else:
    col = intake.open_esm_datastore("../catalogs/pangeo-cmip6.json")
col

glade-cmip6-ESM Collection with 687919 entries:
	> 12 activity_id(s)

	> 24 institution_id(s)

	> 47 source_id(s)

	> 66 experiment_id(s)

	> 162 member_id(s)

	> 35 table_id(s)

	> 1027 variable_id(s)

	> 12 grid_label(s)

	> 59 dcpp_init_year(s)

	> 246 version(s)

	> 6667 time_range(s)

	> 687919 path(s)

`intake-esm` is build on top of [pandas](https://pandas.pydata.org/pandas-docs/stable). It is possible to view the `pandas.DataFrame` as follows.

In [5]:
col.df.head()

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,dcpp_init_year,version,time_range,path
0,AerChemMIP,BCC,BCC-ESM1,ssp370,r2i1p1f1,day,pr,gn,,v20190702,20150101-20551231,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
1,AerChemMIP,BCC,BCC-ESM1,ssp370,r2i1p1f1,Amon,hfls,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
2,AerChemMIP,BCC,BCC-ESM1,ssp370,r2i1p1f1,Amon,prsn,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
3,AerChemMIP,BCC,BCC-ESM1,ssp370,r2i1p1f1,Amon,va,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
4,AerChemMIP,BCC,BCC-ESM1,ssp370,r2i1p1f1,Amon,tas,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...


### Finding unique entries
Query the data to see what models ("source_id"), experiments ("experiment_id") and temporal frequencies ("table_id") are available.

In [6]:
import pprint 
uni_dict = col.unique(['source_id', 'experiment_id', 'table_id'])
pprint.pprint(uni_dict, compact=True)

{'experiment_id': {'count': 66,
                   'values': ['ssp370', 'histSST-piNTCF', 'histSST',
                              'histSST-1950HC', 'hist-1950HC', 'hist-piNTCF',
                              'piClim-NTCF', 'ssp370SST-lowNTCF',
                              'ssp370-lowNTCF', 'ssp370SST', 'hist-bgc',
                              'esm-ssp585', 'amip-future4K', 'amip-m4K',
                              'a4SST', 'aqua-p4K', 'piSST', 'amip-4xCO2',
                              'a4SSTice', 'amip-p4K', 'aqua-control',
                              'aqua-4xCO2', 'abrupt-4xCO2', 'historical',
                              'piControl', 'amip', '1pctCO2', 'esm-piControl',
                              'esm-hist', 'ssp245', 'ssp585', 'ssp126',
                              'hist-GHG', 'hist-aer', 'dcppA-hindcast',
                              'dcppC-hindcast-noPinatubo',
                              'dcppC-hindcast-noElChichon', 'dcppA-assim',
                              'dcp

### Searching for specific datasets

Find the models with **both** dissolved oxygen and temperature data at monthly or annual frequency for both the `historical` and `ssp585` experiments.

In [7]:
experiments = ['historical', 'ssp585']


def get_models(table_id, variable_id):
    # all models
    models = set(uni_dict['source_id']['values'])

    for experiment_id in experiments:
        query = dict(experiment_id=experiment_id, variable_id=variable_id, 
                     table_id=table_id, grid_label='gn')  
        cat = col.search(**query)
        models = models.intersection({model for model in cat.df.source_id.unique().tolist()})

    # ensure the CESM2 models are not included (oxygen was erroneously submitted to the archive)
    return models - {'CESM2-WACCM', 'CESM2'}
    


models = {}    

# look for models with O2
# prioritize getting data from annual table, if present
models['Oyr.o2'] = get_models('Oyr', 'o2')
models['Omon.o2'] = get_models('Omon', 'o2') - models['Oyr.o2']
have_oxygen = models['Oyr.o2'].union(models['Omon.o2'])

# find models with temperature that also have O2
models['Omon.theta'] = get_models('Omon', 'thetao').intersection(have_oxygen)

models_all = list(models['Omon.theta'])
models

{'Oyr.o2': {'CanESM5', 'IPSL-CM6A-LR', 'MIROC-ES2L'},
 'Omon.o2': {'CNRM-ESM2-1', 'UKESM1-0-LL'},
 'Omon.theta': {'CNRM-ESM2-1', 'CanESM5', 'IPSL-CM6A-LR', 'UKESM1-0-LL'}}

In [8]:
df = pd.DataFrame()
for key, val in models.items():
    model_list = list(val)
    table_id = key.split('.')[0]
    variable_id = key.split('.')[1]
    
    cat = col.search(experiment_id=experiments, table_id=table_id, 
                     variable_id=variable_id, 
                     source_id=model_list, grid_label='gn')
    df = pd.concat((df, cat.df))
    
cat.df = df.copy()
cat.df

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,dcpp_init_year,version,time_range,path
294681,CMIP,CCCma,CanESM5,historical,r2i1p1f1,Oyr,o2,gn,,v20190429,1850-2014,/glade/collections/cmip/CMIP6/CMIP/CCCma/CanES...
295218,CMIP,CCCma,CanESM5,historical,r5i1p1f1,Oyr,o2,gn,,v20190429,1850-2014,/glade/collections/cmip/CMIP6/CMIP/CCCma/CanES...
295760,CMIP,CCCma,CanESM5,historical,r12i1p1f1,Oyr,o2,gn,,v20190429,1850-2014,/glade/collections/cmip/CMIP6/CMIP/CCCma/CanES...
296313,CMIP,CCCma,CanESM5,historical,r1i1p2f1,Oyr,o2,gn,,v20190429,1850-2014,/glade/collections/cmip/CMIP6/CMIP/CCCma/CanES...
297506,CMIP,CCCma,CanESM5,historical,r14i1p1f1,Oyr,o2,gn,,v20190429,1850-2014,/glade/collections/cmip/CMIP6/CMIP/CCCma/CanES...
...,...,...,...,...,...,...,...,...,...,...,...,...
629698,ScenarioMIP,MOHC,UKESM1-0-LL,ssp585,r1i1p1f2,Omon,o2,gn,,v20190726,205001-209912,/glade/collections/cmip/CMIP6/ScenarioMIP/MOHC...
629699,ScenarioMIP,MOHC,UKESM1-0-LL,ssp585,r1i1p1f2,Omon,o2,gn,,v20190726,210001-210012,/glade/collections/cmip/CMIP6/ScenarioMIP/MOHC...
629789,ScenarioMIP,MOHC,UKESM1-0-LL,ssp585,r2i1p1f2,Omon,o2,gn,,v20190726,201501-204912,/glade/collections/cmip/CMIP6/ScenarioMIP/MOHC...
629790,ScenarioMIP,MOHC,UKESM1-0-LL,ssp585,r2i1p1f2,Omon,o2,gn,,v20190726,210001-210012,/glade/collections/cmip/CMIP6/ScenarioMIP/MOHC...


### Loading data

The best part about `intake-esm` is that it enables loading data directly into an [xarray.Dataset](http://xarray.pydata.org/en/stable/api.html#dataset).

Note that data on the cloud are in 
[zarr](https://zarr.readthedocs.io/en/stable/) and data on 
[glade](https://www2.cisl.ucar.edu/resources/storage-and-file-systems/glade-file-spaces) are stored as 
[netCDF](https://www.unidata.ucar.edu/software/netcdf/) files. This is opaque to the user!

`intake-esm` has rules for aggegating datasets; these rules are defined in the collection-specification file.

In [9]:
dset_dict = cat.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times': False}, 
                                cdf_kwargs={'chunks': {'time': 48}, 'decode_times': False})

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 10 group(s)


`dset_dict` is a dictionary of `xarray.Dataset`'s; its keys are constructed to refer to compatible groups.

In [10]:
dset_dict.keys()

dict_keys(['CMIP.CCCma.CanESM5.historical.Oyr.gn', 'CMIP.CNRM-CERFACS.CNRM-ESM2-1.historical.Omon.gn', 'CMIP.IPSL.IPSL-CM6A-LR.historical.Oyr.gn', 'CMIP.MIROC.MIROC-ES2L.historical.Oyr.gn', 'CMIP.MOHC.UKESM1-0-LL.historical.Omon.gn', 'ScenarioMIP.CCCma.CanESM5.ssp585.Oyr.gn', 'ScenarioMIP.CNRM-CERFACS.CNRM-ESM2-1.ssp585.Omon.gn', 'ScenarioMIP.IPSL.IPSL-CM6A-LR.ssp585.Oyr.gn', 'ScenarioMIP.MIROC.MIROC-ES2L.ssp585.Oyr.gn', 'ScenarioMIP.MOHC.UKESM1-0-LL.ssp585.Omon.gn'])

Get the grid information.

In [11]:
cat_fx = col.search(source_id=models_all, table_id='Ofx', grid_label='gn')
cat_fx.df

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,dcpp_init_year,version,time_range,path
22443,CMIP,CNRM-CERFACS,CNRM-ESM2-1,abrupt-4xCO2,r1i1p1f2,Ofx,areacello,gn,,v20181026,,/glade/collections/cmip/CMIP6/CMIP/CNRM-CERFAC...
23151,CMIP,CNRM-CERFACS,CNRM-ESM2-1,abrupt-4xCO2,r2i1p1f2,Ofx,areacello,gn,,v20181106,,/glade/collections/cmip/CMIP6/CMIP/CNRM-CERFAC...
23665,CMIP,CNRM-CERFACS,CNRM-ESM2-1,historical,r3i1p1f2,Ofx,areacello,gn,,v20190125,,/glade/collections/cmip/CMIP6/CMIP/CNRM-CERFAC...
23666,CMIP,CNRM-CERFACS,CNRM-ESM2-1,historical,r3i1p1f2,Ofx,deptho,gn,,v20190125,,/glade/collections/cmip/CMIP6/CMIP/CNRM-CERFAC...
24506,CMIP,CNRM-CERFACS,CNRM-ESM2-1,historical,r4i1p1f2,Ofx,masscello,gn,,v20190125,,/glade/collections/cmip/CMIP6/CMIP/CNRM-CERFAC...
...,...,...,...,...,...,...,...,...,...,...,...,...
687839,ScenarioMIP,IPSL,IPSL-CM6A-LR,ssp126,r1i1p1f1,Ofx,areacello,gn,,v20190121,,/glade/collections/cmip/CMIP6/ScenarioMIP/IPSL...
687840,ScenarioMIP,IPSL,IPSL-CM6A-LR,ssp126,r1i1p1f1,Ofx,hfgeou,gn,,v20190121,,/glade/collections/cmip/CMIP6/ScenarioMIP/IPSL...
687879,ScenarioMIP,IPSL,IPSL-CM6A-LR,ssp126,r3i1p1f1,Ofx,areacello,gn,,v20190410,,/glade/collections/cmip/CMIP6/ScenarioMIP/IPSL...
687880,ScenarioMIP,IPSL,IPSL-CM6A-LR,ssp126,r3i1p1f1,Ofx,hfgeou,gn,,v20190410,,/glade/collections/cmip/CMIP6/ScenarioMIP/IPSL...


We want to prune this query to remove the ensemble dimension.

In [12]:
# specify a list of queries to eliminate
corrupt_data = [dict(variable_id='areacello', source_id='IPSL-CM6A-LR',
                     experiment_id='historical', member_id='r2i1p1f1')
               ]

# copy the dataframe 
df = cat_fx.df.copy()

# eliminate data
for elim in corrupt_data:
    condition = np.ones(len(df), dtype=bool)
    for key, val in elim.items():
        condition = condition & (df[key] == val)
    df = df.loc[~condition]

df.drop_duplicates(subset=['source_id', 'variable_id'], inplace=True)
df['member_id'] = np.nan
cat_fx.df = df
df

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,dcpp_init_year,version,time_range,path
22443,CMIP,CNRM-CERFACS,CNRM-ESM2-1,abrupt-4xCO2,,Ofx,areacello,gn,,v20181026,,/glade/collections/cmip/CMIP6/CMIP/CNRM-CERFAC...
23666,CMIP,CNRM-CERFACS,CNRM-ESM2-1,historical,,Ofx,deptho,gn,,v20190125,,/glade/collections/cmip/CMIP6/CMIP/CNRM-CERFAC...
24506,CMIP,CNRM-CERFACS,CNRM-ESM2-1,historical,,Ofx,masscello,gn,,v20190125,,/glade/collections/cmip/CMIP6/CMIP/CNRM-CERFAC...
24507,CMIP,CNRM-CERFACS,CNRM-ESM2-1,historical,,Ofx,hfgeou,gn,,v20190125,,/glade/collections/cmip/CMIP6/CMIP/CNRM-CERFAC...
24508,CMIP,CNRM-CERFACS,CNRM-ESM2-1,historical,,Ofx,basin,gn,,v20190125,,/glade/collections/cmip/CMIP6/CMIP/CNRM-CERFAC...
202044,CMIP,MOHC,UKESM1-0-LL,piControl,,Ofx,sftof,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
202045,CMIP,MOHC,UKESM1-0-LL,piControl,,Ofx,hfgeou,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
202046,CMIP,MOHC,UKESM1-0-LL,piControl,,Ofx,basin,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
202047,CMIP,MOHC,UKESM1-0-LL,piControl,,Ofx,deptho,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
293683,CMIP,CCCma,CanESM5,abrupt-4xCO2,,Ofx,areacello,gn,,v20190429,,/glade/collections/cmip/CMIP6/CMIP/CCCma/CanES...


In [13]:
fx_dsets = cat_fx.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times': False}, 
                                  cdf_kwargs={'chunks': {}, 'decode_times': False})



xarray will load netCDF datasets with dask using a single chunk for all arrays.
                     For effective chunking, please provide chunks in cdf_kwargs.
                     For example: cdf_kwargs={'chunks': {'time': 36}}

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 8 group(s)


In [14]:
fx_dsets.keys()

dict_keys(['CMIP.CCCma.CanESM5.abrupt-4xCO2.Ofx.gn', 'CMIP.CCCma.CanESM5.historical.Ofx.gn', 'CMIP.CNRM-CERFACS.CNRM-ESM2-1.abrupt-4xCO2.Ofx.gn', 'CMIP.CNRM-CERFACS.CNRM-ESM2-1.historical.Ofx.gn', 'CMIP.IPSL.IPSL-CM6A-LR.abrupt-4xCO2.Ofx.gn', 'CMIP.IPSL.IPSL-CM6A-LR.historical.Ofx.gn', 'CMIP.IPSL.IPSL-CM6A-LR.piControl.Ofx.gn', 'CMIP.MOHC.UKESM1-0-LL.piControl.Ofx.gn'])

In [15]:
for key, ds in fx_dsets.items():
    print(key)
    print(ds.data_vars)
    print()

CMIP.CCCma.CanESM5.abrupt-4xCO2.Ofx.gn
Data variables:
    latitude            (j, i) float64 dask.array<chunksize=(291, 360), meta=np.ndarray>
    longitude           (j, i) float64 dask.array<chunksize=(291, 360), meta=np.ndarray>
    vertices_latitude   (j, i, vertices) float64 dask.array<chunksize=(291, 360, 4), meta=np.ndarray>
    vertices_longitude  (j, i, vertices) float64 dask.array<chunksize=(291, 360, 4), meta=np.ndarray>
    areacello           (j, i) float32 dask.array<chunksize=(291, 360), meta=np.ndarray>

CMIP.CCCma.CanESM5.historical.Ofx.gn
Data variables:
    lev_bnds            (lev, bnds) float64 dask.array<chunksize=(45, 2), meta=np.ndarray>
    latitude            (j, i) float64 dask.array<chunksize=(291, 360), meta=np.ndarray>
    longitude           (j, i) float64 dask.array<chunksize=(291, 360), meta=np.ndarray>
    vertices_latitude   (j, i, vertices) float64 dask.array<chunksize=(291, 360, 4), meta=np.ndarray>
    vertices_longitude  (j, i, vertices) float64 