# Plot AOU versus O2 Saturation 
Compute delta AOU and delta O2 saturation for different regiobs (global, tropics, extratropics)

In [1]:
%matplotlib inline

from itertools import product

import numpy as np

import pandas as pd
import xarray as xr
import intake

# util.py is in the local directory
# it contains code that is common across project notebooks
# or routines that are too extensive and might otherwise clutter
# the notebook design
import util 

In [4]:
if util.is_ncar_host():
    from ncar_jobqueue import NCARCluster
    cluster = NCARCluster(project='UCGD0006')
    cluster.adapt(minimum_jobs=1, maximum_jobs=10)
else:
    from dask_kubernetes import KubeCluster
    cluster = KubeCluster()
    cluster.adapt(minimum=1, maximum=10)
cluster

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


VBox(children=(HTML(value='<h2>NCARCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [5]:
from dask.distributed import Client
client = Client(cluster) # Connect this local process to remote workers
client

0,1
Client  Scheduler: tcp://128.117.181.209:35951  Dashboard: https://jupyterhub.ucar.edu/dav/user/shawnee/proxy/38097/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [2]:
if util.is_ncar_host():
    col = intake.open_esm_datastore("../catalogs/glade-cmip6.json")
else:
    col = intake.open_esm_datastore("../catalogs/pangeo-cmip6.json")
col

glade-cmip6-ESM Collection with 698724 entries:
	> 13 activity_id(s)

	> 24 institution_id(s)

	> 47 source_id(s)

	> 68 experiment_id(s)

	> 162 member_id(s)

	> 35 table_id(s)

	> 1027 variable_id(s)

	> 12 grid_label(s)

	> 59 dcpp_init_year(s)

	> 248 version(s)

	> 6813 time_range(s)

	> 698724 path(s)

In [3]:
col.df.head()

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,dcpp_init_year,version,time_range,path
0,AerChemMIP,BCC,BCC-ESM1,ssp370,r2i1p1f1,day,pr,gn,,v20190702,20150101-20551231,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
1,AerChemMIP,BCC,BCC-ESM1,ssp370,r2i1p1f1,Amon,hfls,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
2,AerChemMIP,BCC,BCC-ESM1,ssp370,r2i1p1f1,Amon,prsn,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
3,AerChemMIP,BCC,BCC-ESM1,ssp370,r2i1p1f1,Amon,va,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
4,AerChemMIP,BCC,BCC-ESM1,ssp370,r2i1p1f1,Amon,tas,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...


In [4]:
import pprint 
uni_dict = col.unique(['source_id', 'experiment_id','table_id'])
pprint.pprint(uni_dict, compact=True)

{'experiment_id': {'count': 68,
                   'values': ['ssp370', 'histSST-piNTCF', 'histSST',
                              'histSST-1950HC', 'hist-1950HC', 'hist-piNTCF',
                              'piClim-NTCF', 'ssp370SST-lowNTCF',
                              'ssp370-lowNTCF', 'ssp370SST', '1pctCO2-bgc',
                              'hist-bgc', 'esm-ssp585', 'amip-future4K',
                              'amip-m4K', 'a4SST', 'aqua-p4K', 'piSST',
                              'amip-4xCO2', 'a4SSTice', 'amip-p4K',
                              'aqua-control', 'aqua-4xCO2', 'abrupt-4xCO2',
                              'historical', 'piControl', 'amip', '1pctCO2',
                              'esm-hist', 'esm-piControl', 'ssp245', 'ssp585',
                              'ssp126', 'hist-GHG', 'hist-aer',
                              'dcppA-hindcast', 'dcppC-hindcast-noPinatubo',
                              'dcppC-hindcast-noElChichon', 'dcppA-assim',
                   

In [5]:
experiments = ['historical', 'ssp585']


def get_models(table_id, variable_id):
    # all models
    models = set(uni_dict['source_id']['values'])

    for experiment_id in experiments:
        query = dict(experiment_id=experiment_id, variable_id=variable_id, 
                     table_id=table_id, grid_label='gn')  
        cat = col.search(**query)
        models = models.intersection({model for model in cat.df.source_id.unique().tolist()})

    # ensure the CESM2 models are not included (oxygen was erroneously submitted to the archive)
    return models - {'CESM2-WACCM', 'CESM2'}
    

models = {}    

# look for models with O2, and O2sat
models['Omon.o2sat'] = get_models('Omon', 'o2sat')
have_sat = models['Omon.o2sat']

# find models with O2sat that also have O2
models['Omon.o2'] = get_models('Omon', 'o2').intersection(have_sat)

models_all = list(models['Omon.o2'])
models

{'Omon.o2sat': {'UKESM1-0-LL'}, 'Omon.o2': {'UKESM1-0-LL'}}

Only UKESM1-0-LL has O2, O2sat. Grab ones with conservative temperature (bigthetao), salinity (so), and pressure/depth to be able to calculate oxygen solubility?

In [6]:
df = pd.DataFrame()
for key, val in models.items():
    model_list = list(val)
    table_id = key.split('.')[0]
    variable_id = key.split('.')[1]
    
    cat = col.search(experiment_id=experiments, table_id=table_id, 
                     variable_id=variable_id, 
                     source_id=model_list, grid_label='gn')
    df = pd.concat((df, cat.df))

cat_o2 = col.search(experiment_id=['historical', 'ssp585'], table_id='Omon', 
                 variable_id='o2', grid_label='gn', source_id='UKESM1-0-LL')
cat_o2sat = col.search(experiment_id=['historical', 'ssp585'], table_id='Omon', 
                 variable_id='o2sat', grid_label='gn', source_id='UKESM1-0-LL')
df = pd.concat((cat_o2.df, cat_o2sat.df))
cat.df = df.copy()
cat.df

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,dcpp_init_year,version,time_range,path
195538,CMIP,MOHC,UKESM1-0-LL,historical,r3i1p1f2,Omon,o2,gn,,v20190708,185001-189912,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
195539,CMIP,MOHC,UKESM1-0-LL,historical,r3i1p1f2,Omon,o2,gn,,v20190708,195001-199912,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
195540,CMIP,MOHC,UKESM1-0-LL,historical,r3i1p1f2,Omon,o2,gn,,v20190708,200001-201412,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
198604,CMIP,MOHC,UKESM1-0-LL,historical,r1i1p1f2,Omon,o2,gn,,v20190627,195001-199912,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
198605,CMIP,MOHC,UKESM1-0-LL,historical,r1i1p1f2,Omon,o2,gn,,v20190627,190001-194912,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
200464,CMIP,MOHC,UKESM1-0-LL,historical,r2i1p1f2,Omon,o2,gn,,v20190708,195001-199912,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
200465,CMIP,MOHC,UKESM1-0-LL,historical,r2i1p1f2,Omon,o2,gn,,v20190708,200001-201412,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
200466,CMIP,MOHC,UKESM1-0-LL,historical,r2i1p1f2,Omon,o2,gn,,v20190708,185001-189912,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
637629,ScenarioMIP,MOHC,UKESM1-0-LL,ssp585,r3i1p1f2,Omon,o2,gn,,v20190813,205001-209912,/glade/collections/cmip/CMIP6/ScenarioMIP/MOHC...
637630,ScenarioMIP,MOHC,UKESM1-0-LL,ssp585,r3i1p1f2,Omon,o2,gn,,v20190813,201501-204912,/glade/collections/cmip/CMIP6/ScenarioMIP/MOHC...


In [7]:
# specify a list of queries to eliminate
bad_member = [dict(member_id='r4i1p1f2')
               ]

# copy the dataframe 
df = cat.df.copy()

# eliminate data
for elim in bad_member:
    condition = np.ones(len(df), dtype=bool)
    for key, val in elim.items():
        condition = condition & (df[key] == val)
    df = df.loc[~condition]

cat.df = df
df

#dset_dict = cat.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times': False}, 
#                                cdf_kwargs={'chunks': {'time': 48}, 'decode_times': False})

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,dcpp_init_year,version,time_range,path
195538,CMIP,MOHC,UKESM1-0-LL,historical,r3i1p1f2,Omon,o2,gn,,v20190708,185001-189912,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
195539,CMIP,MOHC,UKESM1-0-LL,historical,r3i1p1f2,Omon,o2,gn,,v20190708,195001-199912,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
195540,CMIP,MOHC,UKESM1-0-LL,historical,r3i1p1f2,Omon,o2,gn,,v20190708,200001-201412,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
198604,CMIP,MOHC,UKESM1-0-LL,historical,r1i1p1f2,Omon,o2,gn,,v20190627,195001-199912,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
198605,CMIP,MOHC,UKESM1-0-LL,historical,r1i1p1f2,Omon,o2,gn,,v20190627,190001-194912,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
200464,CMIP,MOHC,UKESM1-0-LL,historical,r2i1p1f2,Omon,o2,gn,,v20190708,195001-199912,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
200465,CMIP,MOHC,UKESM1-0-LL,historical,r2i1p1f2,Omon,o2,gn,,v20190708,200001-201412,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
200466,CMIP,MOHC,UKESM1-0-LL,historical,r2i1p1f2,Omon,o2,gn,,v20190708,185001-189912,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
637629,ScenarioMIP,MOHC,UKESM1-0-LL,ssp585,r3i1p1f2,Omon,o2,gn,,v20190813,205001-209912,/glade/collections/cmip/CMIP6/ScenarioMIP/MOHC...
637630,ScenarioMIP,MOHC,UKESM1-0-LL,ssp585,r3i1p1f2,Omon,o2,gn,,v20190813,201501-204912,/glade/collections/cmip/CMIP6/ScenarioMIP/MOHC...


In [None]:
#dset_dict.keys()

In [33]:
cat_fx = col.search(source_id=models_all, table_id='Ofx', grid_label='gn')
cat_fx.df

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,dcpp_init_year,version,time_range,path
202044,CMIP,MOHC,UKESM1-0-LL,piControl,r1i1p1f2,Ofx,sftof,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
202045,CMIP,MOHC,UKESM1-0-LL,piControl,r1i1p1f2,Ofx,hfgeou,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
202046,CMIP,MOHC,UKESM1-0-LL,piControl,r1i1p1f2,Ofx,basin,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
202047,CMIP,MOHC,UKESM1-0-LL,piControl,r1i1p1f2,Ofx,deptho,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...


Choose just one matching 

In [33]:
df_Omon_o2_sat = df[(cat.df.table_id == 'Omon') & (cat.df.source_id == 'UKESM1-0-LL') & (cat.df.member_id == 'r1i1p1f2') & (cat.df.variable_id == 'o2sat')]
df_Omon_o2 = df[(cat.df.table_id == 'Omon') & (cat.df.source_id == 'UKESM1-0-LL') & (cat.df.member_id == 'r1i1p1f2') & (cat.df.variable_id == 'o2')]

#df_Omon_o2_sat = df[(cat.df.table_id == 'Omon') & (cat.df.source_id == 'UKESM1-0-LL') & (cat.df.variable_id == 'o2sat')]
#df_Omon_o2 = df[(cat.df.table_id == 'Omon') & (cat.df.source_id == 'UKESM1-0-LL') & (cat.df.variable_id == 'o2')]

df = pd.concat((df_Omon_o2, df_Omon_o2_sat))
cat.df = df.copy()
cat.df

dset_dict = cat.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times': False}, 
                                cdf_kwargs={'chunks': {'time': 48}, 'decode_times': False})

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 2 group(s)


In [34]:
dset_dict.keys()

dict_keys(['CMIP.MOHC.UKESM1-0-LL.historical.Omon.gn', 'ScenarioMIP.MOHC.UKESM1-0-LL.ssp585.Omon.gn'])

In [30]:
#o2_sat_member = {df_Omon_o2_sat.member_id}
#o2_member = {df_Omon_o2.member_id}
#member_intersect = list(set(o2_member) & set(o2_sat_member))

In [35]:
cat_fx = col.search(source_id='UKESM1-0-LL', table_id='Ofx', grid_label='gn')
cat_fx.df

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,dcpp_init_year,version,time_range,path
201685,CMIP,MOHC,UKESM1-0-LL,piControl,r1i1p1f2,Ofx,sftof,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
201686,CMIP,MOHC,UKESM1-0-LL,piControl,r1i1p1f2,Ofx,areacello,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
201687,CMIP,MOHC,UKESM1-0-LL,piControl,r1i1p1f2,Ofx,hfgeou,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
201688,CMIP,MOHC,UKESM1-0-LL,piControl,r1i1p1f2,Ofx,basin,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
201689,CMIP,MOHC,UKESM1-0-LL,piControl,r1i1p1f2,Ofx,deptho,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...


In [36]:
# specify a list of queries to eliminate
corrupt_data = [dict(variable_id='areacello', source_id='IPSL-CM6A-LR',
                     experiment_id='historical', member_id='r2i1p1f1')
               ]

# copy the dataframe 
df = cat_fx.df.copy()

# eliminate data
for elim in corrupt_data:
    condition = np.ones(len(df), dtype=bool)
    for key, val in elim.items():
        condition = condition & (df[key] == val)
    df = df.loc[~condition]

df.drop_duplicates(subset=['source_id', 'variable_id'], inplace=True)
df['member_id'] = np.nan
cat_fx.df = df
df

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,dcpp_init_year,version,time_range,path
201685,CMIP,MOHC,UKESM1-0-LL,piControl,,Ofx,sftof,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
201686,CMIP,MOHC,UKESM1-0-LL,piControl,,Ofx,areacello,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
201687,CMIP,MOHC,UKESM1-0-LL,piControl,,Ofx,hfgeou,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
201688,CMIP,MOHC,UKESM1-0-LL,piControl,,Ofx,basin,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...
201689,CMIP,MOHC,UKESM1-0-LL,piControl,,Ofx,deptho,gn,,v20190705,,/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1...


In [37]:
fx_dsets = cat_fx.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times': False}, 
                                  cdf_kwargs={'chunks': {}, 'decode_times': False})



xarray will load netCDF datasets with dask using a single chunk for all arrays.
For effective chunking, please provide chunks in cdf_kwargs.
For example: cdf_kwargs={'chunks': {'time': 36}}

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 1 group(s)


In [38]:
fx_dsets.keys()

dict_keys(['CMIP.MOHC.UKESM1-0-LL.piControl.Ofx.gn'])

In [39]:
for key, ds in fx_dsets.items():
    print(key)
    print(ds.data_vars)
    print()

CMIP.MOHC.UKESM1-0-LL.piControl.Ofx.gn
Data variables:
    latitude            (j, i) float32 dask.array<chunksize=(330, 360), meta=np.ndarray>
    longitude           (j, i) float32 dask.array<chunksize=(330, 360), meta=np.ndarray>
    vertices_latitude   (j, i, vertices) float32 dask.array<chunksize=(330, 360, 4), meta=np.ndarray>
    vertices_longitude  (j, i, vertices) float32 dask.array<chunksize=(330, 360, 4), meta=np.ndarray>
    type                |S3 ...
    sftof               (j, i) float32 dask.array<chunksize=(330, 360), meta=np.ndarray>
    areacello           (j, i) float32 dask.array<chunksize=(330, 360), meta=np.ndarray>
    hfgeou              (j, i) float32 dask.array<chunksize=(330, 360), meta=np.ndarray>
    basin               (j, i) float64 dask.array<chunksize=(330, 360), meta=np.ndarray>
    deptho              (j, i) float32 dask.array<chunksize=(330, 360), meta=np.ndarray>



In [47]:
cat.df[variable_id == 'o2']

KeyError: True