# Get Domain

**Author:** Andrew Loeppky (Lots of code stolen from Jamie Byer)

**Project:** Land-surface-atmosphere coupling - CMIP6 intercomparison 

This notebook is meant to acquire a dataset from the CMIP6 data library, chop out a pre-specified spatial slice (between coordinates specified by user), and save the dataset in Zarr format. Also adds a 3d pressure field variable, converting from surface pressure and sigma values $ap$ and $b$

## Helpful Docs

https://docs.google.com/document/d/1yUx6jr9EdedCOLd--CPdTfGDwEwzPpCF6p1jRmqx-0Q/edit#

https://towardsdatascience.com/a-quick-introduction-to-cmip6-e017127a49d3

https://pcmdi.llnl.gov/CMIP6/Guide/dataUsers.html

http://proj.badc.rl.ac.uk/svn/exarch/CMIP6dreq/tags/latest/dreqPy/docs/CMIP6_MIP_tables.xlsx

https://esgf-node.llnl.gov/search/cmip6/

In [1]:
# Attributes of the model we want to analyze (put in csv later)
source_id = 'ACCESS-CM2'
source_id = 'GFDL-ESM4'
experiment_id = 'piControl'
table_id = 'Amon'

# Domain we wish to study
lats = (10, 20) # lat min, lat max
lons = (20, 29) # lon min, lon max
times = ()
ceil = 500 # top of domain, hPa

# variables of interest
fields_of_interest = ("ps",  # surface pressure
                      "cl",  # cloud fraction
                      "ta",  # air temperature
                      "ts",  # surface temperature
                      "hus", # specific humidity
                      "hfls", # Surface Upward Latent Heat Flux
                      "hfss", # Surface Upward Sensible Heat Flux
                      "rlds",  # surface downwelling longwave
                      "rlus",  # surface upwelling longwave
                      "rsds", # downwelling short wave
                      "rsus", # upwelling short wave
                      "hurs",  # near surface RH
                      "pr", # precipitation, all phases
                      "evspsbl", # evaporation, sublimation, transpiration
                      "wap"  # omega (subsidence rate in pressure coords)
                     )

In [2]:
# hourly data (put in csv with monthly control set)
#source_id = 'GFDL-ESM4'
#experiment_id = 'piControl'
#table_id = 'CF3hr'

#lats = (10, 20) # lat min, lat max
#lons = (20, 29) # lon min, lon max
#ceil = 500 # top of domain, hPa

# variables of interest
#fields_of_interest = ("ps",  # surface pressure
#                      "ta",)  # air temperature
                     # "ts",  # surface temperature
                     # "hus", # specific humidity
                     # "hfls", # Surface Upward Latent Heat Flux
                     # "hfss", # Surface Upward Sensible Heat Flux
                     # "rlds",  # surface downwelling longwave
                     # "rlus",  # surface upwelling longwave
                     # "rsds", # downwelling short wave
                     # "rsus", # upwelling short wave
                     # "hurs",  # near surface RH
                     # "pr", # precipitation, all phases
                     # "evspsbl", # evaporation, sublimation, transpiration
                     #)

In [3]:
import xarray as xr
import pooch
import pandas as pd
import fsspec
from pathlib import Path
import time
import numpy as np
import json

In [4]:
#get esm datastore
odie = pooch.create(
    path="./.cache",
    base_url="https://storage.googleapis.com/cmip6/",
    registry={
        "pangeo-cmip6.csv": None
    },
)
file_path = odie.fetch("pangeo-cmip6.csv")
df_og = pd.read_csv(file_path)

In [5]:
def fetch_var_exact(the_dict,df_og):
    the_keys = list(the_dict.keys())
    #print(the_keys)
    key0 = the_keys[0]
    #print(key0)
    #print(the_dict[key0])
    hit0 = df_og[key0] == the_dict[key0]
    if len(the_keys) > 1:
        hitnew = hit0
        for key in the_keys[1:]:
            hit = df_og[key] == the_dict[key]
            hitnew = np.logical_and(hitnew,hit)
            #print("total hits: ",np.sum(hitnew))
    else:
        hitnew = hit0
    df_result = df_og[hitnew]
    return df_result

In [6]:
def get_field(variable_id, 
              df,
              source_id=source_id,
              experiment_id=experiment_id,
              table_id=table_id):
    """
    extracts a single variable field from the model
    """

    var_dict = dict(source_id = source_id, variable_id = variable_id,
                    experiment_id = experiment_id, table_id = table_id)
    local_var = fetch_var_exact(var_dict, df)
    zstore_url = local_var['zstore'].array[0]
    the_mapper=fsspec.get_mapper(zstore_url)
    local_var = xr.open_zarr(the_mapper, consolidated=True)
    return local_var

In [7]:
def trim_field(df, lat, lon):
    """
    cuts out a specified domain from an xarrray field
    
    lat = (minlat, maxlat)
    lon = (minlon, maxlon)
    """
    new_field = df.sel(lat=slice(lat[0],lat[1]), lon=slice(lon[0],lon[1]))
    return new_field

## Create one big dataset to represent our domain

In [8]:
# grab all fields of interest and combine
my_fields = [get_field(field, df_og) for field in fields_of_interest]
small_fields = [trim_field(field, lats, lons) for field in my_fields]
my_ds = xr.combine_by_coords(small_fields, compat="broadcast_equals", combine_attrs="drop_conflicts")

  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)
  return np.asarray(array[self.key], dtype=None)
  return np.asarray(array[self.key], dtype=None)
  return np.asarray(array[self.key], dtype=None)
  return np.asarray(array[self.key], dtype=None)
  return np.asarray(array[self.key], dtype=None)
  return np.asarray(array[self.key], dtype=None)
  return np.asarray(array[self.key], dtype=None)
  return np.asarray(array[self.key], dtype=None)
  return np.asarray(array[self.key], dtype=None)
  return np.asarray(array[self.key], dtype=None)


  return np.asarray(array[self.key], dtype=None)
  return np.asarray(array[self.key], dtype=None)
  return np.asarray(array[self.key], dtype=None)
  return np.asarray(array[self.key], dtype=None)
  return np.asarray(array[self.key], dtype=None)


In [9]:
# add pressure field, convert from sigma pressure
def press_from_sigma(ds):
    """
    takes in an xarray Dataset with variables:
        "ps" - surface pressure (Pa)
        "ap" - sigma pressure coordinate
        "b"  - sigma pressure coordinate

    returns the Dataset with a variable "p", the full pressure (Pa)
    """
    ds["p"] = ds.ap + ds.b * ds.ps
    return ds

In [10]:
def p_lcl(ds):
    """
    takes in an xarray Dataset with variables:
        "p" - pressure
        "ts" - surface (2m) air temperature
        "hurs" - near surface relative humidity
        "hus" - specific humidity
        
    horizontally averaged over the domain of the Dataset
    """
    pass

In [11]:
#mean_tsurf = my_ds.ts.mean(dim=("lat", "lon"))
#mean_psurf = my_ds.ps.mean(dim=("lat", "lon"))

In [12]:
# apply functions defined above to the field
#my_ds = press_from_sigma(my_ds)

In [15]:
my_ds

Unnamed: 0,Array,Chunk
Bytes,1.33 kiB,1.33 kiB
Shape,"(85, 2)","(85, 2)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.33 kiB 1.33 kiB Shape (85, 2) (85, 2) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",2  85,

Unnamed: 0,Array,Chunk
Bytes,1.33 kiB,1.33 kiB
Shape,"(85, 2)","(85, 2)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.33 kiB,1.33 kiB
Shape,"(85, 2)","(85, 2)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.33 kiB 1.33 kiB Shape (85, 2) (85, 2) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",2  85,

Unnamed: 0,Array,Chunk
Bytes,1.33 kiB,1.33 kiB
Shape,"(85, 2)","(85, 2)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,680 B,680 B
Shape,"(85,)","(85,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 680 B 680 B Shape (85,) (85,) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",85  1,

Unnamed: 0,Array,Chunk
Bytes,680 B,680 B
Shape,"(85,)","(85,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,62.26 MiB,116.88 kiB
Shape,"(6000, 85, 8, 4)","(11, 85, 8, 4)"
Count,1093 Tasks,546 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 62.26 MiB 116.88 kiB Shape (6000, 85, 8, 4) (11, 85, 8, 4) Count 1093 Tasks 546 Chunks Type float32 numpy.ndarray",6000  1  4  8  85,

Unnamed: 0,Array,Chunk
Bytes,62.26 MiB,116.88 kiB
Shape,"(6000, 85, 8, 4)","(11, 85, 8, 4)"
Count,1093 Tasks,546 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,128 B,128 B
Shape,"(8, 4)","(8, 4)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 128 B 128 B Shape (8, 4) (8, 4) Count 3 Tasks 1 Chunks Type float32 numpy.ndarray",4  8,

Unnamed: 0,Array,Chunk
Bytes,128 B,128 B
Shape,"(8, 4)","(8, 4)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,62.50 kiB
Shape,"(6000, 8, 4)","(500, 8, 4)"
Count,25 Tasks,12 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 750.00 kiB 62.50 kiB Shape (6000, 8, 4) (500, 8, 4) Count 25 Tasks 12 Chunks Type float32 numpy.ndarray",4  8  6000,

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,62.50 kiB
Shape,"(6000, 8, 4)","(500, 8, 4)"
Count,25 Tasks,12 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,69.00 kiB
Shape,"(6000, 8, 4)","(552, 8, 4)"
Count,23 Tasks,11 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 750.00 kiB 69.00 kiB Shape (6000, 8, 4) (552, 8, 4) Count 23 Tasks 11 Chunks Type float32 numpy.ndarray",4  8  6000,

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,69.00 kiB
Shape,"(6000, 8, 4)","(552, 8, 4)"
Count,23 Tasks,11 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,70.25 kiB
Shape,"(6000, 8, 4)","(562, 8, 4)"
Count,23 Tasks,11 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 750.00 kiB 70.25 kiB Shape (6000, 8, 4) (562, 8, 4) Count 23 Tasks 11 Chunks Type float32 numpy.ndarray",4  8  6000,

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,70.25 kiB
Shape,"(6000, 8, 4)","(562, 8, 4)"
Count,23 Tasks,11 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,104.25 kiB
Shape,"(6000, 8, 4)","(834, 8, 4)"
Count,17 Tasks,8 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 750.00 kiB 104.25 kiB Shape (6000, 8, 4) (834, 8, 4) Count 17 Tasks 8 Chunks Type float32 numpy.ndarray",4  8  6000,

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,104.25 kiB
Shape,"(6000, 8, 4)","(834, 8, 4)"
Count,17 Tasks,8 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,13.92 MiB,68.88 kiB
Shape,"(6000, 19, 8, 4)","(29, 19, 8, 4)"
Count,415 Tasks,207 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 13.92 MiB 68.88 kiB Shape (6000, 19, 8, 4) (29, 19, 8, 4) Count 415 Tasks 207 Chunks Type float32 numpy.ndarray",6000  1  4  8  19,

Unnamed: 0,Array,Chunk
Bytes,13.92 MiB,68.88 kiB
Shape,"(6000, 19, 8, 4)","(29, 19, 8, 4)"
Count,415 Tasks,207 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,62.00 kiB
Shape,"(6000, 8, 4)","(496, 8, 4)"
Count,27 Tasks,13 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 750.00 kiB 62.00 kiB Shape (6000, 8, 4) (496, 8, 4) Count 27 Tasks 13 Chunks Type float32 numpy.ndarray",4  8  6000,

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,62.00 kiB
Shape,"(6000, 8, 4)","(496, 8, 4)"
Count,27 Tasks,13 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,72.75 kiB
Shape,"(6000, 8, 4)","(582, 8, 4)"
Count,23 Tasks,11 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 750.00 kiB 72.75 kiB Shape (6000, 8, 4) (582, 8, 4) Count 23 Tasks 11 Chunks Type float32 numpy.ndarray",4  8  6000,

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,72.75 kiB
Shape,"(6000, 8, 4)","(582, 8, 4)"
Count,23 Tasks,11 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,74.25 kiB
Shape,"(6000, 8, 4)","(594, 8, 4)"
Count,23 Tasks,11 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 750.00 kiB 74.25 kiB Shape (6000, 8, 4) (594, 8, 4) Count 23 Tasks 11 Chunks Type float32 numpy.ndarray",4  8  6000,

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,74.25 kiB
Shape,"(6000, 8, 4)","(594, 8, 4)"
Count,23 Tasks,11 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,68.62 kiB
Shape,"(6000, 8, 4)","(549, 8, 4)"
Count,23 Tasks,11 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 750.00 kiB 68.62 kiB Shape (6000, 8, 4) (549, 8, 4) Count 23 Tasks 11 Chunks Type float32 numpy.ndarray",4  8  6000,

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,68.62 kiB
Shape,"(6000, 8, 4)","(549, 8, 4)"
Count,23 Tasks,11 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,69.25 kiB
Shape,"(6000, 8, 4)","(554, 8, 4)"
Count,23 Tasks,11 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 750.00 kiB 69.25 kiB Shape (6000, 8, 4) (554, 8, 4) Count 23 Tasks 11 Chunks Type float32 numpy.ndarray",4  8  6000,

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,69.25 kiB
Shape,"(6000, 8, 4)","(554, 8, 4)"
Count,23 Tasks,11 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,71.62 kiB
Shape,"(6000, 8, 4)","(573, 8, 4)"
Count,23 Tasks,11 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 750.00 kiB 71.62 kiB Shape (6000, 8, 4) (573, 8, 4) Count 23 Tasks 11 Chunks Type float32 numpy.ndarray",4  8  6000,

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,71.62 kiB
Shape,"(6000, 8, 4)","(573, 8, 4)"
Count,23 Tasks,11 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,13.92 MiB,92.62 kiB
Shape,"(6000, 19, 8, 4)","(39, 19, 8, 4)"
Count,309 Tasks,154 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 13.92 MiB 92.62 kiB Shape (6000, 19, 8, 4) (39, 19, 8, 4) Count 309 Tasks 154 Chunks Type float32 numpy.ndarray",6000  1  4  8  19,

Unnamed: 0,Array,Chunk
Bytes,13.92 MiB,92.62 kiB
Shape,"(6000, 19, 8, 4)","(39, 19, 8, 4)"
Count,309 Tasks,154 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,83.12 kiB
Shape,"(6000, 8, 4)","(665, 8, 4)"
Count,21 Tasks,10 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 750.00 kiB 83.12 kiB Shape (6000, 8, 4) (665, 8, 4) Count 21 Tasks 10 Chunks Type float32 numpy.ndarray",4  8  6000,

Unnamed: 0,Array,Chunk
Bytes,750.00 kiB,83.12 kiB
Shape,"(6000, 8, 4)","(665, 8, 4)"
Count,21 Tasks,10 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,13.92 MiB,73.62 kiB
Shape,"(6000, 19, 8, 4)","(31, 19, 8, 4)"
Count,389 Tasks,194 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 13.92 MiB 73.62 kiB Shape (6000, 19, 8, 4) (31, 19, 8, 4) Count 389 Tasks 194 Chunks Type float32 numpy.ndarray",6000  1  4  8  19,

Unnamed: 0,Array,Chunk
Bytes,13.92 MiB,73.62 kiB
Shape,"(6000, 19, 8, 4)","(31, 19, 8, 4)"
Count,389 Tasks,194 Chunks
Type,float32,numpy.ndarray


In [14]:
print(f"""Fetched domain:
          {source_id = }
          {experiment_id = }
          {table_id = }
          {lats = }
          {lons = }
          dataset name: my_ds (xarray Dataset)""")

Fetched domain:
          source_id = 'ACCESS-CM2'
          experiment_id = 'piControl'
          table_id = 'Amon'
          lats = (10, 20)
          lons = (20, 29)
          dataset name: my_ds (xarray Dataset)
