# Get Domain

**Author:** Andrew Loeppky (Lots of code stolen from Jamie Byer)

**Project:** Land-surface-atmosphere coupling - CMIP6 intercomparison 

This notebook is meant to acquire a dataset from the CMIP6 data library, chop out a pre-specified spatial slice (between coordinates specified by user), and save the dataset in Zarr format. Also adds a 3d pressure field variable, converting from surface pressure and sigma values $ap$ and $b$

## Helpful Docs

https://docs.google.com/document/d/1yUx6jr9EdedCOLd--CPdTfGDwEwzPpCF6p1jRmqx-0Q/edit#

https://towardsdatascience.com/a-quick-introduction-to-cmip6-e017127a49d3

https://pcmdi.llnl.gov/CMIP6/Guide/dataUsers.html

http://proj.badc.rl.ac.uk/svn/exarch/CMIP6dreq/tags/latest/dreqPy/docs/CMIP6_MIP_tables.xlsx

https://esgf-node.llnl.gov/search/cmip6/

In [1]:
# Attributes of the model we want to analyze (put in csv later)
source_id = 'AWI-CM-1-1-LR'
source_id = 'GFDL-ESM4'
experiment_id = 'piControl'
table_id = 'Amon'

# Domain we wish to study
lats = (10, 20) # lat min, lat max
lons = (20, 29) # lon min, lon max
times = ()
ceil = 500 # top of domain, hPa

# variables of interest
fields_of_interest = ("ps",  # surface pressure
                      "cl",  # cloud fraction
                      "ta",  # air temperature
                      "ts",  # surface temperature
                      "hus", # specific humidity
                      "hfls", # Surface Upward Latent Heat Flux
                      "hfss", # Surface Upward Sensible Heat Flux
                      "rlds",  # surface downwelling longwave
                      "rlus",  # surface upwelling longwave
                      "rsds", # downwelling short wave
                      "rsus", # upwelling short wave
                      "hurs",  # near surface RH
                      "pr", # precipitation, all phases
                      "evspsbl", # evaporation, sublimation, transpiration
                      "wap"  # omega (subsidence rate in pressure coords)
                     )

In [2]:
# hourly data (put in csv with monthly control set)
'''
source_id = 'GFDL-ESM4'
experiment_id = 'piControl'
#table_id = 'CF3hr'

lats = (10, 20) # lat min, lat max
lons = (20, 29) # lon min, lon max
ceil = 500 # top of domain, hPa

# variables of interest
fields_of_interest = ("ps",  # surface pressure
                      "ta",  # air temperature
                      "ts",  # surface temperature
                      "hus", # specific humidity
                      "hfls", # Surface Upward Latent Heat Flux
                      "hfss", # Surface Upward Sensible Heat Flux
                      "rlds",  # surface downwelling longwave
                      "rlus",  # surface upwelling longwave
                      "rsds", # downwelling short wave
                      "rsus", # upwelling short wave
                      "hurs",  # near surface RH
                      "pr",) # precipitation, all phases
                     # "evspsbl", # evaporation, sublimation, transpiration
                     #)
'''

'\nsource_id = \'GFDL-ESM4\'\nexperiment_id = \'piControl\'\n#table_id = \'CF3hr\'\n\nlats = (10, 20) # lat min, lat max\nlons = (20, 29) # lon min, lon max\nceil = 500 # top of domain, hPa\n\n# variables of interest\nfields_of_interest = ("ps",  # surface pressure\n                      "ta",  # air temperature\n                      "ts",  # surface temperature\n                      "hus", # specific humidity\n                      "hfls", # Surface Upward Latent Heat Flux\n                      "hfss", # Surface Upward Sensible Heat Flux\n                      "rlds",  # surface downwelling longwave\n                      "rlus",  # surface upwelling longwave\n                      "rsds", # downwelling short wave\n                      "rsus", # upwelling short wave\n                      "hurs",  # near surface RH\n                      "pr",) # precipitation, all phases\n                     # "evspsbl", # evaporation, sublimation, transpiration\n                     #)\n'

In [3]:
import xarray as xr
import pooch
import pandas as pd
import fsspec
from pathlib import Path
import time
import numpy as np
import json

In [4]:
#get esm datastore
odie = pooch.create(
    path="./.cache",
    base_url="https://storage.googleapis.com/cmip6/",
    registry={
        "pangeo-cmip6.csv": None
    },
)
file_path = odie.fetch("pangeo-cmip6.csv")
df_og = pd.read_csv(file_path)

In [27]:
df_og[df_og.source_id == source_id][df_og.experiment_id == experiment_id][df_og.table_id == table_id].variable_id

  df_og[df_og.source_id == source_id][df_og.experiment_id == experiment_id][df_og.table_id == table_id].variable_id


16494             vas
16711             wap
16713              zg
18342       ch4global
18344              ci
18345             cli
18346             n2o
18347            huss
18348             hus
18349            hurs
18350             hur
18351            hfss
18353            hfls
18355             cct
18356              va
18357             uas
18358              ua
18359              ts
18360            tauv
18361            tauu
18362          tasmin
18363          tasmax
18364             tas
18365            prsn
18366              pr
18367              o3
18368       n2oglobal
18369             ch4
18370     cfc12global
18371     cfc11global
18372    cfc113global
18375    hcfc22global
18376         co2mass
18395         evspsbl
18398             co2
18399           clwvi
18400             clw
18401             clt
18402           clivi
18403              cl
18461            rsdt
18462          rsdscs
18463            rsds
18464          rsutcs
18465          rlutcs
18466     

In [6]:
def fetch_var_exact(the_dict,df_og):
    the_keys = list(the_dict.keys())
    #print(the_keys)
    key0 = the_keys[0]
    #print(key0)
    #print(the_dict[key0])
    hit0 = df_og[key0] == the_dict[key0]
    if len(the_keys) > 1:
        hitnew = hit0
        for key in the_keys[1:]:
            hit = df_og[key] == the_dict[key]
            hitnew = np.logical_and(hitnew,hit)
            #print("total hits: ",np.sum(hitnew))
    else:
        hitnew = hit0
    df_result = df_og[hitnew]
    return df_result

In [7]:
def get_field(variable_id, 
              df,
              source_id=source_id,
              experiment_id=experiment_id,
              table_id=table_id):
    """
    extracts a single variable field from the model
    """

    var_dict = dict(source_id = source_id, variable_id = variable_id,
                    experiment_id = experiment_id, table_id = table_id)
    
    local_var = fetch_var_exact(var_dict, df)
    try:
        zstore_url = local_var['zstore'].array[0]
    except:
        print(f"failed on '{variable_id}'.")
        print(f"fields available in {local_var}")
    the_mapper=fsspec.get_mapper(zstore_url)
    local_var = xr.open_zarr(the_mapper, consolidated=True)
    return local_var

In [8]:
def trim_field(df, lat, lon):
    """
    cuts out a specified domain from an xarrray field
    
    lat = (minlat, maxlat)
    lon = (minlon, maxlon)
    """
    new_field = df.sel(lat=slice(lat[0],lat[1]), lon=slice(lon[0],lon[1]))
    return new_field

## Create one big dataset to represent our domain

In [9]:
# grab all fields of interest and combine
my_fields = [get_field(field, df_og) for field in fields_of_interest]
small_fields = [trim_field(field, lats, lons) for field in my_fields]
my_ds = xr.combine_by_coords(small_fields, compat="broadcast_equals", combine_attrs="drop_conflicts")

In [10]:
# add pressure field, convert from sigma pressure
def press_from_sigma(ds):
    """
    takes in an xarray Dataset with variables:
        "ps" - surface pressure (Pa)
        "ap" - sigma pressure coordinate
        "b"  - sigma pressure coordinate

    returns the Dataset with a variable "p", the full pressure (Pa)
    """
    ds["p"] = ds.ap + ds.b * ds.ps
    return ds

In [11]:
def p_lcl(ds):
    """
    takes in an xarray Dataset with variables:
        "p" - pressure
        "ts" - surface (2m) air temperature
        "hurs" - near surface relative humidity
        "hus" - specific humidity
        
    horizontally averaged over the domain of the Dataset
    """
    pass

In [12]:
#mean_tsurf = my_ds.ts.mean(dim=("lat", "lon"))
#mean_psurf = my_ds.ps.mean(dim=("lat", "lon"))

In [13]:
# apply functions defined above to the field
#my_ds = press_from_sigma(my_ds)

In [14]:
my_ds

Unnamed: 0,Array,Chunk
Bytes,784 B,784 B
Shape,"(49, 2)","(49, 2)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 784 B 784 B Shape (49, 2) (49, 2) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",2  49,

Unnamed: 0,Array,Chunk
Bytes,784 B,784 B
Shape,"(49, 2)","(49, 2)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,784 B,784 B
Shape,"(49, 2)","(49, 2)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 784 B 784 B Shape (49, 2) (49, 2) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",2  49,

Unnamed: 0,Array,Chunk
Bytes,784 B,784 B
Shape,"(49, 2)","(49, 2)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,784 B,784 B
Shape,"(49, 2)","(49, 2)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 784 B 784 B Shape (49, 2) (49, 2) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",2  49,

Unnamed: 0,Array,Chunk
Bytes,784 B,784 B
Shape,"(49, 2)","(49, 2)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,392 B,392 B
Shape,"(49,)","(49,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 392 B 392 B Shape (49,) (49,) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",49  1,

Unnamed: 0,Array,Chunk
Bytes,392 B,392 B
Shape,"(49,)","(49,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,392 B,392 B
Shape,"(49,)","(49,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 392 B 392 B Shape (49,) (49,) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",49  1,

Unnamed: 0,Array,Chunk
Bytes,392 B,392 B
Shape,"(49,)","(49,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,78.51 MiB,160.78 kiB
Shape,"(6000, 49, 10, 7)","(12, 49, 10, 7)"
Count,1001 Tasks,500 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 78.51 MiB 160.78 kiB Shape (6000, 49, 10, 7) (12, 49, 10, 7) Count 1001 Tasks 500 Chunks Type float32 numpy.ndarray",6000  1  7  10  49,

Unnamed: 0,Array,Chunk
Bytes,78.51 MiB,160.78 kiB
Shape,"(6000, 49, 10, 7)","(12, 49, 10, 7)"
Count,1001 Tasks,500 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,170.35 kiB
Shape,"(6000, 10, 7)","(623, 10, 7)"
Count,21 Tasks,10 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.60 MiB 170.35 kiB Shape (6000, 10, 7) (623, 10, 7) Count 21 Tasks 10 Chunks Type float32 numpy.ndarray",7  10  6000,

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,170.35 kiB
Shape,"(6000, 10, 7)","(623, 10, 7)"
Count,21 Tasks,10 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,84.77 kiB
Shape,"(6000, 10, 7)","(310, 10, 7)"
Count,41 Tasks,20 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.60 MiB 84.77 kiB Shape (6000, 10, 7) (310, 10, 7) Count 41 Tasks 20 Chunks Type float32 numpy.ndarray",7  10  6000,

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,84.77 kiB
Shape,"(6000, 10, 7)","(310, 10, 7)"
Count,41 Tasks,20 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,82.85 kiB
Shape,"(6000, 10, 7)","(303, 10, 7)"
Count,41 Tasks,20 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.60 MiB 82.85 kiB Shape (6000, 10, 7) (303, 10, 7) Count 41 Tasks 20 Chunks Type float32 numpy.ndarray",7  10  6000,

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,82.85 kiB
Shape,"(6000, 10, 7)","(303, 10, 7)"
Count,41 Tasks,20 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,164.06 kiB
Shape,"(6000, 10, 7)","(600, 10, 7)"
Count,21 Tasks,10 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.60 MiB 164.06 kiB Shape (6000, 10, 7) (600, 10, 7) Count 21 Tasks 10 Chunks Type float32 numpy.ndarray",7  10  6000,

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,164.06 kiB
Shape,"(6000, 10, 7)","(600, 10, 7)"
Count,21 Tasks,10 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,30.44 MiB,155.86 kiB
Shape,"(6000, 19, 10, 7)","(30, 19, 10, 7)"
Count,401 Tasks,200 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 30.44 MiB 155.86 kiB Shape (6000, 19, 10, 7) (30, 19, 10, 7) Count 401 Tasks 200 Chunks Type float32 numpy.ndarray",6000  1  7  10  19,

Unnamed: 0,Array,Chunk
Bytes,30.44 MiB,155.86 kiB
Shape,"(6000, 19, 10, 7)","(30, 19, 10, 7)"
Count,401 Tasks,200 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,164.06 kiB
Shape,"(6000, 10, 7)","(600, 10, 7)"
Count,21 Tasks,10 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.60 MiB 164.06 kiB Shape (6000, 10, 7) (600, 10, 7) Count 21 Tasks 10 Chunks Type float32 numpy.ndarray",7  10  6000,

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,164.06 kiB
Shape,"(6000, 10, 7)","(600, 10, 7)"
Count,21 Tasks,10 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,164.06 kiB
Shape,"(6000, 10, 7)","(600, 10, 7)"
Count,21 Tasks,10 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.60 MiB 164.06 kiB Shape (6000, 10, 7) (600, 10, 7) Count 21 Tasks 10 Chunks Type float32 numpy.ndarray",7  10  6000,

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,164.06 kiB
Shape,"(6000, 10, 7)","(600, 10, 7)"
Count,21 Tasks,10 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,101.72 kiB
Shape,"(6000, 10, 7)","(372, 10, 7)"
Count,35 Tasks,17 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.60 MiB 101.72 kiB Shape (6000, 10, 7) (372, 10, 7) Count 35 Tasks 17 Chunks Type float32 numpy.ndarray",7  10  6000,

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,101.72 kiB
Shape,"(6000, 10, 7)","(372, 10, 7)"
Count,35 Tasks,17 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,102.54 kiB
Shape,"(6000, 10, 7)","(375, 10, 7)"
Count,33 Tasks,16 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.60 MiB 102.54 kiB Shape (6000, 10, 7) (375, 10, 7) Count 33 Tasks 16 Chunks Type float32 numpy.ndarray",7  10  6000,

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,102.54 kiB
Shape,"(6000, 10, 7)","(375, 10, 7)"
Count,33 Tasks,16 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,97.89 kiB
Shape,"(6000, 10, 7)","(358, 10, 7)"
Count,35 Tasks,17 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.60 MiB 97.89 kiB Shape (6000, 10, 7) (358, 10, 7) Count 35 Tasks 17 Chunks Type float32 numpy.ndarray",7  10  6000,

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,97.89 kiB
Shape,"(6000, 10, 7)","(358, 10, 7)"
Count,35 Tasks,17 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,94.61 kiB
Shape,"(6000, 10, 7)","(346, 10, 7)"
Count,37 Tasks,18 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.60 MiB 94.61 kiB Shape (6000, 10, 7) (346, 10, 7) Count 37 Tasks 18 Chunks Type float32 numpy.ndarray",7  10  6000,

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,94.61 kiB
Shape,"(6000, 10, 7)","(346, 10, 7)"
Count,37 Tasks,18 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,30.44 MiB,155.86 kiB
Shape,"(6000, 19, 10, 7)","(30, 19, 10, 7)"
Count,401 Tasks,200 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 30.44 MiB 155.86 kiB Shape (6000, 19, 10, 7) (30, 19, 10, 7) Count 401 Tasks 200 Chunks Type float32 numpy.ndarray",6000  1  7  10  19,

Unnamed: 0,Array,Chunk
Bytes,30.44 MiB,155.86 kiB
Shape,"(6000, 19, 10, 7)","(30, 19, 10, 7)"
Count,401 Tasks,200 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,164.06 kiB
Shape,"(6000, 10, 7)","(600, 10, 7)"
Count,21 Tasks,10 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.60 MiB 164.06 kiB Shape (6000, 10, 7) (600, 10, 7) Count 21 Tasks 10 Chunks Type float32 numpy.ndarray",7  10  6000,

Unnamed: 0,Array,Chunk
Bytes,1.60 MiB,164.06 kiB
Shape,"(6000, 10, 7)","(600, 10, 7)"
Count,21 Tasks,10 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,30.44 MiB,77.93 kiB
Shape,"(6000, 19, 10, 7)","(15, 19, 10, 7)"
Count,801 Tasks,400 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 30.44 MiB 77.93 kiB Shape (6000, 19, 10, 7) (15, 19, 10, 7) Count 801 Tasks 400 Chunks Type float32 numpy.ndarray",6000  1  7  10  19,

Unnamed: 0,Array,Chunk
Bytes,30.44 MiB,77.93 kiB
Shape,"(6000, 19, 10, 7)","(15, 19, 10, 7)"
Count,801 Tasks,400 Chunks
Type,float32,numpy.ndarray


In [15]:
print(f"""Fetched domain:
          {source_id = }
          {experiment_id = }
          {table_id = }
          {lats = }
          {lons = }
          dataset name: my_ds (xarray Dataset)""")

Fetched domain:
          source_id = 'GFDL-ESM4'
          experiment_id = 'piControl'
          table_id = 'Amon'
          lats = (10, 20)
          lons = (20, 29)
          dataset name: my_ds (xarray Dataset)
