In [None]:
import pandas as pd
import gcsfs
import xarray as xr
import os

In [None]:
# for Google Cloud: this file has locations of all CMIP6 data in the Pangeo Google Cloud Collection
fs = gcsfs.GCSFileSystem(token='anon', access='read_only')

df = pd.read_csv("https://cmip6.storage.googleapis.com/pangeo-cmip6-noQC.csv.gz")

In [None]:
print(len(df),'datasets, with columns:\n',list(df.keys()))

In [None]:
df.activity_id.unique()

In [None]:
# This is a useful function for finding the datasets of interest

def search_df(df, verbose= False, **search):
    "search by keywords - if list, then match exactly, otherwise match as substring"
    keys = ['activity_id','institution_id','source_id','experiment_id','member_id', 'table_id', 'variable_id', 'grid_label']
    d = df
    for skey in search.keys():
        if isinstance(search[skey], str):  # match a string as a substring
            d = d[d[skey].str.contains(search[skey])]
        else:
            dk = []
            for key in search[skey]:       # match a list of strings exactly
                dk += [d[d[skey]==key]]
            d = pd.concat(dk)
            keys.remove(skey)
    if verbose:
        for key in keys:
            print(key,' = ',list(d[key].unique()))      
    return d.reset_index()

In [None]:
asearch = {}
asearch['experiment_id'] = ['ssp585']
asearch['table_id'] = ['Amon']
asearch['variable_id'] = ['tasmin', 'tasmax']
# add more as needed

#asearch['grid_label'] = ['gn']

# Find all datasets matching this search:
df_subset = search_df(df,**asearch)

# Print first 5 entries
df_subset.head()

In [None]:
print(f'There are {len(df_subset)} datasets in {df_subset.source_id.nunique()} models matching this search')

In [None]:
# Download and save all datasets in netcdf format
#
# change path to where you want to keep these files
path = './'

for index,row in df_subset.iterrows():
    zstore = row['zstore']
    print(f'\navailable dataset: {zstore}')
    varname = row['variable_id']
    fullname = zstore.split('gs://cmip6/CMIP6/')[-1].split(f'/{varname}')[0]
    shortname = path + varname + '_' + '_'.join(fullname.split('/')[2:])
    
    ncfile = f'{shortname}.nc'
    if os.path.exists(ncfile):
        print(f'{ncfile} already exists')
        continue
        
    print(f'saving {ncfile}')
    mapper = fs.get_mapper(zstore)
    ds = xr.open_zarr(mapper, consolidated=True)   
    ds.to_netcdf(ncfile)