*If you run this notebook at colab.research.google.com, you need to install packages with the following command*:

In [None]:
# !pip install --upgrade gcsfs intake intake-xarray zarr

# Access to data in the cloud (GCS)

In [2]:
import sys
import gcsfs
import xarray as xr
import intake
import pandas as pd

## Read data from Google Cloud Storage (gcsfs)

### Access and listing

In [2]:
# Define cloud file system access point:
fs = gcsfs.GCSFileSystem(project='alert-ground-261008', token='anon', access='read_only')

# And list content of a bucket:
fs.ls('opendata_bdo2020')

['opendata_bdo2020/EN.4.2.1.f.analysis.g10.zarr',
 'opendata_bdo2020/GLOBAL_ARGO_SDL2000',
 'opendata_bdo2020/GLOB_HOMOGENEOUS_variables.zarr',
 'opendata_bdo2020/Global_Argo_VerticalMean_Temperature.zarr',
 'opendata_bdo2020/dt_global_allsat_phy_l4_mm']

But data access with ``gcsfs`` is critically dependant on the GCS set-up. For instance the following project does not allow to list the bucket content:

In [None]:
fs2 = gcsfs.GCSFileSystem(project='alert-ground-261008', token='anon', access='read_only')
try:
    fs2.ls('data_bdo2020')
except:
    print(sys.exc_info()[0])

On the other hand, some dataset may not be free and use a requester pay model. 
In this case, you would have to properly manage authentication:

In [None]:
fs3 = gcsfs.GCSFileSystem(project='poised-honor-358', token='anon')
try:
    fs3.ls('sonific01')
except ValueError as e:
    print(str(e))

### Load data

In [None]:
gcsmap = fs.get_mapper("opendata_bdo2020/EN.4.2.1.f.analysis.g10.zarr")
ds = xr.open_zarr(gcsmap)

# ds = xr.open_dataset("gcs://opendata_bdo2020/EN.4.2.1.f.analysis.g10.zarr",
#                      backend_kwargs={"storage_options": {"project": "alert-ground-261008", "token": 'anon', 'access':'read_only'}},
#                     engine="zarr")

print("Size of the dataset:", ds.nbytes/1e9,"Gb")
print(ds)

In [None]:
# Load another dataset:
gcsmap = fs.get_mapper('opendata_bdo2020/GLOBAL_ARGO_SDL2000')
ds = xr.open_zarr(gcsmap, consolidated=False)
print("Size of the dataset:", ds.nbytes/1e9,"Gb")
print(ds)

## Use intake catalog of data

The catalog also uses the gcsfs entry point, but with intake it's transparent to the user:

### Access and listing of the catalog

In [3]:
from intake import open_catalog

In [4]:
catalog_url = 'https://raw.githubusercontent.com/obidam/ds2-2022/main/ds2_data_catalog.yml'
cat = open_catalog(catalog_url)
list(cat)

['argo_global_sdl',
 'argo_global_sdl_homogeneous',
 'argo_global_vertical_mean',
 'en4',
 'sea_surface_height']

### Load data

In [None]:
ds = cat['en4'].read_chunked()
print("Size of the dataset:", ds.nbytes/1e9,"Gb")
ds

In [None]:
ds  = cat["sea_surface_height"].to_dask()
print("Size of the dataset:", ds.nbytes/1e9,"Gb")
ds

# Pangeo data

https://github.com/pangeo-data/pangeo-datastore

https://catalog.pangeo.io/

## Explore catalog

In [None]:
from intake import open_catalog

pangeo_cat = open_catalog("https://raw.githubusercontent.com/pangeo-data/pangeo-datastore/master/intake-catalogs/master.yaml")
list(pangeo_cat)

In [None]:
list(pangeo_cat.ocean)
# print(list(pangeo_cat.atmosphere))
# print(list(pangeo_cat.hydro))
# pangeo_cat.walk(depth=5)

# CMIP6 data

In [4]:
# this only needs to be created once
gcs = gcsfs.GCSFileSystem(token='anon')

In [7]:
df_full = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')
df_full.sample(10)

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
348810,DCPP,NCAR,CESM1-1-CAM5-CMIP5,dcppA-hindcast,r15i1p1f1,Amon,rlds,gn,gs://cmip6/DCPP/NCAR/CESM1-1-CAM5-CMIP5/dcppA-...,1986.0,20191007
320103,DCPP,NCAR,CESM1-1-CAM5-CMIP5,dcppA-hindcast,r40i1p1f1,Amon,rsus,gn,gs://cmip6/DCPP/NCAR/CESM1-1-CAM5-CMIP5/dcppA-...,2017.0,20191007
375443,CMIP,CAS,FGOALS-f3-L,abrupt-4xCO2,r2i1p1f1,Amon,tauu,gr,gs://cmip6/CMIP6/CMIP/CAS/FGOALS-f3-L/abrupt-4...,,20191018
332594,DCPP,NCAR,CESM1-1-CAM5-CMIP5,dcppA-hindcast,r9i1p1f1,Amon,rlutcs,gn,gs://cmip6/DCPP/NCAR/CESM1-1-CAM5-CMIP5/dcppA-...,1970.0,20191007
88803,CMIP,CCCma,CanESM5-CanOE,historical,r3i1p2f1,Amon,rsdscs,gn,gs://cmip6/CMIP6/CMIP/CCCma/CanESM5-CanOE/hist...,,20190429
209025,CMIP,INM,INM-CM5-0,piControl,r1i1p1f1,Amon,rtmt,gr1,gs://cmip6/CMIP6/CMIP/INM/INM-CM5-0/piControl/...,,20190619
472968,CMIP,FIO-QLNM,FIO-ESM-2-0,piControl,r1i1p1f1,Amon,tasmin,gn,gs://cmip6/CMIP6/CMIP/FIO-QLNM/FIO-ESM-2-0/piC...,,20200921
381519,CMIP,NCC,NorESM2-MM,piControl,r1i1p1f1,Amon,ua,gn,gs://cmip6/CMIP6/CMIP/NCC/NorESM2-MM/piControl...,,20191108
235263,CMIP,MPI-M,MPI-ESM1-2-LR,historical,r2i1p1f1,Omon,o2min,gn,gs://cmip6/CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/hist...,,20190710
138901,ScenarioMIP,CCCma,CanESM5,ssp119,r25i1p2f1,Omon,tauvo,gn,gs://cmip6/CMIP6/ScenarioMIP/CCCma/CanESM5/ssp...,,20190429


In [None]:
# df = df_full.query("activity_id=='CMIP' & table_id == 'Omon' & variable_id == 'thetao' & experiment_id == 'historical' & member_id == 'r1i1p1f1'")
df = df_full.query("activity_id=='CMIP' & table_id == 'Omon' & institution_id == 'CNRM-CERFACS' & experiment_id == 'historical'")
# df = df_full.query('institution_id == "CNRM-CERFACS" & member_id=="r1i1p1f2" & source_id=="CNRM-CM6-1"')

# df = df_full.query("activity_id=='CMIP' & table_id == 'Omon' & variable_id == 'thetao' & experiment_id == 'abrupt-4xCO2'")

# df = df.query("source_id=='CNRM-CM6-1-HR' & variable_id=='thetao'") # Horizontal resolution up to 1/4 deg
# df = df.query("source_id=='CNRM-ESM2-1' & variable_id=='thetao'") # Horizontal resolution up to 1deg
df = df.query("source_id=='CNRM-ESM2-1' & (variable_id=='thetao' | variable_id=='so')") # Horizontal resolution up to 1deg

# df = df.sort_values('version')
df = df.sort_values('member_id')
df

In [None]:
# get the path to a specific zarr store (the first one from the dataframe above)
zstore = df.zstore.values[-1]
print(zstore)

# create a mutable-mapping-style interface to the store
mapper = gcs.get_mapper(zstore)

# open it using xarray and zarr
ds = xr.open_zarr(mapper, consolidated=True)
print("Size of the dataset:", ds.nbytes/1e9,"Gb")

ds

In [None]:
sst = ds['thetao'].sel(lev=0, method='nearest')
sst

In [None]:
def open_cmip6(df_row):
    # get the path to zarr store
    zstore = df.zstore.values[-1]
#     print(zstore)
    
    # create a mutable-mapping-style interface to the store
    mapper = gcs.get_mapper(zstore)

    # open it using xarray and zarr
    return xr.open_zarr(mapper, consolidated=True)

ds = open_cmip6(df.iloc[0])
print("Size of the dataset:", ds.nbytes/1e9,"Gb")
ds

In [None]:
# Compute size of the df selection:
total_size = 0 # Gb
for index, row in df.iterrows():
    ds = open_cmip6(row)
    total_size += ds.nbytes/1e9
print("Size of the selection of datasets:", total_size, "Gb")    