# CMIP6 data

This notebook shows how to access CMIP6 data from the cloud

## Import modules and libraries

*First, let's make sure the Python env is correct to run this notebook*:

In [None]:
import os, sys, urllib, tempfile
with tempfile.TemporaryDirectory() as tmpdirname:
    sys.path.append(tmpdirname)
    repo = "https://raw.githubusercontent.com/obidam/ds2-2025/main/"
    urllib.request.urlretrieve(os.path.join(repo, "utils.py"), 
                               os.path.join(tmpdirname, "utils.py"))
    from utils import check_up_env
    ds2tools = check_up_env(with_tuto=True)

In [None]:
import sys
import gcsfs
import xarray as xr
import intake
import zarr
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
create_map = ds2tools.create_map

# this only needs to be created once
gcs = gcsfs.GCSFileSystem(token='anon')

# 
xr.set_options(display_style='html')
%matplotlib inline
%config InlineBackend.figure_format = 'retina' 

## Read the full CMIP6 catalog

In [None]:
df_full = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')
df_full.sample(10)

### Make a subset of it

In [None]:
# df = df_full.query("activity_id=='CMIP' & table_id == 'Omon' & variable_id == 'thetao' & experiment_id == 'historical' & member_id == 'r1i1p1f1'")
df = df_full.query("activity_id=='CMIP' & table_id == 'Omon' & institution_id == 'CNRM-CERFACS' & experiment_id == 'historical'")
# df = df_full.query('institution_id == "CNRM-CERFACS" & member_id=="r1i1p1f2" & source_id=="CNRM-CM6-1"')

# df = df_full.query("activity_id=='CMIP' & table_id == 'Omon' & variable_id == 'thetao' & experiment_id == 'abrupt-4xCO2'")

# df = df.query("source_id=='CNRM-CM6-1-HR' & variable_id=='thetao'") # Horizontal resolution up to 1/4 deg
# df = df.query("source_id=='CNRM-ESM2-1' & variable_id=='thetao'") # Horizontal resolution up to 1deg
df = df.query("source_id=='CNRM-ESM2-1' & (variable_id=='thetao' | variable_id=='so')") # Horizontal resolution up to 1deg

# df = df.sort_values('version')
df = df.sort_values('member_id')
df

## Read some data (1 row of the catalog)

In [None]:
def open_cmip6(df_row):
    # get the path to zarr store
    zstore = df.zstore.values[-1]
    print(zstore)
    
    # create a mutable-mapping-style interface to the store
    mapper = gcs.get_mapper(zstore)

    # open it using xarray and zarr
    ds = xr.open_zarr(mapper, consolidated=True)
    print("Size of this dataset:", ds.nbytes/1e9,"Gb")

    return ds

ds = open_cmip6(df.iloc[0])
ds

### Play with it

In [None]:
# Compute size of the full df selection:
total_size = 0 # Gb
for index, row in df.iterrows():
    ds = open_cmip6(row)
    total_size += ds.nbytes/1e9
print("Size of the selection of datasets:", total_size, "Gb")    

In [None]:
sst = ds['thetao'].sel(lev=0, method='nearest')
sst

In [None]:
sst.sel(time='1978-05-28T12:00:00', method='nearest').plot()

In [None]:
sst.where(sst['lat']>=0).where(sst['lon']>=360-275).sel(time='1978-05-28T12:00:00', method='nearest').plot(xlim=[0, 120], ylim=[140, 270])

### Horizontal resolution of the grid

In [None]:
ds['lat'].isel(x=0).diff('y').plot()

In [None]:
ds['lon'].isel(y=0).diff('x').plot()

## Connect to a cluster for large computation

In [None]:
import coiled
from dask.distributed import Client
# Connect to one of the existing class cluster: 
# cluster = coiled.Cluster(name="ds2-highmem-binder", workspace="class-2025")
cluster = coiled.Cluster(name="ds2-highcpu-binder", workspace="class-2025")
client = cluster.get_client()

## Compute climatology

In [None]:
ds = open_cmip6(df.iloc[0])
print("Size of this dataset:", ds.nbytes/1e9,"Gb")

In [None]:
%%time
t = ds['thetao'].sel(lev=700, method='nearest')
# t = t.where(t['lat']>=30).where(t['lon']>=360-275)
# t = t.where(t['time']>=pd.to_datetime('2100-01-01'))
# t
tm = t.mean(dim='time')
tm

In [None]:
%%time

# This should take about 9mins on ds2-highmem-binder and 4mins on ds2-highcpu-binder:
tm = tm.compute().persist()
tm

In [None]:
fig, proj, ax = create_map()
tm.plot(transform=proj, x='lon', y='lat', ax=ax, 
        vmin=-2, vmax=15, levels=18, 
        cmap=mpl.colormaps.get_cmap('bwr'))
# ax.gridlines(draw_labels=True)
ax.add_feature(cfeature.LAND, facecolor=[0.7]*3, zorder=100)
ax.set_title("Ocean temperature climatology at z=%0.1f m" % tm['lev'].data)

## Checkout a High resolution simulation

In [None]:
# df = df_full.query("activity_id=='CMIP' & table_id == 'Omon' & variable_id == 'thetao' & experiment_id == 'historical' & member_id == 'r1i1p1f1'")
df = df_full.query("activity_id=='CMIP' & table_id == 'Omon' & institution_id == 'CNRM-CERFACS' & experiment_id == 'historical'")
# df = df_full.query("activity_id=='CMIP' & table_id == 'Omon' & institution_id == 'CNRM-CERFACS' & experiment_id == 'piControl'")
# df = df_full.query("activity_id=='CMIP' & table_id == 'Omon' & institution_id == 'CNRM-CERFACS'")
# df = df_full.query('institution_id == "CNRM-CERFACS" & member_id=="r1i1p1f2" & source_id=="CNRM-CM6-1"')

# df = df_full.query("activity_id=='CMIP' & table_id == 'Omon' & variable_id == 'thetao' & experiment_id == 'abrupt-4xCO2'")

# df = df.query("source_id=='CNRM-CM6-1-HR'") # Horizontal resolution up to 1/4 deg
df = df.query("source_id=='CNRM-CM6-1-HR' & variable_id=='thetao'") # Horizontal resolution up to 1/4 deg
# df = df.query("source_id=='CNRM-ESM2-1' & variable_id=='thetao'") # Horizontal resolution up to 1deg
# df = df.query("source_id=='CNRM-ESM2-1' & (variable_id=='thetao' | variable_id=='so')") # Horizontal resolution up to 1deg

# df = df.sort_values('version')
df = df.sort_values('member_id')
df

In [None]:
this_df = df[df['variable_id'] == 'thetao'].iloc[0]
# this_df = df[df['experiment_id'] == 'piControl'].iloc[0]
ds = open_cmip6(this_df)
ds

In [None]:
%%time
t = ds['thetao'].sel(lev=700, method='nearest')
tm = t.mean(dim='time')
tm = tm.compute().persist()
tm

In [None]:
fig, proj, ax = create_map()
# fig, proj, ax = create_map(extent=[-90, 0, 0, 80])
tm.plot(transform=proj, x='lon', y='lat', ax=ax, 
        vmin=-2, vmax=15, levels=18, 
        cmap=mpl.colormaps.get_cmap('bwr'))
# ax.gridlines(draw_labels=True)
ax.add_feature(cfeature.LAND, facecolor=[0.7]*3, zorder=100)
ax.set_title("Ocean temperature climatology at z=%0.1f m" % tm['lev'].data)