# Sea Surface Level 
Notebook environment to migrate netcdf files to CF compliant zarr

In [1]:
# Optional; code formatter, installed as jupyter lab extension
#%load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

<IPython.core.display.Javascript object>

### Configure OS independent paths

In [6]:
# Import standard packages
import os
import pathlib
import sys
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import netCDF4 as nc
import numpy.ma as ma

# Make root directories importable by appending root to path
cwd = pathlib.Path().resolve()
sys.path.append(os.path.dirname(cwd))

# Get root paths
home = pathlib.Path().home()
root = home.root

# Import custom functionality
from coclicodata.drive_config import p_drive
from coclicodata.etl.cf_compliancy_checker import check_compliancy, save_compliancy

# Define (local and) remote drives
coclico_data_dir = p_drive.joinpath("11207608-coclico", "FASTTRACK_DATA")

# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    home.joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"Anaconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml"
    )
)

<IPython.core.display.Javascript object>

In [7]:
# Project paths & files (manual input)
ds_dir = coclico_data_dir.joinpath("03_sea_level_jrc")
ds_historical_path = ds_dir.joinpath("CoastAlRisk_Europe_ESL_Historical.nc")
ds_rcp45_path = ds_dir.joinpath("CoastAlRisk_Europe_ESL_RCP45.nc")
ds_rcp85_path = ds_dir.joinpath("CoastAlRisk_Europe_ESL_RCP85.nc")
ds_out_file = "CoastAlRisk_Europe_ESL"
CF_dir = coclico_data_dir.joinpath(r"CF")  # directory to save output CF check files

<IPython.core.display.Javascript object>

### change NetCDF base files

In [None]:
# reshaping the historical dataset to merge into the RCP45 and RCP85 one
dataset_new = "CoastAlRisk_Europe_ESL_Historical_new.nc"
ds_new = nc.Dataset(os.path.join(ds_dir, dataset_new), "w")

# copy global attributes all at once via dictionary
ds = nc.Dataset(ds_historical_path)
ds_new.setncatts(ds.__dict__)

# copy dimensions
for name, dimension in ds.dimensions.items():
    ds_new.createDimension(
        name, (len(dimension) if not dimension.isunlimited() else None)
    )

# adding a new dimension
ds_new.createDimension("nsdec", (1))  # only for 1995

# copy all file data and extend for 2 instances
extended_var = ["esl", "eewl"]
for name, variable in ds.variables.items():
    if name in extended_var:
        ds_new.createVariable(
            name, variable.datatype, ("npoints", "nrp", "nens", "nsdec")
        )
        ds_new[name].setncatts(
            ds[name].__dict__
        )  # copy variable attributes all at once via dictionary
        ds_new[name][:] = np.expand_dims(ds[name][:], axis=3)  # data, extended
    else:
        ds_new.createVariable(name, variable.datatype, variable.dimensions)
        ds_new[name].setncatts(
            ds[name].__dict__
        )  # copy variable attributes all at once via dictionary
        ds_new[name][:] = ds[name][:]  # data

# adding a new variables
ds_new.createVariable("decades", "float32", ("nsdec",))
ds_new["decades"][:] = ma.masked_array(int(1995), mask=[0], dtype="float32")

In [None]:
# combining the datasets
ens_list_45 = [
    xr.open_mfdataset(os.path.join(str(ds_historical_path).split(".")[0] + "_new.nc")),
    xr.open_mfdataset(ds_rcp45_path),
]
ds_comb_45 = xr.concat(
    ens_list_45, data_vars="different", dim="nsdec"
)  # only concat files that are different

ens_list_85 = [
    xr.open_mfdataset(os.path.join(str(ds_historical_path).split(".")[0] + "_new.nc")),
    xr.open_mfdataset(ds_rcp85_path),
]
ds_comb_85 = xr.concat(
    ens_list_85, data_vars="different", dim="nsdec"
)  # only concat files that are different

In [None]:
# saving the new datasets
ds_comb_45.to_netcdf(
    os.path.join(str(ds_rcp45_path).split(".")[0] + "_new.nc")
)  # Export netcdf file
ds_comb_85.to_netcdf(
    os.path.join(str(ds_rcp85_path).split(".")[0] + "_new.nc")
)  # Export netcdf file

### Check CF compliancy original NetCDF files

In [None]:
# overwrite paths to load new files
ds_historical_path = ds_dir.joinpath("CoastAlRisk_Europe_ESL_Historical_new.nc")
ds_rcp45_path = ds_dir.joinpath("CoastAlRisk_Europe_ESL_RCP45_new.nc")
ds_rcp85_path = ds_dir.joinpath("CoastAlRisk_Europe_ESL_RCP85_new.nc")

In [None]:
# open datasets
ds_hist = xr.open_dataset(ds_historical_path)
ds_45rcp = xr.open_dataset(ds_rcp45_path)
ds_85rcp = xr.open_dataset(ds_rcp85_path)

# check original dataset
ds_hist

In [None]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile=ds_historical_path, working_dir=CF_dir)

In [None]:
# save original CF compliancy
save_compliancy(cap, testfile=ds_historical_path, working_dir=CF_dir)

In [None]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile=ds_rcp45_path, working_dir=CF_dir)

In [None]:
# save original CF compliancy
save_compliancy(cap, testfile=ds_rcp45_path, working_dir=CF_dir)

In [None]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile=ds_rcp85_path, working_dir=CF_dir)

In [None]:
# save original CF compliancy
save_compliancy(cap, testfile=ds_rcp85_path, working_dir=CF_dir)

### Make CF compliant alterations to the NetCDF files (dataset dependent)

In [None]:
# NetCDF attribute alterations

# rename global attribute names
ds_hist.attrs["Project_Name"] = ds_hist.attrs.pop("Project Name")
ds_hist.attrs["Project_Acronym"] = ds_hist.attrs.pop("Project Acronym")
ds_45rcp.attrs["Project_Name"] = ds_45rcp.attrs.pop("Project Name")
ds_45rcp.attrs["Project_Acronym"] = ds_45rcp.attrs.pop("Project Acronym")
ds_85rcp.attrs["Project_Name"] = ds_85rcp.attrs.pop("Project Name")
ds_85rcp.attrs["Project_Acronym"] = ds_85rcp.attrs.pop("Project Acronym")

# add global attributes
ds_hist.attrs["Conventions"] = "CF-1.8"
ds_45rcp.attrs["Conventions"] = "CF-1.8"
ds_85rcp.attrs["Conventions"] = "CF-1.8"
ds_hist.attrs["Starting_date"] = "01-Dec-1969"
ds_hist.attrs["End_date"] = "30-Nov-2004 21:00:00"
ds_45rcp.attrs["Starting_date"] = "01-Dec-2009"
ds_45rcp.attrs["End_date"] = "30-Nov-2099 21:00:00"
ds_85rcp.attrs["Starting_date"] = "01-Dec-2009"
ds_85rcp.attrs["End_date"] = "30-Nov-2099 21:00:00"

# remove certain variable attributes
del ds_hist["rp"].attrs["Starting date"]
del ds_hist["rp"].attrs["End date"]
del ds_45rcp["rp"].attrs["Starting date"]
del ds_45rcp["rp"].attrs["End date"]
del ds_85rcp["rp"].attrs["Starting date"]
del ds_85rcp["rp"].attrs["End date"]

# add or change certain variable / coordinate attributes
dataset_attributes = {
    "decades": {"long_name": "decade window", "Format": "YYYY", "units": "yr"}
}  # specify custom (CF convention) attributes

# add / overwrite attributes
for k, v in dataset_attributes.items():
    try:
        ds_hist[k].attrs = dataset_attributes[k]
        ds_45rcp[k].attrs = dataset_attributes[k]
        ds_85rcp[k].attrs = dataset_attributes[k]
    except:
        continue

In [None]:
# NetCDF variable and dimension alterations

# rename or swap dimension names, the latter in case the name already exists as coordinate
ds_hist = ds_hist.rename_dims(
    {"npoints": "stations", "nens": "nensemble", "nsdec": "time"}
)
ds_45rcp = ds_45rcp.rename_dims(
    {"npoints": "stations", "nens": "nensemble", "nsdec": "time"}
)
ds_85rcp = ds_85rcp.rename_dims(
    {"npoints": "stations", "nens": "nensemble", "nsdec": "time"}
)
ds_hist = ds_hist.swap_dims({"nrp": "rp"})
ds_45rcp = ds_45rcp.swap_dims({"nrp": "rp"})
ds_85rcp = ds_85rcp.swap_dims({"nrp": "rp"})

# rename variables, if necessary
ds_hist = ds_hist.rename_vars(
    {"longitude": "lon", "latitude": "lat", "ensmbl": "ensemble", "decades": "time"}
)
ds_45rcp = ds_45rcp.rename_vars(
    {"longitude": "lon", "latitude": "lat", "ensmbl": "ensemble", "decades": "time"}
)
ds_85rcp = ds_85rcp.rename_vars(
    {"longitude": "lon", "latitude": "lat", "ensmbl": "ensemble", "decades": "time"}
)

# set some data variables to coordinates to avoid duplication of dimensions in later stage
ds_hist = ds_hist.set_coords(["lon", "lat", "rp", "time"])
ds_45rcp = ds_45rcp.set_coords(["lon", "lat", "rp", "time"])
ds_85rcp = ds_85rcp.set_coords(["lon", "lat", "rp", "time"])

# assign coordinate strings i.s.o. integers (and copy along the attributes)
ds_hist = ds_hist.assign_coords(
    {
        "ensemble": (
            "nensemble",
            np.array(["min", "mean", "max"], dtype="S"),
            ds_hist.ensemble.attrs,
        )
    }
)
ds_45rcp = ds_45rcp.assign_coords(
    {
        "ensemble": (
            "nensemble",
            np.array(["min", "mean", "max"], dtype="S"),
            ds_45rcp.ensemble.attrs,
        )
    }
)
ds_85rcp = ds_85rcp.assign_coords(
    {
        "ensemble": (
            "nensemble",
            np.array(["min", "mean", "max"], dtype="S"),
            ds_85rcp.ensemble.attrs,
        )
    }
)

# remove attributes in copied entries
del ds_hist["ensemble"].attrs["Contents"]
del ds_45rcp["ensemble"].attrs["Contents"]
del ds_85rcp["ensemble"].attrs["Contents"]

In [None]:
# concat datasets along new dimension with index values and name derived from pandas index object, if necessary
dataset = xr.concat([ds_45rcp, ds_85rcp], dim="nscenarios")
dataset = dataset.assign_coords(
    scenarios=("nscenarios", np.array(["RCP45", "RCP85"], dtype="S"))
)

# dataset = xr.concat(
#     [dataset_historical, dataset_45rcp, dataset_85rcp],
#     pd.Index(["historical", "rcp45", "rcp85"], name="scenarios"),
# )

# dataset["scenarios"].values.astype("U") # retrieve scenarios as string

In [None]:
# re-order shape of the data variables
ds_hist = ds_hist.transpose("stations", "rp", "time", "nensemble")
ds_45rcp = ds_45rcp.transpose("stations", "rp", "time", "nensemble")
ds_85rcp = ds_85rcp.transpose("stations", "rp", "time", "nensemble")
dataset = dataset.transpose("nscenarios", "stations", "rp", "time", "nensemble")

# add or change certain variable / coordinate attributes
dataset_attributes = {
    "scenarios": {"long_name": "climate scenarios"}
}  # specify custom (CF convention) attributes

# add / overwrite attributes
for k, v in dataset_attributes.items():
    try:
        dataset[k].attrs = dataset_attributes[k]
    except:
        continue

# add epsg
ds_hist.attrs["crs"] = 4326
ds_45rcp.attrs["crs"] = 4326
ds_85rcp.attrs["crs"] = 4326
dataset.attrs["crs"] = 4326

In [None]:
# check the xarray dataset, best practice is to have as many as possible bold dimensions (dimension == coordinate name).
# in this way, the Front-End can access the variable directly without having to index the variable first

dataset
# dataset["scenarios"]

In [None]:
# save new .nc files
ds_hist.to_netcdf(path=str(ds_historical_path).replace(".nc", "_CF.nc"))
ds_45rcp.to_netcdf(path=str(ds_rcp45_path).replace(".nc", "_CF.nc"))
ds_85rcp.to_netcdf(path=str(ds_rcp85_path).replace(".nc", "_CF.nc"))
dataset.to_netcdf(path=ds_dir.joinpath(ds_out_file + "_CF.nc"))

In [None]:
ds = xr.open_dataset(str(ds_historical_path).replace(".nc", "_new_CF.nc"))

In [None]:
ds["ensemble"]

### Check CF compliancy altered NetCDF files

In [None]:
%%capture cap --no-stderr
# check altered CF compliancy

check_compliancy(testfile=str(ds_historical_path).replace(".nc", "_CF.nc"), working_dir=CF_dir)

In [None]:
# save altered CF compliancy
save_compliancy(
    cap, testfile=str(ds_historical_path).replace(".nc", "_CF.nc"), working_dir=CF_dir,
)

In [None]:
%%capture cap --no-stderr
# check altered CF compliancy

check_compliancy(testfile=str(ds_rcp45_path).replace(".nc", "_CF.nc"), working_dir=CF_dir)

In [None]:
# save altered CF compliancy
save_compliancy(
    cap, testfile=str(ds_rcp45_path).replace(".nc", "_CF.nc"), working_dir=CF_dir,
)

In [None]:
%%capture cap --no-stderr
# check altered CF compliancy

check_compliancy(testfile=str(ds_rcp85_path).replace(".nc", "_CF.nc"), working_dir=CF_dir)

In [None]:
# save altered CF compliancy
save_compliancy(
    cap, testfile=str(ds_rcp85_path).replace(".nc", "_CF.nc"), working_dir=CF_dir,
)

In [None]:
%%capture cap --no-stderr
# check altered CF compliancy

check_compliancy(testfile=ds_dir.joinpath(ds_out_file + "_CF.nc"), working_dir=CF_dir)

In [None]:
# save altered CF compliancy
save_compliancy(
    cap, testfile=ds_dir.joinpath(ds_out_file + "_CF.nc"), working_dir=CF_dir,
)

### write data to Zarr files

In [None]:
# export to zarr in write mode (to overwrite if exists)
dataset.to_zarr(ds_dir.joinpath("%s.zarr" % ds_out_file), mode="w")

### Open written dataset

In [8]:
check = xr.open_zarr(ds_dir.joinpath("%s.zarr" % ds_out_file))

<IPython.core.display.Javascript object>

In [28]:
check.scenarios

Unnamed: 0,Array,Chunk
Bytes,10 B,10 B
Shape,"(2,)","(2,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,|S5 numpy.ndarray,|S5 numpy.ndarray
"Array Chunk Bytes 10 B 10 B Shape (2,) (2,) Dask graph 1 chunks in 2 graph layers Data type |S5 numpy.ndarray",2  1,

Unnamed: 0,Array,Chunk
Bytes,10 B,10 B
Shape,"(2,)","(2,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,|S5 numpy.ndarray,|S5 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10 B,10 B
Shape,"(2,)","(2,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,|S5 numpy.ndarray,|S5 numpy.ndarray
"Array Chunk Bytes 10 B 10 B Shape (2,) (2,) Dask graph 1 chunks in 2 graph layers Data type |S5 numpy.ndarray",2  1,

Unnamed: 0,Array,Chunk
Bytes,10 B,10 B
Shape,"(2,)","(2,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,|S5 numpy.ndarray,|S5 numpy.ndarray


<IPython.core.display.Javascript object>

In [24]:
check.time.values

array([1995., 2010., 2020., 2030., 2040., 2050., 2060., 2070., 2080.,
       2090., 2100.], dtype=float32)

<IPython.core.display.Javascript object>