# Sea Level Rise AR6
Notebook environment to migrate netcdf files to CF compliant zarr & CoG. 
Note, this is still quite a mess, TODO: clean up..

In [None]:
# Optional; code formatter, installed as jupyter lab extension
#%load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

### Configure OS independent paths

In [None]:
# Import standard packages
import os
import pathlib
import sys
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import netCDF4 as nc
import numpy.ma as ma
import rasterio
import rioxarray as rio
from datacube.utils.cog import write_cog

# Make root directories importable by appending root to path
cwd = pathlib.Path().resolve()
sys.path.append(os.path.dirname(cwd))

# Get root paths
home = pathlib.Path().home()
root = home.root
tmp_dir = home.joinpath("data", "tmp")

# Import custom functionality
from etl import p_drive
from etl.CF_compliancy_checker import check_compliancy, save_compliancy

# Define (local and) remote drives
coclico_data_dir = p_drive.joinpath("11205479-coclico", "FASTTRACK_DATA")

# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    home.joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"Anaconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml"
    )
)

# use local or remote data dir
use_local_data = False
ds_dirname = "17_AR6_SLP_IPCC"

if use_local_data: 
    ds_dir = tmp_dir.joinpath(ds_dirname)
else: 
    ds_dir = coclico_data_dir.joinpath(ds_dirname)

if not ds_dir.exists():
    raise FileNotFoundError("Directory with data does not exist.")

# directory to export result (make if not exists)
cog_dir = ds_dir.joinpath("cog")
cog_dirs = ds_dir.joinpath("cogs") # for making all files CF compliant
cog_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# Project paths & files (manual input)
ds_ssp26_path = ds_dir.joinpath(f"total_ssp126_medium_confidence_values.nc")
ds_ssp45_path = ds_dir.joinpath(f"total_ssp245_medium_confidence_values.nc")
ds_ssp85_path = ds_dir.joinpath(f"total_ssp585_medium_confidence_values.nc")
ds_out_file = "slr_medium_confidence_values"
CF_dir = coclico_data_dir.joinpath(r"CF")  # directory to save output CF check files

### Check CF compliancy original NetCDF files

In [None]:
# open datasets
ds_26ssp = xr.open_dataset(ds_ssp26_path)
ds_45ssp = xr.open_dataset(ds_ssp45_path)
ds_85ssp = xr.open_dataset(ds_ssp85_path)

# check original dataset
ds_45ssp

In [None]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile=ds_ssp26_path, working_dir=CF_dir)

In [None]:
# save original CF compliancy
save_compliancy(cap, testfile=ds_ssp26_path, working_dir=CF_dir)

In [None]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile=ds_ssp45_path, working_dir=CF_dir)

In [None]:
# save original CF compliancy
save_compliancy(cap, testfile=ds_ssp45_path, working_dir=CF_dir)

In [None]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile=ds_ssp85_path, working_dir=CF_dir)

In [None]:
# save original CF compliancy
save_compliancy(cap, testfile=ds_ssp85_path, working_dir=CF_dir)

### Make CF compliant alterations to the NetCDF files (dataset dependent)

In [None]:
# plt.figure(figsize=(16,9))
# plt.scatter(ds['lon'][1030:], ds['lat'][1030:], s=1)
# #plt.scatter(ds['longitude'][2000:2100], ds['latitude'][2000:2100], s=1, c='r')
# # plt.xlim(-10,50)
# # plt.ylim(25,75)
# plt.grid()

In [None]:
# rework the datasets

# construct lon, lat grid arrays
lonl = list(dict.fromkeys(ds_26ssp.lon.values[1030:])) # 1030 is where rasterized output starts (before we have arbitrary points)
latl = list(dict.fromkeys(ds_26ssp.lat.values[1030:])) # 1030 is where rasterized output starts (before we have arbitrary points)

# reshape sea level change variable
slc_26 = ds_26ssp["sea_level_change"].values[:,:,1030:].reshape(len(ds_26ssp.quantiles.values), len(ds_26ssp.years.values), len(latl), len(lonl)) # reshaped values
slc_45 = ds_45ssp["sea_level_change"].values[:,:,1030:].reshape(len(ds_45ssp.quantiles.values), len(ds_45ssp.years.values), len(latl), len(lonl)) # reshaped values
slc_85 = ds_85ssp["sea_level_change"].values[:,:,1030:].reshape(len(ds_85ssp.quantiles.values), len(ds_85ssp.years.values), len(latl), len(lonl)) # reshaped values

# re-order monotonically
slc_26 = slc_26[:,:,:,np.argsort(lonl)]
slc_26 = slc_26[:,:,np.argsort(latl),:]
slc_45 = slc_45[:,:,:,np.argsort(lonl)]
slc_45 = slc_45[:,:,np.argsort(latl),:]
slc_85 = slc_85[:,:,:,np.argsort(lonl)]
slc_85 = slc_85[:,:,np.argsort(latl),:]

# remove items that will be replaced
ds_26ssp = ds_26ssp.drop_vars({"sea_level_change", "lat", "lon", "locations"}) # make clean reworked dataset
ds_45ssp = ds_45ssp.drop_vars({"sea_level_change", "lat", "lon", "locations"}) # make clean reworked dataset
ds_85ssp = ds_85ssp.drop_vars({"sea_level_change", "lat", "lon", "locations"}) # make clean reworked dataset

# substitute new items
ds_26ssp = ds_26ssp.assign_coords({"lat": sorted(latl), "lon": sorted(lonl)}) # assign dimensions
ds_26ssp = ds_26ssp.assign(slr=(["quantiles", "years", "lat", "lon"], slc_26)) # assign data variabel
ds_45ssp = ds_45ssp.assign_coords({"lat": sorted(latl), "lon": sorted(lonl)}) # assign dimensions
ds_45ssp = ds_45ssp.assign(slr=(["quantiles", "years", "lat", "lon"], slc_45)) # assign data variabel
ds_85ssp = ds_85ssp.assign_coords({"lat": sorted(latl), "lon": sorted(lonl)}) # assign dimensions
ds_85ssp = ds_85ssp.assign(slr=(["quantiles", "years", "lat", "lon"], slc_85)) # assign data variabel

In [None]:
# NetCDF variable and dimension alterations

# rename or swap dimension names, the latter in case the name already exists as coordinate
ds_26ssp = ds_26ssp.rename_dims({"years": "time", "quantiles": "ensemble"})
ds_45ssp = ds_45ssp.rename_dims({"years": "time", "quantiles": "ensemble"})
ds_85ssp = ds_85ssp.rename_dims({"years": "time", "quantiles": "ensemble"})

# rename variables, if necessary
ds_26ssp = ds_26ssp.rename_vars({"years": "time", "quantiles": "ensemble"})
ds_45ssp = ds_45ssp.rename_vars({"years": "time", "quantiles": "ensemble"})
ds_85ssp = ds_85ssp.rename_vars({"years": "time", "quantiles": "ensemble"})

# make quantiles percentages for percentiles
ds_26ssp['ensemble'] = np.around(ds_26ssp['ensemble'].values*100, decimals=2)
ds_45ssp['ensemble'] = np.around(ds_45ssp['ensemble'].values*100, decimals=2)
ds_85ssp['ensemble'] = np.around(ds_85ssp['ensemble'].values*100, decimals=2)

In [None]:
for i in [ds_26ssp, ds_45ssp, ds_85ssp]:
    i["time"].attrs["long_name"] = "time"
    i["time"].attrs["units"] = "yr"
    i["ensemble"].attrs["long_name"] = "ensemble"
    i["ensemble"].attrs["units"] = "1"
    i["lat"].attrs["long_name"] = "latitude"
    i["lat"].attrs["standard_name"] = "latitude"
    i["lat"].attrs["units"] = "degrees_north"
    i["lon"].attrs["long_name"] = "longitude"
    i["lon"].attrs["standard_name"] = "longitude"
    i["lon"].attrs["units"] = "degrees_east"
    i["slr"].attrs["long_name"] = "sea level rise"
    i["slr"].attrs["units"] = "mm"

In [None]:
# concat datasets along new dimension with index values and name derived from pandas index object, if necessary
dataset = xr.concat([ds_26ssp, ds_45ssp, ds_85ssp], dim="nscenarios")
dataset = dataset.assign_coords(
    scenarios=("nscenarios", np.array(["SSP1-26", "SSP2-45", "SSP5-85"], dtype="S"))
)

# dataset = xr.concat(
#     [dataset_historical, dataset_45rcp, dataset_85rcp],
#     pd.Index(["historical", "rcp45", "rcp85"], name="scenarios"),
# )

# dataset["scenarios"].values.astype("U") # retrieve scenarios as string

In [None]:
# re-order shape of the data variables
ds_26ssp = ds_26ssp.transpose("time", "lat", "lon", "ensemble")
ds_45ssp = ds_45ssp.transpose("time", "lat", "lon", "ensemble")
ds_85ssp = ds_85ssp.transpose("time", "lat", "lon", "ensemble")
dataset = dataset.transpose("nscenarios", "time", "lat", "lon", "ensemble")

# add or change certain variable / coordinate attributes
dataset_attributes = {
    "scenarios": {"long_name": "climate scenarios"}
}  # specify custom (CF convention) attributes

# add / overwrite attributes
for k, v in dataset_attributes.items():
    try:
        dataset[k].attrs = dataset_attributes[k]
    except:
        continue

In [None]:
import json

# NetCDF attribute alterations by means of metadata template
f_global = open(ds_dir.joinpath("metadata_AR6_slp.json"))
meta_global = json.load(f_global)
ds_list = [ds_26ssp, ds_45ssp, ds_85ssp, dataset]

for i in ds_list:
    for attr_name, attr_val in meta_global.items():
        if attr_name == 'PROVIDERS':
            attr_val = json.dumps(attr_val)
        i.attrs[attr_name] = attr_val

    i.attrs['Conventions'] = "CF-1.8"

In [None]:
# check the xarray dataset, best practice is to have as many as possible bold dimensions (dimension == coordinate name).
# in this way, the Front-End can access the variable directly without having to index the variable first

dataset
# dataset["nscenarios"]

In [None]:
# save new .nc files
ds_26ssp.to_netcdf(path=str(ds_ssp26_path).replace(".nc", "_CF.nc"))
ds_45ssp.to_netcdf(path=str(ds_ssp45_path).replace(".nc", "_CF.nc"))
ds_85ssp.to_netcdf(path=str(ds_ssp85_path).replace(".nc", "_CF.nc"))
dataset.to_netcdf(path=ds_dir.joinpath(ds_out_file + "_CF.nc"))

### Check CF compliancy altered NetCDF files

In [None]:
%%capture cap --no-stderr
# check altered CF compliancy

check_compliancy(testfile=str(ds_ssp26_path).replace(".nc", "_CF.nc"), working_dir=CF_dir)

In [None]:
# save altered CF compliancy
save_compliancy(
    cap, testfile=str(ds_ssp26_path).replace(".nc", "_CF.nc"), working_dir=CF_dir,
)

In [None]:
%%capture cap --no-stderr
# check altered CF compliancy

check_compliancy(testfile=str(ds_ssp45_path).replace(".nc", "_CF.nc"), working_dir=CF_dir)

In [None]:
# save altered CF compliancy
save_compliancy(
    cap, testfile=str(ds_ssp45_path).replace(".nc", "_CF.nc"), working_dir=CF_dir,
)

In [None]:
%%capture cap --no-stderr
# check altered CF compliancy

check_compliancy(testfile=str(ds_ssp85_path).replace(".nc", "_CF.nc"), working_dir=CF_dir)

In [None]:
# save altered CF compliancy
save_compliancy(
    cap, testfile=str(ds_ssp85_path).replace(".nc", "_CF.nc"), working_dir=CF_dir,
)

In [None]:
%%capture cap --no-stderr
# check altered CF compliancy

check_compliancy(testfile=ds_dir.joinpath(ds_out_file + "_CF.nc"), working_dir=CF_dir)

In [None]:
# save altered CF compliancy
save_compliancy(
    cap, testfile=ds_dir.joinpath(ds_out_file + "_CF.nc"), working_dir=CF_dir,
)

### write data to Zarr files (not used)

In [None]:
# export to zarr in write mode (to overwrite if exists)
#dataset.to_zarr(ds_dir.joinpath("%s.zarr" % ds_out_file), mode="w")

In [None]:
# check dataset
#ds_26ssp = xr.open_dataset(r"P:\11205479-coclico\FASTTRACK_DATA\17_AR6_SLP_IPCC\total_ssp126_medium_confidence_values_CF.nc")

### Write data to CoG (CF compliant)

#### single CoG test

In [None]:
# check CoG for one set of params

# hard-coded input params
ENSEMBLE = 50.0 # select ensemble
TIME = 0 # select timestep (indices)
VARIABLE = "slr" # select variable
SSP = 0 # select scenario (indices)

# open the dataset
ds_fp = ds_dir.joinpath(ds_out_file + "_CF.nc")
ds = xr.open_dataset(ds_fp)

# make array 2d and fix spatial dimensions and crs
rds = ds.sel({"ensemble": ENSEMBLE, "nscenarios": SSP}).isel(time=TIME)[VARIABLE] 

rds.rio.set_spatial_dims(x_dim="lon", y_dim="lat")
if not rds.rio.crs:
    rds = rds.rio.write_crs("EPSG:4326")

# convert to dataset
rdsd = rds.to_dataset()

# add all attributes (again)
for attr_name, attr_val in meta_global.items():
    if attr_name == 'PROVIDERS':
        attr_val = json.dumps(attr_val)
    if attr_name == "MEDIA_TYPE": # change media type to tiff, leave the rest as is
        attr_val = "IMAGE/TIFF"
    rdsd.attrs[attr_name] = attr_val

rdsd.attrs['Conventions'] = "CF-1.8"

# export file
ssp_str = rdsd["scenarios"].item().decode("utf-8") # fix scenario string 
fname = f"{VARIABLE}_{ssp_str}_ens{float(ENSEMBLE)}_time{TIME}_CF.GeoTiff"
outpath = cog_dir.joinpath(fname)
rdsd.rio.to_raster(outpath, driver="GTiff")

In [None]:
# export to nc for quick CF compliancy check..
rdsd.to_netcdf(path=cog_dir.joinpath(fname.replace(".GeoTiff", ".nc")))
CF_dir = coclico_data_dir.joinpath(r"CF")

In [None]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile=cog_dir.joinpath(fname.replace(".GeoTiff", ".nc")), working_dir=CF_dir)

In [None]:
# save original CF compliancy
save_compliancy(cap, testfile=cog_dir.joinpath(fname.replace(".GeoTiff", ".nc")), working_dir=CF_dir)

##### Note, TIFFs are way less flexible in variables and therefore no CF compliancy check is needed. Data will always be an array with band, y, x as dimensions and band, y, x, spatial_ref as coordinates

In [None]:
data = rio.open_rasterio(outpath, masked=True)
data.plot()
#rds.plot()

In [None]:
data

#### Multiple CoGs

In [None]:
# do for all CoGs (CF compliant)

# open the dataset
ds_fp = ds_dir.joinpath(ds_out_file + "_CF.nc")
ds = xr.open_dataset(ds_fp)

for idx, scen in enumerate(ds["scenarios"].values):
    ssp = scen.decode("utf-8")

    # format ssp name for filenaming
    ssp_name = "ssp=%s"%ssp.strip("SSP")
    print(ssp_name)

    # extract list of data variables
    variables = set(ds.variables) - set(ds.dims) - set(ds.coords)
    #print(variables)

    ntimes = ds.dims["time"]
    for ntime in range(ntimes):
        ds2 = ds.copy()
        ds2 = ds2.isel({"time": ntime})

        # extract time for use tif naming (dataset specific)
        time_name = str(ds2.time.values)

        for var_name in variables:
            da = ds2.sel({"nscenarios": idx})[var_name]

            for idv, ens in enumerate(da["ensemble"].values):
                da2 = da.isel({"ensemble": idv})

                # add crs and spatial dims
                da2.rio.set_spatial_dims(x_dim="lon", y_dim="lat")
                if not da2.rio.crs:
                    da2 = da2.rio.write_crs("EPSG:4326")

                # compose tif name
                fname = time_name + ".tif"
                blob_name = pathlib.Path(ssp_name, var_name + "_ens%s"%np.around(ens, decimals=2), fname)
                outpath = cog_dirs.joinpath(blob_name)

                # convert to dataset and save as geotiff & nc to check the CF compliancy
                # dads = da2.to_dataset()

                # # add all attributes (again)
                # for attr_name, attr_val in meta_global.items():
                #     if attr_name == 'PROVIDERS':
                #         attr_val = json.dumps(attr_val)
                #     if attr_name == "MEDIA_TYPE": # change media type to tiff, leave the rest as is
                #         attr_val = "IMAGE/TIFF"
                #     dads.attrs[attr_name] = attr_val

                # dads.attrs['Conventions'] = "CF-1.8"

                # save to .nc & geotiff
                # fname = f"{var_name}_{ssp}_ens{np.around(ens, decimals=2)}_time{ntime}_CF.GeoTiff"
                # outpath = cog_dir.joinpath(fname)
                # dads.rio.to_raster(outpath, driver="GTiff")
                # dads.to_netcdf(path=cog_dir.joinpath(fname.replace(".GeoTiff", ".nc")))
                # CF_dir = coclico_data_dir.joinpath(r"CF")
                
                # make parent dir if not exists
                outpath.parent.mkdir(parents=True, exist_ok=True)

                # set overwrite is false because tifs should be unique
                try:
                    write_cog(da2, fname=outpath, overwrite=False)
                except OSError as e:
                    continue

In [None]:
# %%capture cap --no-stderr
# # check original CF compliancy

# check_compliancy(testfile=cog_dir.joinpath(fname.replace(".GeoTiff", ".nc")), working_dir=CF_dir)

In [None]:
# save original CF compliancy
# save_compliancy(cap, testfile=cog_dir.joinpath(fname.replace(".GeoTiff", ".nc")), working_dir=CF_dir)