# Coastal Flood Risk

Notebook environment to migrate netcdf files to zarr and geojson

In [None]:
# Optional; code formatter, installed as jupyter lab extension
#%load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

### Configure OS independent paths

In [None]:
# Import standard packages
import os
import pathlib
import sys
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import netCDF4 as nc
import numpy.ma as ma
import math
from shapely import wkb

# Make root directories importable by appending root to path
cwd = pathlib.Path().resolve()
sys.path.append(os.path.dirname(cwd))

# Get root paths
home = pathlib.Path().home()
root = home.root

# Import custom functionality
from etl import p_drive
from etl.CF_compliancy_checker import check_compliancy, save_compliancy

# Define (local and) remote drives
coclico_data_dir = p_drive.joinpath("11205479-coclico", "data")

# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    home.joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"Anaconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml"
    )
)

In [None]:
# Project paths & files (manual input)
ds_dir = coclico_data_dir.joinpath("07_flood_risk_jrc")
ds_path1 = ds_dir.joinpath("ExpectedAnnualDamage.nc")
ds_path2 = ds_dir.joinpath("ExpectedAnnualDamageperGDP.nc")
ds_path3 = ds_dir.joinpath("ExpectedAnnualPeopleAffected.nc")
ds_out_file = "Coastal_Flood_risk_Europe"
CF_dir = coclico_data_dir.joinpath(r"CF")  # directory to save output CF check files

### Check CF compliancy original NetCDF files

In [None]:
# open datasets
ds1 = xr.open_dataset(ds_path1)
ds2 = xr.open_dataset(ds_path2)
ds3 = xr.open_dataset(ds_path3)

# check original dataset
ds1

In [None]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile=ds_path1, working_dir=CF_dir)

In [None]:
# save original CF compliancy
save_compliancy(cap, testfile=ds_path1, working_dir=CF_dir)

In [None]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile=ds_path2, working_dir=CF_dir)

In [None]:
# save original CF compliancy
save_compliancy(cap, testfile=ds_path2, working_dir=CF_dir)

In [None]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile=ds_path3, working_dir=CF_dir)

In [None]:
# save original CF compliancy
save_compliancy(cap, testfile=ds_path3, working_dir=CF_dir)

### Add NUTS regions

The nuts regions are not included as attributes in the netCDF files. The NetCDF files only contain lon and lat values at the centroid of NUTS2 regions so we retrieve that information from eurostat ("https://ec.europa.eu/eurostat/web/gisco/geodata/reference-data/administrative-units-statistical-units/nuts").

In [None]:
# load nuts regions, filter on countries and replace the geomtry by buffered centroids
nuts_regions = gpd.read_file(
    coclico_data_dir.joinpath("XX_NUTS2", "NUTS_RG_20M_2021_4326.geojson")
)
nuts_regions = nuts_regions.to_crs("EPSG:4326")

NUTS0 = nuts_regions[nuts_regions.LEVL_CODE == 0]  # countries
NUTS0["polygons"] = NUTS0.geometry  # rename the geometry
NUTS0["geometry"] = NUTS0.centroid.buffer(0.5)  # set the centroid as geometry

In [None]:
# create dummy file from xarray dataset with lon and lat centroids
ds_d = gpd.GeoDataFrame(geometry=gpd.points_from_xy(ds1.lon, ds1.lat), crs="EPSG:4326",)

# spatial join to keep only nuts regions used ds1
sjoins = gpd.sjoin(ds_d, NUTS0, how="left")

# spatial join alterations (i.e. renaming the columns and reducing the size)
sjoins["instance"] = sjoins.index_right.values
cropped_list = ["instance", "NUTS_ID", "NAME_LATN", "CNTR_CODE", "polygons"]
renamed_list = ["instance", "acronym", "name", "country", "geometry"]
sjoins = sjoins[cropped_list]
sjoins = sjoins.rename(
    {
        "NUTS_ID": renamed_list[1],
        "NAME_LATN": renamed_list[2],
        "CNTR_CODE": renamed_list[3],
        "polygons": renamed_list[4],
    },
    axis="columns",
)

# supplementing the faulty joins
supplement = ["FR", "NO"]
nan_idx = []
for idx, i in enumerate(sjoins.instance):
    if math.isnan(i) == True:
        nan_idx.append(idx)

for i, j in zip(nan_idx, supplement):
    supp = NUTS0.loc[NUTS0["NUTS_ID"] == j]
    for kid, (k, l) in enumerate(zip(renamed_list, cropped_list)):
        if kid == 0:
            sjoins.at[i, k] = supp.index.values[0]
        else:
            sjoins.at[i, k] = supp[l].values[0]

sjoins = gpd.GeoDataFrame(sjoins, crs="EPSG:4326")
sjoins.head()

In [None]:
# add geometries

# extract geometries of nut2 regions in well-known binary format
geoms = sjoins["geometry"].apply(lambda x: wkb.dumps(x))

# rename dims and add new data to dataset
ds1 = ds1.assign_coords({"geometry": ("row", geoms)})
ds2 = ds2.assign_coords({"geometry": ("row", geoms)})
ds3 = ds3.assign_coords({"geometry": ("row", geoms)})

In [None]:
add_geom_attrs = {
    "geometry": {
        "long_name": "NUTS2 regions (polygons) in well-known binary format (wkb).",
        "geometry_type": "polygon",
        "units": "degree",
        "comment": "These NUTS2 regions (2021 version) are available at Eurostat.",
        "crs_wkt": f"{sjoins.crs.to_epsg()}",
    },
}

for k, v in add_geom_attrs.items():
    ds1[k].attrs = add_geom_attrs[k]
    ds2[k].attrs = add_geom_attrs[k]
    ds3[k].attrs = add_geom_attrs[k]

### Make CF compliant alterations to the NetCDF files (dataset dependent)

In [None]:
# NetCDF attribute, variable and dimension alterations

# set lon/lat coordinates for each of the datasets
ds1, ds2, ds3 = [ds.set_coords(["lon", "lat"]) for ds in [ds1, ds2, ds3]]

# add global attributes
ds1.attrs["Conventions"] = "CF-1.8"
ds2.attrs["Conventions"] = "CF-1.8"
ds3.attrs["Conventions"] = "CF-1.8"
ds1.attrs["crs"] = 4326
ds2.attrs["crs"] = 4326
ds3.attrs["crs"] = 4326

# rename dimensions
ds1 = ds1.rename_dims({"row": "stations"})
ds2 = ds2.rename_dims({"row": "stations"})
ds3 = ds3.rename_dims({"row": "stations"})

# alter variable attributes
ds_list = [ds1, ds2, ds3]
for i in range(3):

    ds_list[i].lat.attrs["long_name"] = ds_list[i].lat.attrs["standard_name"]
    ds_list[i].lat.attrs["standard_name"] = "latitude"
    ds_list[i].lat.attrs["units"] = "degrees_north"

    ds_list[i].lon.attrs["long_name"] = ds_list[i].lon.attrs["standard_name"]
    ds_list[i].lon.attrs["standard_name"] = "latitude"
    ds_list[i].lon.attrs["units"] = "degrees_north"

    ds_list[i].base2000.attrs["long_name"] = ds_list[i].base2000.attrs["standard_name"]
    del ds_list[i].base2000.attrs["standard_name"]
    ds_list[i].sust2050.attrs["long_name"] = ds_list[i].sust2050.attrs["standard_name"]
    del ds_list[i].sust2050.attrs["standard_name"]
    ds_list[i].sust2100.attrs["long_name"] = ds_list[i].sust2100.attrs["standard_name"]
    del ds_list[i].sust2100.attrs["standard_name"]
    ds_list[i].frag2050.attrs["long_name"] = ds_list[i].frag2050.attrs["standard_name"]
    del ds_list[i].frag2050.attrs["standard_name"]
    ds_list[i].frag2100.attrs["long_name"] = ds_list[i].frag2100.attrs["standard_name"]
    del ds_list[i].frag2100.attrs["standard_name"]
    ds_list[i].ffbd2050.attrs["long_name"] = ds_list[i].ffbd2050.attrs["standard_name"]
    del ds_list[i].ffbd2050.attrs["standard_name"]
    ds_list[i].ffbd2100.attrs["long_name"] = ds_list[i].ffbd2100.attrs["standard_name"]
    del ds_list[i].ffbd2100.attrs["standard_name"]

    if i == 0 or i == 2:  # annual damage
        ds_list[i].base2000.attrs["units"] = "1e12"
        ds_list[i].sust2050.attrs["units"], ds_list[i].sust2100.attrs["units"] = (
            "1e12",
            "1e12",
        )
        ds_list[i].frag2050.attrs["units"], ds_list[i].frag2100.attrs["units"] = (
            "1e12",
            "1e12",
        )
        ds_list[i].ffbd2050.attrs["units"], ds_list[i].ffbd2100.attrs["units"] = (
            "1e12",
            "1e12",
        )
    if i == 1:  # annual damage per GDP
        ds_list[i].base2000.attrs["units"] = "1"
        ds_list[i].sust2050.attrs["units"], ds_list[i].sust2100.attrs["units"] = (
            "1",
            "1",
        )
        ds_list[i].frag2050.attrs["units"], ds_list[i].frag2100.attrs["units"] = (
            "1",
            "1",
        )
        ds_list[i].ffbd2050.attrs["units"], ds_list[i].ffbd2100.attrs["units"] = (
            "1",
            "1",
        )
    if i == 2:  # annual people affected
        ds_list[i].base2000.attrs["units"] = "1e3"
        ds_list[i].sust2050.attrs["units"], ds_list[i].sust2100.attrs["units"] = (
            "1e3",
            "1e3",
        )
        ds_list[i].frag2050.attrs["units"], ds_list[i].frag2100.attrs["units"] = (
            "1e3",
            "1e3",
        )
        ds_list[i].ffbd2050.attrs["units"], ds_list[i].ffbd2100.attrs["units"] = (
            "1e3",
            "1e3",
        )

In [None]:
# NetCDF variable and dimension alterations

# concatenate the datasets for time and scenarios
ds_list_new = []
var_list = [
    "expected annual damage",
    "expected annual damage per GDP",
    "expected annual people affected",
]
var_list_abb = ["ead", "ead_GDP", "eapa"]
for i in range(3):
    das = xr.concat(
        [ds_list[i]["base2000"], ds_list[i]["sust2050"], ds_list[i]["sust2100"]],
        dim="time",
    )
    das = das.assign_coords({"time": ("time", np.array([2000, 2050, 2100]))})
    das.time.attrs["long_name"] = "time"
    das.time.attrs["units"] = "yr"
    daf = xr.concat(
        [ds_list[i]["base2000"], ds_list[i]["frag2050"], ds_list[i]["frag2100"]],
        dim="time",
    )
    daf = daf.assign_coords({"time": ("time", np.array([2000, 2050, 2100]))})
    daf.time.attrs["long_name"] = "time"
    daf.time.attrs["units"] = "yr"
    dag = xr.concat(
        [ds_list[i]["base2000"], ds_list[i]["ffbd2050"], ds_list[i]["ffbd2100"]],
        dim="time",
    )
    dag = dag.assign_coords({"time": ("time", np.array([2000, 2050, 2100]))})
    dag.time.attrs["long_name"] = "time"
    dag.time.attrs["units"] = "yr"

    dsnew = xr.concat([das, daf, dag], dim="nscenarios")
    dsnew = dsnew.assign_coords(
        {
            "scenarios": (
                "nscenarios",
                np.array(["RCP4.5-SSP1", "RCP8.5-SSP3", "RCP8.5-SSP5",], dtype="S",),
            )
        }
    )
    dsnew.scenarios.attrs["long_name"] = "climate scenarios"
    dsnew = dsnew.to_dataset(name=var_list_abb[i])
    dsnew[var_list_abb[i]].attrs["long_name"] = var_list[i]  # variable attributes
    dsnew.attrs = ds1.attrs  # copy global attributes
    ds_list_new.append(dsnew)

# merge into one dataset
ds = xr.merge(ds_list_new)

In [None]:
# re-order shape of the data variables
ds1_new = ds_list_new[0].transpose("nscenarios", "stations", "time")
ds2_new = ds_list_new[1].transpose("nscenarios", "stations", "time")
ds3_new = ds_list_new[2].transpose("nscenarios", "stations", "time")
ds = ds.transpose("nscenarios", "stations", "time")

In [None]:
# check the xarray dataset, best practice is to have as many as possible bold dimensions (dimension == coordinate name).
# in this way, the Front-End can access the variable directly without having to index the variable first

ds
# ds["scenarios"]

In [None]:
# save new .nc files
ds1_new.to_netcdf(path=str(ds_path1).replace(".nc", "_CF.nc"))
ds2_new.to_netcdf(path=str(ds_path2).replace(".nc", "_CF.nc"))
ds3_new.to_netcdf(path=str(ds_path3).replace(".nc", "_CF.nc"))
ds.to_netcdf(path=ds_dir.joinpath(ds_out_file + "_CF.nc"))

### Check CF compliancy altered NetCDF files

In [None]:
%%capture cap --no-stderr
# check altered CF compliancy

check_compliancy(testfile=str(ds_path1).replace(".nc", "_CF.nc"), working_dir=CF_dir)

In [None]:
# save altered CF compliancy
save_compliancy(
    cap, testfile=str(ds_path1).replace(".nc", "_CF.nc"), working_dir=CF_dir,
)

In [None]:
%%capture cap --no-stderr
# check altered CF compliancy

check_compliancy(testfile=str(ds_path2).replace(".nc", "_CF.nc"), working_dir=CF_dir)

In [None]:
# save altered CF compliancy
save_compliancy(
    cap, testfile=str(ds_path2).replace(".nc", "_CF.nc"), working_dir=CF_dir,
)

In [None]:
%%capture cap --no-stderr
# check altered CF compliancy

check_compliancy(testfile=str(ds_path3).replace(".nc", "_CF.nc"), working_dir=CF_dir)

In [None]:
# save altered CF compliancy
save_compliancy(
    cap, testfile=str(ds_path3).replace(".nc", "_CF.nc"), working_dir=CF_dir,
)

In [None]:
%%capture cap --no-stderr
# check altered CF compliancy

check_compliancy(testfile=ds_dir.joinpath(ds_out_file + "_CF.nc"), working_dir=CF_dir)

In [None]:
# save altered CF compliancy
save_compliancy(
    cap, testfile=ds_dir.joinpath(ds_out_file + "_CF.nc"), working_dir=CF_dir,
)

### write data to Zarr files

In [None]:
# export to zarr in write mode (to overwrite if exists)
ds.to_zarr(ds_dir.joinpath("%s.zarr" % ds_out_file), mode="w")