## File to make /Coastal risk csv cf compliant and export to zarr

Notebook environment to migrate Coastal risk csv file to CF compliant zarr

In [1]:
# Optional; code formatter, installed as jupyter lab extension
#%load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

<IPython.core.display.Javascript object>

### Configure OS independent paths

In [2]:
#%pip install tqdm

<IPython.core.display.Javascript object>

In [3]:
# Import standard packages
import os
import pathlib

import sys
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import math
from tqdm import tqdm 

from coclicodata.drive_config import p_drive
from coclicodata.etl.cf_compliancy_checker import check_compliancy, save_compliancy

# Define (local and) remote drives
gca_data_dir = r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data"


# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    pathlib.Path().home().joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"AppData\Local\miniconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml" ###Changed from anaconda to miniconda  - added the new package udunits file
    )
)

<IPython.core.display.Javascript object>

In [4]:
template_df=pd.read_csv(r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\All_Datasets\Orig_Datasets\04_Auxillary_files\Arjen_Vector_Template.csv")
Hazard_map=template_df
Hazard_map = gpd.GeoDataFrame(
    Hazard_map, geometry=gpd.points_from_xy(Hazard_map.Intersect_lon, Hazard_map.Intersect_lat), crs="EPSG:4326"
)
Hazard_map=Hazard_map.drop(columns=["Start_lon","Start_lat","End_lon","End_lat","Intersect_lon","Intersect_lat"])
Hazard_map
IPCC=gpd.read_file(r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\All_Datasets\Orig_Datasets\04_Auxillary_files\IPCC-WGI-reference-regions-v4.gpkg")
IPCC.head()
region_acronym= "CAR"
region_gdf=IPCC.loc[IPCC["Acronym"]==region_acronym,:]
# region_gdf=region_gdf.iloc[:,-1:]
region_gdf

Hazard_map=Hazard_map.sjoin(region_gdf)
Haz_map=Hazard_map
Hazard_map=Hazard_map.iloc[:,0:5]
Hazard_map["IPCC_REGION"]=Haz_map["Name"]
Hazard_map.index=range(0,len(Hazard_map.index))
Hazard_map

Unnamed: 0,transect_id,country_id,continent,country_name,geometry,IPCC_REGION
0,BOX_116_353_2,COL,South America,Colombia,POINT (-71.87980 12.25068),Caribbean
1,BOX_116_353_3,COL,South America,Colombia,POINT (-71.87698 12.24731),Caribbean
2,BOX_116_353_4,COL,South America,Colombia,POINT (-71.87563 12.24314),Caribbean
3,BOX_116_353_5,COL,South America,Colombia,POINT (-71.87523 12.23877),Caribbean
4,BOX_116_353_6,COL,South America,Colombia,POINT (-71.87483 12.23440),Caribbean
...,...,...,...,...,...,...
37504,BOX_139_002_61,DOM,North America,Dominican Republic,POINT (-71.66819 17.96526),Caribbean
37505,BOX_139_002_62,DOM,North America,Dominican Republic,POINT (-71.66623 17.96142),Caribbean
37506,BOX_139_002_63,DOM,North America,Dominican Republic,POINT (-71.66458 17.95744),Caribbean
37507,BOX_139_002_64,DOM,North America,Dominican Republic,POINT (-71.66294 17.95347),Caribbean


<IPython.core.display.Javascript object>

In [5]:
# Project paths & files (manual input)
dataset_dir = pathlib.Path().joinpath(gca_data_dir,"Processed_Risk_Levels_PC_FC_Merged")
dataset_dir_path = dataset_dir.joinpath("PC_FC_RL_merged_Carib_original.nc")
CF_dir = dataset_dir.joinpath("CF")  # directory to save output CF check files
dataset_dir_path

WindowsPath('P:/11209197-018-global-coastal-atlas/MSc_students/ClenmarRowe/Data/Processed_Risk_Levels_PC_FC_Merged/PC_FC_RL_merged_Carib_original.nc')

<IPython.core.display.Javascript object>

In [6]:
PC_Risk_path=r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\Processed_data_CVI_components\Risk\01 Present\Caribbean\Haz_Exp_Vul"
FC_Risk_2050_high=r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\Processed_data_CVI_components\Risk\02 Future\Caribbean\Haz_Exp_Vul\2050_High_emission"
FC_Risk_2050_Low=r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\Processed_data_CVI_components\Risk\02 Future\Caribbean\Haz_Exp_Vul\2050_Low_emission"
FC_Risk_2100_High=r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\Processed_data_CVI_components\Risk\02 Future\Caribbean\Haz_Exp_Vul\2100_High_emission"
FC_Risk_2100_Low=r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\Processed_data_CVI_components\Risk\02 Future\Caribbean\Haz_Exp_Vul\2100_Low_emission"

<IPython.core.display.Javascript object>

In [7]:
all_paths=[PC_Risk_path,FC_Risk_2050_high,FC_Risk_2050_Low,FC_Risk_2100_High,FC_Risk_2100_Low]  

df_All_Risk=[]
for path in all_paths:
    files = os.listdir(path)

    # Filter only CSV files
    csv_files = [file for file in files if file.endswith('.csv')]


    
    # Iterate over each CSV file
    for csv_file in csv_files:
        file_path = os.path.join(path, csv_file)
        df = pd.read_csv(file_path)
        df_All_Risk.append(df)
df_All_Risk

[          transect_id  ESL_100yrRP - Hazard level  \
 0       BOX_116_353_2                         1.0   
 1       BOX_116_353_3                         1.0   
 2       BOX_116_353_4                         1.0   
 3       BOX_116_353_5                         1.0   
 4       BOX_116_353_6                         1.0   
 ...               ...                         ...   
 37504  BOX_139_002_61                         2.0   
 37505  BOX_139_002_62                         2.0   
 37506  BOX_139_002_63                         2.0   
 37507  BOX_139_002_64                         2.0   
 37508  BOX_139_002_65                         2.0   
 
        Percentage of Built Up Area - Exposure level  \
 0                                                 1   
 1                                                 1   
 2                                                 1   
 3                                                 1   
 4                                                 1   
 ...          

<IPython.core.display.Javascript object>

In [8]:
len(df_All_Risk)

17

<IPython.core.display.Javascript object>

In [9]:
Hazard_map

Unnamed: 0,transect_id,country_id,continent,country_name,geometry,IPCC_REGION
0,BOX_116_353_2,COL,South America,Colombia,POINT (-71.87980 12.25068),Caribbean
1,BOX_116_353_3,COL,South America,Colombia,POINT (-71.87698 12.24731),Caribbean
2,BOX_116_353_4,COL,South America,Colombia,POINT (-71.87563 12.24314),Caribbean
3,BOX_116_353_5,COL,South America,Colombia,POINT (-71.87523 12.23877),Caribbean
4,BOX_116_353_6,COL,South America,Colombia,POINT (-71.87483 12.23440),Caribbean
...,...,...,...,...,...,...
37504,BOX_139_002_61,DOM,North America,Dominican Republic,POINT (-71.66819 17.96526),Caribbean
37505,BOX_139_002_62,DOM,North America,Dominican Republic,POINT (-71.66623 17.96142),Caribbean
37506,BOX_139_002_63,DOM,North America,Dominican Republic,POINT (-71.66458 17.95744),Caribbean
37507,BOX_139_002_64,DOM,North America,Dominican Republic,POINT (-71.66294 17.95347),Caribbean


<IPython.core.display.Javascript object>

In [10]:
Hazard_map.columns[1]

'country_id'

<IPython.core.display.Javascript object>

In [11]:
df_Exp_Vul=df_All_Risk[0]
df_Exp_Vul["lon"]=df_Exp_Vul["geometry"].apply(lambda point_str: float(point_str.strip('POINT ()').split()[0]))
df_Exp_Vul["lat"]=df_Exp_Vul["geometry"].apply(lambda point_str: float(point_str.strip('POINT ()').split()[1]))
df_Exp_Vul=df_Exp_Vul.drop(columns=[df_All_Risk[0].columns[1],df_All_Risk[0].columns[12],df_All_Risk[0].columns[13],df_All_Risk[0].columns[-3],"geometry"])
df_Exp_Vul[Hazard_map.columns[1]]=Hazard_map[Hazard_map.columns[1]]
df_Exp_Vul[Hazard_map.columns[2]]=Hazard_map[Hazard_map.columns[2]]
df_Exp_Vul[Hazard_map.columns[3]]=Hazard_map[Hazard_map.columns[3]]
df_Exp_Vul[Hazard_map.columns[5]]=Hazard_map[Hazard_map.columns[5]]
df_Exp_Vul["Uncertainty_Columns"]=df_Exp_Vul["Uncertainty_Columns"].apply(lambda x:x.replace("[","").replace("]","").replace("β","B"))
df_Exp_Vul

Unnamed: 0,transect_id,Percentage of Built Up Area - Exposure level,LECZ Area÷LECZ mean elevation - Exposure level,Neashore Slope (tanβ or m^-1) - Exposure level,Population Within 1km of the coastline - Exposure level,Sediment Type - Exposure level,Infant Mortality Rate (3_yr_Average) - Vulnerability level,GDP per Capita (3_yr_Average) - Vulnerability level,Mean Years of Schooling - Vulnerability level,Deaths from Recent Shocks (2004-2023) - Vulnerability level,World Protected Areas (Status and Governance) - Vulnerability level,Uncertainty_Columns,Uncertainty_Columns_length,Confindence %,lon,lat,country_id,continent,country_name
0,BOX_116_353_2,1,2,4.0,1,4.0,3.0,5,3.0,2,3,,0,100.0,-71.879805,12.250684,COL,South America,Colombia
1,BOX_116_353_3,1,3,4.0,1,2.0,3.0,5,3.0,2,3,,0,100.0,-71.876981,12.247308,COL,South America,Colombia
2,BOX_116_353_4,1,4,4.0,1,4.0,3.0,5,3.0,2,3,,0,100.0,-71.875632,12.243144,COL,South America,Colombia
3,BOX_116_353_5,1,4,4.0,1,2.0,3.0,5,3.0,2,3,,0,100.0,-71.875233,12.238772,COL,South America,Colombia
4,BOX_116_353_6,1,5,4.0,1,2.0,3.0,5,3.0,2,3,,0,100.0,-71.874835,12.234400,COL,South America,Colombia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37504,BOX_139_002_61,1,2,3.0,2,4.0,4.0,5,3.0,2,5,,0,100.0,-71.668186,17.965259,DOM,North America,Dominican Republic
37505,BOX_139_002_62,1,2,3.0,2,4.0,4.0,5,3.0,2,5,,0,100.0,-71.666228,17.961418,DOM,North America,Dominican Republic
37506,BOX_139_002_63,1,3,3.0,2,4.0,4.0,5,3.0,2,5,,0,100.0,-71.664583,17.957442,DOM,North America,Dominican Republic
37507,BOX_139_002_64,1,3,3.0,2,4.0,4.0,5,3.0,2,5,,0,100.0,-71.662937,17.953466,DOM,North America,Dominican Republic


<IPython.core.display.Javascript object>

In [12]:
df_Haz_Exp_Vul=df_Exp_Vul.copy()
for idx,df in enumerate(df_All_Risk):
    PC=""
    if idx<4:
        PC=" PC"
    haz=df.columns[1].replace(" - Hazard level",PC)
    df_Haz_Exp_Vul[haz+" - Hazard Level"]=df.iloc[:,1]
    df_Haz_Exp_Vul[haz+" - Risk Level"]=df.iloc[:,12]
    df_Haz_Exp_Vul[haz+" - Risk Level Class"]=df.iloc[:,13]
    df_Haz_Exp_Vul[haz+" value"]=df.iloc[:,-1]
df_Haz_Exp_Vul.columns

Index(['transect_id', 'Percentage of Built Up Area - Exposure level',
       'LECZ Area÷LECZ mean elevation - Exposure level',
       'Neashore Slope (tanβ or m^-1) - Exposure level',
       'Population Within 1km of the coastline - Exposure level',
       'Sediment Type - Exposure level',
       'Infant Mortality Rate (3_yr_Average) - Vulnerability level',
       'GDP per Capita (3_yr_Average) - Vulnerability level',
       'Mean Years of Schooling - Vulnerability level',
       'Deaths from Recent Shocks (2004-2023) - Vulnerability level',
       'World Protected Areas (Status and Governance) - Vulnerability level',
       'Uncertainty_Columns', 'Uncertainty_Columns_length', 'Confindence %',
       'lon', 'lat', 'country_id', 'continent', 'country_name',
       'ESL_100yrRP PC - Hazard Level', 'ESL_100yrRP PC - Risk Level',
       'ESL_100yrRP PC - Risk Level Class', 'ESL_100yrRP PC value',
       'EWH_100yrRP PC - Hazard Level', 'EWH_100yrRP PC - Risk Level',
       'EWH_100yrRP PC 

<IPython.core.display.Javascript object>

In [13]:
df_Haz_Exp_Vul.to_csv(r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\Processed_Risk_Levels_PC_FC_Merged\PC_FC_RL_merged_Carrib.csv",index=False)

<IPython.core.display.Javascript object>

In [14]:
ds=xr.Dataset.from_dataframe(df_Haz_Exp_Vul)
ds

<IPython.core.display.Javascript object>

In [15]:

os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'
# Write the xarray dataset to a netCDF file
ds.to_netcdf(dataset_dir_path)

<IPython.core.display.Javascript object>

### Check CF compliancy original NetCDF files

In [16]:
# open datasets
ds = xr.open_dataset(dataset_dir_path)

# check original dataset
ds

<IPython.core.display.Javascript object>

In [17]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile= dataset_dir_path, 
                 working_dir=CF_dir
                 )


<IPython.core.display.Javascript object>

In [18]:
# save original CF compliancy
save_compliancy(cap, testfile=dataset_dir_path, working_dir=CF_dir)



<IPython.core.display.Javascript object>

### Make CF compliant alterations to the NetCDF files (dataset dependent)

In [19]:
# open original datasets
ds = xr.open_dataset(dataset_dir_path)

# check original dataset
ds

<IPython.core.display.Javascript object>

In [20]:

# set some data variables to coordinates to *avoid duplication* of dimensions in later stage
ds = ds.set_coords(['transect_id', 'Uncertainty_Columns', 'Uncertainty_Columns_length', 'Confindence %', 'lat',"lon", 'country_name', 'continent','country_id',"IPCC_REGION"])
ds

ValueError: These variables cannot be found in this dataset: ['IPCC_REGION']

<IPython.core.display.Javascript object>

In [None]:
# rename or swap dimension names, the latter in case the name already exists as coordinate
ds = ds.rename_dims({"index": "nstations"})
ds


<IPython.core.display.Javascript object>

In [None]:
ds=ds.drop_vars(["index"])
ds

<IPython.core.display.Javascript object>

In [None]:
import json

# NetCDF attribute alterations
f_global = open(dataset_dir.joinpath("STOC_metadata.json"))
meta_global = json.load(f_global)

for attr_name, attr_val in meta_global.items():
    if attr_name == 'PROVIDERS':
        attr_val = json.dumps(attr_val)
    ds.attrs[attr_name] = attr_val

ds.attrs['Conventions'] = "CF-1.8"
ds

<IPython.core.display.Javascript object>

In [None]:
# rename variables, if necessary
ds = ds.rename_vars(
       {
        "country_name": "country" ,
       }
  )
ds

<IPython.core.display.Javascript object>

In [None]:
ds=ds.compute()
ds

<IPython.core.display.Javascript object>

In [None]:
np.unique(ds["Uncertainty_Columns"].values)

array(['',
       "'ESL_100yrRP - Hazard level', 'Mean Years of Schooling - Vulnerability level'",
       "'Infant Mortality Rate (3_yr_Average) - Vulnerability level', 'Mean Years of Schooling - Vulnerability level'",
       "'Mean Years of Schooling - Vulnerability level'",
       "'Neashore Slope (tanB or m^-1) - Exposure level'",
       "'Neashore Slope (tanB or m^-1) - Exposure level', 'Infant Mortality Rate (3_yr_Average) - Vulnerability level', 'Mean Years of Schooling - Vulnerability level'",
       "'Neashore Slope (tanB or m^-1) - Exposure level', 'Mean Years of Schooling - Vulnerability level'",
       "'Sediment Type - Exposure level'"], dtype='<U159')

<IPython.core.display.Javascript object>

In [None]:
# change dtypes from unsigned to signed
object_vars = list(ds.variables)
for i in object_vars:
    # if isinstance(ds[i].values[0],str) and (i not in ['Uncertainty_Columns'] ) :
    if isinstance(ds[i].values[0],str)  :
        ds[i] = ds[i].astype('S')
ds

<IPython.core.display.Javascript object>

In [None]:
i

'EWH_5_yr_RP 2100 SSP1-26 Ensemble mean value'

<IPython.core.display.Javascript object>

In [None]:
# add or change certain variable / coordinate attributes
### dataset attributes is a dictionary of dictionaries
dataset_attributes = {
    "lon": {"standard_name": "longitude", "long_name": "longitude", "units": "degrees_east"},
    "lat": {"standard_name": "latitude", "long_name": "latitude", "units": "degrees_north"},
    "transect_id": { "long_name": "Transect Identity", "units": "1"},
    "continent": { "long_name": "Continent", "units": "1"},
    "country": { "long_name": "Country", "units": "1"},
    "country_id": { "long_name": "Country Identification", "units": "1"},
    "IPCC_REGION": { "long_name": "IPCC Region", "units": "1"}
}  # specify custom (CF convention) attributes

 # add / overwrite attributes
for k, v in dataset_attributes.items():
    try:
        ds[k].attrs = dataset_attributes[k]
    except:
        continue


ds

<IPython.core.display.Javascript object>

In [None]:
dataset_attributes = {
    "Uncertainty_Columns": {"long_name": "Uncertainty Columns"},
    "Uncertainty_Columns_length": {"long_name": "Uncertainty Columns Length"},
    "Confindence %": {"long_name": "Confindence Percentage"},
    "Percentage of Built Up Area - Exposure level": {"long_name": "Percentage of Built Up Area - Exposure level"},
    "LECZ Area÷LECZ mean elevation - Exposure level": {"long_name": "LECZ Area÷LECZ mean elevation - Exposure level"},
    "Neashore Slope (tanβ or m^-1) - Exposure level": {"long_name": "Neashore Slope (tanβ or m^-1) - Exposure level"},
    "Population Within 1km of the coastline - Exposure level": {"long_name": "Population Within 1km of the coastline - Exposure level"},
    "Sediment Type - Exposure level": {"long_name": "Sediment Type - Exposure level"},
    "Infant Mortality Rate (3_yr_Average) - Vulnerability level": {"long_name": "Infant Mortality Rate (3_yr_Average) - Vulnerability level"},
    "GDP per Capita (3_yr_Average) - Vulnerability level": {"long_name": "GDP per Capita (3_yr_Average) - Vulnerability level"},
    "Mean Years of Schooling - Vulnerability level": {"long_name": "Mean Years of Schooling - Vulnerability level"},
    "Deaths from Recent Shocks (2004-2023) - Vulnerability level": {"long_name": "Deaths from Recent Shocks (2004-2023) - Vulnerability level"},
    "World Protected Areas (Status and Governance) - Vulnerability level": {"long_name": "World Protected Areas (Status and Governance) - Vulnerability level"},
    "ESL_100yrRP PC - Hazard Level": {"long_name": "ESL_100yrRP PC - Hazard Level"},
    "ESL_100yrRP PC - Risk Level": {"long_name": "ESL_100yrRP PC - Risk Level"},
    "ESL_100yrRP PC - Risk Level Class": {"long_name": "ESL_100yrRP PC - Risk Level Class"},
    "ESL_100yrRP PC value": {"long_name": "ESL_100yrRP PC value"},
    "EWH_100yrRP PC - Hazard Level": {"long_name": "EWH_100yrRP PC - Hazard Level"},
    "EWH_100yrRP PC - Risk Level": {"long_name": "EWH_100yrRP PC - Risk Level"},
    "EWH_100yrRP PC - Risk Level Class": {"long_name": "EWH_100yrRP PC - Risk Level Class"},
    "EWH_100yrRP PC value": {"long_name": "EWH_100yrRP PC value"},
    "Land_Subsidence_2010 PC - Hazard Level": {"long_name": "Land Subsidence 2010 PC - Hazard Level"},
    "Land_Subsidence_2010 PC - Risk Level": {"long_name": "Land Subsidence 2010 PC - Risk Level"},
    "Land_Subsidence_2010 PC - Risk Level Class": {"long_name": "Land Subsidence 2010 PC - Risk Level Class"},
    "Land_Subsidence_2010 PC value": {"long_name": "Land Subsidence 2010 PC value"},
    "Shoreline_Change_Rate_PC PC - Hazard Level": {"long_name": "Shoreline Change Rate PC - Hazard Level"},
    "Shoreline_Change_Rate_PC PC - Risk Level": {"long_name": "Shoreline Change Rate PC - Risk Level"},
    "Shoreline_Change_Rate_PC PC - Risk Level Class": {"long_name": "Shoreline Change Rate PC - Risk Level Class"},
    "Shoreline_Change_Rate_PC PC value": {"long_name": "Shoreline Change Rate PC value"},
    "Change_rate_SSP5_85_2050 - Hazard Level": {"long_name": "Change rate SSP5 85 2050 - Hazard Level"},
    "Change_rate_SSP5_85_2050 - Risk Level": {"long_name": "Change rate SSP5 85 2050 - Risk Level"},
    "Change_rate_SSP5_85_2050 - Risk Level Class": {"long_name": "Change rate SSP5 85 2050 - Risk Level Class"},
    "Change_rate_SSP5_85_2050 value": {"long_name": "Change rate SSP5 85 2050 value"},
    "ESL_100yrRP_8.5_2050 - Hazard Level": {"long_name": "ESL 100yrRP 8.5 2050 - Hazard Level"},
    "ESL_100yrRP_8.5_2050 - Risk Level": {"long_name": "ESL 100yrRP 8.5 2050 - Risk Level"},
    "ESL_100yrRP_8.5_2050 - Risk Level Class": {"long_name": "ESL 100yrRP 8.5 2050 - Risk Level Class"},
    "ESL_100yrRP_8.5_2050 value": {"long_name": "ESL 100yrRP 8.5 2050 value"},
    "EWH_5_yr_RP 2050 SSP5-85 Ensemble mean - Hazard Level": {"long_name": "EWH 5 yr RP 2050 SSP5 85 Ensemble mean - Hazard Level"},
    "EWH_5_yr_RP 2050 SSP5-85 Ensemble mean - Risk Level": {"long_name": "EWH 5 yr RP 2050 SSP5 85 Ensemble mean - Risk Level"},
    "EWH_5_yr_RP 2050 SSP5-85 Ensemble mean - Risk Level Class": {"long_name": "EWH 5 yr RP 2050 SSP5 85 Ensemble mean - Risk Level Class"},
    "EWH_5_yr_RP 2050 SSP5-85 Ensemble mean value": {"long_name": "EWH 5 yr RP 2050 SSP5 85 Ensemble mean value"},
    "Land_Subsidence_2040_high - Hazard Level": {"long_name": "Land Subsidence 2040 high - Hazard Level"},
    "Land_Subsidence_2040_high - Risk Level": {"long_name": "Land Subsidence 2040 high - Risk Level"},
    "Land_Subsidence_2040_high - Risk Level Class": {"long_name": "Land Subsidence 2040 high - Risk Level Class"},
    "Land_Subsidence_2040_high value": {"long_name": "Land Subsidence 2040 high value"},
    "Change_rate_SSP1_45_2050 - Hazard Level": {"long_name": "Change rate SSP1 45 2050 - Hazard Level"},
    "Change_rate_SSP1_45_2050 - Risk Level": {"long_name": "Change rate SSP1 45 2050 - Risk Level"},
    "Change_rate_SSP1_45_2050 - Risk Level Class": {"long_name": "Change rate SSP1 45 2050 - Risk Level Class"},
    "Change_rate_SSP1_45_2050 value": {"long_name": "Change rate SSP1 45 2050 value"},
    "ESL_100yrRP_4.5_2050 - Hazard Level": {"long_name": "ESL 100yrRP 4.5 2050 - Hazard Level"},
    "ESL_100yrRP_4.5_2050 - Risk Level": {"long_name": "ESL 100yrRP 4.5 2050 - Risk Level"},
    "ESL_100yrRP_4.5_2050 - Risk Level Class": {"long_name": "ESL 100yrRP 4.5 2050 - Risk Level Class"},
    "ESL_100yrRP_4.5_2050 value": {"long_name": "ESL 100yrRP 4.5 2050 value"},
    "EWH_5_yr_RP 2050 SSP1-26 Ensemble mean - Hazard Level": {"long_name": "EWH 5 yr RP 2050 SSP1 26 Ensemble mean - Hazard Level"},
    "EWH_5_yr_RP 2050 SSP1-26 Ensemble mean - Risk Level": {"long_name": "EWH 5 yr RP 2050 SSP1 26 Ensemble mean - Risk Level"},
    "EWH_5_yr_RP 2050 SSP1-26 Ensemble mean - Risk Level Class": {"long_name": "EWH 5 yr RP 2050 SSP1 26 Ensemble mean - Risk Level Class"},
    "EWH_5_yr_RP 2050 SSP1-26 Ensemble mean value": {"long_name": "EWH 5 yr RP 2050 SSP1 26 Ensemble mean value"},
    "Change_rate_SSP5_85_2100 - Hazard Level": {"long_name": "Change rate SSP5 85 2100 - Hazard Level"},
    "Change_rate_SSP5_85_2100 - Risk Level": {"long_name": "Change rate SSP5 85 2100 - Risk Level"},
    "Change_rate_SSP5_85_2100 - Risk Level Class": {"long_name": "Change rate SSP5 85 2100 - Risk Level Class"},
    "Change_rate_SSP5_85_2100 value": {"long_name": "Change rate SSP5 85 2100 value"},
    "ESL_100yrRP_8.5_2100 - Hazard Level": {"long_name": "ESL 100yrRP 8.5 2100 - Hazard Level"},
    "ESL_100yrRP_8.5_2100 - Risk Level": {"long_name": "ESL 100yrRP 8.5 2100 - Risk Level"},
    "ESL_100yrRP_8.5_2100 - Risk Level Class": {"long_name": "ESL 100yrRP 8.5 2100 - Risk Level Class"},
    "ESL_100yrRP_8.5_2100 value": {"long_name": "ESL 100yrRP 8.5 2100 value"},
    "EWH_5_yr_RP 2100 SSP5-85 Ensemble mean - Hazard Level": {"long_name": "EWH 5 yr RP 2100 SSP5 85 Ensemble mean - Hazard Level"},
    "EWH_5_yr_RP 2100 SSP5-85 Ensemble mean - Risk Level": {"long_name": "EWH 5 yr RP 2100 SSP5 85 Ensemble mean - Risk Level"},
    "EWH_5_yr_RP 2100 SSP5-85 Ensemble mean - Risk Level Class": {"long_name": "EWH 5 yr RP 2100 SSP5 85 Ensemble mean - Risk Level Class"},
    "EWH_5_yr_RP 2100 SSP5-85 Ensemble mean value": {"long_name": "EWH 5 yr RP 2100 SSP5 85 Ensemble mean value"},
    "Change_rate_SSP1_45_2100 - Hazard Level": {"long_name": "Change rate SSP1 45 2100 - Hazard Level"},
    "Change_rate_SSP1_45_2100 - Risk Level": {"long_name": "Change rate SSP1 45 2100 - Risk Level"},
    "Change_rate_SSP1_45_2100 - Risk Level Class": {"long_name": "Change rate SSP1 45 2100 - Risk Level Class"},
    "Change_rate_SSP1_45_2100 value": {"long_name": "Change rate SSP1 45 2100 value"},
    "ESL_RCP4.5_Time_Horizon2100 - Hazard Level": {"long_name": "ESL RCP4.5 Time Horizon2100 - Hazard Level"},
    "ESL_RCP4.5_Time_Horizon2100 - Risk Level": {"long_name": "ESL RCP4.5 Time Horizon2100 - Risk Level"},
    "ESL_RCP4.5_Time_Horizon2100 - Risk Level Class": {"long_name": "ESL RCP4.5 Time Horizon2100 - Risk Level Class"},
    "ESL_RCP4.5_Time_Horizon2100 value": {"long_name": "ESL RCP4.5 Time Horizon2100 value"},
    "EWH_5_yr_RP 2100 SSP1-26 Ensemble mean - Hazard Level": {"long_name": "EWH 5 yr RP 2100 SSP1 26 Ensemble mean - Hazard Level"},
    "EWH_5_yr_RP 2100 SSP1-26 Ensemble mean - Risk Level": {"long_name": "EWH 5 yr RP 2100 SSP1 26 Ensemble mean - Risk Level"},
    "EWH_5_yr_RP 2100 SSP1-26 Ensemble mean - Risk Level Class": {"long_name": "EWH 5 yr RP 2100 SSP1 26 Ensemble mean - Risk Level Class"},
    "EWH_5_yr_RP 2100 SSP1-26 Ensemble mean value": {"long_name": "EWH 5 yr RP 2100 SSP1 26 Ensemble mean value"}
}

# Add or update attributes in the xarray Dataset ds
for var in dataset_attributes:
    if var in ds:
        ds[var].attrs.update(dataset_attributes[var])
ds

<IPython.core.display.Javascript object>

In [None]:
# Update dataset_attributes with suitable standard, long names, and units for all attributes
for attribute in dataset_attributes:
    units = "1"  # Default units
    
    if "ESL" in attribute:
        units = "m"
    elif "EWH" in attribute:
        units = "m"
    elif "Change_rate" in attribute:
        units = "m/yr"
    elif "Land_Subsidence_" in attribute:
        units = "1"
    
    dataset_attributes[attribute]["units"] = units

# Add or update attributes in the xarray Dataset ds
for var in dataset_attributes:
    if var in ds:
        ds[var].attrs.update(dataset_attributes[var])
ds

<IPython.core.display.Javascript object>

In [None]:
renamed_vars = {var: var.replace("-", "_").replace(".", "_").replace(" ","_").replace("(","").replace(")","").replace("%","perc").replace("β","B").replace("^","").replace("÷","_") for var in ds.variables}
ds = ds.rename(renamed_vars)
ds

<IPython.core.display.Javascript object>

In [None]:
# set some data variables to coordinates to *avoid duplication* of dimensions in later stage
ds = ds.set_coords(['transect_id', 'Uncertainty_Columns', 'Uncertainty_Columns_length', 'Confindence_perc', 'lat',"lon", 'country', 'continent','country_id'])
ds

<IPython.core.display.Javascript object>

In [None]:
# Write the xarray dataset to a netCDF file
#Compliant netcdf
dataset_dir_path_CF=str(dataset_dir_path).replace("original","final")


ds.to_netcdf(path=dataset_dir_path_CF)

<IPython.core.display.Javascript object>

### Check CF compliancy altered NetCDF files

In [None]:
# open datasets (only first file, rest is the same)
ds = xr.open_dataset(dataset_dir_path_CF)

# check original dataset
ds

<IPython.core.display.Javascript object>

In [None]:
%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(testfile=dataset_dir_path_CF, 
                 working_dir=CF_dir
                 )

<IPython.core.display.Javascript object>

In [None]:
# save original CF compliancy (for first file)
save_compliancy(cap, testfile=dataset_dir_path_CF, working_dir=CF_dir)



<IPython.core.display.Javascript object>

### write data to Zarr files

In [None]:
# # export to zarr in write mode (to overwrite if exists)
ds.to_zarr(str(dataset_dir_path).replace("original","final").replace(".nc", ".zarr"), mode="w")

<xarray.backends.zarr.ZarrStore at 0x2699ee873c0>

<IPython.core.display.Javascript object>