## File to make /Coastal risk csv cf compliant and export to zar

In [1]:
# Optional; code formatter, installed as jupyter lab extension
#%load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# Import standard packages
import os
import pathlib
import sys

import numpy as np
#import geopandas as gpd
 
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import glob
import itertools
import json
import copy
from itertools import chain
from shapely import wkb

# Import custom functionality
sys.path.append('c:/Windows/System32/coclicodata/src')#CLENMAR ADDED

from coclicodata.drive_config import p_drive
from coclicodata.etl.cf_compliancy_checker import check_compliancy, save_compliancy

# Define (local and) remote drives
gca_data_dir = p_drive.joinpath("11209197-018-global-coastal-atlas","MSc_students","ClenmarRowe")

# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    pathlib.Path().home().joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"AppData\Local\miniconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml" ###Changed from anaconda to miniconda  - added the new package udunits file
    )
)


<IPython.core.display.Javascript object>

In [3]:
# Project paths & files (manual input)
dataset_dir = gca_data_dir.joinpath(r"Example_Coastalrisk_STAC")
dataset_dir_path = dataset_dir.joinpath("CoastalRisk.nc")
CF_dir = gca_data_dir.joinpath("Example_Coastalrisk_STAC","CFclen")  # directory to save output CF check files

<IPython.core.display.Javascript object>

In [4]:
filename=dataset_dir.joinpath("Coastalrisk example","Coastal risk.csv")

df = pd.read_csv(filename, index_col=None, header=0)

ds = xr.Dataset.from_dataframe(df)

<IPython.core.display.Javascript object>

In [5]:
df

Unnamed: 0,X,Y,RPresent,R2050,R2100
0,-26.3728,-58.4052,High Risk,Very High Risk,Very High Risk
1,-26.3890,-58.4079,High Risk,Very High Risk,Very High Risk
2,-26.4057,-58.4098,High Risk,Very High Risk,Very High Risk
3,-26.4227,-58.4112,High Risk,Very High Risk,Very High Risk
4,-26.4396,-58.4123,Very Low Risk,Very Low Risk,Very Low Risk
...,...,...,...,...,...
783433,48.1141,67.6111,Medium Risk,High Risk,Very High Risk
783434,48.1203,67.6145,Low Risk,Medium Risk,High Risk
783435,47.8598,67.5380,Low Risk,Medium Risk,High Risk
783436,47.8671,67.5357,Medium Risk,High Risk,Very High Risk


<IPython.core.display.Javascript object>

In [6]:
ds

<IPython.core.display.Javascript object>

In [7]:
# Write the xarray dataset to a netCDF file
ds.to_netcdf(dataset_dir_path)


<IPython.core.display.Javascript object>

### Check CF compliancy original Risk NetCDF file

In [8]:
ds=xr.open_dataset(dataset_dir_path)
ds

<IPython.core.display.Javascript object>

In [9]:
%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(dataset_dir_path, 
                 working_dir=CF_dir
                 )


<IPython.core.display.Javascript object>

In [10]:
# save original CF compliancy (for first file)
save_compliancy(cap, dataset_dir_path, working_dir=CF_dir)



<IPython.core.display.Javascript object>

In [11]:
import json

# NetCDF attribute alterations by means of metadata template

### create a metadata template file with all of the attributes, descriptions etc
### creates a loop through all the dictionaries key-value, where attr name is the key and attr_val is the value
### add these attributes to the xarray attribributes by using the .attrs[] notation

f_global = open(gca_data_dir.joinpath("Example_Coastalrisk_STAC", "metadata_CoastalRiskIndex.json"))
meta_global = json.load(f_global)

<IPython.core.display.Javascript object>

In [12]:
for attr_name, attr_val in meta_global.items():
    if attr_name == 'PROVIDERS':
        attr_val = json.dumps(attr_val)
    ds.attrs[attr_name] = attr_val
ds.attrs['Conventions'] = "CF-1.8"
ds

<IPython.core.display.Javascript object>

In [13]:
# NetCDF variable and dimension alterations (per dataset)

###iterates through the ds_list list of xarrays, renames the dimensions, renames and move long lat from variables to coordinates,
###  changes the shape of the data array from a 12*124 2D matrix to a 4*3*124
### gives the dimensions names the rows are called ensemble bounds L-Low C-Center and U-upper

# rename or swap dimension names, the latter in case the name already exists as coordinate
ds = ds.rename_dims({"index": "nstations"})

# rename variables, if necessary
ds = ds.rename_vars(
    {"X": "lon", "Y": "lat"} #"index":"nstations", 
)

# set some data variables to coordinates to avoid duplication of dimensions in later stage
ds = ds.set_coords(["lon", "lat"])
ds

<IPython.core.display.Javascript object>

In [14]:
 # remove variables
ds = ds.drop(["index"])
ds

  ds = ds.drop(["index"])


<IPython.core.display.Javascript object>

In [15]:
#ds = ds.expand_dims(dim={"Horizon": [2020, 2050, 2100]})
ds

<IPython.core.display.Javascript object>

In [16]:
# add or change certain variable / coordinate attributes
### dataset attributes is a dictionary of dictionaries
dataset_attributes = {
   # "Horizon": {"long_name": "horizon", "units": "yr"}, # set to 1 if no unit
    "lon": {"standard_name": "longitude", "long_name": "longitude", "units": "degrees_east"},
    "lat": {"standard_name": "latitude", "long_name": "latitude", "units": "degrees_north"},
    "RPresent": {"standard_name": "Present", "long_name": "Present_Climate", "units": "1"},
    "R2050": {"standard_name": "Future_2050", "long_name": "2050_Future_Climate", "units": "1"},
    "R2100": {"standard_name": "Future_2100", "long_name": "2100_Future_Climate", "units": "1"}
}  # specify custom (CF convention) attributes

 # add / overwrite attributes
for k, v in dataset_attributes.items():
    try:
        ds[k].attrs = dataset_attributes[k]
    except:
        continue
ds

<IPython.core.display.Javascript object>

In [17]:
# # add variable
# data_array = [ds['RPresent'], ds['R2050'], ds['R2100']]

# #ds = ds.assign(Time_Horizon=(["TimeHoz"], data_array))
# ds
# data_array

<IPython.core.display.Javascript object>

In [18]:
ds.to_netcdf(dataset_dir.joinpath("CFsemitest.nc"))

<IPython.core.display.Javascript object>

In [19]:
%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(dataset_dir.joinpath("CFsemitest.nc"), 
                 working_dir=CF_dir
                 )


<IPython.core.display.Javascript object>

In [20]:

# save original CF compliancy (for first file)
save_compliancy(cap, dataset_dir.joinpath("CFsemitest.nc"), working_dir=CF_dir)



<IPython.core.display.Javascript object>

### From the changes made, we can see that we are left with 0 warnings and 6 erors. 
### If we want to delve deeper into why there is an issue we have to open the .check file in a txt document
#### In this document we find that the variable type and variable names are the issue
##### Only bytes can be used, so we will first try to convert all the strings to a byte object
##### If that doesnt work we will just replace the strings with integers moving from 1-5 indicative of risk

In [21]:

ds['R2050'].values
len(ds['R2050'].values)
len(k)


5

<IPython.core.display.Javascript object>

In [22]:
# newrp2050=[]
# for i in ds['R2050'].values:
#     for k in i:
#         k= k.encode('utf-8')
#         newrp2050.append(k)
# newrp2050



RPresent_new=[]
for i in ds['RPresent'].values:
        i= i.encode('utf-8')
        RPresent_new.append(i)

R2050_new=[]
for i in ds['R2050'].values:
        i= i.encode('utf-8')
        R2050_new.append(i)

R2100_new=[]
for i in ds['R2100'].values:
        i= i.encode('utf-8')
        R2100_new.append(i)



# Assign 'newrp2050' as a new variable in the Dataset
ds = ds.assign(RPresent_new=(('nstations'),RPresent_new))
ds = ds.assign(R2050_new=(('nstations'),R2050_new))
ds = ds.assign(R2100_new=(('nstations'),R2100_new))
ds


<IPython.core.display.Javascript object>

In [23]:
ds = ds.drop(["RPresent", "R2050", "R2100"])
ds

  ds = ds.drop(["RPresent", "R2050", "R2100"])


<IPython.core.display.Javascript object>

In [24]:
dataset_variables = {
    "RPresent_new": {"standard_name": "Present", "long_name": "Present_Climate", "units": "1"},
    "R2050_new": {"standard_name": "Future_2050", "long_name": "2050_Future_Climate", "units": "1"},
    "R2100_new": {"standard_name": "Future_2100", "long_name": "2100_Future_Climate", "units": "1"}
}  # specify custom (CF convention) attributes

 # add / overwrite attributes
for k, v in dataset_variables.items():
    try:
        ds[k].attrs = dataset_variables[k]
    except:
        continue
ds

<IPython.core.display.Javascript object>

In [25]:
ds.to_netcdf(dataset_dir.joinpath("CFsemitest.nc"))

<IPython.core.display.Javascript object>

In [26]:
%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(dataset_dir.joinpath("CFsemitest.nc"), 
                 working_dir=CF_dir
                 )


<IPython.core.display.Javascript object>

In [28]:
%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(dataset_dir.joinpath("CFsemitest.nc"), 
                 working_dir=CF_dir
                 )


<IPython.core.display.Javascript object>

In [29]:
ds.to_netcdf(dataset_dir.joinpath("CFsemitest.nc"))

<IPython.core.display.Javascript object>

In [30]:
%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(dataset_dir.joinpath("CFsemitest.nc"), 
                 working_dir=CF_dir
                 )


<IPython.core.display.Javascript object>

In [31]:

# save original CF compliancy (for first file)
save_compliancy(cap, dataset_dir.joinpath("CFsemitest.nc"), working_dir=CF_dir)



<IPython.core.display.Javascript object>

### Yay!
##### Fortunately, converting from string to bytes work and now the only error we have is the variable's standard name
##### We can give them the fake name of "Time" until it works, along with the corresponding unit of measurement "yr

In [38]:
dataset_variables = {
    "RPresent_new": {"standard_name": "time", "long_name": "Present_Climate", "units": "yr"},
    "R2050_new": {"standard_name": "time", "long_name": "2050_Future_Climate", "units": "yr"},
    "R2100_new": {"standard_name": "time", "long_name": "2100_Future_Climate", "units": "yr"}
}  # specify custom (CF convention) attributes

 # add / overwrite attributes
for k, v in dataset_variables.items():
    try:
        ds[k].attrs = dataset_variables[k]
    except:
        continue
ds

<IPython.core.display.Javascript object>

In [39]:
ds.to_netcdf(dataset_dir.joinpath("CFfinaltest.nc"))


<IPython.core.display.Javascript object>

In [40]:

%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(dataset_dir.joinpath("CFfinaltest.nc"), 
                 working_dir=CF_dir
                 )


<IPython.core.display.Javascript object>

In [41]:


# save original CF compliancy (for first file)
save_compliancy(cap, dataset_dir.joinpath("CFfinaltest.nc"), working_dir=CF_dir)




<IPython.core.display.Javascript object>

In [43]:
# export to Zarr in one-liner (as rp is the temporal dimension)

# export to zarr in write mode (to overwrite if exists)
ds.to_zarr(str(dataset_dir.joinpath("CFFinaltest.nc")).replace(".nc", ".zarr"), mode="w")

<xarray.backends.zarr.ZarrStore at 0x1f5a0104740>

<IPython.core.display.Javascript object>