# Global Future Waves - Meucci

Notebook environment to migrate csv files to CF compliant zarr

In [1]:
# Optional; code formatter, installed as jupyter lab extension
#%load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# Import standard packages
import os
import pathlib
import sys

import numpy as np
#import geopandas as gpd
 
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import glob
import itertools
import json
import copy
from itertools import chain
from shapely import wkb
import scipy.io

# Import custom functionality
# sys.path.append('c:/Windows/System32/coclicodata/src')#CLENMAR ADDED

from coclicodata.drive_config import p_drive
from coclicodata.etl.cf_compliancy_checker import check_compliancy, save_compliancy

# Define (local and) remote drives
gca_data_dir = r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\All_Datasets\Orig_Datasets\01_Hazards\02_Future\Extreme_Wave_Height"
#gca_data_dir = p_drive.joinpath("11209197-018-global-coastal-atlas","MSc_students","ClenmarRowe")

# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    pathlib.Path().home().joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"AppData\Local\miniconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml" ###Changed from anaconda to miniconda  - added the new package udunits file
    )
)


<IPython.core.display.Javascript object>

In [3]:
# Project paths & files (manual input)
dataset_dir = pathlib.Path().joinpath(gca_data_dir,"From Alberto_updated_fixing Low waves near NL issue")
dataset_dir_path = dataset_dir.joinpath("Delta_Hs_21st_century_Meucci_original.nc")
CF_dir = dataset_dir.joinpath("CF")  # directory to save output CF check files
CF_dir

WindowsPath('P:/11209197-018-global-coastal-atlas/MSc_students/ClenmarRowe/Data/All_Datasets/Orig_Datasets/01_Hazards/02_Future/Extreme_Wave_Height/From Alberto_updated_fixing Low waves near NL issue/CF')

<IPython.core.display.Javascript object>

In [4]:
filename=dataset_dir.joinpath("ToRosh_v2.mat")



# Load .mat file
mat = scipy.io.loadmat(filename)
mat

{'__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on: Sun Nov  7 15:59:18 2021',
 '__version__': '1.0',
 '__globals__': [],
 'atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_19792005': array([[  5.260417  ,  -4.042969  ,   3.50721715],
        [ 31.58854   , 131.3965    ,   7.18969115],
        [ 13.56771   , -89.85352   ,   2.8184764 ],
        ...,
        [-43.30729   , -65.00977   ,   6.87677158],
        [-47.23958   , -74.23828   ,  12.00625439],
        [-50.23437   , -68.61328   ,   5.23305381]]),
 'atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP45': array([[  5.260417  ,  -4.042969  ,   3.59824671],
        [ 31.58854   , 131.3965    ,   7.47399303],
        [ 13.56771   , -89.85352   ,   2.8693292 ],
        ...,
        [-43.30729   , -65.00977   ,   6.78598459],
        [-47.23958   , -74.23828   ,  12.47201505],
        [-50.23437   , -68.61328   ,   5.16333437]]),
 'atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85': array([[  5.260417

<IPython.core.display.Javascript object>

In [5]:
cols=list(mat.keys())
cols,len(cols)

(['__header__',
  '__version__',
  '__globals__',
  'atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_19792005',
  'atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP45',
  'atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85',
  'atGTSR_POT_EXP_RP10_ZtoHs_ens1000_CMIP5_ww3_hs_19792005',
  'atGTSR_POT_EXP_RP10_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85',
  'atGTSR_POT_EXP_RP20_ZtoHs_ens1000_CMIP5_ww3_hs_19792005',
  'atGTSR_POT_EXP_RP20_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85',
  'atGTSR_POT_EXP_RP50_ZtoHs_ens1000_CMIP5_ww3_hs_19792005',
  'atGTSR_POT_EXP_RP50_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85',
  'atGTSR_changes_RP100_Hs_hist_end21C_RCP85',
  'atGTSR_changes_RP10_Hs_hist_end21C_RCP85',
  'atGTSR_changes_RP20_Hs_hist_end21C_RCP85',
  'atGTSR_changes_RP50_Hs_hist_end21C_RCP85'],
 16)

<IPython.core.display.Javascript object>

In [6]:
mat[cols[-1]]

array([[ 5.26041700e+00, -4.04296900e+00,  1.60086459e-01],
       [ 3.15885400e+01,  1.31396500e+02,  2.42841191e-01],
       [ 1.35677100e+01, -8.98535200e+01,  1.98058293e-01],
       ...,
       [-4.33072900e+01, -6.50097700e+01, -3.91518899e-01],
       [-4.72395800e+01, -7.42382800e+01,  1.65603790e-01],
       [-5.02343700e+01, -6.86132800e+01,  1.14539074e-01]])

<IPython.core.display.Javascript object>

In [7]:
df=pd.DataFrame(mat[cols[3]],columns=["lat","lon",cols[3]])
df

Unnamed: 0,lat,lon,atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_19792005
0,5.260417,-4.042969,3.507217
1,31.588540,131.396500,7.189691
2,13.567710,-89.853520,2.818476
3,16.848960,-99.902340,3.246715
4,51.848960,-176.660200,14.488858
...,...,...,...
9861,-35.755210,-57.275390,1.825781
9862,-39.869790,-62.080080,3.377378
9863,-43.307290,-65.009770,6.876772
9864,-47.239580,-74.238280,12.006254


<IPython.core.display.Javascript object>

In [8]:
i=4
for i in range(4,len(cols)):
    df_sub=pd.DataFrame(mat[cols[i]],columns=["lat","lon",cols[i]])
    df[cols[i]]=df_sub[cols[i]]

df

Unnamed: 0,lat,lon,atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_19792005,atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP45,atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85,atGTSR_POT_EXP_RP10_ZtoHs_ens1000_CMIP5_ww3_hs_19792005,atGTSR_POT_EXP_RP10_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85,atGTSR_POT_EXP_RP20_ZtoHs_ens1000_CMIP5_ww3_hs_19792005,atGTSR_POT_EXP_RP20_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85,atGTSR_POT_EXP_RP50_ZtoHs_ens1000_CMIP5_ww3_hs_19792005,atGTSR_POT_EXP_RP50_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85,atGTSR_changes_RP100_Hs_hist_end21C_RCP85,atGTSR_changes_RP10_Hs_hist_end21C_RCP85,atGTSR_changes_RP20_Hs_hist_end21C_RCP85,atGTSR_changes_RP50_Hs_hist_end21C_RCP85
0,5.260417,-4.042969,3.507217,3.598247,3.676720,2.942206,3.077181,3.111709,3.256101,3.334575,3.494661,0.169503,0.134975,0.144392,0.160086
1,31.588540,131.396500,7.189691,7.473993,7.491762,5.679337,5.791874,6.135405,6.301248,6.733624,6.976465,0.302071,0.112536,0.165843,0.242841
2,13.567710,-89.853520,2.818476,2.869329,3.032593,2.398272,2.564212,2.524065,2.706065,2.692683,2.890741,0.214117,0.165941,0.182000,0.198058
3,16.848960,-99.902340,3.246715,3.194796,3.301518,2.687143,2.756369,2.854438,2.920779,3.079420,3.137108,0.054803,0.069225,0.066341,0.057688
4,51.848960,-176.660200,14.488858,15.043621,15.314552,11.663437,12.166594,12.514934,13.108401,13.637361,14.372745,0.825694,0.503157,0.593467,0.735384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9861,-35.755210,-57.275390,1.825781,1.891209,1.836997,1.507984,1.511723,1.603323,1.608931,1.730442,1.739789,0.011216,0.003739,0.005608,0.009347
9862,-39.869790,-62.080080,3.377378,3.291737,3.317112,2.746170,2.695420,2.936484,2.882562,3.187064,3.129970,-0.060266,-0.050750,-0.053922,-0.057094
9863,-43.307290,-65.009770,6.876772,6.785985,6.445533,5.492270,5.191538,5.906486,5.571709,6.462556,6.071037,-0.431238,-0.300732,-0.334777,-0.391519
9864,-47.239580,-74.238280,12.006254,12.472015,12.233960,9.863755,9.915507,10.515820,10.608972,11.364540,11.530143,0.227705,0.051751,0.093152,0.165604


<IPython.core.display.Javascript object>

In [9]:
df["Meucci_RP100_percent_change_end21C_RCP45"]=100*(df.iloc[:,3]-df.iloc[:,2])/df.iloc[:,2]
df["Meucci_RP100_percent_change_end21C_RCP85"]=100*(df.iloc[:,4]-df.iloc[:,2])/df.iloc[:,2]
df

Unnamed: 0,lat,lon,atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_19792005,atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP45,atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85,atGTSR_POT_EXP_RP10_ZtoHs_ens1000_CMIP5_ww3_hs_19792005,atGTSR_POT_EXP_RP10_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85,atGTSR_POT_EXP_RP20_ZtoHs_ens1000_CMIP5_ww3_hs_19792005,atGTSR_POT_EXP_RP20_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85,atGTSR_POT_EXP_RP50_ZtoHs_ens1000_CMIP5_ww3_hs_19792005,atGTSR_POT_EXP_RP50_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85,atGTSR_changes_RP100_Hs_hist_end21C_RCP85,atGTSR_changes_RP10_Hs_hist_end21C_RCP85,atGTSR_changes_RP20_Hs_hist_end21C_RCP85,atGTSR_changes_RP50_Hs_hist_end21C_RCP85,Meucci_RP100_percent_change_end21C_RCP45,Meucci_RP100_percent_change_end21C_RCP85
0,5.260417,-4.042969,3.507217,3.598247,3.676720,2.942206,3.077181,3.111709,3.256101,3.334575,3.494661,0.169503,0.134975,0.144392,0.160086,2.595492,4.832986
1,31.588540,131.396500,7.189691,7.473993,7.491762,5.679337,5.791874,6.135405,6.301248,6.733624,6.976465,0.302071,0.112536,0.165843,0.242841,3.954299,4.201443
2,13.567710,-89.853520,2.818476,2.869329,3.032593,2.398272,2.564212,2.524065,2.706065,2.692683,2.890741,0.214117,0.165941,0.182000,0.198058,1.804266,7.596909
3,16.848960,-99.902340,3.246715,3.194796,3.301518,2.687143,2.756369,2.854438,2.920779,3.079420,3.137108,0.054803,0.069225,0.066341,0.057688,-1.599124,1.687964
4,51.848960,-176.660200,14.488858,15.043621,15.314552,11.663437,12.166594,12.514934,13.108401,13.637361,14.372745,0.825694,0.503157,0.593467,0.735384,3.828894,5.698820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9861,-35.755210,-57.275390,1.825781,1.891209,1.836997,1.507984,1.511723,1.603323,1.608931,1.730442,1.739789,0.011216,0.003739,0.005608,0.009347,3.583601,0.614332
9862,-39.869790,-62.080080,3.377378,3.291737,3.317112,2.746170,2.695420,2.936484,2.882562,3.187064,3.129970,-0.060266,-0.050750,-0.053922,-0.057094,-2.535731,-1.784404
9863,-43.307290,-65.009770,6.876772,6.785985,6.445533,5.492270,5.191538,5.906486,5.571709,6.462556,6.071037,-0.431238,-0.300732,-0.334777,-0.391519,-1.320198,-6.270940
9864,-47.239580,-74.238280,12.006254,12.472015,12.233960,9.863755,9.915507,10.515820,10.608972,11.364540,11.530143,0.227705,0.051751,0.093152,0.165604,3.879317,1.896555


<IPython.core.display.Javascript object>

In [10]:
ds = xr.Dataset.from_dataframe(df)
ds

<IPython.core.display.Javascript object>

In [11]:
# Write the xarray dataset to a netCDF file
import os
os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'

ds.to_netcdf(str(dataset_dir_path))

<IPython.core.display.Javascript object>

### Check CF compliancy original NetCDF files

In [12]:
# open datasets (only first file, rest is the same)
ds = xr.open_dataset(dataset_dir_path)

# check original dataset
ds

<IPython.core.display.Javascript object>

In [13]:
%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(testfile=dataset_dir_path, 
                 working_dir=CF_dir
                 )

<IPython.core.display.Javascript object>

In [14]:
# save original CF compliancy (for first file)
save_compliancy(cap, testfile=dataset_dir_path, working_dir=CF_dir)



<IPython.core.display.Javascript object>

### Make CF compliant alterations to the NetCDF files (dataset dependent)

In [15]:
# open datasets (only first file, rest is the same)
ds = xr.open_dataset(dataset_dir_path)

ds

<IPython.core.display.Javascript object>

In [16]:
import json

# NetCDF attribute alterations by means of metadata template
metadata_json=dataset_dir.joinpath("metadata_Meucci_21st_century_Wave_heights.json")

f_global = open(metadata_json)
meta_global = json.load(f_global)


for attr_name, attr_val in meta_global.items():
    if attr_name == 'PROVIDERS':
        attr_val = json.dumps(attr_val)
    ds.attrs[attr_name] = attr_val

ds.attrs['Conventions'] = "CF-1.8"
ds

<IPython.core.display.Javascript object>

In [17]:
ds = ds.rename_dims({"index": "nstations"})

ds

<IPython.core.display.Javascript object>

In [18]:

# # rename variables, if necessary
# ds = ds.rename_vars(
#        {"X": "lon", "Y": "lat"
#          #,"index":"nstations" }
#        }
#   )
# ds

<IPython.core.display.Javascript object>

In [19]:

# set some data variables to coordinates to avoid duplication of dimensions in later stage
ds = ds.set_coords(["lon", "lat"])

ds

<IPython.core.display.Javascript object>

In [20]:
#Drop index from coordinates list
# ds=ds.rename_dims({"index": "nstations"})
ds=ds.drop_vars("index")
# ds = ds.set_coords(["error_code"])

ds

<IPython.core.display.Javascript object>

In [21]:
# add or change certain variable / coordinate attributes
### dataset attributes is a dictionary of dictionaries
dataset_attributes = {
    "lon": {"standard_name": "longitude", "long_name": "longitude", "units": "degrees_east"},
    "lat": {"standard_name": "latitude", "long_name": "latitude", "units": "degrees_north"},
    "stations": {"long_name": "stations", "units": "1"}, # set to 1 if no unit
    "atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_19792005": {"long_name": "RP100 Z to Hs 1979-2005", "units": "m"},
    "atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP45": {"long_name": "RP100 Z to Hs end 21C RCP45", "units": "m"},
    "atGTSR_POT_EXP_RP100_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85": {"long_name": "RP100 Z to Hs end 21C RCP85", "units": "m"},
    "atGTSR_POT_EXP_RP10_ZtoHs_ens1000_CMIP5_ww3_hs_19792005": {"long_name": "RP10 Z to Hs 1979-2005", "units": "m"},
    "atGTSR_POT_EXP_RP10_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85": {"long_name": "RP10 Z to Hs end 21C RCP85", "units": "m"},
    "atGTSR_POT_EXP_RP20_ZtoHs_ens1000_CMIP5_ww3_hs_19792005": {"long_name": "RP20 Z to Hs 1979-2005", "units": "m"},
    "atGTSR_POT_EXP_RP20_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85": {"long_name": "RP20 Z to Hs end 21C RCP85", "units": "m"},
    "atGTSR_POT_EXP_RP50_ZtoHs_ens1000_CMIP5_ww3_hs_19792005": {"long_name": "RP50 Z to Hs 1979-2005", "units": "m"},
    "atGTSR_POT_EXP_RP50_ZtoHs_ens1000_CMIP5_ww3_hs_end21C_RCP85": {"long_name": "RP50 Z to Hs end 21C RCP85", "units": "m"},
    "atGTSR_changes_RP100_Hs_hist_end21C_RCP85": {"long_name": "Changes RP100 Hs hist end 21C RCP85", "units": "m"},
    "atGTSR_changes_RP10_Hs_hist_end21C_RCP85": {"long_name": "Changes RP10 Hs hist end 21C RCP85", "units": "m"},
    "atGTSR_changes_RP20_Hs_hist_end21C_RCP85": {"long_name": "Changes RP20 Hs hist end 21C RCP85", "units": "m"},
    "atGTSR_changes_RP50_Hs_hist_end21C_RCP85": {"long_name": "Changes RP50 Hs hist end 21C RCP85", "units": "m"},
    "Meucci_RP100_percent_change_end21C_RCP45": {"long_name": "Meucci RP100 percent change end 21C RCP45", "units": "%"},
    "Meucci_RP100_percent_change_end21C_RCP85": {"long_name": "Meucci RP100 percent change end 21C RCP85", "units": "%"}

}  # specify custom (CF convention) attributes

 # add / overwrite attributes
for k, v in dataset_attributes.items():
    try:
        ds[k].attrs = dataset_attributes[k]
    except:
        continue
ds

<IPython.core.display.Javascript object>

In [22]:
# Write the xarray dataset to a netCDF file
#Compliant netcdf
dataset_dir_path_CF=str(dataset_dir_path).replace("original","final")


ds.to_netcdf(path=dataset_dir_path_CF)

<IPython.core.display.Javascript object>

### Check CF compliancy modified NetCDF files

In [23]:
# open datasets (only first file, rest is the same)
ds = xr.open_dataset(dataset_dir_path_CF)

# check original dataset
ds

<IPython.core.display.Javascript object>

In [24]:
%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(testfile=dataset_dir_path_CF, 
                 working_dir=CF_dir
                 )

<IPython.core.display.Javascript object>

In [25]:
# save original CF compliancy (for first file)
save_compliancy(cap, testfile=dataset_dir_path_CF, working_dir=CF_dir)



<IPython.core.display.Javascript object>

In [26]:
# export to zarr in write mode (to overwrite if exists)
ds.to_zarr(str(dataset_dir_path).replace("original","final").replace(".nc", ".zarr"), mode="w")

<xarray.backends.zarr.ZarrStore at 0x2efd98098c0>

<IPython.core.display.Javascript object>