# ERA5 1993- 2023 wave data

Notebook environment to migrate NetCDF files to CF compliant zarr

In [1]:
# Optional; code formatter, installed as jupyter lab extension
#%load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# Import standard packages
import os
import pathlib
import sys

import numpy as np
#import geopandas as gpd
 
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import glob
import itertools
import json
import copy
from itertools import chain
from shapely import wkb

# Import custom functionality
# sys.path.append('c:/Windows/System32/coclicodata/src')#CLENMAR ADDED

from coclicodata.drive_config import p_drive
from coclicodata.etl.cf_compliancy_checker import check_compliancy, save_compliancy
#P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\All_Datasets\Orig_Datasets\01_Hazards\01_Present\Extreme_Wave_Height
# Define (local and) remote drives
# gca_data_dir = pathlib.Path.home().joinpath(r"OneDrive - Stichting Deltares\Documents\GitHub")
gca_data_dir = p_drive.joinpath("11209197-018-global-coastal-atlas", r"MSc_students\ClenmarRowe\Data\All_Datasets", r"Orig_Datasets\01_Hazards\01_Present")

# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    pathlib.Path().home().joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"AppData\Local\miniconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml" ###Changed from anaconda to miniconda  - added the new package udunits file
    )
)


<IPython.core.display.Javascript object>

In [3]:
# Project paths & files (manual input)
dataset_dir = gca_data_dir.joinpath("Extreme_Wave_Height")
dataset_dir_path = gca_data_dir.joinpath("Extreme_Wave_Height","ERA5_Hourly_1993-2023.nc")
CF_dir = gca_data_dir.joinpath("Extreme_Wave_Height","CF")  # directory to save output CF check files

<IPython.core.display.Javascript object>

In [4]:
#For SSP1-26

import glob



# Get a list of all netCDF files in the specified directory
files = sorted(glob.glob(os.path.join(dataset_dir ,  "*.nc")))

# Open datasets using lazily in Chunks
datasets = [xr.open_dataset(f, chunks={'time':  1}) for f in files]
datasets


[<xarray.Dataset>
 Dimensions:    (longitude: 720, latitude: 361, time: 96408)
 Coordinates:
   * longitude  (longitude) float32 0.0 0.5 1.0 1.5 ... 358.0 358.5 359.0 359.5
   * latitude   (latitude) float32 90.0 89.5 89.0 88.5 ... -89.0 -89.5 -90.0
   * time       (time) datetime64[ns] 1993-01-01 ... 2003-12-31T23:00:00
 Data variables:
     swh        (time, latitude, longitude) float32 dask.array<chunksize=(1, 361, 720), meta=np.ndarray>
 Attributes:
     Conventions:  CF-1.6
     history:      2024-03-25 17:20:46 GMT by grib_to_netcdf-2.25.1: /opt/ecmw...,
 <xarray.Dataset>
 Dimensions:    (longitude: 720, latitude: 361, time: 48)
 Coordinates:
   * longitude  (longitude) float32 0.0 0.5 1.0 1.5 ... 358.0 358.5 359.0 359.5
   * latitude   (latitude) float32 90.0 89.5 89.0 88.5 ... -89.0 -89.5 -90.0
   * time       (time) datetime64[ns] 2008-01-01 ... 2008-01-02T23:00:00
 Data variables:
     swh        (time, latitude, longitude) float32 dask.array<chunksize=(1, 361, 720), meta=np.

<IPython.core.display.Javascript object>

In [5]:
# files

<IPython.core.display.Javascript object>

In [6]:

# Concatenate the datasets along the time dimension
All_ERA5_30yrs = xr.concat(datasets, dim='time')#.compute()
ds=All_ERA5_30yrs
ds

Unnamed: 0,Array,Chunk
Bytes,305.59 GiB,0.99 MiB
Shape,"(315600, 361, 720)","(1, 361, 720)"
Dask graph,315600 chunks in 11 graph layers,315600 chunks in 11 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 305.59 GiB 0.99 MiB Shape (315600, 361, 720) (1, 361, 720) Dask graph 315600 chunks in 11 graph layers Data type float32 numpy.ndarray",720  361  315600,

Unnamed: 0,Array,Chunk
Bytes,305.59 GiB,0.99 MiB
Shape,"(315600, 361, 720)","(1, 361, 720)"
Dask graph,315600 chunks in 11 graph layers,315600 chunks in 11 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


<IPython.core.display.Javascript object>

In [7]:
print(ds.time.values)

['1993-01-01T00:00:00.000000000' '1993-01-01T01:00:00.000000000'
 '1993-01-01T02:00:00.000000000' ... '2023-12-31T21:00:00.000000000'
 '2023-12-31T22:00:00.000000000' '2023-12-31T23:00:00.000000000']


<IPython.core.display.Javascript object>

In [8]:

# # Save the merged dataset to a new netCDF file in the current directory
# All_ERA5_30yrs.to_zarr('Annual_max_30yrs.nc')
# All_ERA5_30yrs

<IPython.core.display.Javascript object>

### Check CF compliancy original NetCDF files

In [9]:
# %%capture cap --no-stderr
# # check original CF compliancy (for first file)

# check_compliancy(testfile=dataset_dir_path, 
#                  working_dir=CF_dir
#                  )

<IPython.core.display.Javascript object>

In [10]:
# # save original CF compliancy (for first file)
# save_compliancy(cap, testfile=dataset_dir_path, working_dir=CF_dir)

<IPython.core.display.Javascript object>

### Make CF compliant alterations to the NetCDF files (dataset dependent)

In [11]:
ds = ds.sortby('time')
ds

Unnamed: 0,Array,Chunk
Bytes,305.59 GiB,0.99 MiB
Shape,"(315600, 361, 720)","(1, 361, 720)"
Dask graph,315600 chunks in 12 graph layers,315600 chunks in 12 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 305.59 GiB 0.99 MiB Shape (315600, 361, 720) (1, 361, 720) Dask graph 315600 chunks in 12 graph layers Data type float32 numpy.ndarray",720  361  315600,

Unnamed: 0,Array,Chunk
Bytes,305.59 GiB,0.99 MiB
Shape,"(315600, 361, 720)","(1, 361, 720)"
Dask graph,315600 chunks in 12 graph layers,315600 chunks in 12 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


<IPython.core.display.Javascript object>

In [12]:
import json

# NetCDF attribute alterations by means of metadata template
metadata_json=dataset_dir.joinpath("metadata_ERA5_copernicus_1993_2023.json")

f_global = open(metadata_json)
meta_global = json.load(f_global)


for attr_name, attr_val in meta_global.items():
    if attr_name == 'PROVIDERS':
        attr_val = json.dumps(attr_val)
    ds.attrs[attr_name] = attr_val

ds.attrs['Conventions'] = "CF-1.8"
ds

Unnamed: 0,Array,Chunk
Bytes,305.59 GiB,0.99 MiB
Shape,"(315600, 361, 720)","(1, 361, 720)"
Dask graph,315600 chunks in 12 graph layers,315600 chunks in 12 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 305.59 GiB 0.99 MiB Shape (315600, 361, 720) (1, 361, 720) Dask graph 315600 chunks in 12 graph layers Data type float32 numpy.ndarray",720  361  315600,

Unnamed: 0,Array,Chunk
Bytes,305.59 GiB,0.99 MiB
Shape,"(315600, 361, 720)","(1, 361, 720)"
Dask graph,315600 chunks in 12 graph layers,315600 chunks in 12 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


<IPython.core.display.Javascript object>

# Slice to reduce size of CF checked NetCDF

In [13]:
# ds_slice=ds.sel(time=slice('2008-01-01','2008-01-02'))
# ds_slice

<IPython.core.display.Javascript object>

In [14]:
# var=ds['swh'].isel(time=0)
# var.plot()
# plt.show()

<IPython.core.display.Javascript object>

In [15]:
# # Write the xarray dataset to a netCDF file
# #Compliant netcdf
# dataset_dir_path_CF=pathlib.Path().joinpath(str(dataset_dir_path)[0:-3]+"_CF"+".nc")


# # ds.to_netcdf(path=dataset_dir_path_CF)
# #ds_slice.to_netcdf(path=r"C:\Users\rowe\OneDrive - Stichting Deltares\Desktop\delete\era5_coastal_Hs_all_years_CF.nc")
# ds_slice.to_netcdf(path=dataset_dir_path_CF)

<IPython.core.display.Javascript object>

### Check CF compliancy modified NetCDF files

In [16]:
# # open datasets (only first file, rest is the same)
# ds_slice = xr.open_dataset(dataset_dir_path_CF)

# # check original dataset
# ds_slice

<IPython.core.display.Javascript object>

In [17]:
# %%capture cap --no-stderr
# # check original CF compliancy (for first file)

# check_compliancy(testfile=dataset_dir_path_CF, 
#                  working_dir=gca_data_dir.joinpath("Extreme_Wave_Height")
#                  )

<IPython.core.display.Javascript object>

In [18]:
# # save original CF compliancy (for first file)
# save_compliancy(cap, testfile=dataset_dir_path_CF, working_dir=CF_dir)

<IPython.core.display.Javascript object>

In [19]:
ds

Unnamed: 0,Array,Chunk
Bytes,305.59 GiB,0.99 MiB
Shape,"(315600, 361, 720)","(1, 361, 720)"
Dask graph,315600 chunks in 12 graph layers,315600 chunks in 12 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 305.59 GiB 0.99 MiB Shape (315600, 361, 720) (1, 361, 720) Dask graph 315600 chunks in 12 graph layers Data type float32 numpy.ndarray",720  361  315600,

Unnamed: 0,Array,Chunk
Bytes,305.59 GiB,0.99 MiB
Shape,"(315600, 361, 720)","(1, 361, 720)"
Dask graph,315600 chunks in 12 graph layers,315600 chunks in 12 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


<IPython.core.display.Javascript object>

In [20]:
# Resample the data annually and calculate the maximum value for each year
annual_maxima = ds.resample(time='1Y').max(dim='time')

# Print the annual maxima
annual_maxima

Unnamed: 0,Array,Chunk
Bytes,30.74 MiB,0.99 MiB
Shape,"(31, 361, 720)","(1, 361, 720)"
Dask graph,31 chunks in 328 graph layers,31 chunks in 328 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 30.74 MiB 0.99 MiB Shape (31, 361, 720) (1, 361, 720) Dask graph 31 chunks in 328 graph layers Data type float32 numpy.ndarray",720  361  31,

Unnamed: 0,Array,Chunk
Bytes,30.74 MiB,0.99 MiB
Shape,"(31, 361, 720)","(1, 361, 720)"
Dask graph,31 chunks in 328 graph layers,31 chunks in 328 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


<IPython.core.display.Javascript object>

In [21]:
ds_mean_period=ds.sel(time=slice("1993","2014"))
ds_mean_period

Unnamed: 0,Array,Chunk
Bytes,229.20 GiB,0.99 MiB
Shape,"(236712, 361, 720)","(1, 361, 720)"
Dask graph,236712 chunks in 13 graph layers,236712 chunks in 13 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 229.20 GiB 0.99 MiB Shape (236712, 361, 720) (1, 361, 720) Dask graph 236712 chunks in 13 graph layers Data type float32 numpy.ndarray",720  361  236712,

Unnamed: 0,Array,Chunk
Bytes,229.20 GiB,0.99 MiB
Shape,"(236712, 361, 720)","(1, 361, 720)"
Dask graph,236712 chunks in 13 graph layers,236712 chunks in 13 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


<IPython.core.display.Javascript object>

In [22]:
ds_mean=ds_mean_period.mean(dim='time', skipna=True)
ds_mean

Unnamed: 0,Array,Chunk
Bytes,0.99 MiB,0.99 MiB
Shape,"(361, 720)","(361, 720)"
Dask graph,1 chunks in 23 graph layers,1 chunks in 23 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 0.99 MiB 0.99 MiB Shape (361, 720) (361, 720) Dask graph 1 chunks in 23 graph layers Data type float32 numpy.ndarray",720  361,

Unnamed: 0,Array,Chunk
Bytes,0.99 MiB,0.99 MiB
Shape,"(361, 720)","(361, 720)"
Dask graph,1 chunks in 23 graph layers,1 chunks in 23 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


<IPython.core.display.Javascript object>

### Write data to Zarr files

In [23]:
from dask.diagnostics import ProgressBar


# # export to zarr in write mode (to overwrite if exists)

with ProgressBar():
    # ds_mean.to_zarr(r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\Processed_data\01_Hazards\01_Present\Extreme_wave_height\ERA5_Mean_1993-2014.zarr", mode="w")
    # annual_maxima.to_zarr(r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\Processed_data\01_Hazards\01_Present\Extreme_wave_height\ERA5_Ann_max_GLOBAL_1993-2023.zarr", mode="w")
    ds.to_zarr(str(dataset_dir_path).replace(".nc", "___.zarr"), mode="w")

[########################################] | 100% Completed | 24hr 27m


<IPython.core.display.Javascript object>

In [25]:
with ProgressBar():
    # ds_mean.to_zarr(r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\Processed_data\01_Hazards\01_Present\Extreme_wave_height\ERA5_Mean_1993-2014.zarr", mode="w")
    annual_maxima.to_zarr(r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\Processed_data\01_Hazards\01_Present\Extreme_wave_height\ERA5_Ann_max_GLOBAL_1993-2023.zarr", mode="w")

[########################################] | 100% Completed | 3hr 44m


  return np.nanmax(x_chunk, axis=axis, keepdims=keepdims)


<IPython.core.display.Javascript object>