# All CMIP6 Phase 2 GCM 2070-2100

Notebook environment to migrate NetCDF files to CF compliant zarr

In [1]:
# Optional; code formatter, installed as jupyter lab extension
#%load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# Import standard packages
import os
import pathlib
import sys

import numpy as np
#import geopandas as gpd
 
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import glob
import itertools
import json
import copy
from itertools import chain
from shapely import wkb

# Import custom functionality
# sys.path.append('c:/Windows/System32/coclicodata/src')#CLENMAR ADDED

from coclicodata.drive_config import p_drive
from coclicodata.etl.cf_compliancy_checker import check_compliancy, save_compliancy
#P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\All_Datasets\Orig_Datasets\01_Hazards\01_Present\Extreme_Wave_Height
# Define (local and) remote drives
# gca_data_dir = pathlib.Path.home().joinpath(r"OneDrive - Stichting Deltares\Documents\GitHub")
gca_data_dir = p_drive.joinpath("11209197-018-global-coastal-atlas", r"MSc_students\ClenmarRowe\Data\All_Datasets", r"Orig_Datasets\01_Hazards\02_Future")

# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    pathlib.Path().home().joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"AppData\Local\miniconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml" ###Changed from anaconda to miniconda  - added the new package udunits file
    )
)


<IPython.core.display.Javascript object>

In [3]:
GCM_all=["MRI-ESM2-0","MPI-ESM1-2-LR","KIOST-ESM","IPSL-CM6A-LR","CMCC-CM2-SR5","AWI-CM-1-1-MR"]

<IPython.core.display.Javascript object>

In [4]:
for i,GCM in enumerate({GCM_all[1]}):# Project paths & files (manual input)
    dataset_dir = gca_data_dir.joinpath(r"Extreme_Wave_Height\GCM_All_CMIP6-phase2_Projection_2071-2100",GCM)
    os.mkdir(dataset_dir)
    dataset_dir_path = gca_data_dir.joinpath(r"Extreme_Wave_Height\GCM_All_CMIP6-phase2_Projection_2071-2100",GCM,GCM+"_WW3_3Hourly_2071-2100.nc")
    CF_dir = gca_data_dir.joinpath(r"Extreme_Wave_Height\GCM_All_CMIP6-phase2_Projection_2071-2100",GCM,"CF")  # directory to save output CF check files

<IPython.core.display.Javascript object>

In [5]:
#For SSP1-26

import glob

orig_dataset_dir= gca_data_dir.joinpath(r"Extreme_Wave_Height\GCM_All_CMIP6-phase2_Projection_2071-2100")
# Specify the path to the directory containing the netCDF files
dir="SSP1-26"

# Get a list of all netCDF files in the specified directory
files = sorted(glob.glob(os.path.join(orig_dataset_dir , dir,  "*.nc")))

datasets=[]
for file_name in files:
    if GCM in file_name:
        # Open datasets using lazily in Chunks
        datasets.append(xr.open_dataset(file_name, chunks={'time':  1}))
datasets


[<xarray.Dataset>
 Dimensions:    (longitude: 720, latitude: 361, time: 248)
 Coordinates:
   * longitude  (longitude) float32 0.0 0.5 1.0 1.5 ... 358.0 358.5 359.0 359.5
   * latitude   (latitude) float32 -90.0 -89.5 -89.0 -88.5 ... 89.0 89.5 90.0
   * time       (time) datetime64[ns] 2071-01-01 ... 2071-01-31T21:00:00
 Data variables:
     MAPSTA     (latitude, longitude) int16 dask.array<chunksize=(361, 720), meta=np.ndarray>
     hs         (time, latitude, longitude) float32 dask.array<chunksize=(1, 361, 720), meta=np.ndarray>
 Attributes: (12/15)
     WAVEWATCH_III_version_number:  6.07
     WAVEWATCH_III_switches:        F90 NOGRB NC4 SCRIP SHRD PR3 UQ FLX4 LN1 S...
     product_name:                  ww3.207101_hs.nc
     area:                          Global regular grid output
     latitude_resolution:           0.
     longitude_resolution:          0.
     ...                            ...
     easternmost_longitude:         360.
     minimum_altitude:              -12000 

<IPython.core.display.Javascript object>

In [6]:

# Concatenate the datasets along the time dimension
merged_SSP1_RCP26 = xr.concat(datasets, dim='time')#.compute()

# Save the merged dataset to a new netCDF file in the current directory
# merged.to_netcdf('merged_output.nc')
merged_SSP1_RCP26

Unnamed: 0,Array,Chunk
Bytes,42.44 GiB,122.95 MiB
Shape,"(87656, 361, 720)","(248, 361, 720)"
Dask graph,360 chunks in 1081 graph layers,360 chunks in 1081 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray
"Array Chunk Bytes 42.44 GiB 122.95 MiB Shape (87656, 361, 720) (248, 361, 720) Dask graph 360 chunks in 1081 graph layers Data type int16 numpy.ndarray",720  361  87656,

Unnamed: 0,Array,Chunk
Bytes,42.44 GiB,122.95 MiB
Shape,"(87656, 361, 720)","(248, 361, 720)"
Dask graph,360 chunks in 1081 graph layers,360 chunks in 1081 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,84.88 GiB,0.99 MiB
Shape,"(87656, 361, 720)","(1, 361, 720)"
Dask graph,87656 chunks in 721 graph layers,87656 chunks in 721 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 84.88 GiB 0.99 MiB Shape (87656, 361, 720) (1, 361, 720) Dask graph 87656 chunks in 721 graph layers Data type float32 numpy.ndarray",720  361  87656,

Unnamed: 0,Array,Chunk
Bytes,84.88 GiB,0.99 MiB
Shape,"(87656, 361, 720)","(1, 361, 720)"
Dask graph,87656 chunks in 721 graph layers,87656 chunks in 721 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


<IPython.core.display.Javascript object>

In [7]:
#For SSP5-85

# Specify the path to the directory containing the netCDF files
dir="SSP5-85"

# Get a list of all netCDF files in the specified directory
files = sorted(glob.glob(os.path.join(orig_dataset_dir , dir,  "*.nc")))

datasets=[]
for file_name in files:
    if GCM in file_name:
        # Open datasets using lazily in Chunks
        datasets.append(xr.open_dataset(file_name, chunks={'time':  1}))


# Concatenate the datasets along the time dimension
merged_SSP5_RCP85 = xr.concat(datasets, dim='time')

# Save the merged dataset to a new netCDF file in the current directory
# merged.to_netcdf('merged_output.nc')
merged_SSP5_RCP85

Unnamed: 0,Array,Chunk
Bytes,42.44 GiB,122.95 MiB
Shape,"(87656, 361, 720)","(248, 361, 720)"
Dask graph,360 chunks in 1081 graph layers,360 chunks in 1081 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray
"Array Chunk Bytes 42.44 GiB 122.95 MiB Shape (87656, 361, 720) (248, 361, 720) Dask graph 360 chunks in 1081 graph layers Data type int16 numpy.ndarray",720  361  87656,

Unnamed: 0,Array,Chunk
Bytes,42.44 GiB,122.95 MiB
Shape,"(87656, 361, 720)","(248, 361, 720)"
Dask graph,360 chunks in 1081 graph layers,360 chunks in 1081 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,84.88 GiB,0.99 MiB
Shape,"(87656, 361, 720)","(1, 361, 720)"
Dask graph,87656 chunks in 721 graph layers,87656 chunks in 721 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 84.88 GiB 0.99 MiB Shape (87656, 361, 720) (1, 361, 720) Dask graph 87656 chunks in 721 graph layers Data type float32 numpy.ndarray",720  361  87656,

Unnamed: 0,Array,Chunk
Bytes,84.88 GiB,0.99 MiB
Shape,"(87656, 361, 720)","(1, 361, 720)"
Dask graph,87656 chunks in 721 graph layers,87656 chunks in 721 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


<IPython.core.display.Javascript object>

In [8]:
combined_ds= merged_SSP5_RCP85
combined_ds=combined_ds.rename({'hs': 'hs_5_85_'+GCM.replace("-","_")})

combined_ds["hs_1_26_"+GCM.replace("-","_")]=merged_SSP1_RCP26["hs"]
combined_ds=combined_ds.drop_vars("MAPSTA")
combined_ds

Unnamed: 0,Array,Chunk
Bytes,84.88 GiB,0.99 MiB
Shape,"(87656, 361, 720)","(1, 361, 720)"
Dask graph,87656 chunks in 721 graph layers,87656 chunks in 721 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 84.88 GiB 0.99 MiB Shape (87656, 361, 720) (1, 361, 720) Dask graph 87656 chunks in 721 graph layers Data type float32 numpy.ndarray",720  361  87656,

Unnamed: 0,Array,Chunk
Bytes,84.88 GiB,0.99 MiB
Shape,"(87656, 361, 720)","(1, 361, 720)"
Dask graph,87656 chunks in 721 graph layers,87656 chunks in 721 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,84.88 GiB,0.99 MiB
Shape,"(87656, 361, 720)","(1, 361, 720)"
Dask graph,87656 chunks in 721 graph layers,87656 chunks in 721 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 84.88 GiB 0.99 MiB Shape (87656, 361, 720) (1, 361, 720) Dask graph 87656 chunks in 721 graph layers Data type float32 numpy.ndarray",720  361  87656,

Unnamed: 0,Array,Chunk
Bytes,84.88 GiB,0.99 MiB
Shape,"(87656, 361, 720)","(1, 361, 720)"
Dask graph,87656 chunks in 721 graph layers,87656 chunks in 721 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


<IPython.core.display.Javascript object>

In [9]:
ds=combined_ds

<IPython.core.display.Javascript object>

### Check CF compliancy original NetCDF files

In [10]:
%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(testfile=files[0], 
                 working_dir=CF_dir
                 )

<IPython.core.display.Javascript object>

In [11]:
# # save original CF compliancy (for first file)
save_compliancy(cap, testfile=dataset_dir_path, working_dir=CF_dir)



<IPython.core.display.Javascript object>

### Make CF compliant alterations to the NetCDF files (dataset dependent)

In [12]:
import json

# NetCDF attribute alterations by means of metadata template
metadata_json=orig_dataset_dir.joinpath("metadata_"+GCM+"_2071-2100.json")

f_global = open(metadata_json)
meta_global = json.load(f_global)


for attr_name, attr_val in meta_global.items():
    if attr_name == 'PROVIDERS':
        attr_val = json.dumps(attr_val)
    ds.attrs[attr_name] = attr_val

ds.attrs['Conventions'] = "CF-1.8"
ds

Unnamed: 0,Array,Chunk
Bytes,84.88 GiB,0.99 MiB
Shape,"(87656, 361, 720)","(1, 361, 720)"
Dask graph,87656 chunks in 721 graph layers,87656 chunks in 721 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 84.88 GiB 0.99 MiB Shape (87656, 361, 720) (1, 361, 720) Dask graph 87656 chunks in 721 graph layers Data type float32 numpy.ndarray",720  361  87656,

Unnamed: 0,Array,Chunk
Bytes,84.88 GiB,0.99 MiB
Shape,"(87656, 361, 720)","(1, 361, 720)"
Dask graph,87656 chunks in 721 graph layers,87656 chunks in 721 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,84.88 GiB,0.99 MiB
Shape,"(87656, 361, 720)","(1, 361, 720)"
Dask graph,87656 chunks in 721 graph layers,87656 chunks in 721 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 84.88 GiB 0.99 MiB Shape (87656, 361, 720) (1, 361, 720) Dask graph 87656 chunks in 721 graph layers Data type float32 numpy.ndarray",720  361  87656,

Unnamed: 0,Array,Chunk
Bytes,84.88 GiB,0.99 MiB
Shape,"(87656, 361, 720)","(1, 361, 720)"
Dask graph,87656 chunks in 721 graph layers,87656 chunks in 721 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


<IPython.core.display.Javascript object>

# Slice to reduce size of CF checked NetCDF

In [13]:
ds_slice=ds.sel(time=slice('2065-01-01','2065-01-02'))
ds_slice

Unnamed: 0,Array,Chunk
Bytes,0 B,0 B
Shape,"(0, 361, 720)","(0, 361, 720)"
Dask graph,1 chunks in 722 graph layers,1 chunks in 722 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 0 B 0 B Shape (0, 361, 720) (0, 361, 720) Dask graph 1 chunks in 722 graph layers Data type float32 numpy.ndarray",,

Unnamed: 0,Array,Chunk
Bytes,0 B,0 B
Shape,"(0, 361, 720)","(0, 361, 720)"
Dask graph,1 chunks in 722 graph layers,1 chunks in 722 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0 B,0 B
Shape,"(0, 361, 720)","(0, 361, 720)"
Dask graph,1 chunks in 722 graph layers,1 chunks in 722 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 0 B 0 B Shape (0, 361, 720) (0, 361, 720) Dask graph 1 chunks in 722 graph layers Data type float32 numpy.ndarray",,

Unnamed: 0,Array,Chunk
Bytes,0 B,0 B
Shape,"(0, 361, 720)","(0, 361, 720)"
Dask graph,1 chunks in 722 graph layers,1 chunks in 722 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


<IPython.core.display.Javascript object>

In [14]:
# var=ds['swh'].isel(time=0)
# var.plot()
# plt.show()

<IPython.core.display.Javascript object>

In [15]:
# Write the xarray dataset to a netCDF file
#Compliant netcdf
dataset_dir_path_CF=pathlib.Path().joinpath(str(dataset_dir_path)[0:-3]+"_CF"+".nc")


# ds.to_netcdf(path=dataset_dir_path_CF)
#ds_slice.to_netcdf(path=r"C:\Users\rowe\OneDrive - Stichting Deltares\Desktop\delete\era5_coastal_Hs_all_years_CF.nc")
ds_slice.to_netcdf(path=dataset_dir_path_CF)

<IPython.core.display.Javascript object>

### Check CF compliancy modified NetCDF files

In [16]:
# open datasets (only first file, rest is the same)
ds_slice = xr.open_dataset(dataset_dir_path_CF)

# check original dataset
ds_slice

<IPython.core.display.Javascript object>

In [17]:
%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(testfile=dataset_dir_path_CF, 
                 working_dir=CF_dir
                 )

<IPython.core.display.Javascript object>

In [18]:
# save original CF compliancy (for first file)
save_compliancy(cap, testfile=dataset_dir_path_CF, working_dir=CF_dir)



<IPython.core.display.Javascript object>

### Write data to Zarr files

In [18]:
from dask.diagnostics import ProgressBar


# # export to zarr in write mode (to overwrite if exists)

with ProgressBar():
    ds.to_zarr(str(dataset_dir_path).replace(".nc", ".zarr"), mode="w")

if i == 1:
    combined_all = ds
else:
    combined_all['hs_5_85_' + GCM.replace("-", "_")] = ds['hs_5_85_' + GCM.replace("-", "_")]
    combined_all['hs_1_26_' + GCM.replace("-", "_")] = ds['hs_1_26_' + GCM.replace("-", "_")]
combined_all
    

[########################################] | 100% Completed | 15hr 58m


<IPython.core.display.Javascript object>

In [19]:
# # export to zarr in write mode (to overwrite if exists)
# ds.to_zarr(str(dataset_dir_path).replace(".nc", ".zarr"), mode="w")

<IPython.core.display.Javascript object>