# 02 - Wave Energy Dataset JRC

This script performs the following tasks:
1. [Auth] writes data to Zarr files (cloud-native file format) (AUTh)
2. [Deltares] checks and uploads the Zarr to a Google Cloud Storage (GCS) bucket 
3. [Deltares] creates a geoJSON from Zarr data 
4. [Deltares] uploads the geoJSON to Mapbox 
5. [Deltares] updates the STAC

TODO: 
1. [Auth] make zarr consistent with cf conventions 
2. [Deltares] come up with checks (cf conventions) for Zarr file before uploading to GCS

In [1]:
# Optional; code formatter, installed as jupyter lab extension
#%load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

# imports
import sys
import os
import geojson
import netCDF4 as nc
import pathlib
import platform
import xarray as xr
import pandas as pd
import subprocess
import warnings
import numpy as np
from google.cloud import storage
from dotenv import dotenv_values
from itertools import product

warnings.filterwarnings("ignore")

# make root directories importable by appending root to path
cwd = pathlib.Path().resolve()
sys.path.append(os.path.dirname(cwd))

# OS independent path configurations
if platform.system() == "Windows":
    root = pathlib.Path("P:/")
else:  # linux or other
    root = pathlib.Path("/p/")

coclico_data_dir = root.joinpath("11205479-coclico", "data")



<IPython.core.display.Javascript object>

In [2]:
# paths to the dataset, manual input
dataset_dir = coclico_data_dir.joinpath("02_wave_energy_jrc")
dataset_path = dataset_dir.joinpath("CoastAlRisk_Global_WEF_RCP85.nc")
dataset_out_file = "CoastAlRisk_Global_WEF_RCP85"

# GCS and mapbox private access keys
GCS_token_path = coclico_data_dir.joinpath(
    "google_credentials.json"
)  # path name (including json file name)
config = dotenv_values(".env")
mapbox_token = config["MAPBOX_TOKEN"]  # mapbox private key

# STAC custom functions
local_STAC = r"../../coclicodata"  # path to local GitHub STAC clone
sys.path.insert(-1, local_STAC)
import generate as gr

<IPython.core.display.Javascript object>

# 1. write data to Zarr files

In [3]:
# open datasets
dataset = xr.open_dataset(dataset_path)

# check original dataset
# dataset

<IPython.core.display.Javascript object>

In [4]:
# rename or swap dimension names, the latter in case the name already exists as coordinate
dataset = dataset.rename_dims({"npoints": "stations", "nrp": "RP", "nsdec": "year"})
# dataset = dataset.swap_dims({"r": "RP"})

# rename variables, if necessary
dataset = dataset.rename_vars({"rp": "RP", "decades": "year"})

# set some data variables to coordinates to avoid duplication of dimensions in later stage
dataset = dataset.set_coords(["longitude", "latitude", "RP", "year"])

<IPython.core.display.Javascript object>

In [5]:
# concat datasets along new dimension with index values and name derived from pandas index object, if necessary
# dataset = xr.concat(
#    [dataset_historical, dataset_45rcp, dataset_85rcp],
#    pd.Index(["Historical", "RCP45", "RCP85"], name="scenario"),
# )

<IPython.core.display.Javascript object>

In [7]:
# re-order shape of the data variables
dataset = dataset.transpose("stations", "RP", "year")

<IPython.core.display.Javascript object>

In [8]:
# check the xarray dataset, best practice is to have as many as possible bold dimensions (dimension == coordinate name).
# in this way, the Front-End can access the variable directly without having to index the variable first

dataset

<IPython.core.display.Javascript object>

In [9]:
# export to zarr in write mode (to overwrite if exists)
dataset.to_zarr(dataset_dir.joinpath("%s.zarr" % dataset_out_file), mode="w")

<xarray.backends.zarr.ZarrStore at 0x1cf5dddd040>

<IPython.core.display.Javascript object>

# 2. check and upload Zarr to GCS bucket

In [10]:
# check zarr data

# TODO Come up with checks

<IPython.core.display.Javascript object>

In [11]:
# upload zarr folder to GCS
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(GCS_token_path)

# function to upload zarr folder to GCS
storage_client = storage.Client()


def upload_from_directory(directory_path, dest_bucket_name, dest_blob_name):
    rel_paths = directory_path.glob("**/*")
    bucket = storage_client.bucket(dest_bucket_name)
    for local_file in rel_paths:
        remote_path = f'{dest_blob_name}/{"/".join(str(local_file).split(os.sep)[5:])}'  # note 5: is hardcoded and might lead to problems
        if os.path.isfile(local_file):
            blob = bucket.blob(remote_path)
            blob.upload_from_filename(local_file)

    # print status
    print("Folder uploaded to GCS")


# specification of directory, bucket and file name to feed into the function
directory_path = dataset_dir.joinpath("%s.zarr" % dataset_out_file)
dest_bucket_name = "dgds-data-public"
dest_blob_name = "coclico/" + dataset_out_file + ".zarr"
folder_upload = upload_from_directory(directory_path, dest_bucket_name, dest_blob_name)

Folder uploaded to GCS


<IPython.core.display.Javascript object>

# 3. create geoJSON from Zarr data

In [12]:
# load (locally or cloud) stored Zarr
# zarr_fn = str(dataset_dir.joinpath("%s.zarr" % dataset_out_file))  # local file path
zarr_fn = (
    r"https://storage.googleapis.com/dgds-data-public/coclico/%s.zarr"
    % dataset_out_file  # cloud file path, requires a http link
)
nddata = xr.open_zarr(zarr_fn)

# specify input variables
mapbox_url = "mapbox://global-data-viewer.%s" % dataset_out_file
template = "deltares-coclico-xssl"
variable = "wef"  # note, variable is set as different variables might not have the same dimensions
datasetid = f"{variable}-mapbox"

<IPython.core.display.Javascript object>

In [87]:
# write geoJSON

# automated variable retrieval (without hidden files)
variables = list(nddata.variables)

# autmated dimension retrieval
dimensions = list(nddata["%s" % (variable)].dims)

# write data to flattened GeoJSON file - Mapbox styling uses this (aligned with STAC)
cube_dimensions = {}
for dimension in dimensions:  # loop over dimensions of variable

    if (
        dimension == "stations"
    ):  # when dimension equal to stations (assumed present and independent), only read extent to keep json file size low

        dim = nddata[
            dimension
        ]  # get dimensions based on dimension itself (just indices)

        dimdict = {
            "type": "stations",
            "extent": [int(min(dim[:].values)), int(max(dim[:].values))],
            "unit": "-",
        }

    else:  # when dimension unequal to stations (assumed present and independent), match dimensions to variables to read
        # correct values to be appended to the dictionary (this also works in case coordinate != dimension)
        for var in variables:  # identify variable with dimension
            var_dim = nddata["%s" % var].dims
            if (
                len(var_dim) == 1 and var_dim[0] == dimension
            ):  # only take the variable with a single dimension and where dimension is equal to the variable dimensions

                dim = nddata[var]  # get dimensions based on variable (values)

                dimdict = {
                    "type": "temporal",  # TODO To be customized based on variable?
                    "values": dim[:].values.tolist(),
                    "unit": dim.attrs.get("units", "-"),
                }

    dim = gr.Dimension.from_dict(dimdict)
    cube_dimensions[dimension] = dim

dimvals = {k: v.values for k, v in cube_dimensions.items() if v.values}

# dot product of variables keys
keys = []
for values in product(*dimvals.values()):
    keys.append(
        "-".join(map(lambda x: "-".join(x), zip(dimvals.keys(), map(str, values))))
    )

# flatten single data values over specific dimension keys
features = []
for j, (lon, lat) in enumerate(
    zip(nddata["longitude"][:].values, nddata["latitude"][:].values)
):  # assumes longitude and latitude are present and independent
    point = geojson.Point((float(lon), float(lat)))
    feature = geojson.Feature(geometry=point)
    feature["properties"]["locationId"] = j

    for a, b in zip(
        nddata.sel({"stations": j})["%s" % variable].values.flatten(), keys
    ):  # flattened along dimensions
        feature["properties"][b] = a

    features.append(feature)

# store the features in a GeoJSON file
collection = geojson.FeatureCollection(features)
with open(
    os.path.join(dataset_dir, "platform", r"%s.geojson" % dataset_out_file), "w",
) as f:
    geojson.dump(collection, f)

<IPython.core.display.Javascript object>

In [100]:
nddata

Unnamed: 0,Array,Chunk
Bytes,37.72 kiB,37.72 kiB
Shape,"(4828,)","(4828,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 37.72 kiB 37.72 kiB Shape (4828,) (4828,) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",4828  1,

Unnamed: 0,Array,Chunk
Bytes,37.72 kiB,37.72 kiB
Shape,"(4828,)","(4828,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,37.72 kiB,37.72 kiB
Shape,"(4828,)","(4828,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 37.72 kiB 37.72 kiB Shape (4828,) (4828,) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",4828  1,

Unnamed: 0,Array,Chunk
Bytes,37.72 kiB,37.72 kiB
Shape,"(4828,)","(4828,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.24 MiB,452.62 kiB
Shape,"(4828, 8, 11)","(2414, 4, 6)"
Count,9 Tasks,8 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 3.24 MiB 452.62 kiB Shape (4828, 8, 11) (2414, 4, 6) Count 9 Tasks 8 Chunks Type float64 numpy.ndarray",11  8  4828,

Unnamed: 0,Array,Chunk
Bytes,3.24 MiB,452.62 kiB
Shape,"(4828, 8, 11)","(2414, 4, 6)"
Count,9 Tasks,8 Chunks
Type,float64,numpy.ndarray


<IPython.core.display.Javascript object>

In [103]:
# check written geojson

# check shape
with open(os.path.join(dataset_dir, "platform", "%s.geojson" % dataset_out_file)) as f:
    check = geojson.load(f)

print(check["features"][2])

# check the minima and maxima for the colormap boundaries
# year = 0
# for idx, i in enumerate(nddata["RP"][:].values):
#     print(
#         i,
#         round(min(nddata["%s" % variable][:, idx, year].values), 2),
#         round(max(nddata["%s" % variable][:, idx, year].values), 2),
#     )

{"geometry": {"coordinates": [-179.987, 71.867], "type": "Point"}, "properties": {"RP-10.0-year-1995.0": 205850.50820902662, "RP-10.0-year-2010.0": 219014.60409213914, "RP-10.0-year-2020.0": 225416.78152235728, "RP-10.0-year-2030.0": 229143.52012785547, "RP-10.0-year-2040.0": 235024.46446463515, "RP-10.0-year-2050.0": 241363.3113429508, "RP-10.0-year-2060.0": 246371.8859040228, "RP-10.0-year-2070.0": 248509.71378579538, "RP-10.0-year-2080.0": 251987.9714307226, "RP-10.0-year-2090.0": 256223.00909818543, "RP-10.0-year-2100.0": 260335.6567895622, "RP-100.0-year-1995.0": 322616.0514124408, "RP-100.0-year-2010.0": 343211.5404803857, "RP-100.0-year-2020.0": 353270.6438084154, "RP-100.0-year-2030.0": 359185.7322859595, "RP-100.0-year-2040.0": 368430.14857902075, "RP-100.0-year-2050.0": 378250.5827647725, "RP-100.0-year-2060.0": 385985.8158455367, "RP-100.0-year-2070.0": 389215.4863091807, "RP-100.0-year-2080.0": 394615.0789788287, "RP-100.0-year-2090.0": 401154.0805733845, "RP-100.0-year-210

<IPython.core.display.Javascript object>

# 4. upload geoJSON to Mapbox

In [104]:
# ingest geoJSON into mapbox tilesets

# python way of running CLI to upload to mapbox automatically
subprocess.run(
    [
        "mapbox",
        "--access-token",
        mapbox_token,
        "upload",
        r"global-data-viewer.%s" % dataset_out_file.split(".")[0],
        os.path.join(
            dataset_dir, "platform", r"%s.geojson" % dataset_out_file.split(".")[0]
        ),
    ],
    shell=True,
    check=True,
)

# notebook version of CLI
#!mapbox --access-token {mapbox_token} upload {filename} {source}

# CLI command (example)
# mapbox --access-token **write out mapbox_token** upload global-data-viewer.CoastAlRisk_Europe_EESSL p:\11205479-coclico\data\01_storm_surge_jrc\platform\CoastAlRisk_Europe_EESSL.geojson

CompletedProcess(args=['mapbox', '--access-token', 'sk.eyJ1IjoiZ2xvYmFsLWRhdGEtdmlld2VyIiwiYSI6ImNsMnB0dmlscTFrZnEzY211cWxna3Bxb3AifQ.OQ3P9PEjZUiErvjrwVbsag', 'upload', 'global-data-viewer.CoastAlRisk_Global_WEF_RCP85', 'P:\\11205479-coclico\\data\\02_wave_energy_jrc\\platform\\CoastAlRisk_Global_WEF_RCP85.geojson'], returncode=0)

<IPython.core.display.Javascript object>

# 5. Update the STAC

In [107]:
# update STAC

# Get initial STAC
collection = gr.Collection.from_file(
    os.path.join(local_STAC, "current/collection.json")
)
# collection.describe()  # display hierarchy

# Get template and set items
templatedataset = collection.get_child(template)
dataset = templatedataset.full_copy()
dataset.id = datasetid
dataset.title = variable
dataset.description = variable

# Drop existing items, dimensions and summaries
dataset._resolved_objects
dataset.set_root(None)
dataset.clear_items()
dataset.assets = {}
dataset.extra_fields = gr.deepcopy(
    dataset.extra_fields
)  # workaround for https://github.com/stac-utils/pystac/issues/787
dataset.summaries = None
dataset.extra_fields.pop("cube:dimensions", None)
dataset.extra_fields.pop("cube:variables", None)
dataset.extra_fields.pop("summaries", None)

# Add zarr asset
dataset.add_asset("data", gr.gen_zarr_asset(variable, zarr_fn))

# Add dimension info
dc_ext = gr.DatacubeExtension.ext(dataset)
dc_ext.apply(cube_dimensions)

var = gr.Variable({})
var.description = ""
var.dimensions = list(cube_dimensions.keys())
var.type = "data"
var.unit = nddata["%s" % variable].attrs["units"]
dc_ext.variables = {variable: var}

# Add summaries
dataset.summaries = gr.Summaries(summaries=dimvals)

# Add children
layout = gr.Layout()
for values, key in zip(product(*dimvals.values()), keys):
    feature = gr.gen_default_item(f"{variable}-mapbox-{key}")
    feature.add_asset("mapbox", gr.gen_mapbox_asset(mapbox_url, dataset_out_file))
    feature.properties = gr.gen_default_props(key=key)
    for (k, v) in zip(dimvals.keys(), values):
        feature.properties[k] = v
    dataset.add_item(feature, strategy=layout)

# Set extra link properties
gr.extend_links(dataset, cube_dimensions.keys())

# Save and limit number of folders
collection.add_child(dataset)
dataset.normalize_hrefs(
    os.path.join(local_STAC, f"current/{variable}"), strategy=layout
)
collection.save(
    catalog_type=gr.CatalogType.SELF_CONTAINED,
    dest_href=os.path.join(local_STAC, f"current"),
    stac_io=gr.IO(),
)

<IPython.core.display.Javascript object>