# Shoreline Monitor High Resolution

Hotspots + monthly temporal resolution

Notebook environment to migrate netcdf files to CF compliant zarr

In [35]:
# Optional; code formatter, installed as jupyter lab extension
# %load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

### Configure OS independent paths

In [36]:
# Import standard packages
import os
import pathlib

import sys
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import math

# Make root directories importable by appending root to path
cwd = pathlib.Path().resolve()
sys.path.append(os.path.dirname(cwd))
sys.path.append(r"P:\1000545-054-globalbeaches\15_GlobalCoastalAtlas\coclicodata")

# Get root paths
home = pathlib.Path().home()
root = home.root

# Import custom functionality
from etl import p_drive
from etl.CF_compliancy_checker import check_compliancy, save_compliancy

# Define (local and) remote drives
gca_data_dir = p_drive.joinpath(
    "1000545-054-globalbeaches", "15_GlobalCoastalAtlas", "datasets"
)

# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    home.joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"Anaconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml"
    )
)

<IPython.core.display.Javascript object>

In [37]:
# Project paths & files (manual input)
dataset_dir = gca_data_dir.joinpath("02_Shorelinemonitor_monthly")
dataset_dir_shorelinemonitor_hr = dataset_dir.joinpath("shorelinemonitor_monthly.nc")
dataset_out_file = "ShorelineMonitor_HR"
CF_dir = gca_data_dir.joinpath(r"CF")  # directory to save output CF check files

<IPython.core.display.Javascript object>

In [38]:
# # # write csv to netcdf

# # # Load data from CSV file into a pandas dataframe
# csv_dir = r'P:\11202268-hydraulic-engineering\MSc_students\Dante_van_der_Heijden\02. Data\shorelineMonitor_highres.csv'
# df = pd.read_csv(csv_dir)

# # Convert the pandas dataframe to an xarray dataset
# ds = xr.Dataset.from_dataframe(df)

# # Write the xarray dataset to a netCDF file
# ds.to_netcdf(dataset_dir_shorelinemonitor_hr)

<IPython.core.display.Javascript object>

### Check CF compliancy original NetCDF files

In [39]:
# open datasets
dataset_shorelinemonitor_hr = xr.open_dataset(dataset_dir_shorelinemonitor_hr)

# check original dataset
dataset_shorelinemonitor_hr

<IPython.core.display.Javascript object>

In [40]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile=dataset_dir_shorelinemonitor_hr, 
                 working_dir=CF_dir
                 )


<IPython.core.display.Javascript object>

In [41]:
# save original CF compliancy
save_compliancy(cap, testfile=dataset_dir_shorelinemonitor_hr, working_dir=CF_dir)



<IPython.core.display.Javascript object>

### Make CF compliant alterations to the NetCDF files (dataset dependent)

In [42]:
import json

# NetCDF attribute alterations
f_global = open(
    r"P:\1000545-054-globalbeaches\15_GlobalCoastalAtlas\datasets\02_Shorelinemonitor_monthly\metadata_shorelinemonitor_hr.json"
)
meta_global = json.load(f_global)

for attr_name, attr_val in meta_global.items():
    if attr_name == "PROVIDERS":
        attr_val = json.dumps(attr_val)
    dataset_shorelinemonitor_hr.attrs[attr_name] = attr_val

dataset_shorelinemonitor_hr.attrs["Conventions"] = "CF-1.8"

<IPython.core.display.Javascript object>

In [43]:
dataset_shorelinemonitor_hr = dataset_shorelinemonitor_hr.drop("Unnamed: 0")

<IPython.core.display.Javascript object>

In [44]:
dataset_shorelinemonitor_hr

<IPython.core.display.Javascript object>

In [45]:
from datetime import datetime


def round_to_nearest_month(date):
    # Extract year and month from the input date
    year = date.year
    month = date.month

    # Round the day to the nearest month
    if date.day > 15:
        month += 1
        if month > 12:
            year += 1
            month = 1
    else:
        month = month

    # Create a new date object for the rounded date
    rounded_date = datetime(year, month, 1)

    return pd.to_datetime(rounded_date)

<IPython.core.display.Javascript object>

In [46]:
max_dt = pd.to_datetime('1984-01-01')

for i, x in enumerate(dataset_shorelinemonitor_hr['dt'].values):
    try:    
        lst = json.loads(x)
        if len(lst) > 0:
            max_v = int(max(lst))
            end_date = round_to_nearest_month(round_to_nearest_month(pd.to_datetime(dataset_shorelinemonitor_hr['Sdate'].values[i])) + pd.DateOffset(days = max_v))
            if end_date > max_dt:
                max_dt = end_date
    except: 
        continue


<IPython.core.display.Javascript object>

In [47]:
sdate = pd.to_datetime(min(dataset_shorelinemonitor_hr["Sdate"].values))
# edate = sdate + pd.DateOffset(max_dt)
edate = max_dt

sdate, edate

(Timestamp('1984-04-19 00:00:00'), Timestamp('2021-12-01 00:00:00'))

<IPython.core.display.Javascript object>

In [48]:
# drop time xarray

# dataset_shorelinemonitor = dataset_shorelinemonitor.drop_vars('dt')
# TODO: this needs to be changes as start_dates are different now and not always at 1984-01

date_range = pd.date_range(start=sdate, end=edate, freq="MS")
dataset_shorelinemonitor_hr["time_step"] = xr.DataArray(
    date_range, dims=["time"]
)  # TODO: altered time_range into time_step

<IPython.core.display.Javascript object>

In [49]:
start_lats = []
for x in dataset_shorelinemonitor_hr['transect_origin_y'].values:
    try:
        x2 = float(x)
    except:
        x2 = float(str(x)[:-2])
    finally:
        start_lats.append(x2)


<IPython.core.display.Javascript object>

In [50]:
# combine start and end coordinates into a transect
from shapely.geometry import LineString

start_lons = dataset_shorelinemonitor_hr["transect_origin_x"].values
end_lons = dataset_shorelinemonitor_hr["transect_end_x"].values
end_lats = dataset_shorelinemonitor_hr["transect_end_y"].values
coords = zip(zip(start_lons, start_lats), zip(end_lons, end_lats))

dataset_shorelinemonitor_hr["geometry"] = (
    ["index"],
    [str(LineString(line)) for line in coords],
)
dataset_shorelinemonitor_hr["geometry"].attrs["long_name"] = "Geometry"

<IPython.core.display.Javascript object>

In [51]:
dataset_shorelinemonitor_hr = dataset_shorelinemonitor_hr.assign_coords(
    transect_origin_y=start_lats
)

<IPython.core.display.Javascript object>

In [52]:
dataset_shorelinemonitor_hr

<IPython.core.display.Javascript object>

In [53]:
def create_shorelinepos(time_range, positions, timesteps, sdates):
    time_range = [pd.to_datetime(date_str) for date_str in time_range]
    l = np.full((len(positions), len(time_range)), np.nan)

    for i in range(len(sdates)):
        try:
            positions_lst, timesteps_lst = np.array(json.loads(positions[i])), np.array(
                json.loads(timesteps[i])
            )
        except:
            positions_lst, timesteps_lst = [], []

        sdate = round_to_nearest_month(pd.to_datetime(sdates[i]))
        dates = [
            round_to_nearest_month(x)
            for x in pd.to_datetime(sdate) + pd.to_timedelta(timesteps_lst, unit="D")
        ]
        mask = np.isin(time_range, dates)
        indices = np.arange(len(time_range))[mask]

        l[i, indices] = positions_lst

    return l


new_var1 = xr.apply_ufunc(
    create_shorelinepos,
    dataset_shorelinemonitor_hr["time_step"].values,
    dataset_shorelinemonitor_hr["dist"].values,
    dataset_shorelinemonitor_hr["dt"].values,
    dataset_shorelinemonitor_hr["Sdate"].values,
    input_core_dims=[["time_step"], ["index"], ["index"], ["index"]],
    dask="parallelized",
    output_dtypes=[float],
    output_core_dims=[["time_step"]],
    vectorize=False,
)

<IPython.core.display.Javascript object>

In [54]:
def combine_outliers(positions, outliers):
    l = np.zeros(positions.shape)
    not_na = np.where(~np.isnan(positions))[0]
    outl = np.array(json.loads(outliers))
    if len(outl) > 0:
        l[not_na[outl]] = 1

    return l


new_var2 = xr.apply_ufunc(
    combine_outliers,
    new_var1,
    dataset_shorelinemonitor_hr["outliers"],
    input_core_dims=[["time_step"], []],
    output_core_dims=[["time_step"]],
    vectorize=True,
    dask="parallelized",
    output_dtypes=[np.float32],
)

<IPython.core.display.Javascript object>

In [55]:
delete_vars = ["dist", "outliers", "Sdate"]
for dv in delete_vars:
    dataset_shorelinemonitor_hr = dataset_shorelinemonitor_hr.drop(dv)

dataset_shorelinemonitor_hr["dist"] = (["index", "time"], new_var1.data)
dataset_shorelinemonitor_hr["outliers"] = (["index", "time"], new_var2.data)

<IPython.core.display.Javascript object>

In [56]:
from scipy.stats import linregress
# TODO: Apply again after correct timesteps
# Define function to calculate changerate for sub-array
def calc_changerate(vary):
    varx = np.arange(0, len(vary))
    mask = ~np.isnan(vary)
    slope = -999
    intercept = -999
    if len(vary[mask]) > 0:
        slope, intercept, r_value, p_value, std_err = linregress(varx[mask], vary[mask])
    return slope * 12, intercept  # [m/month -> m/yr]


# Apply calc_changerate function to each sub-array of data in parallel
changerate, intercept = xr.apply_ufunc(
                                        calc_changerate,
                                        dataset_shorelinemonitor_hr['dist'],
                                        input_core_dims=[["time"]],
                                        output_core_dims=[[], []],
                                        vectorize=True,
                                        dask="parallelized",
                                        output_dtypes=[np.float32, np.float32],
                                    )

<IPython.core.display.Javascript object>

In [57]:
cr_meta = {"long_name": "Changerate", "units": "m/yr"}
dataset_shorelinemonitor_hr["changerate"] = ("index", changerate.data)

dataset_shorelinemonitor_hr["changerate"].attrs["long_name"] = cr_meta["long_name"]
dataset_shorelinemonitor_hr["changerate"].attrs["units"] = cr_meta["units"]

itcp_meta = { "long_name": "Intercept", "units": "m"}
dataset_shorelinemonitor_hr["intercept"] = ("index", intercept.data)

dataset_shorelinemonitor_hr["intercept"].attrs["long_name"] = itcp_meta["long_name"]
dataset_shorelinemonitor_hr["intercept"].attrs["units"] = itcp_meta["units"]

<IPython.core.display.Javascript object>

In [60]:
dataset_shorelinemonitor_hr

<IPython.core.display.Javascript object>

In [61]:
# NetCDF variable and dimension alterations

# rename or swap dimension names, the latter in case the name already exists as coordinate
# dataset_shorelinemonitor_hr = dataset_shorelinemonitor_hr.rename_dims(
#     {"index": "stations"}
# )

f_vars = open(
    r"P:\1000545-054-globalbeaches\15_GlobalCoastalAtlas\datasets\02_Shorelinemonitor_monthly\vars_shorelinemonitor_hr.json"
)
meta_vars = json.load(f_vars)

for var_name, var_dict in meta_vars.items():
    if var_name in list(dataset_shorelinemonitor_hr.keys()):
        dataset_shorelinemonitor_hr = dataset_shorelinemonitor_hr.rename_vars(
            {var_name: var_dict["name"]}
        )
        for key, value in var_dict.items():
            if key != "name":
                dataset_shorelinemonitor_hr[var_dict["name"]].attrs[key] = value

delete_vars = ["transect_end_x", "transect_end_y", "dt"]
for dv in delete_vars:
    dataset_shorelinemonitor_hr = dataset_shorelinemonitor_hr.drop(dv)

# change dtypes
object_vars = ["transect_id", "hotspot_id", "geometry"]
for ov in object_vars:
    dataset_shorelinemonitor_hr[ov] = dataset_shorelinemonitor_hr[ov].astype("S")

all_vars = list(dataset_shorelinemonitor_hr.keys())
data_vars = ["sp", "outliers"]
# set some data variables to coordinates to avoid duplication of dimensions in later stage
dataset_shorelinemonitor_hr = dataset_shorelinemonitor_hr.set_coords(
    [v for v in all_vars if v not in data_vars]
)

# drop index xarray
dataset_shorelinemonitor_hr = dataset_shorelinemonitor_hr.drop("index")

<IPython.core.display.Javascript object>

In [62]:
dataset_shorelinemonitor_hr

<IPython.core.display.Javascript object>

In [79]:
dataset_shorelinemonitor_hr = dataset_shorelinemonitor_hr.drop("time_step")

<IPython.core.display.Javascript object>

In [80]:
# dataset_shorelinemonitor = dataset_shorelinemonitor.rename_vars({'time_step' : 'time'})
rename = {"long_name": "Time", "standard_name": "time", "units": "yr"}

dataset_shorelinemonitor_hr["time"].attrs["long_name"] = rename["long_name"]
dataset_shorelinemonitor_hr["time"].attrs["standard_name"] = rename["standard_name"]
dataset_shorelinemonitor_hr["time"].attrs["units"] = rename["units"]

<IPython.core.display.Javascript object>

In [81]:
# save new .nc files
dataset_shorelinemonitor_hr["time"].attrs.pop("units", None)
dataset_shorelinemonitor_hr.to_netcdf(
    path=str(dataset_dir_shorelinemonitor_hr).replace(".nc", "_CF.nc")
)

<IPython.core.display.Javascript object>

In [82]:
dataset_shorelinemonitor_hr = dataset_shorelinemonitor_hr.swap_dims(
    {"transect_origin_y": "stations"}
)

<IPython.core.display.Javascript object>

In [83]:
dataset_shorelinemonitor_hr

<IPython.core.display.Javascript object>

### Check CF compliancy altered NetCDF files

In [84]:
%%capture cap --no-stderr
# check altered CF compliancy

check_compliancy(testfile=str(dataset_dir_shorelinemonitor_hr).replace(".nc", "_CF.nc"), working_dir=CF_dir)

<IPython.core.display.Javascript object>

In [85]:
# save altered CF compliancy
save_compliancy(
    cap,
    testfile=str(dataset_dir_shorelinemonitor_hr).replace(".nc", "_CF.nc"),
    working_dir=CF_dir,
)



<IPython.core.display.Javascript object>

### write data to Zarr files

In [86]:
# export to zarr in write mode (to overwrite if exists)
dataset_shorelinemonitor_hr.to_zarr(
    dataset_dir.joinpath("%s.zarr" % dataset_out_file), mode="w"
)

<xarray.backends.zarr.ZarrStore at 0x155d91499e0>

<IPython.core.display.Javascript object>

In [87]:
test = xr.open_zarr(dataset_dir.joinpath("%s.zarr" % dataset_out_file))

<IPython.core.display.Javascript object>

In [88]:
test

Unnamed: 0,Array,Chunk
Bytes,236.86 kiB,236.86 kiB
Shape,"(60637,)","(60637,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 236.86 kiB 236.86 kiB Shape (60637,) (60637,) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",60637  1,

Unnamed: 0,Array,Chunk
Bytes,236.86 kiB,236.86 kiB
Shape,"(60637,)","(60637,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.53 MiB,451.54 kiB
Shape,"(60637,)","(7580,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,|S61 numpy.ndarray,|S61 numpy.ndarray
"Array Chunk Bytes 3.53 MiB 451.54 kiB Shape (60637,) (7580,) Dask graph 8 chunks in 2 graph layers Data type |S61 numpy.ndarray",60637  1,

Unnamed: 0,Array,Chunk
Bytes,3.53 MiB,451.54 kiB
Shape,"(60637,)","(7580,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,|S61 numpy.ndarray,|S61 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.79 MiB,229.47 kiB
Shape,"(60637,)","(7580,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,|S31 numpy.ndarray,|S31 numpy.ndarray
"Array Chunk Bytes 1.79 MiB 229.47 kiB Shape (60637,) (7580,) Dask graph 8 chunks in 2 graph layers Data type |S31 numpy.ndarray",60637  1,

Unnamed: 0,Array,Chunk
Bytes,1.79 MiB,229.47 kiB
Shape,"(60637,)","(7580,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,|S31 numpy.ndarray,|S31 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,236.86 kiB,236.86 kiB
Shape,"(60637,)","(60637,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 236.86 kiB 236.86 kiB Shape (60637,) (60637,) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",60637  1,

Unnamed: 0,Array,Chunk
Bytes,236.86 kiB,236.86 kiB
Shape,"(60637,)","(60637,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,473.73 kiB,236.87 kiB
Shape,"(60637,)","(30319,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 473.73 kiB 236.87 kiB Shape (60637,) (30319,) Dask graph 2 chunks in 2 graph layers Data type float64 numpy.ndarray",60637  1,

Unnamed: 0,Array,Chunk
Bytes,473.73 kiB,236.87 kiB
Shape,"(60637,)","(30319,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,473.73 kiB,236.87 kiB
Shape,"(60637,)","(30319,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 473.73 kiB 236.87 kiB Shape (60637,) (30319,) Dask graph 2 chunks in 2 graph layers Data type float64 numpy.ndarray",60637  1,

Unnamed: 0,Array,Chunk
Bytes,473.73 kiB,236.87 kiB
Shape,"(60637,)","(30319,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,888.24 kiB,222.07 kiB
Shape,"(60637,)","(15160,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,|S15 numpy.ndarray,|S15 numpy.ndarray
"Array Chunk Bytes 888.24 kiB 222.07 kiB Shape (60637,) (15160,) Dask graph 4 chunks in 2 graph layers Data type |S15 numpy.ndarray",60637  1,

Unnamed: 0,Array,Chunk
Bytes,888.24 kiB,222.07 kiB
Shape,"(60637,)","(15160,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,|S15 numpy.ndarray,|S15 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,104.55 MiB,843.87 kiB
Shape,"(60637, 452)","(3790, 57)"
Dask graph,128 chunks in 2 graph layers,128 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 104.55 MiB 843.87 kiB Shape (60637, 452) (3790, 57) Dask graph 128 chunks in 2 graph layers Data type float32 numpy.ndarray",452  60637,

Unnamed: 0,Array,Chunk
Bytes,104.55 MiB,843.87 kiB
Shape,"(60637, 452)","(3790, 57)"
Dask graph,128 chunks in 2 graph layers,128 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,209.11 MiB,1.65 MiB
Shape,"(60637, 452)","(3790, 57)"
Dask graph,128 chunks in 2 graph layers,128 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 209.11 MiB 1.65 MiB Shape (60637, 452) (3790, 57) Dask graph 128 chunks in 2 graph layers Data type float64 numpy.ndarray",452  60637,

Unnamed: 0,Array,Chunk
Bytes,209.11 MiB,1.65 MiB
Shape,"(60637, 452)","(3790, 57)"
Dask graph,128 chunks in 2 graph layers,128 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


<IPython.core.display.Javascript object>