In [None]:
import xarray as xr
import numpy as np

# Load the 4D SINMOD dataset
ds = xr.open_dataset("/cluster/projects/itk-SINMOD/coral-mapping/midnor/PhysStates_2019.nc")

#Load the 2D midnor SINMOD dataset to get "gridLons" variable
ds_2d = xr.open_dataset("/cluster/projects/itk-SINMOD/coral-mapping/midnor/samp_2D_jan_jun.nc")
gridLons = ds_2d["gridLons"]

# Check the dataset structure
#print(ds)

#Add gridLons to ds
ds['gridLons'] = gridLons

# Check the result
print(ds)

# List of variables for processing
variables = ['temperature', 'salinity', 'u_velocity', 'v_velocity']


In [2]:
import xarray as xr
import dask
import time 

def process_bottom_layer(
    file_path,
    variable_name,
    chunks={"time":-1, "zc": -1, "yc": 50, "xc": 50},
    output_path=None
):
    """
    Process bottom layer data for a specified variable in a NetCDF file.
    
    Parameters:
    - file_path (str): Path to the NetCDF file.
    - variable_name (str): Name of the variable to process.
    - chunks (dict): Chunking strategy for the dataset.
    - slice_dict (dict): Slicing parameters for the data (e.g., {"zc": slice(0, 10), "yc": slice(0, 10), "xc": slice(65, 75)}).
    - output_path (str): Path to save the processed file (optional). If None, the result is not saved.
    
    Returns:
    - xarray.DataArray: The time-averaged bottom layer data.
    """
    time_start = time.time()

    # Open the dataset with chunking
    ds = xr.open_dataset(file_path, chunks=chunks)

    print(f"\nAccessed the dataset in {time.time() - time_start:.2f} seconds")
    
    # Extract the variable
    if variable_name == "current_speed":
        data_var = ds["u_velocity"]
    else:
        data_var = ds[variable_name]
    
    # Extract the first time step
    time_slice = data_var.isel(time=0)
    
    # Step 1: Create a mask for valid values in first time step
    valid_mask = ~time_slice.isnull()
    
    # Step 2: Find the index of the bottom-most valid layer for each (yc, xc)
    # Subtract 1 to get the correct index for the bottom layer
    bottom_layer_idx = valid_mask.argmin(dim="zc") - 1

    # Ensure bottom_layer_idx does not go negative (e.g., if all values are invalid in a column)
    bottom_layer_idx = bottom_layer_idx.clip(min=0)
    
    # Step 3: Extract the bottom layer data across all time steps
    if variable_name == "current_speed":
        bottom_layer_data = (data_var.isel(zc=bottom_layer_idx)**2 + ds["v_velocity"].isel(zc=bottom_layer_idx)**2)**0.5
    else:
        bottom_layer_data = data_var.isel(zc=bottom_layer_idx.load())

    print(f"\nExtracted the bottom layer data in {time.time() - time_start:.2f} seconds.\n\nStarting computation of statistics...")

    # CONSIDER CHUNKING THE BOTTOM LAYER DATA FOR EFFICIENT COMPUTATION
    # bottom_layer_data = bottom_layer_data.chunk({"yc": 50, "xc": 50})

    # Step 4: Calculate statistics across time
    time_avg_bottom_layer = bottom_layer_data.mean(dim="time", skipna=True)

    # Calculate both 10th and 90th percentiles
    time_percentiles = bottom_layer_data.quantile([0.1, 0.9], dim="time", skipna=True)

    print(f"\nComputed statistics after {time.time() - time_start:.2f} seconds")

    # Create a new DataArray with the (mean, 10th, 90th) percentiles and explicitly define the 'stat' dimension
    # Concatenate mean and percentiles in one line, drop 'quantile' and concatenate all together
    stats_array = xr.concat([time_avg_bottom_layer, time_percentiles.sel(quantile=0.1).drop_vars("quantile"), time_percentiles.sel(quantile=0.9).drop_vars("quantile")], dim="stat").rename(f"{variable_name}_features")

    # Name each value of the first dimension
    stats_array = stats_array.assign_coords(stat=["mean", "10th_percentile", "90th_percentile"])
    # Save to output file if specified
    if output_path:
        stats_array.to_netcdf(output_path)
    
    return stats_array, bottom_layer_idx

In [6]:
speed_array, idx = process_bottom_layer("/cluster/projects/itk-SINMOD/coral-mapping/midnor/PhysStates_2019.nc", "current_speed")
speed_array


Accessed the dataset in 0.02 seconds


  ds = xr.open_dataset(file_path, chunks=chunks)
  ds = xr.open_dataset(file_path, chunks=chunks)


ValueError: Vectorized indexing with Dask arrays is not supported. Please pass a numpy array by calling ``.compute``. See https://github.com/dask/dask/issues/8958.

In [3]:
import xarray as xr
import time 

def process_bottom_layer_no_dask(
    file_path,
    variable_name,
    #chunks={"time":-1, "zc": -1, "yc": 50, "xc": 50},
    output_path=None
):
    """
    Process bottom layer data for a specified variable in a NetCDF file.
    
    Parameters:
    - file_path (str): Path to the NetCDF file.
    - variable_name (str): Name of the variable to process.
    - chunks (dict): Chunking strategy for the dataset.
    - slice_dict (dict): Slicing parameters for the data (e.g., {"zc": slice(0, 10), "yc": slice(0, 10), "xc": slice(65, 75)}).
    - output_path (str): Path to save the processed file (optional). If None, the result is not saved.
    
    Returns:
    - xarray.DataArray: The time-averaged bottom layer data.
    """
    time_start = time.time()

    # Open the dataset
    ds = xr.open_dataset(file_path)

    print(f"\nAccessed the dataset after {time.time() - time_start:.2f} seconds")
    
    # Extract the variable
    if variable_name == "current_speed":
        data_var = ds["u_velocity"][:,:,0:10,65:75]
    else:
        data_var = ds[variable_name]
    
    # Extract the first time step
    time_slice = data_var.isel(time=0)
    
    # Step 1: Create a mask for valid values in first time step
    valid_mask = ~time_slice.isnull()
    
    # Step 2: Find the index of the bottom-most valid layer for each (yc, xc)
    # Subtract 1 to get the correct index for the bottom layer
    bottom_layer_idx = valid_mask.argmin(dim="zc") - 1

    # Ensure bottom_layer_idx does not go negative (e.g., if all values are invalid in a column)
    bottom_layer_idx = bottom_layer_idx.clip(min=0)
    
    # Step 3: Extract the bottom layer data across all time steps
    if variable_name == "current_speed":
        bottom_layer_data = (data_var.isel(zc=bottom_layer_idx)**2 + ds["v_velocity"][:,:,:10,65:75].isel(zc=bottom_layer_idx)**2)**0.5
    else:
        bottom_layer_data = data_var.isel(zc=bottom_layer_idx)

    ds.close()

    print(f"\nExtracted the bottom layer data after {time.time() - time_start:.2f} seconds.\n\nStarting computation of statistics...")

    # Step 4: Calculate statistics across time
    time_avg_bottom_layer = bottom_layer_data.mean(dim="time", skipna=True)

    # Calculate both 10th and 90th percentiles
    time_percentiles = bottom_layer_data.quantile([0.1, 0.9], dim="time", skipna=True)

    print(f"\nComputed statistics after {time.time() - time_start:.2f} seconds")

    # Create a new DataArray with the (mean, 10th, 90th) percentiles and explicitly define the 'stat' dimension
    # Concatenate mean and percentiles in one line, drop 'quantile' and concatenate all together
    stats_array = xr.concat([time_avg_bottom_layer, time_percentiles.sel(quantile=0.1).drop_vars("quantile"), time_percentiles.sel(quantile=0.9).drop_vars("quantile")], dim="stat").rename(f"{variable_name}_features")

    # Name each value of the first dimension
    stats_array = stats_array.assign_coords(stat=["mean", "10th_percentile", "90th_percentile"])

    # Save to output file if specified
    if output_path:
        stats_array.to_netcdf(output_path)
    
    return stats_array, bottom_layer_idx

In [None]:
current_array_loaded, idx_no_dask = process_bottom_layer_no_dask("/cluster/projects/itk-SINMOD/coral-mapping/midnor/PhysStates_2019.nc", "temperature")


Accessed the dataset after 0.18 seconds


In [None]:
print(current_array_loaded)

In [None]:
import matplotlib.pyplot as plt

variable_name = "current_speed"
data_array = current_array_loaded

# Visualize the results for the entire data
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot mean, 10th percentile, and 90th percentile for the entire dataset
data_array.sel(stat="mean").plot(ax=axes[0])
axes[0].set_title(f'{variable_name} - Mean')

data_array.sel(stat="10th_percentile").plot(ax=axes[1])
axes[1].set_title(f'{variable_name} - 10th Percentile')

data_array.sel(stat="90th_percentile").plot(ax=axes[2])
axes[2].set_title(f'{variable_name} - 90th Percentile')

plt.tight_layout()
plt.show()


In [4]:
import numpy as np

data_slice = temp_array_no_dask.isel(xc=slice(65, 75), yc=slice(0, 10))

# 1. Assert that if one of the statistics is NaN, then all three (mean, 10th, 90th percentiles) should also be NaN
nan_mask = data_slice.sel(stat="mean").isnull() | data_slice.sel(stat="10th_percentile").isnull() | data_slice.sel(stat="90th_percentile").isnull()
assert (nan_mask == (data_slice.sel(stat="mean").isnull() & data_slice.sel(stat="10th_percentile").isnull() & data_slice.sel(stat="90th_percentile").isnull())).all(), "If one of the stats is NaN, all of them should be NaN for that grid point."

# 2. Assert that the mean should be greater than or equal to the 10th percentile, and less than or equal to the 90th percentile, ignoring NaNs
assert (data_slice.sel(stat="mean").notnull() >= data_slice.sel(stat="10th_percentile").notnull()).all(), "Mean should be greater than or equal to the 10th percentile"
assert (data_slice.sel(stat="mean").notnull() <= data_slice.sel(stat="90th_percentile").notnull()).all(), "Mean should be less than or equal to the 90th percentile"


NameError: name 'temp_array_no_dask' is not defined

In [None]:
data_slice.where(~nan_mask)