# Box Concatenate Test

This notebook is to test concatentating lots of separate netcdf files into one netcdf or zarr file. I will test the functionality using the smaller scale tests located in `/swot/SUM05/amf2288/sync-boxes/test_boxes`

In [1]:
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
import argopy
import scipy.ndimage as filter
import scipy
import matplotlib
import gsw
from pathlib import Path

import dask
from dask.diagnostics import ProgressBar

In [2]:
def concatenate_netcdf(input_dir: str, first_dim: str, second_dim: str, output_dir: str, output_file: str):
    input_path = Path(input_dir)
    netcdf_files = list(input_path.glob("*.nc"))
    
    # Create the output directory if it doesn't exist
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Open datasets lazily with dask
    datasets = [xr.open_dataset(str(file), chunks={}) for file in netcdf_files]
    
    # Concatenate along the first dimension
    combined_first_dim = xr.concat(datasets, dim=first_dim)
    
    # Rechunk the data to ensure uniform chunk sizes
    combined_rechunked = combined_first_dim.chunk({first_dim: 256, second_dim: 256})  # Adjust chunk sizes as needed
    
    # Save to NetCDF
    output_file_path = output_path / output_file
    with ProgressBar():
        combined_rechunked.to_netcdf(output_file_path, compute=True)

def concatenate_zarr(input_dir: str, first_dim: str, second_dim: str, output_dir: str, output_file: str):
    input_path = Path(input_dir)
    netcdf_files = list(input_path.glob("*.nc"))
    
    # Create the output directory if it doesn't exist
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Open datasets lazily with dask
    datasets = [xr.open_dataset(str(file), chunks={}) for file in netcdf_files]
    
    # Concatenate along the first dimension
    combined_first_dim = xr.concat(datasets, dim=first_dim)
    
    # Rechunk the data to ensure uniform chunk sizes
    combined_rechunked = combined_first_dim.chunk({first_dim: 256, second_dim: 256})  # Adjust chunk sizes as needed
    
    # Save to Zarr
    output_file_path = output_path / output_file
    with ProgressBar():
        combined_rechunked.to_zarr(output_file_path, compute=True)

In [3]:
# Example usage
input_directory = "/swot/SUM05/amf2288/sync-boxes/test_boxes"
output_directory = "/swot/SUM05/amf2288/sync-boxes/test_concat"
output_netcdf = "test_out.nc"
output_zarr = "test_out.zarr"
first_dim = "N_PROF"
second_dim = "PRES_INTERPOLATED"

In [4]:
# Concatenate along both dimensions and save to NetCDF
concatenate_netcdf(input_directory, first_dim, second_dim, output_directory, output_netcdf)

[########################################] | 100% Completed | 1.05 s


In [7]:
# Concatenate along both dimensions and save to Zarr
concatenate_zarr(input_directory, first_dim, second_dim, output_directory, output_zarr)

[########################################] | 100% Completed | 1.14 s


In [6]:
xr.open_dataset('/swot/SUM05/amf2288/sync-boxes/test_concat/test_out.nc')

In [8]:
xr.open_zarr('/swot/SUM05/amf2288/sync-boxes/test_concat/test_out.zarr')

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 18.23 kiB 2.00 kiB Shape (2334,) (256,) Dask graph 10 chunks in 2 graph layers Data type float64 numpy.ndarray",2334  1,

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 18.23 kiB 2.00 kiB Shape (2334,) (256,) Dask graph 10 chunks in 2 graph layers Data type float64 numpy.ndarray",2334  1,

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 18.23 kiB 2.00 kiB Shape (2334,) (256,) Dask graph 10 chunks in 2 graph layers Data type int64 numpy.ndarray",2334  1,

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 18.23 kiB 2.00 kiB Shape (2334,) (256,) Dask graph 10 chunks in 2 graph layers Data type int64 numpy.ndarray",2334  1,

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray
"Array Chunk Bytes 18.23 kiB 2.00 kiB Shape (2334,) (256,) Dask graph 10 chunks in 2 graph layers Data type datetime64[ns] numpy.ndarray",2334  1,

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 18.23 kiB 2.00 kiB Shape (2334,) (256,) Dask graph 10 chunks in 2 graph layers Data type float64 numpy.ndarray",2334  1,

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 18.23 kiB 2.00 kiB Shape (2334,) (256,) Dask graph 10 chunks in 2 graph layers Data type float64 numpy.ndarray",2334  1,

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.82 MiB,512.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 17.82 MiB 512.00 kiB Shape (2334, 1001) (256, 256) Dask graph 40 chunks in 2 graph layers Data type float64 numpy.ndarray",1001  2334,

Unnamed: 0,Array,Chunk
Bytes,17.82 MiB,512.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 18.23 kiB 2.00 kiB Shape (2334,) (256,) Dask graph 10 chunks in 2 graph layers Data type int64 numpy.ndarray",2334  1,

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,9.12 kiB,1.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,,
"Array Chunk Bytes 9.12 kiB 1.00 kiB Shape (2334,) (256,) Dask graph 10 chunks in 2 graph layers Data type",2334  1,

Unnamed: 0,Array,Chunk
Bytes,9.12 kiB,1.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,9.12 kiB,1.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,,
"Array Chunk Bytes 9.12 kiB 1.00 kiB Shape (2334,) (256,) Dask graph 10 chunks in 2 graph layers Data type",2334  1,

Unnamed: 0,Array,Chunk
Bytes,9.12 kiB,1.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 18.23 kiB 2.00 kiB Shape (2334,) (256,) Dask graph 10 chunks in 2 graph layers Data type int64 numpy.ndarray",2334  1,

Unnamed: 0,Array,Chunk
Bytes,18.23 kiB,2.00 kiB
Shape,"(2334,)","(256,)"
Dask graph,10 chunks in 2 graph layers,10 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,8.91 MiB,256.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 8.91 MiB 256.00 kiB Shape (2334, 1001) (256, 256) Dask graph 40 chunks in 2 graph layers Data type float32 numpy.ndarray",1001  2334,

Unnamed: 0,Array,Chunk
Bytes,8.91 MiB,256.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,8.91 MiB,256.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 8.91 MiB 256.00 kiB Shape (2334, 1001) (256, 256) Dask graph 40 chunks in 2 graph layers Data type float32 numpy.ndarray",1001  2334,

Unnamed: 0,Array,Chunk
Bytes,8.91 MiB,256.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.82 MiB,512.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 17.82 MiB 512.00 kiB Shape (2334, 1001) (256, 256) Dask graph 40 chunks in 2 graph layers Data type float64 numpy.ndarray",1001  2334,

Unnamed: 0,Array,Chunk
Bytes,17.82 MiB,512.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.82 MiB,512.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 17.82 MiB 512.00 kiB Shape (2334, 1001) (256, 256) Dask graph 40 chunks in 2 graph layers Data type float64 numpy.ndarray",1001  2334,

Unnamed: 0,Array,Chunk
Bytes,17.82 MiB,512.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.82 MiB,512.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 17.82 MiB 512.00 kiB Shape (2334, 1001) (256, 256) Dask graph 40 chunks in 2 graph layers Data type float64 numpy.ndarray",1001  2334,

Unnamed: 0,Array,Chunk
Bytes,17.82 MiB,512.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,8.91 MiB,256.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 8.91 MiB 256.00 kiB Shape (2334, 1001) (256, 256) Dask graph 40 chunks in 2 graph layers Data type float32 numpy.ndarray",1001  2334,

Unnamed: 0,Array,Chunk
Bytes,8.91 MiB,256.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,8.91 MiB,256.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 8.91 MiB 256.00 kiB Shape (2334, 1001) (256, 256) Dask graph 40 chunks in 2 graph layers Data type float32 numpy.ndarray",1001  2334,

Unnamed: 0,Array,Chunk
Bytes,8.91 MiB,256.00 kiB
Shape,"(2334, 1001)","(256, 256)"
Dask graph,40 chunks in 2 graph layers,40 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
