### Imports

In [1]:
import glob
import numpy as np
import xarray as xr
import pandas as pd
import metpy.calc as mpcalc

In [2]:
from dask_jobqueue import PBSCluster
from distributed import Client
from dask import delayed

### Scratch directory

In [3]:
lustre_scratch  = "/lustre/desc1/scratch/myasears"

### Spin up a cluster

In [4]:
cluster = PBSCluster(
        job_name = 'dask-eol-25',
        cores = 1,
        memory = '4GiB',
        processes = 1,
        local_directory = lustre_scratch + '/dask/spill',
        log_directory = lustre_scratch + '/dask/logs/',
        resource_spec = 'select=1:ncpus=1:mem=4GB',
        queue = 'casper',
        walltime = '3:00:00',
        interface = 'ext')

In [5]:
client = Client(cluster)

In [6]:
# Scale the cluster and display cluster dashboard URL
n_workers = 5
cluster.scale(n_workers)
client.wait_for_workers(n_workers = n_workers)
cluster

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/myasears/proxy/8787/status,Workers: 5
Total threads: 5,Total memory: 20.00 GiB

0,1
Comm: tcp://128.117.208.100:37311,Workers: 5
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/myasears/proxy/8787/status,Total threads: 5
Started: Just now,Total memory: 20.00 GiB

0,1
Comm: tcp://128.117.208.174:38541,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/myasears/proxy/33061/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.174:41851,
Local directory: /lustre/desc1/scratch/myasears/dask/spill/dask-scratch-space/worker-s2c7qgdm,Local directory: /lustre/desc1/scratch/myasears/dask/spill/dask-scratch-space/worker-s2c7qgdm
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 57.88 MiB,Spilled bytes: 0 B
Read bytes: 28.38 MiB,Write bytes: 12.82 MiB

0,1
Comm: tcp://128.117.208.174:42279,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/myasears/proxy/36783/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.174:36575,
Local directory: /lustre/desc1/scratch/myasears/dask/spill/dask-scratch-space/worker-zwa7lm9n,Local directory: /lustre/desc1/scratch/myasears/dask/spill/dask-scratch-space/worker-zwa7lm9n
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 57.90 MiB,Spilled bytes: 0 B
Read bytes: 8.92 MiB,Write bytes: 4.08 MiB

0,1
Comm: tcp://128.117.208.174:42661,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/myasears/proxy/41077/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.174:37813,
Local directory: /lustre/desc1/scratch/myasears/dask/spill/dask-scratch-space/worker-s4k_pvai,Local directory: /lustre/desc1/scratch/myasears/dask/spill/dask-scratch-space/worker-s4k_pvai
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 57.92 MiB,Spilled bytes: 0 B
Read bytes: 38.61 MiB,Write bytes: 4.14 MiB

0,1
Comm: tcp://128.117.208.175:39139,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/myasears/proxy/38059/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.175:42895,
Local directory: /lustre/desc1/scratch/myasears/dask/spill/dask-scratch-space/worker-bvrlgsos,Local directory: /lustre/desc1/scratch/myasears/dask/spill/dask-scratch-space/worker-bvrlgsos
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 57.91 MiB,Spilled bytes: 0 B
Read bytes: 1.44 GiB,Write bytes: 3.61 MiB

0,1
Comm: tcp://128.117.208.174:46103,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/myasears/proxy/35599/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.174:44295,
Local directory: /lustre/desc1/scratch/myasears/dask/spill/dask-scratch-space/worker-o2dcdps6,Local directory: /lustre/desc1/scratch/myasears/dask/spill/dask-scratch-space/worker-o2dcdps6
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 57.91 MiB,Spilled bytes: 0 B
Read bytes: 27.69 MiB,Write bytes: 14.76 MiB


### Load ERA5 data

In [7]:
era5_path = '/glade/campaign/collections/rda/data/d633000/e5.oper.an.pl'

In [8]:
target_lat = 38.0
target_lon = 243.0

start_date = pd.Timestamp("2023-07-11T00:00:00")
end_date = pd.Timestamp("2023-09-27T23:59:59")
yyyymm = ["202307", "202308", "202309"]

var_map = {"Z": "e5.oper.an.pl.128_129_z",
           "U": "e5.oper.an.pl.128_131_u",
           "V": "e5.oper.an.pl.128_132_v",
           "W": "e5.oper.an.pl.128_135_w"
           }

In [9]:
def open_variable(file_prefix, yyyymm):
    files = []
    for month in yyyymm:
        files.extend(sorted(glob.glob(f'{era5_path}/{month}/{file_prefix}*')))

    ds = xr.open_mfdataset(files, combine="by_coords", parallel=True)
    ds_point = ds.sel(latitude=target_lat, longitude=target_lon, time=slice(start_date, end_date))
    
    return ds_point

In [10]:
# Open and subset all variables
datasets = [open_variable(file_prefix, yyyymm) for file_prefix in var_map.values()]

# Merge them together (preserves attrs if consistent)
combined_era5 = xr.merge(datasets, compat="override", combine_attrs="override")

In [11]:
# Convert geopotential to geometric height (m above MSL)
height = mpcalc.geopotential_to_height(combined_era5["Z"])
combined_era5["height_msl"] = height
combined_era5.height_msl.attrs.update({"long_name": "Height above mean sea level", "units": "meters"})

# Drop utc_date variable
combined_era5 = combined_era5.drop_vars("utc_date")

# Change variable names to standardize with other datasets
name_mapping = {"level": "pressure", "Z": "geopotential", "U": "u_wind", "V": "v_wind", "W": "w_wind"}
combined_era5 = combined_era5.rename(name_mapping)
combined_era5

Unnamed: 0,Array,Chunk
Bytes,274.03 kiB,148 B
Shape,"(1896, 37)","(1, 37)"
Dask graph,1896 chunks in 186 graph layers,1896 chunks in 186 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 274.03 kiB 148 B Shape (1896, 37) (1, 37) Dask graph 1896 chunks in 186 graph layers Data type float32 numpy.ndarray",37  1896,

Unnamed: 0,Array,Chunk
Bytes,274.03 kiB,148 B
Shape,"(1896, 37)","(1, 37)"
Dask graph,1896 chunks in 186 graph layers,1896 chunks in 186 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,274.03 kiB,148 B
Shape,"(1896, 37)","(1, 37)"
Dask graph,1896 chunks in 186 graph layers,1896 chunks in 186 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 274.03 kiB 148 B Shape (1896, 37) (1, 37) Dask graph 1896 chunks in 186 graph layers Data type float32 numpy.ndarray",37  1896,

Unnamed: 0,Array,Chunk
Bytes,274.03 kiB,148 B
Shape,"(1896, 37)","(1, 37)"
Dask graph,1896 chunks in 186 graph layers,1896 chunks in 186 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,274.03 kiB,148 B
Shape,"(1896, 37)","(1, 37)"
Dask graph,1896 chunks in 186 graph layers,1896 chunks in 186 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 274.03 kiB 148 B Shape (1896, 37) (1, 37) Dask graph 1896 chunks in 186 graph layers Data type float32 numpy.ndarray",37  1896,

Unnamed: 0,Array,Chunk
Bytes,274.03 kiB,148 B
Shape,"(1896, 37)","(1, 37)"
Dask graph,1896 chunks in 186 graph layers,1896 chunks in 186 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,274.03 kiB,148 B
Shape,"(1896, 37)","(1, 37)"
Dask graph,1896 chunks in 186 graph layers,1896 chunks in 186 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 274.03 kiB 148 B Shape (1896, 37) (1, 37) Dask graph 1896 chunks in 186 graph layers Data type float32 numpy.ndarray",37  1896,

Unnamed: 0,Array,Chunk
Bytes,274.03 kiB,148 B
Shape,"(1896, 37)","(1, 37)"
Dask graph,1896 chunks in 186 graph layers,1896 chunks in 186 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Magnitude,"Array Chunk Bytes 274.03 kiB 148 B Shape (1896, 37) (1, 37) Dask graph 1896 chunks in 189 graph layers Data type float32 numpy.ndarray 37  1896",
"Array Chunk Bytes 274.03 kiB 148 B Shape (1896, 37) (1, 37) Dask graph 1896 chunks in 189 graph layers Data type float32 numpy.ndarray",37  1896,
,Array,Chunk
Bytes,274.03 kiB,148 B
Shape,"(1896, 37)","(1, 37)"
Dask graph,1896 chunks in 189 graph layers,1896 chunks in 189 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
Units,meter,

Unnamed: 0,Array,Chunk
Bytes,274.03 kiB,148 B
Shape,"(1896, 37)","(1, 37)"
Dask graph,1896 chunks in 189 graph layers,1896 chunks in 189 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 274.03 kiB 148 B Shape (1896, 37) (1, 37) Dask graph 1896 chunks in 189 graph layers Data type float32 numpy.ndarray",37  1896,

Unnamed: 0,Array,Chunk
Bytes,274.03 kiB,148 B
Shape,"(1896, 37)","(1, 37)"
Dask graph,1896 chunks in 189 graph layers,1896 chunks in 189 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


### Load 449 data

In [12]:
prof449_path = "/glade/campaign/collections/gdex/data/special_projects/pythia_2025/eol-cookbook/m2hats_iss2_data/prof449Mhz_30min_winds"

In [13]:
files = []
files.extend(sorted(glob.glob(f'{prof449_path}/*.nc')))

In [14]:
# Get min and max altitude from campaign 

def get_minmax_alt(f):
    with xr.open_dataset(f, decode_cf=False) as tmp:
        return float(tmp['height'].min()), float(tmp['height'].max())

min_heights, max_heights = zip(*[get_minmax_alt(f) for f in files])
min_height, max_height = min(min_heights), max(max_heights)

In [15]:
# Retrieve common height grid (with a step of 100m) using max and min values
step = 100
# Create common height grid
common_agl = np.arange(min_height, max_height + step, step)

# Retrieve altitude value from fifth file (after setup -- checked manually)
altitude = xr.open_dataset(files[5]).alt.values

# Use alt to create common MSL grid
common_msl = common_agl + altitude

In [16]:
def open_and_regrid(f, common_agl, common_msl):
    ds = xr.open_dataset(f, chunks="auto")

    # Calculate MSL height from AGL
    msl_height = ds['height'].isel(time=0) + altitude

    # Make height coordinate 1-dimensional (same at every time step)
    height_1d = ds['height'].isel(time=0).values
    ds = ds.assign_coords(height=("height", height_1d))

    # Reindex height coords to span min + max from entire campaign
    ds = ds.reindex(height=common_agl)
    
    # Update coords to the reindexed grid
    ds = ds.assign_coords(
        height_agl=("height", common_agl),
        height_msl=("height", common_msl)
    )
    
    ds.height_msl.attrs.update({"long_name": "Height above mean sea level", "units": "meters"})
    
    # Swap to make geopotential the vertical coordinate
    ds = ds.swap_dims({"height": "height_msl"}).drop_vars("height")

    return ds

In [17]:
datasets = [delayed(open_and_regrid)(f, common_agl, common_msl) for f in files[2:]]
datasets = [d.compute() for d in datasets]
combined_profiler = xr.concat(datasets, dim="time", combine_attrs="override")

In [18]:
combined_profiler = combined_profiler.assign_coords(
    latitude=combined_profiler["lat"].isel(time=0).item(),
    longitude=combined_profiler["lon"].isel(time=0).item(),
    altitude=combined_profiler["alt"].isel(time=0).item()
).drop_vars(["lat", "lon", "alt"])

name_mapping = {
    "u": "u_wind",
    "v": "v_wind",
    "wvert": "w_wind"
}

vars_to_keep = [var for var in name_mapping if var in combined_profiler.data_vars]
combined_profiler = combined_profiler[vars_to_keep]
combined_profiler = combined_profiler.rename(name_mapping)

combined_profiler

Unnamed: 0,Array,Chunk
Bytes,1.37 MiB,8.62 kiB
Shape,"(3696, 97)","(48, 46)"
Dask graph,385 chunks in 404 graph layers,385 chunks in 404 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.37 MiB 8.62 kiB Shape (3696, 97) (48, 46) Dask graph 385 chunks in 404 graph layers Data type float32 numpy.ndarray",97  3696,

Unnamed: 0,Array,Chunk
Bytes,1.37 MiB,8.62 kiB
Shape,"(3696, 97)","(48, 46)"
Dask graph,385 chunks in 404 graph layers,385 chunks in 404 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.37 MiB,8.62 kiB
Shape,"(3696, 97)","(48, 46)"
Dask graph,385 chunks in 404 graph layers,385 chunks in 404 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.37 MiB 8.62 kiB Shape (3696, 97) (48, 46) Dask graph 385 chunks in 404 graph layers Data type float32 numpy.ndarray",97  3696,

Unnamed: 0,Array,Chunk
Bytes,1.37 MiB,8.62 kiB
Shape,"(3696, 97)","(48, 46)"
Dask graph,385 chunks in 404 graph layers,385 chunks in 404 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.37 MiB,8.62 kiB
Shape,"(3696, 97)","(48, 46)"
Dask graph,385 chunks in 404 graph layers,385 chunks in 404 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.37 MiB 8.62 kiB Shape (3696, 97) (48, 46) Dask graph 385 chunks in 404 graph layers Data type float32 numpy.ndarray",97  3696,

Unnamed: 0,Array,Chunk
Bytes,1.37 MiB,8.62 kiB
Shape,"(3696, 97)","(48, 46)"
Dask graph,385 chunks in 404 graph layers,385 chunks in 404 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


I intend to keep more variables -- only keeping wspd for now to align with ERA5; others are not needed.

### Combine the datasets

ERA5 data measures on constant pressure; profiler data is measured on constant height. Will interpolate ERA5 to a constant height before combination, but wanted to keep the dataset true to its original data, at least for now. Want to store both of these campaign files, cleaned, in Zarr format and make a second notebook to read in + compare easily. 