### 930am Monday June 16th: Taking the mostly working section plots and making the new calvert cubes using this modified interpolation technique ###

In [None]:
import xarray as xr
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import numpy as np
import os
import matplotlib.dates as mdates
import cmocean as cm
import waypoint_distance as wd
import pandas as pd
from pathlib import Path

def plot_section(ds, topo, xlim=(77,0)):
    """
    Plot temperature section for a given file

    Parameters:
    - ds: xarray.Dataset, a .nc file for each transect
    - xlim: float, maximum distance (km) along transect to plot
    """

    # ─── Styling ─────────────────────────────────────────
    plt.rcParams.update({
        'font.size': 12,
        'axes.titlesize': 20,
        'axes.labelsize': 20,
        'xtick.labelsize': 15,
        'ytick.labelsize': 15,
        'legend.fontsize': 20,
        'figure.titlesize': 20})

    # ─── Extract variables ──────────────────────────────
    along = ds['along'].values
    depth = ds['depth'].values
    temperature = ds['temperature'].values
    pdens = ds['potential_density'].values - 1000  # sigma-theta
    lon = ds['longitude'].values
    lat = ds['latitude'].values
    time_top = ds['time'].values  # shape (along,)

    # ─── Interpolate Bathymetry ─────────────────────────
    interp_bathy = topo['Band1'].interp(
        lon=xr.DataArray(lon, dims='along'),
        lat=xr.DataArray(lat, dims='along'),
        method='nearest')
    ocean_floor = -interp_bathy.values
    depth_grid, along_grid = np.meshgrid(depth, along, indexing='ij')  # shape: (depth, along)

    # Expand bottom_depths to match depth grid shape
    bathymetry_floor = np.tile(ocean_floor, (len(depth), 1))

    # Create mask where depth > bottom
    mask = depth_grid > bathymetry_floor

    # ------- Plotting ------- #
    fig, ax = plt.subplots(figsize=(1.5 * 1.5 * 6.4, 1.5 * 4.8))
    
    # Fill below bathymetry to 420 m
    ax.fill_between(along / 1000, ocean_floor, 420,
                    where=~np.isnan(ocean_floor),
                    facecolor='grey', zorder=1)
    
    # Plot temperature
    cf = ax.pcolormesh(along / 1000, depth, temperature, 
                    shading='auto', cmap=cm.cm.thermal, 
                    vmin=5.3, vmax=10, zorder=2)

    # Plot bathymetry
    ax.plot(along / 1000, ocean_floor, color='black', linewidth=2)

    # Isopycnals
    for levels, color, lw in [
        (np.linspace(24, 27, 7), 'black', 0.5),
        # ([26.6], 'white', 2),
        ([26.7], 'lime', 2),
        ([26.8], 'red', 2),
        ([26.9], 'blue', 2)]:
        cf_iso = ax.contour(along / 1000, depth, pdens, levels=levels,
                            colors=color, linewidths=lw, linestyles='-')
        if lw != 0.3:
            ax.clabel(cf_iso, fmt='%1.2f')

    # ─── Top Axis with Time Labels ──────────────────────
    along_km = along / 1000
    nticks = 8
    idx_ticks = np.linspace(0, len(along_km) - 1, nticks, dtype=int)
    tick_locs = along_km[idx_ticks]
    tick_times = time_top[idx_ticks]

    # Remove NaT
    valid_mask = ~pd.isna(tick_times)
    tick_locs = tick_locs[valid_mask]
    tick_times = tick_times[valid_mask]
    tick_labels = [pd.to_datetime(t).strftime('%b %d %H:%M') for t in tick_times]

    ax_top = ax.secondary_xaxis('top')
    ax_top.set_xticks(tick_locs)
    ax_top.set_xticklabels(tick_labels, rotation=30, ha='center', fontsize=10)

    # ─── Labels and Limits ──────────────────────────────
    ax.set_xlabel('Along-Transect Distance (km)')
    ax.set_ylabel('Depth (m)')
    ax.invert_yaxis()
    ax.set_ylim(420, 0)
    ax.set_xlim(xlim)

    earliest_time = pd.to_datetime(min(tick_times))
    tstr = earliest_time.strftime('%Y-%m-%d')
    ax.set_title(f'Temperature Section ({tstr})')

    plt.colorbar(cf, ax=ax, label='Temperature (°C)')
    plt.tight_layout()

def interpolate(ds, step=50, extrapolate=True):
    import numpy as np
    import xarray as xr

    # Remove duplicate along values
    _, index_unique = np.unique(ds['along'], return_index=True)
    ds = ds.isel(time=index_unique)

    # Build regular along grid
    min_along = np.floor(ds['along'].min().item() / step) * step
    max_along = np.ceil(ds['along'].max().item() / step) * step
    along_grid = np.arange(min_along, max_along + 1, step)

    # Swap time with along for interpolation
    ds = ds.swap_dims({'time': 'along'})
    ds_interp = ds.interp(along=along_grid)

    # Interpolate time manually (since it's not numeric by default)
    interp_time = np.interp(
        along_grid,
        ds['along'].values,
        ds['time'].values.astype('datetime64[ns]').astype('float64'),
        left=np.nan,
        right=np.nan
    )
    ds_interp['time'] = ('along', interp_time.astype('datetime64[ns]'))

    # Get along mask for 0–20 km region
    mask_along = (ds_interp['along'] >= 0) & (ds_interp['along'] <= 20000)

    # Subset temperature in that range
    temp_sub = ds_interp['temperature'].where(mask_along, drop=True)

    # Find the deepest depth where at least one valid (non-NaN) temperature exists
    valid_depths = ds_interp['depth'][~np.all(np.isnan(temp_sub), axis=1)]
    max_valid_depth = valid_depths.max().item()

    # Limit to valid depth range
    ds_interp = ds_interp.sel(depth=ds_interp['depth'] <= max_valid_depth)

    # ─── Vectorized valid_temp_depth assignment ─────
    temp = ds_interp['temperature'].values  # (depth, along)
    depth_vals = ds_interp['depth'].values
    valid_mask = ~np.isnan(temp)
    reversed_mask = valid_mask[::-1, :]
    first_valid_idx_from_bottom = reversed_mask.argmax(axis=0)
    has_valid_data = valid_mask.any(axis=0)
    valid_depths = np.where(has_valid_data,
                            depth_vals[-1 - first_valid_idx_from_bottom],
                            np.nan)
    ds_interp['valid_temp_depth'] = ('along', valid_depths)

    # ─── Fill NaNs along each depth row using nearest ─────
    for var in ds_interp.data_vars:
        da = ds_interp[var]
        if 'along' not in da.dims or 'depth' not in da.dims:
            continue

        filled_rows = []
        depths = ds_interp['depth'].values
        along_vals = ds_interp['along'].values

        for i in range(len(depths)):
            row = da.isel(depth=i)
            filled = row.interpolate_na(
                dim='along',
                method='nearest',
                fill_value='extrapolate')
            filled_rows.append(filled.values)

        # Rebuild variable
        new_da = xr.DataArray(
            data=np.array(filled_rows),
            dims=('depth', 'along'),
            coords={'depth': depths, 'along': along_vals})
        ds_interp[var] = new_da

    return ds_interp

def clean_and_interpolate(file_pathway, topo):

    ds = xr.open_dataset(file_pathway)

    waypoint_lon = np.array([-127.950, -128.115, -128.243, -128.514, -128.646, -128.798])
    waypoint_lat = np.array([51.757, 51.705, 51.715, 51.450, 51.4165, 51.408])
    central_lat = 51.715

    alongx, acrossx, _ = wd.get_simple_distance(
        shiplon=ds['longitude'].values,
        shiplat=ds['latitude'].values,
        wplon=waypoint_lon,
        wplat=waypoint_lat,
        central_lat=central_lat)

    ds = ds.assign(along=('time', alongx), across=('time', acrossx))

    peak_idx = int(np.argmax(ds['along'].values))
    ds_out = ds.isel(time=slice(0, peak_idx + 1))
    ds_return = ds.isel(time=slice(peak_idx + 1, None))

    results = {}

    for leg, name in [(ds_out, "out"), (ds_return, "return")]:
        try:
            if leg['time'].size < 2 or np.count_nonzero(~np.isnan(leg['along'])) < 10:
                continue

            prev_len = -1
            while prev_len != len(leg['time']):
                if len(leg['along']) < 3:
                    raise ValueError("Too short for gradient")
                prev_len = len(leg['time'])
                grad = np.gradient(leg['along'].values)
                keep_mask = grad > 0 if name == "out" else grad < 0
                leg = leg.sel(time=keep_mask)

            if np.count_nonzero(~np.isnan(leg['along'])) < 100:
                continue

            # Interpolate before assigning to results
            leg_interp = interpolate(leg, step=50)
            leg_interp = mask_dataset(leg_interp, topo)
            results[name] = leg_interp
            # results[name] = leg

        except Exception as e:
            print(f"⚠️ Skipping {name} leg in {file_pathway}: {e}")
            continue
        
    # Access outbound leg
    ds_out_cleaned = results.get("out")

    # Access return leg
    ds_return_cleaned = results.get("return")

    return ds_out_cleaned, ds_return_cleaned

def mask_dataset(ds, topo):
    """
    Applies depth-based masking to all 2D (depth, along) variables using valid_temp_depth
    and bathymetry clearance logic.

    Parameters:
    - ds: xarray.Dataset, must include 'valid_temp_depth'
    - topo: xarray.Dataset, bathymetry

    Returns:
    - ds_masked: xarray.Dataset with masked versions of all (depth, along) variables
    """

    # Extract variables
    along = ds['along'].values
    depth = ds['depth'].values
    lon = ds['longitude'].values
    lat = ds['latitude'].values

    if 'valid_temp_depth' not in ds:
        raise ValueError("Dataset must include 'valid_temp_depth'")

    # Interpolate bathymetry
    ocean_floor = -topo['Band1'].interp(
        lon=xr.DataArray(lon, dims='along'),
        lat=xr.DataArray(lat, dims='along'),
        method='nearest').values

    valid_depths = ds['valid_temp_depth'].values
    local_clearance = ocean_floor - valid_depths

    # Average clearance in trusted region
    trusted = (along >= 20000) & (along <= 77000)
    mean_clearance = np.nanmean(local_clearance[trusted])

    # Compute adaptive mask depth
    mask_depth = np.minimum(
        np.where(local_clearance <= mean_clearance,
                 ocean_floor - local_clearance,
                 ocean_floor - mean_clearance),
        ocean_floor
    )

    # Apply mask to all 2D (depth, along) variables
    for var in ds.data_vars:
        da = ds[var]
        if set(da.dims) == {'depth', 'along'}:
            arr = da.values.copy()
            for j in range(len(along)):
                limit_depth = mask_depth[j]
                if np.isnan(limit_depth):
                    arr[:, j] = np.nan
                else:
                    arr[depth > limit_depth, j] = np.nan
            # Save masked version
            ds[var] = (('depth', 'along'), arr)

    return ds

#############################################
## _____________ Processing ______________ ##
#############################################
if False:

    def process_and_plot(file_pathway):

        topo = xr.open_dataset(os.path.expanduser('~/Desktop/british_columbia_3_msl_2013.nc'))

        ds_out_cleaned, ds_return_cleaned = clean_and_interpolate(file_pathway, topo)
        if ds_out_cleaned is not None:
            plot_section(ds_out_cleaned, topo)
        else:
            print("No outbound leg found.")

        if ds_return_cleaned is not None:
            plot_section(ds_return_cleaned, topo)
        else:
            print("No return leg found.")
        return ds_return_cleaned

    file_pathway = '~/CalvertLine_reprocessed/dfo-hal1002-20240702_grid_delayed.nc'
    ds_return_cleaned = process_and_plot(file_pathway)
if False: 
    # Load bathymetry once
    topo = xr.open_dataset(os.path.expanduser('~/Desktop/british_columbia_3_msl_2013.nc'))

    input_dir = Path("~/Users/martinwilliamson/Desktop/dfo-hal1002-20250506_grid.nc").expanduser()
    output_dir = Path("~/Desktop/cleaned_transects").expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)

    all_files = sorted(input_dir.glob("*_grid.nc"))

    for file_path in all_files:
        print(f"🧼 Cleaning & interpolating: {file_path.name}")

        try:
            ds_out, ds_return = clean_and_interpolate(str(file_path), topo)

            for ds, leg in [(ds_out, "out"), (ds_return, "return")]:
                if ds is None:
                    print(f"⚠️ No {leg} leg found in {file_path.name}")
                    continue

                valid_times = ds['time'].values[~np.isnan(ds['time'].values)]
                if len(valid_times) == 0:
                    print(f"⚠️ Skipping {file_path.name} — no valid times in {leg} leg.")
                    continue

                timestamp = pd.to_datetime(valid_times[0]).strftime('%Y%m%d')
                out_path = output_dir / f"{timestamp}_{leg}.nc"
                ds.to_netcdf(out_path)
                print(f"✅ Saved {leg}: {out_path.name}")

        except Exception as e:
            print(f"❌ Failed on {file_path.name}: {e}")
        
file_path = Path("~/Desktop/dfo-hal1002-20220804_grid_delayed.nc").expanduser()
topo = xr.open_dataset(os.path.expanduser('~/Desktop/Summer 2025 Python/british_columbia_3_msl_2013.nc'))
output_dir = Path("~/Desktop/Summer 2025 Python/cleaned_transects").expanduser()
ds_out, ds_return = clean_and_interpolate(str(file_path), topo)

# Then optionally save it like before:
for ds, leg in [(ds_out, "out"), (ds_return, "return")]:
    if ds is None:
        print(f"⚠️ No {leg} leg found in {file_path.name}")
        continue

    valid_times = ds['time'].values[~np.isnan(ds['time'].values)]
    if len(valid_times) == 0:
        print(f"⚠️ Skipping {file_path.name} — no valid times in {leg} leg.")
        continue

    timestamp = pd.to_datetime(valid_times[0]).strftime('%Y%m%d')
    out_path = output_dir / f"{timestamp}_{leg}.nc"
    ds.to_netcdf(out_path)
    print(f"✅ Saved {leg}: {out_path.name}")

# Adding new glider files to the cube without remaking full cube:

In [None]:
from pathlib import Path
import xarray as xr

existing_cube = xr.open_dataset("~/Desktop/Summer 2025 Python/calvert_cube.nc")

# Explicit paths to the 2 new files
new_files = [
    Path("~/Desktop/Summer 2025 Python/cleaned_transects/20220804_out.nc").expanduser(),
    # Path("~/Desktop/Summer 2025 Python/cleaned_transects/20250625_return.nc").expanduser()
]

new_datasets = []
for nc_file in new_files:
    ds = xr.open_dataset(nc_file)

    transect_label = nc_file.stem
    ds = ds.expand_dims(transect=[transect_label])
    new_datasets.append(ds)
    print(f"✅ Prepared: {transect_label}")

# Always demote longitude and latitude to variables in ALL datasets

def demote_coords(ds):
    # If the coord is present as a coordinate, reset it
    for coord in ['longitude', 'latitude']:
        if coord in ds.coords:
            ds = ds.reset_coords(coord)
    return ds

# Demote in existing cube
existing_cube = demote_coords(existing_cube)

# Demote in new datasets
new_datasets_cleaned = []
for ds in new_datasets:
    ds = demote_coords(ds)
    new_datasets_cleaned.append(ds)

updated_cube = xr.concat(
    [existing_cube] + new_datasets_cleaned,
    dim="transect",
    compat="no_conflicts"
)

updated_cube.to_netcdf("~/Desktop/Summer 2025 Python/transect_cube.nc")
print("✅ Saved updated cube to ~/Desktop/Summer 2025 Python/transect_cube.nc")

In [None]:
from pathlib import Path
import xarray as xr

existing_cube = xr.open_dataset("~/Desktop/Summer 2025 Python/calvert_cube.nc")

# Explicit paths to the 2 new files
new_files = [
    Path("~/Desktop/Summer 2025 Python/cleaned_transects/20220804_out.nc").expanduser(),
    # Path("~/Desktop/Summer 2025 Python/cleaned_transects/20250625_return.nc").expanduser()
]

new_datasets = []
for nc_file in new_files:
    ds = xr.open_dataset(nc_file)

    transect_label = nc_file.stem
    ds = ds.expand_dims(transect=[transect_label])
    new_datasets.append(ds)
    print(f"✅ Prepared: {transect_label}")

# Always demote longitude and latitude to variables in ALL datasets

def demote_coords(ds):
    # If the coord is present as a coordinate, reset it
    for coord in ['longitude', 'latitude']:
        if coord in ds.coords:
            ds = ds.reset_coords(coord)
    return ds

# Demote in existing cube
existing_cube = demote_coords(existing_cube)

# Demote in new datasets
new_datasets_cleaned = []
for ds in new_datasets:
    ds = demote_coords(ds)
    new_datasets_cleaned.append(ds)

updated_cube = xr.concat(
    [existing_cube] + new_datasets_cleaned,
    dim="transect",
    compat="no_conflicts")

updated_cube.to_netcdf("~/Desktop/Summer 2025 Python/calvert_cube.nc")
print("✅ Saved updated cube to ~/Desktop/Summer 2025 Python/calvert_cube.nc")

In [None]:
import xarray as xr
from pathlib import Path

input_dir = Path("~/Desktop/cleaned_transects").expanduser()
all_files = sorted(input_dir.glob("*.nc"))

datasets = []

for nc_file in all_files:
    try:
        ds = xr.open_dataset(nc_file)

        # Promote lon/lat to regular variables if they are coordinates
        for coord in ['longitude', 'latitude']:
            if coord in ds.coords:
                ds = ds.reset_coords(coord)

        # Add transect label
        transect_label = nc_file.stem
        ds = ds.expand_dims(transect=[transect_label])
        datasets.append(ds)

        print(f"✅ Added: {transect_label}")

    except Exception as e:
        print(f"⚠️ Skipping {nc_file.name}: {e}")

# Combine all into one cube
if datasets:
    cube = xr.concat(datasets, dim='transect')
    cube.to_netcdf("~/Desktop/transect_cube.nc")
    print("✅ Saved cube to ~/Desktop/transect_cube.nc")
else:
    print("⚠️ No valid datasets to combine.")

In [None]:
from pathlib import Path
import xarray as xr
import matplotlib.pyplot as plt

# Load bathymetry once
topo = xr.open_dataset(os.path.expanduser('~/Desktop/british_columbia_3_msl_2013.nc'))

input_dir = Path("~/Desktop/cleaned_transects").expanduser()

# Loop through all cleaned transect files
for nc_file in sorted(input_dir.glob("*.nc")):
    try:
        ds = xr.open_dataset(nc_file)
        print(f"📈 Plotting: {nc_file.name}")

        plot_section(ds, topo)
        plt.show()
    except Exception as e:
        print(f"⚠️ Failed to plot {nc_file.name}: {e}")

In [None]:
import xarray as xr
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import numpy as np
import os
import matplotlib.dates as mdates
import cmocean as cm
import waypoint_distance as wd
import pandas as pd
from pathlib import Path
def plot_oxygen_section(ds, topo, xlim=(77, 0)):
    """
    Plot oxygen concentration section for a given transect dataset.

    Parameters:
    - ds: xarray.Dataset for the transect
    - topo: xarray.Dataset with bathymetry (e.g., from GEBCO or similar)
    - xlim: tuple of (max, min) along-track km for x-axis limits
    """

    # ─── Styling ─────────────────────────────────────────
    plt.rcParams.update({
        'font.size': 12,
        'axes.titlesize': 20,
        'axes.labelsize': 20,
        'xtick.labelsize': 15,
        'ytick.labelsize': 15,
        'legend.fontsize': 20,
        'figure.titlesize': 20})

    # ─── Extract variables ──────────────────────────────
    along = ds['along'].values
    depth = ds['depth'].values
    oxygen = ds['oxygen_concentration'].values
    pdens = ds['potential_density'].values - 1000
    lon = ds['longitude'].values
    lat = ds['latitude'].values
    time_top = ds['time'].values

    # ─── Interpolate Bathymetry ─────────────────────────
    interp_bathy = topo['Band1'].interp(
        lon=xr.DataArray(lon, dims='along'),
        lat=xr.DataArray(lat, dims='along'),
        method='nearest')
    ocean_floor = -interp_bathy.values
    depth_grid, along_grid = np.meshgrid(depth, along, indexing='ij')
    bathymetry_floor = np.tile(ocean_floor, (len(depth), 1))
    mask = depth_grid > bathymetry_floor

    # ─── Plotting ───────────────────────────────────────
    fig, ax = plt.subplots(figsize=(1.5 * 1.5 * 6.4, 1.5 * 4.8))

    # Fill below bathymetry to 420m
    ax.fill_between(along / 1000, ocean_floor, 420,
                    where=~np.isnan(ocean_floor),
                    facecolor='grey', zorder=1)

    # Oxygen colormap
    cf = ax.pcolormesh(along / 1000, depth, oxygen,
                       shading='auto', cmap='inferno',
                       vmin=0, vmax=200, zorder=2)

    # Bathymetry line
    ax.plot(along / 1000, ocean_floor, color='black', linewidth=2)

    # Isopycnals
    for levels, color, lw in [
        (np.linspace(24, 27, 7), 'black', 0.5),
        ([26.7], 'lime', 2),
        ([26.8], 'red', 2),
        ([26.9], 'blue', 2)]:
        iso = ax.contour(along / 1000, depth, pdens, levels=levels,
                         colors=color, linewidths=lw, linestyles='-')
        if lw > 0.5:
            ax.clabel(iso, fmt='%1.2f')

    # ─── Top Axis with Time ─────────────────────────────
    along_km = along / 1000
    nticks = 8
    idx_ticks = np.linspace(0, len(along_km) - 1, nticks, dtype=int)
    tick_locs = along_km[idx_ticks]
    tick_times = time_top[idx_ticks]

    valid_mask = ~pd.isna(tick_times)
    tick_locs = tick_locs[valid_mask]
    tick_times = tick_times[valid_mask]
    tick_labels = [pd.to_datetime(t).strftime('%b %d %H:%M') for t in tick_times]

    if len(tick_locs) > 0:
        ax_top = ax.secondary_xaxis('top')
        ax_top.set_xticks(tick_locs)
        ax_top.set_xticklabels(tick_labels, rotation=30, ha='center', fontsize=10)

    # ─── Labels & Limits ────────────────────────────────
    ax.set_xlabel('Along-Transect Distance (km)')
    ax.set_ylabel('Depth (m)')
    ax.invert_yaxis()
    ax.set_ylim(420, 0)
    ax.set_xlim(xlim)

    # Title with timestamp
    if len(tick_times) > 0:
        tstr = pd.to_datetime(tick_times[0]).strftime('%Y-%m-%d')
        ax.set_title(f'Oxygen Section ({tstr})')
    else:
        ax.set_title('Oxygen Section')

    # Colorbar
    plt.colorbar(cf, ax=ax, label='Oxygen (μmol/kg)')
    plt.tight_layout()

# Load bathymetry once
topo = xr.open_dataset(os.path.expanduser('~/Desktop/british_columbia_3_msl_2013.nc'))

input_dir = Path("~/Desktop/cleaned_transects").expanduser()

# Loop through all cleaned transect files
for nc_file in sorted(input_dir.glob("*.nc")):
    try:
        ds = xr.open_dataset(nc_file)
        print(f"📈 Plotting: {nc_file.name}")

        plot_oxygen_section(ds, topo)
        plt.show()
    except Exception as e:
        print(f"⚠️ Failed to plot {nc_file.name}: {e}")

In [None]:
file_pathway = '~/Desktop/dfo-hal1002-20250506_grid_delayed.nc'

if True:

    def process_and_plot(file_pathway):

        topo = xr.open_dataset(os.path.expanduser('~/Desktop/british_columbia_3_msl_2013.nc'))

        ds_out_cleaned, ds_return_cleaned = clean_and_interpolate(file_pathway, topo)
        if ds_out_cleaned is not None:
            plot_section(ds_out_cleaned, topo)
        else:
            print("No outbound leg found.")

        if ds_return_cleaned is not None:
            plot_section(ds_return_cleaned, topo)
        else:
            print("No return leg found.")
        return ds_return_cleaned
    ds_return_cleaned = process_and_plot(file_pathway)

In [None]:
# # testing cube creation

# import xarray as xr
# import numpy as np
# import os
# import pandas as pd
# import waypoint_distance as wd

# def clean_mission(file_pathway):
#     ds = xr.open_dataset(file_pathway)

#     # Project onto along/across
#     waypoint_lon = np.array([-127.950, -128.115, -128.243, -128.514, -128.646, -128.798])
#     waypoint_lat = np.array([51.757, 51.705, 51.715, 51.450, 51.4165, 51.408])
#     central_lat = 51.715

#     alongx, acrossx, _ = wd.get_simple_distance(
#         shiplon=ds['longitude'].values,
#         shiplat=ds['latitude'].values,
#         wplon=waypoint_lon,
#         wplat=waypoint_lat,
#         central_lat=central_lat)

#     ds = ds.assign(along=('time', alongx), across=('time', acrossx))

#     peak_idx = int(np.argmax(ds['along'].values))
#     ds_out = ds.isel(time=slice(0, peak_idx + 1))
#     ds_return = ds.isel(time=slice(peak_idx + 1, None))

#     for leg, name in [(ds_out, "out"), (ds_return, "return")]:
#         if leg['time'].size < 2 or np.count_nonzero(~np.isnan(leg['along'])) < 10:
#             print(f"Skipping {name} leg: not enough points")
#             if name == "out": ds_out = None
#             else: ds_return = None
#             continue

#         prev_len = -1
#         while prev_len != len(leg['time']):
#             prev_len = len(leg['time'])
#             grad = np.gradient(leg['along'])
#             keep_mask = grad > 0 if name == "out" else grad < 0
#             leg = leg.sel(time=keep_mask)

#         if np.count_nonzero(~np.isnan(leg['along'])) < 100:
#             print(f"Dropping {name} leg: too few valid points")
#             if name == "out": ds_out = None
#             else: ds_return = None
#             continue

#         if name == "out": ds_out = interpolate(leg.set_coords('along'))
#         else: ds_return = interpolate(leg.set_coords('along'))

#     if ds_out is not None: ds_out = ds_out.assign_coords(mission_type='Outbound')
#     if ds_return is not None: ds_return = ds_return.assign_coords(mission_type='Return')

#     return ds_out, ds_return

# def interpolate(ds):
#     _, index_unique = np.unique(ds['along'], return_index=True)
#     ds = ds.isel(time=index_unique)

#     min_along = np.floor(ds['along'].min().item() / 50) * 50
#     max_along = np.ceil(ds['along'].max().item() / 50) * 50
#     along_grid = np.arange(min_along, max_along + 1, 50)

#     ds = ds.swap_dims({'time': 'along'})
#     ds_interp = ds.interp(along=along_grid)

#     interp_time = np.interp(
#         along_grid,
#         ds['along'].values,
#         ds['time'].values.astype('datetime64[ns]').astype('float64'),
#         left=np.nan,
#         right=np.nan
#     )
#     ds_interp['time'] = ('along', interp_time.astype('datetime64[ns]'))

#     return ds_interp

# def append_to_calvert_cube(ds_transect, cube_path='~/Desktop/calvert_cube.nc', source_file=None):
#     cube_path = os.path.expanduser(cube_path)

#     if 'transect' not in ds_transect.dims:
#         ds_transect = ds_transect.expand_dims(transect=[0])

#     if 'transect_time' not in ds_transect.coords:
#         valid_times = pd.to_datetime(ds_transect['time'].values, errors='coerce').dropna()
#         transect_time = np.datetime64(valid_times[0]) if len(valid_times) > 0 else np.datetime64('NaT')
#         ds_transect = ds_transect.assign_coords(transect_time=('transect', [transect_time]))

#     if source_file:
#         ds_transect.attrs['source_file'] = os.path.basename(source_file)

#     if not os.path.exists(cube_path):
#         ds_transect.to_netcdf(cube_path)
#         print(f"🟢 Created new cube: {cube_path}")
#         return

#     cube = xr.open_dataset(cube_path)
#     cube.load(); cube.close()

#     processed_files = cube.attrs.get('source_file', [])
#     if isinstance(processed_files, str):
#         processed_files = [processed_files]
#     if os.path.basename(source_file) in processed_files:
#         print(f"⚠️ Skipping {source_file}: already in cube.")
#         return

#     new_index = cube.sizes['transect']
#     ds_transect = ds_transect.assign_coords(transect=[new_index])
#     ds_combined = xr.concat([cube, ds_transect], dim='transect')
#     ds_combined = ds_combined.sortby('transect_time')

#     ds_combined.attrs['source_file'] = processed_files + [os.path.basename(source_file)]
#     ds_combined.to_netcdf(cube_path, mode='w')
#     print(f"✅ Appended to cube: {source_file}")

# def add_file_to_cube(filepath):
#     ds_out, ds_return = clean_mission(filepath)
#     if ds_out is not None:
#         append_to_calvert_cube(ds_out, source_file=filepath + "_out")
#     else:
#         print(f"Skipped outbound for {filepath}")
#     if ds_return is not None:
#         append_to_calvert_cube(ds_return, source_file=filepath + "_return")
#     else:
#         print(f"Skipped return for {filepath}")

In [None]:
# # TRYING AGAIN" 
# import xarray as xr
# import numpy as np
# import os
# from pathlib import Path
# import waypoint_distance as wd

# def interpolate(ds, step=50):
#     _, index_unique = np.unique(ds['along'], return_index=True)
#     ds = ds.isel(time=index_unique)

#     min_along = np.floor(ds['along'].min().item() / step) * step
#     max_along = np.ceil(ds['along'].max().item() / step) * step
#     along_grid = np.arange(min_along, max_along + 1, step)

#     ds = ds.swap_dims({'time': 'along'})
#     ds_interp = ds.interp(along=along_grid)

#     interp_time = np.interp(
#         along_grid,
#         ds['along'].values,
#         ds['time'].values.astype('datetime64[ns]').astype('float64'),
#         left=np.nan,
#         right=np.nan
#     )
#     ds_interp['time'] = ('along', interp_time.astype('datetime64[ns]'))

#     return ds_interp

# def clean_and_interpolate(file_pathway):
#     ds = xr.open_dataset(file_pathway)

#     waypoint_lon = np.array([-127.950, -128.115, -128.243, -128.514, -128.646, -128.798])
#     waypoint_lat = np.array([51.757, 51.705, 51.715, 51.450, 51.4165, 51.408])
#     central_lat = 51.715

#     alongx, acrossx, _ = wd.get_simple_distance(
#         shiplon=ds['longitude'].values,
#         shiplat=ds['latitude'].values,
#         wplon=waypoint_lon,
#         wplat=waypoint_lat,
#         central_lat=central_lat)

#     ds = ds.assign(along=('time', alongx), across=('time', acrossx))

#     peak_idx = int(np.argmax(ds['along'].values))
#     ds_out = ds.isel(time=slice(0, peak_idx + 1))
#     ds_return = ds.isel(time=slice(peak_idx + 1, None))

#     results = {}
#     for leg, name in [(ds_out, "out"), (ds_return, "return")]:
#         if leg['time'].size < 2 or np.count_nonzero(~np.isnan(leg['along'])) < 10:
#             continue

#         prev_len = -1
#         while prev_len != len(leg['time']):
#             prev_len = len(leg['time'])
#             grad = np.gradient(leg['along'])
#             keep_mask = grad > 0 if name == "out" else grad < 0
#             leg = leg.sel(time=keep_mask)

#         if np.count_nonzero(~np.isnan(leg['along'])) < 100:
#             continue

#         results[name] = interpolate(leg.set_coords('along'))

#     return results

# def save_cleaned_transects(input_dir, output_dir):
#     input_dir = Path(input_dir).expanduser()
#     output_dir = Path(output_dir).expanduser()
#     output_dir.mkdir(parents=True, exist_ok=True)

#     for i, f in enumerate(sorted(input_dir.glob("*_grid_delayed.nc"))):
#         print(f"🧼 Cleaning & interpolating: {f.name}")
#         try:
#             results = clean_and_interpolate(str(f))

#             for leg in results:
#                 ds = results[leg]
#                 ds = ds.expand_dims('transect')
#                 ds = ds.assign_coords(transect=("transect", [i]))

#                 out_path = output_dir / f"{f.stem}_{leg}.nc"
#                 ds.to_netcdf(out_path)
#                 print(f"✅ Saved {leg}: {out_path.name}")

#         except Exception as e:
#             print(f"❌ Failed on {f.name}: {e}")
if False:
    save_cleaned_transects(
        input_dir="~/CalvertLine_reprocessed",
        output_dir="~/Desktop/cleaned_transects")

In [None]:
# import xarray as xr
# import numpy as np
# import pandas as pd
# import os
# import re
# from pathlib import Path

# # === Directories ===
# input_dir = Path("~/Desktop/cleaned_transects").expanduser()
# output_dir = Path("~/Desktop/CalvertLine_cubes").expanduser()
# # output_dir.mkdir(parents=True, exist_ok=True)

# # === Extract year from filename ===
# def extract_year(filename):
#     match = re.search(r'(\d{8})', filename)
#     return match.group(1)[:4] if match else None

# # === Interpolation and cube appending logic (unchanged) ===
# def append_to_calvert_cube(ds_transect, cube_path, source_file=None):
#     cube_path = os.path.expanduser(cube_path)

#     if 'transect' not in ds_transect.dims:
#         ds_transect = ds_transect.expand_dims(transect=[0])

#     if 'transect_time' not in ds_transect.coords:
#         valid_times = pd.to_datetime(ds_transect['time'].values, errors='coerce').dropna()
#         transect_time = np.datetime64(valid_times[0]) if len(valid_times) > 0 else np.datetime64('NaT')
#         ds_transect = ds_transect.assign_coords(transect_time=('transect', [transect_time]))

#     if source_file:
#         ds_transect.attrs['source_file'] = os.path.basename(source_file)

#     if not os.path.exists(cube_path):
#         ds_transect.to_netcdf(cube_path)
#         print(f"🟢 Created new cube: {cube_path}")
#         return

#     cube = xr.open_dataset(cube_path)
#     cube.load(); cube.close()

#     processed_files = cube.attrs.get('source_file', [])
#     if isinstance(processed_files, str):
#         processed_files = [processed_files]
#     if os.path.basename(source_file) in processed_files:
#         print(f"⚠️ Skipping {source_file}: already in cube.")
#         return

#     new_index = cube.sizes['transect']
#     ds_transect = ds_transect.assign_coords(transect=[new_index])
#     ds_combined = xr.concat([cube, ds_transect], dim='transect')
#     ds_combined = ds_combined.sortby('transect_time')

#     ds_combined.attrs['source_file'] = processed_files + [os.path.basename(source_file)]
#     ds_combined.to_netcdf(cube_path, mode='w')
#     print(f"✅ Appended: {source_file}")

# if False:
#     # === Main loop: group by year and build each cube ===
#     all_nc_files = sorted(input_dir.glob("*.nc"))

#     files_by_year = {}
#     for f in all_nc_files:
#         year = extract_year(f.name)
#         if year:
#             files_by_year.setdefault(year, []).append(f)

#     for year, files in sorted(files_by_year.items()):
#         print(f"\n📅 Processing year {year} with {len(files)} files")
#         cube_path = str(output_dir / f"cube_{year}.nc")
#         for f in files:
#             try:
#                 ds = xr.open_dataset(f)
#                 append_to_calvert_cube(ds, cube_path=cube_path, source_file=str(f))
#             except Exception as e:
#                 print(f"❌ Failed to process {f.name}: {e}")

In [None]:
# cube_2025 = xr.open_dataset('/Users/martinwilliamson/Desktop/CalvertLine_cubes/cube_2025.nc')
# cube_2025

# cube_2024 = xr.open_dataset('/Users/martinwilliamson/Desktop/CalvertLine_cubes/cube_2024.nc')
# cube_2024

# transect_times = cube_2024['transect_time'].values
# transect_times

In [None]:
# cube_2024

In [None]:
# from collections import defaultdict
# import re

# urls = [
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-bb046/dfo-bb046-20210511/L0-gridfiles/dfo-bb046-20210511_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-bb046/dfo-bb046-20220507/L0-gridfiles/dfo-bb046-20220507_grid.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-bb046/dfo-bb046-20210413/L0-gridfiles/dfo-bb046-20210413_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-bb046/dfo-bb046-20200717/L0-gridfiles/dfo-bb046-20200717_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-bb046/dfo-bb046-20201006/L0-gridfiles/dfo-bb046-20201006_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-bb046/dfo-bb046-20210212/L0-gridfiles/dfo-bb046-20210212_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-bb046/dfo-bb046-20220707/L0-gridfiles/dfo-bb046-20220707_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-bb046/dfo-bb046-20201103/L0-gridfiles/dfo-bb046-20201103_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-bb046/dfo-bb046-20200810/L0-gridfiles/dfo-bb046-20200810_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-bb046/dfo-bb046-20210324/L0-gridfiles/dfo-bb046-20210324_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-bb046/dfo-bb046-20200908/L0-gridfiles/dfo-bb046-20200908_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-bb046/dfo-bb046-20220608/L0-gridfiles/dfo-bb046-20220608_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-k999/dfo-k999-20230320/L0-gridfiles/dfo-k999-20230320_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-k999/dfo-k999-20230811/L0-gridfiles/dfo-k999-20230811_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-k999/dfo-k999-20250114/L0-gridfiles/dfo-k999-20250114_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-k999/dfo-k999-20241023/L0-gridfiles/dfo-k999-20241023_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-k999/dfo-k999-20230915/L0-gridfiles/dfo-k999-20230915_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-k999/dfo-k999-20250317/L0-gridfiles/dfo-k999-20250317_grid.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-k999/dfo-k999-20241119/L0-gridfiles/dfo-k999-20241119_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-k999/dfo-k999-20230516/L0-gridfiles/dfo-k999-20230516_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-k999/dfo-k999-20230418/L0-gridfiles/dfo-k999-20230418_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-hal1002/dfo-hal1002-20220914/L0-gridfiles/dfo-hal1002-20220914_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-hal1002/dfo-hal1002-20220804/L0-gridfiles/dfo-hal1002-20220804_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-hal1002/dfo-hal1002-20240723/L0-gridfiles/dfo-hal1002-20240723_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-hal1002/dfo-hal1002-20250311/L0-gridfiles/dfo-hal1002-20250311_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-hal1002/dfo-hal1002-20240702/L0-gridfiles/dfo-hal1002-20240702_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-hal1002/dfo-hal1002-20240924/L0-gridfiles/dfo-hal1002-20240924_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-hal1002/dfo-hal1002-20250506/L0-gridfiles/dfo-hal1002-20250506_grid.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-eva035/dfo-eva035-20190612/L0-gridfiles/dfo-eva035-20190612_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-eva035/dfo-eva035-20230915/L0-gridfiles/dfo-eva035-20230915_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-eva035/dfo-eva035-20230811/L0-gridfiles/dfo-eva035-20230811_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-eva035/dfo-eva035-20230720/L0-gridfiles/dfo-eva035-20230720_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-eva035/dfo-eva035-20230620/L0-gridfiles/dfo-eva035-20230620_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-eva035/dfo-eva035-20230518/L0-gridfiles/dfo-eva035-20230518_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-eva035/dfo-eva035-20231019/L0-gridfiles/dfo-eva035-20231019_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-mike579/dfo-mike579-20190611/L0-gridfiles/dfo-mike579-20190611_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-mike579/dfo-mike579-20210704/L0-gridfiles/dfo-mike579-20210704_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-colin1142/dfo-colin1142-20240312/L0-gridfiles/dfo-colin1142-20240312_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-marvin1003/dfo-marvin1003-20240416/L0-gridfiles/dfo-marvin1003-20240416_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-marvin1003/dfo-marvin1003-20221129/L0-gridfiles/dfo-marvin1003-20221129_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-marvin1003/dfo-marvin1003-20240516/L0-gridfiles/dfo-marvin1003-20240516_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-marvin1003/dfo-marvin1003-20221018/L0-gridfiles/dfo-marvin1003-20221018_grid_delayed.nc",
#     "https://cproof.uvic.ca/gliderdata/deployments/./dfo-rosie713/dfo-rosie713-20190615/L0-gridfiles/dfo-rosie713-20190615_grid_delayed.nc"
# ]

# import re
# from collections import defaultdict

# by_year = defaultdict(list)
# for url in urls:
#     filename = url.split('/')[-1]
#     match = re.search(r'(\d{4})(\d{4})', filename)
#     if match:
#         year = match.group(1)
#         full_date = match.group(1) + match.group(2)
#         by_year[year].append(full_date)

# # Sort years and timestamps within each year
# sorted_by_year = {year: sorted(timestamps) for year, timestamps in sorted(by_year.items())}
# sorted_by_year

# Here I am attempting to figure out why some missions didn't get plotted:

In [None]:
import xarray as xr
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import numpy as np
import os
import matplotlib.dates as mdates
import cmocean as cm
import waypoint_distance as wd
import pandas as pd

def plot_all_sections(cube, xlim = 77):
    """
    Plot temperature sections for all transects in the data cube.

    Parameters:
    - cube: xarray.Dataset, the combined data cube of all transects.
    - temp_bounds: tuple, (min, max) temperature values for colormap.
    """

    plt.rcParams.update({
        'font.size': 12,
        'axes.titlesize': 20,
        'axes.labelsize': 20,
        'xtick.labelsize': 15,
        'ytick.labelsize': 15,
        'legend.fontsize': 20,
        'figure.titlesize': 20})
    
    topo_file = os.path.expanduser('~/Desktop/british_columbia_3_msl_2013.nc')
    topo = xr.open_dataset(topo_file)

    for transect in cube.transect:
        ds = cube.sel(transect=transect)

        temp_bounds = (5.3, 10)
        # Extract coordinates and data
        along = ds['along'].values
        depth = ds['depth'].values
        temperature = ds['temperature'].values
        pdens = ds['potential_density'].values - 1000  # Sigma-theta

        # Interpolate bathymetry over full along track
        full_lon = ds['longitude'].values
        full_lat = ds['latitude'].values

        interp_bathy = topo['Band1'].interp(
            lon=xr.DataArray(full_lon, dims='along'),
            lat=xr.DataArray(full_lat, dims='along'),
            method='nearest')
        bottom_depths = -interp_bathy.values  # 1D array for full along

        # Plot temperature section
        fig, ax = plt.subplots(figsize=( 1.5 * 1.5 * 6.4, 1.5 * 4.8))
        # Create meshgrid of depth and along
        depth_grid, along_grid = np.meshgrid(depth, along, indexing='ij')

        # Expand bottom_depths to match depth grid shape
        bathymetry_floor = np.tile(bottom_depths, (len(depth), 1))

        # Create mask where depth > bottom
        mask = depth_grid > bathymetry_floor

        # Plot grey background
        ax.contourf(along / 1000, depth, mask,
                    levels=[0.5, 1.5], colors='grey', zorder=1)
        cf = ax.pcolormesh(along / 1000, depth, temperature, shading='auto',
                           cmap=cm.cm.thermal, vmin=temp_bounds[0], vmax=temp_bounds[1])

        # Bathymetry
        ax.plot(along / 1000, bottom_depths, color='black', linewidth=2)

        # Isopycnal contours
        for levels, color, lw in [
            (np.linspace(24, 27, 7), 'black', 0.5),
            ([26.6], 'white', 2),
            ([26.7], 'lime', 2),
            ([26.8], 'red', 2),
            ([26.9], 'blue', 2)]:
            cf_iso = ax.contour(along / 1000, depth, pdens, levels=levels,
                                colors=color, linewidths=lw, linestyles='-')
            if lw != 0.3:
                ax.clabel(cf_iso, fmt='%1.2f')

        time_top = ds['time'].values  # shape (along,)
        along_km = along / 1000

        # Number of ticks you want
        nticks = 8
        idx_ticks = np.linspace(0, len(along_km) - 1, nticks, dtype=int)

        tick_locs = along_km[idx_ticks]
        tick_times = time_top[idx_ticks]

        # Remove NaT values
        valid_mask = ~pd.isna(tick_times)
        tick_locs = tick_locs[valid_mask]
        tick_times = tick_times[valid_mask]

        # Format with hour and date
        tick_labels = [pd.to_datetime(t).strftime('%b %d %H:%M') for t in tick_times]

        # Add secondary x-axis with formatted time ticks
        ax_top = ax.secondary_xaxis('top')
        ax_top.set_xticks(tick_locs)
        ax_top.set_xticklabels(tick_labels, rotation=30, ha='center', fontsize=10)

        # Labels and formatting
        ax.set_xlabel('Along-Transect Distance (km)')
        ax.set_ylabel('Depth (m)')
        ax.invert_yaxis()
        ax.set_ylim(410, 0)
        ax.set_xlim(xlim,0)
        tstr = str(ds['transect_time'].values.astype('datetime64[D]'))
        ax.set_title(f'Temperature Section ({tstr})')
        plt.colorbar(cf, ax=ax, label='Temperature (°C)')
        plt.tight_layout()

In [None]:
cube = xr.open_dataset('')

In [None]:
# TRYING AGAIN" 
import xarray as xr
import numpy as np
import os
from pathlib import Path
import waypoint_distance as wd

def interpolate(ds, step=50):
    _, index_unique = np.unique(ds['along'], return_index=True)
    ds = ds.isel(time=index_unique)

    min_along = np.floor(ds['along'].min().item() / step) * step
    max_along = np.ceil(ds['along'].max().item() / step) * step
    along_grid = np.arange(min_along, max_along + 1, step)

    ds = ds.swap_dims({'time': 'along'})
    ds_interp = ds.interp(along=along_grid)

    interp_time = np.interp(
        along_grid,
        ds['along'].values,
        ds['time'].values.astype('datetime64[ns]').astype('float64'),
        left=np.nan,
        right=np.nan
    )
    ds_interp['time'] = ('along', interp_time.astype('datetime64[ns]'))

    return ds_interp

# def clean_and_interpolate(file_pathway):
#     ds = xr.open_dataset('~/CalvertLine_reprocessed/dfo-colin1142-20240312_grid_delayed.nc')

#     waypoint_lon = np.array([-127.950, -128.115, -128.243, -128.514, -128.646, -128.798])
#     waypoint_lat = np.array([51.757, 51.705, 51.715, 51.450, 51.4165, 51.408])
#     central_lat = 51.715

#     alongx, acrossx, _ = wd.get_simple_distance(
#         shiplon=ds['longitude'].values,
#         shiplat=ds['latitude'].values,
#         wplon=waypoint_lon,
#         wplat=waypoint_lat,
#         central_lat=central_lat)

#     ds = ds.assign(along=('time', alongx), across=('time', acrossx))

#     peak_idx = int(np.argmax(ds['along'].values))
#     ds_out = ds.isel(time=slice(0, peak_idx + 1))
#     ds_return = ds.isel(time=slice(peak_idx + 1, None))

#     grad = np.gradient(ds_out['along'].values)
#     grad
#     results = {}
#     for leg, name in [(ds_out, "out"), (ds_return, "return")]:
#         if leg['time'].size < 2 or np.count_nonzero(~np.isnan(leg['along'])) < 10:
#             continue

#         prev_len = -1
#         while prev_len != len(leg['time']):
#             # if len(leg['along']) < 3:
#             #     break  # not enough for gradient
#             prev_len = len(leg['time'])
#             grad = np.gradient(leg['along'])
#             keep_mask = grad > 0 if name == "out" else grad < 0
#             leg = leg.sel(time=keep_mask)

#         if len(leg['along']) < 3 or np.count_nonzero(~np.isnan(leg['along'])) < 10:
#             continue

#         results[name] = interpolate(leg.set_coords('along'))

#         return results
    
def clean_and_interpolate(file_pathway):
    ds = xr.open_dataset(file_pathway)

    waypoint_lon = np.array([-127.950, -128.115, -128.243, -128.514, -128.646, -128.798])
    waypoint_lat = np.array([51.757, 51.705, 51.715, 51.450, 51.4165, 51.408])
    central_lat = 51.715

    alongx, acrossx, _ = wd.get_simple_distance(
        shiplon=ds['longitude'].values,
        shiplat=ds['latitude'].values,
        wplon=waypoint_lon,
        wplat=waypoint_lat,
        central_lat=central_lat)

    ds = ds.assign(along=('time', alongx), across=('time', acrossx))

    peak_idx = int(np.argmax(ds['along'].values))
    ds_out = ds.isel(time=slice(0, peak_idx + 1))
    ds_return = ds.isel(time=slice(peak_idx + 1, None))

    results = {}
    for leg, name in [(ds_out, "out"), (ds_return, "return")]:
        if leg['time'].size < 2 or np.count_nonzero(~np.isnan(leg['along'])) < 10:
            continue

        prev_len = -1
        while prev_len != len(leg['time']):
            prev_len = len(leg['time'])
            grad = np.gradient(leg['along'])
            keep_mask = grad > 0 if name == "out" else grad < 0
            leg = leg.sel(time=keep_mask)

        if np.count_nonzero(~np.isnan(leg['along'])) < 100:
            continue

        results[name] = interpolate(leg.set_coords('along'))

    return results

# def save_cleaned_transects(input_dir, output_dir):
#     input_dir = Path(input_dir).expanduser()
#     output_dir = Path(output_dir).expanduser()
#     output_dir.mkdir(parents=True, exist_ok=True)

#     for i, f in enumerate(sorted(input_dir.glob("*_grid_delayed.nc"))):
#         print(f"🧼 Cleaning & interpolating: {f.name}")
#         try:
#             results = clean_and_interpolate(str(f))

#             for leg in results:
#                 ds = results[leg]
#                 ds = ds.expand_dims('transect')
#                 ds = ds.assign_coords(transect=("transect", [i]))

#                 out_path = output_dir / f"{f.stem}_{leg}.nc"
#                 ds.to_netcdf(out_path)
#                 print(f"✅ Saved {leg}: {out_path.name}")

#         except Exception as e:
#             print(f"❌ Failed on {f.name}: {e}")
# if False:
#     save_cleaned_transects(
#         input_dir="~/CalvertLine_reprocessed",
#         output_dir="~/Desktop/cleaned_transects")

In [None]:
# from pathlib import Path
# import xarray as xr

# # ⬇️ Step 1: Point to files you want to process
# files_to_run = [
#     "~/CalvertLine_reprocessed/dfo-eva035-20231019_grid_delayed.nc",
#     "~/CalvertLine_reprocessed/dfo-hal1002-20240924_grid_delayed.nc",
#     "~/CalvertLine_reprocessed/dfo-marvin1003-20221129_grid_delayed.nc",
#     "~/CalvertLine_reprocessed/dfo-marvin1003-20240516_grid_delayed.nc"
# ]

# # ⬇️ Step 2: Destination for cleaned outbound files
# output_dir = Path("~/Desktop/cleaned_transects").expanduser()
# output_dir.mkdir(parents=True, exist_ok=True)

# # ⬇️ Step 3: Loop through files and run your function
# for i, file_path in enumerate(files_to_run):
#     file_path = Path(file_path).expanduser()
#     print(f"🧼 Cleaning outbound: {file_path.name}")
    
#     try:
#         results = clean_and_interpolate(str(file_path))

#         if "out" in results:
#             ds = results["out"]
#             ds = ds.expand_dims("transect")
#             ds = ds.assign_coords(transect=("transect", [i]))

#             out_path = output_dir / f"{file_path.stem}_out.nc"
#             ds.to_netcdf(out_path)
#             print(f"✅ Saved outbound: {out_path.name}")
#         else:
#             print("⚠️ No outbound leg found.")

#     except Exception as e:
#         print(f"❌ Failed on {file_path.name}: {e}")

In [None]:
def plot_multiple_missions_map(glider_files, long_bounds=None, lat_bounds=None,
                               topo_file=os.path.expanduser('~/Desktop/british_columbia_3_msl_2013.nc')):
    """
    Plot multiple glider paths over bathymetry for the Calvert Line.

    Parameters:
    - glider_files: list of str, paths to NetCDF glider grid files
    - topo_file: str, path to topo NetCDF file
    - long_bounds, lat_bounds: optional map bounds
    """

    # Load all datasets
    datasets = [xr.open_dataset(os.path.expanduser(f)) for f in glider_files]
    all_lons = np.concatenate([ds['longitude'].values for ds in datasets])
    all_lats = np.concatenate([ds['latitude'].values for ds in datasets])

    # Auto bounding box if not provided
    if long_bounds is None:
        long_bounds = [all_lons.min() - 0.5, all_lons.max() + 0.5]
    if lat_bounds is None:
        lat_bounds = [all_lats.min() - 0.5, all_lats.max() + 0.5]

    # Load topo and subset
    topo = xr.open_dataset(topo_file)
    topo = topo.sel(
        lon=slice(long_bounds[0], long_bounds[1]),
        lat=slice(lat_bounds[0], lat_bounds[1])
    )
    topo_var = -topo['Band1']

    # Set up plot
    fig, ax = plt.subplots(figsize=(12, 9), subplot_kw={'projection': ccrs.PlateCarree()})
    ax.set_extent(long_bounds + lat_bounds, crs=ccrs.PlateCarree())

    # Gridlines with lat/lon ticks only
    gl = ax.gridlines(draw_labels=True, linestyle='--', alpha=0)
    gl.top_labels = False
    gl.right_labels = False

    # Bathymetry shading
    levels = np.linspace(0, 410, 51)
    contourf = ax.contourf(topo['lon'], topo['lat'], topo_var,
                           levels=levels, cmap=cm.cm.deep, extend='both')
    fig.colorbar(contourf, ax=ax, label='Depth (m)')

    # 0 m contour (coastline)
    ax.contour(topo['lon'], topo['lat'], topo_var, levels=[0.5], colors='black', linewidths=1)

    # Time scaling for consistent colorbar
    all_time_vals = np.concatenate([ds['time'].values for ds in datasets])
    all_time_nums = mdates.date2num(all_time_vals)
    vmin = all_time_nums.min()
    vmax = all_time_nums.max()

    for i, ds in enumerate(datasets):
        lons = ds['longitude'].values
        lats = ds['latitude'].values
        time_vals = ds['time'].values
        time_nums = mdates.date2num(time_vals)

        label = os.path.basename(glider_files[i]).split('_')[0]
        sc = ax.scatter(lons, lats, c=time_nums, cmap='seismic',
                        vmin=vmin, vmax=vmax, s=5, transform=ccrs.PlateCarree(),
                        zorder=5, label=label)

    # Colorbar
    cbar = plt.colorbar(sc, ax=ax, orientation='vertical', pad=0.01, extend='both')
    cbar.set_label('Date')
    cbar.ax.yaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))

    # Waypoint track overlay (optional)
    waypoint_lon = [-127.950, -128.115, -128.243, -128.514, -128.646, -128.798]
    waypoint_lat = [51.757, 51.705, 51.715, 51.450, 51.4165, 51.408]
    # ax.plot(waypoint_lon, waypoint_lat, color='black', linestyle='-', linewidth=2, label='Transect')

    ax.legend(title='Glider Missions')
    ax.set_title('Glider Missions Map')
    ax.set_aspect(1 / np.cos(np.deg2rad(np.mean(lat_bounds))))

plot_multiple_missions_map(['~/CalvertLine_reprocessed/dfo-colin1142-20240312_grid_delayed.nc'], long_bounds=(-128.3, -128), lat_bounds = (51.6, 51.8))

In [None]:
from pathlib import Path
import xarray as xr
import numpy as np
import pandas as pd

def clean_and_interpolate(file_pathway):
    ds = xr.open_dataset(file_pathway)

    waypoint_lon = np.array([-127.950, -128.115, -128.243, -128.514, -128.646, -128.798])
    waypoint_lat = np.array([51.757, 51.705, 51.715, 51.450, 51.4165, 51.408])
    central_lat = 51.715

    alongx, acrossx, _ = wd.get_simple_distance(
        shiplon=ds['longitude'].values,
        shiplat=ds['latitude'].values,
        wplon=waypoint_lon,
        wplat=waypoint_lat,
        central_lat=central_lat)

    ds = ds.assign(along=('time', alongx), across=('time', acrossx))

    peak_idx = int(np.argmax(ds['along'].values))
    ds_out = ds.isel(time=slice(0, peak_idx + 1))
    ds_return = ds.isel(time=slice(peak_idx + 1, None))

    results = {}

    for leg, name in [(ds_out, "out"), (ds_return, "return")]:
        try:
            if leg['time'].size < 2 or np.count_nonzero(~np.isnan(leg['along'])) < 10:
                continue

            prev_len = -1
            while prev_len != len(leg['time']):
                if len(leg['along']) < 3:
                    raise ValueError("Too short for gradient")
                prev_len = len(leg['time'])
                grad = np.gradient(leg['along'].values)
                keep_mask = grad > 0 if name == "out" else grad < 0
                leg = leg.sel(time=keep_mask)

            if np.count_nonzero(~np.isnan(leg['along'])) < 100:
                continue

            results[name] = interpolate(leg.set_coords('along'))

        except Exception as e:
            print(f"⚠️ Skipping {name} leg in {file_pathway}: {e}")
            continue

    return results
input_dir = Path("~/CalvertLine_reprocessed").expanduser()
output_dir = Path("~/Desktop/cleaned_transects").expanduser()
output_dir.mkdir(parents=True, exist_ok=True)

all_files = sorted(input_dir.glob("*_grid_delayed.nc"))

for file_path in all_files:
    print(f"🧼 Cleaning & interpolating: {file_path.name}")
    
    try:
        results = clean_and_interpolate(str(file_path))

        for leg in ("out", "return"):
            if leg in results:
                ds = results[leg]
                ds = ds.expand_dims("transect")

                # Get first valid time as date string
                valid_times = ds['time'].values[~np.isnan(ds['time'].values)]
                if len(valid_times) == 0:
                    print(f"⚠️ Skipping {file_path.name} — no valid times in {leg} leg.")
                    continue
                timestamp = pd.to_datetime(valid_times[0]).strftime('%Y%m%d')

                # Assign coordinate and build filename
                ds = ds.assign_coords(transect=("transect", [timestamp]))
                out_path = output_dir / f"{timestamp}_{leg}.nc"
                
                ds.to_netcdf(out_path)
                print(f"✅ Saved {leg}: {out_path.name}")
            else:
                print(f"⚠️ No {leg} leg found in {file_path.name}")

    except Exception as e:
        print(f"❌ Failed on {file_path.name}: {e}")

In [None]:
def quick_plot(ds, var='temperature'):
    fig, ax = plt.subplots(figsize=(10, 4))
    ds[var].plot(ax=ax)
    ax.set_title(f"{var} — {ds.attrs.get('title', '')}")
    plt.show()

In [None]:
def clean_bad_data(ds, file_id):
    if file_id == '20200717':
        ds = ds.where(ds.salinity > 26, drop=True)
    elif file_id == '20200810':
        ds = ds.sel(time=~ds.time.isin([
            np.datetime64("2020-08-10T04:33:00"),
            np.datetime64("2020-08-10T04:34:00")
        ]))
    return ds
file_id = file_path.name.split('-')[-1].split('_')[0]  # e.g., '20200810'
ds = clean_bad_data(ds, file_id)

# August 6th fixing some deep casts and changing masking logic slightly #

In [None]:
import xarray as xr
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import numpy as np
import os
import matplotlib.dates as mdates
import cmocean as cm
import waypoint_distance as wd
import pandas as pd
from pathlib import Path

# def plot_section(ds, topo, xlim=(77,0)):
#     """
#     Plot temperature section for a given file

#     Parameters:
#     - ds: xarray.Dataset, a .nc file for each transect
#     - xlim: float, maximum distance (km) along transect to plot
#     """

#     # ─── Styling ─────────────────────────────────────────
#     plt.rcParams.update({
#         'font.size': 12,
#         'axes.titlesize': 20,
#         'axes.labelsize': 20,
#         'xtick.labelsize': 15,
#         'ytick.labelsize': 15,
#         'legend.fontsize': 20,
#         'figure.titlesize': 20})

#     # ─── Extract variables ──────────────────────────────
#     along = ds['along'].values
#     depth = ds['depth'].values
#     temperature = ds['temperature'].values
#     pdens = ds['potential_density'].values - 1000  # sigma-theta
#     lon = ds['longitude'].values
#     lat = ds['latitude'].values
#     time_top = ds['time'].values  # shape (along,)

#     # ─── Interpolate Bathymetry ─────────────────────────
#     interp_bathy = topo['Band1'].interp(
#         lon=xr.DataArray(lon, dims='along'),
#         lat=xr.DataArray(lat, dims='along'),
#         method='nearest')
#     ocean_floor = -interp_bathy.values
#     depth_grid, along_grid = np.meshgrid(depth, along, indexing='ij')  # shape: (depth, along)

#     # Expand bottom_depths to match depth grid shape
#     bathymetry_floor = np.tile(ocean_floor, (len(depth), 1))

#     # Create mask where depth > bottom
#     mask = depth_grid > bathymetry_floor

#     # ------- Plotting ------- #
#     fig, ax = plt.subplots(figsize=(1.5 * 1.5 * 6.4, 1.5 * 4.8))
    
#     # Fill below bathymetry to 420 m
#     ax.fill_between(along / 1000, ocean_floor, 420,
#                     where=~np.isnan(ocean_floor),
#                     facecolor='grey', zorder=1)
    
#     # Plot temperature
#     cf = ax.pcolormesh(along / 1000, depth, temperature, 
#                     shading='auto', cmap=cm.cm.thermal, 
#                     vmin=5.3, vmax=10, zorder=2)

#     # Plot bathymetry
#     ax.plot(along / 1000, ocean_floor, color='black', linewidth=2)

#     # Isopycnals
#     for levels, color, lw in [
#         (np.linspace(24, 27, 7), 'black', 0.5),
#         # ([26.6], 'white', 2),
#         ([26.7], 'lime', 2),
#         ([26.8], 'red', 2),
#         ([26.9], 'blue', 2)]:
#         cf_iso = ax.contour(along / 1000, depth, pdens, levels=levels,
#                             colors=color, linewidths=lw, linestyles='-')
#         if lw != 0.3:
#             ax.clabel(cf_iso, fmt='%1.2f')

#     # ─── Top Axis with Time Labels ──────────────────────
#     along_km = along / 1000
#     nticks = 8
#     idx_ticks = np.linspace(0, len(along_km) - 1, nticks, dtype=int)
#     tick_locs = along_km[idx_ticks]
#     tick_times = time_top[idx_ticks]

#     # Remove NaT
#     valid_mask = ~pd.isna(tick_times)
#     tick_locs = tick_locs[valid_mask]
#     tick_times = tick_times[valid_mask]
#     tick_labels = [pd.to_datetime(t).strftime('%b %d %H:%M') for t in tick_times]

#     ax_top = ax.secondary_xaxis('top')
#     ax_top.set_xticks(tick_locs)
#     ax_top.set_xticklabels(tick_labels, rotation=30, ha='center', fontsize=10)

#     # ─── Labels and Limits ──────────────────────────────
#     ax.set_xlabel('Along-Transect Distance (km)')
#     ax.set_ylabel('Depth (m)')
#     ax.invert_yaxis()
#     ax.set_ylim(420, 0)
#     ax.set_xlim(xlim)

#     earliest_time = pd.to_datetime(min(tick_times))
#     tstr = earliest_time.strftime('%Y-%m-%d')
#     ax.set_title(f'Temperature Section ({tstr})')

#     cf = ax.pcolormesh(along / 1000, depth, temperature, 
#                     shading='auto', cmap=cm.cm.thermal, 
#                    vmin=5.3, vmax=10, zorder=2)

#     plt.colorbar(cf, ax=ax, label='Temperature (°C)')
#     plt.tight_layout()

def interpolate(ds, step=50, extrapolate=True):
    import numpy as np
    import xarray as xr

    # Remove duplicate along values
    _, index_unique = np.unique(ds['along'], return_index=True)
    ds = ds.isel(time=index_unique)

    # Build regular along grid
    min_along = np.floor(ds['along'].min().item() / step) * step
    max_along = np.ceil(ds['along'].max().item() / step) * step
    along_grid = np.arange(min_along, max_along + 1, step)

    # Swap time with along for interpolation
    ds = ds.swap_dims({'time': 'along'})
    ds_interp = ds.interp(along=along_grid)

    # Interpolate time manually (since it's not numeric by default)
    interp_time = np.interp(
        along_grid,
        ds['along'].values,
        ds['time'].values.astype('datetime64[ns]').astype('float64'),
        left=np.nan,
        right=np.nan
    )
    ds_interp['time'] = ('along', interp_time.astype('datetime64[ns]'))

    # Get along mask for 0–20 km region
    mask_along = (ds_interp['along'] >= 0) & (ds_interp['along'] <= 20000)

    # Subset temperature in that range
    temp_sub = ds_interp['temperature'].where(mask_along, drop=True)

    # Find the deepest depth where at least one valid (non-NaN) temperature exists
    valid_depths = ds_interp['depth'][~np.all(np.isnan(temp_sub), axis=1)]
    max_valid_depth = valid_depths.max().item()

    # Limit to valid depth range
    ds_interp = ds_interp.sel(depth=ds_interp['depth'] <= max_valid_depth)
    # Drop any depths that aren't exactly on a 1.0 m spacing starting from 0.5
    depths = ds_interp['depth'].values
    uniform_depths = np.arange(0.5, np.max(depths) + 1, 1.0)
    ds_interp = ds_interp.sel(depth=np.isin(ds_interp['depth'], uniform_depths))

    # ─── Vectorized valid_temp_depth assignment ─────
    temp = ds_interp['temperature'].values  # (depth, along)
    depth_vals = ds_interp['depth'].values
    valid_mask = ~np.isnan(temp)
    reversed_mask = valid_mask[::-1, :]
    first_valid_idx_from_bottom = reversed_mask.argmax(axis=0)
    has_valid_data = valid_mask.any(axis=0)
    valid_depths = np.where(has_valid_data,
                            depth_vals[-1 - first_valid_idx_from_bottom],
                            np.nan)
    ds_interp['valid_temp_depth'] = ('along', valid_depths)

    # ─── Fill NaNs along each depth row using nearest ─────
    for var in ds_interp.data_vars:
        da = ds_interp[var]
        if 'along' not in da.dims or 'depth' not in da.dims:
            continue

        filled_rows = []
        depths = ds_interp['depth'].values
        along_vals = ds_interp['along'].values

        for i in range(len(depths)):
            row = da.isel(depth=i)
            filled = row.interpolate_na(
                dim='along',
                method='nearest',
                fill_value='extrapolate')
            filled_rows.append(filled.values)

        # Rebuild variable
        new_da = xr.DataArray(
            data=np.array(filled_rows),
            dims=('depth', 'along'),
            coords={'depth': depths, 'along': along_vals})
        ds_interp[var] = new_da

    return ds_interp

def clean_and_interpolate(file_pathway, topo):

    ds = xr.open_dataset(file_pathway)

    waypoint_lon = np.array([-127.932, -128.013, -128.086415 , -128.154, -128.243, -128.345, -128.514, -128.646, -128.798])
    waypoint_lat = np.array([51.775, 51.7415, 51.71175, 51.70317, 51.715, 51.70172, 51.450, 51.4165, 51.408])
    central_lat = 51.715

    alongx, acrossx, _ = wd.get_simple_distance(
        shiplon=ds['longitude'].values,
        shiplat=ds['latitude'].values,
        wplon=waypoint_lon,
        wplat=waypoint_lat,
        central_lat=central_lat)

    ds = ds.assign(along=('time', alongx), across=('time', acrossx))

    peak_idx = int(np.argmax(ds['along'].values))
    ds_out = ds.isel(time=slice(0, peak_idx + 1))
    ds_return = ds.isel(time=slice(peak_idx + 1, None))

    results = {}

    for leg, name in [(ds_out, "out"), (ds_return, "return")]:
        try:
            if leg['time'].size < 2 or np.count_nonzero(~np.isnan(leg['along'])) < 10:
                continue

            # ─── Preserve deep casts (>300m) only if along < 20000 ───
            depths = leg['depth'].values.reshape(-1, 1)
            temps = leg['temperature'].values
            along = leg['along'].values

            # Find which casts go deeper than 300m
            deep_cast_mask = np.nanmax(depths * ~np.isnan(temps), axis=0) > 300

            # Only keep those casts if their along-location is < 20000
            within_range_mask = along < 17000
            final_mask = deep_cast_mask & within_range_mask

            # Get corresponding times
            preserve_times = leg['time'].values[final_mask]

            # ─── Gradient filtering loop ───
            prev_len = -1
            while prev_len != len(leg['time']):
                if len(leg['along']) < 3:
                    raise ValueError("Too short for gradient")
                prev_len = len(leg['time'])
                grad = np.gradient(leg['along'].values)
                keep_mask = grad > 0 if name == "out" else grad < 0
                leg = leg.sel(time=keep_mask)

            # ─── Add back preserved deep profiles if dropped ───
            times_to_add = [t for t in preserve_times if t not in leg['time'].values]
            if times_to_add:
                preserved = ds.sel(time=times_to_add)
                leg = leg.sortby('along')
                leg = xr.concat([leg, preserved], dim='time')

            if np.count_nonzero(~np.isnan(leg['along'])) < 100:
                continue

            # Interpolate before assigning to results
            leg_interp = interpolate(leg, step=50)
            leg_interp = mask_dataset(leg_interp, topo)
            results[name] = leg_interp
            # results[name] = leg

        except Exception as e:
            print(f"⚠️ Skipping {name} leg in {file_pathway}: {e}")
            continue
        
    # Access outbound leg
    ds_out_cleaned = results.get("out")

    # Access return leg
    ds_return_cleaned = results.get("return")

    return ds_out_cleaned, ds_return_cleaned

def mask_dataset(ds, topo):
    """
    Applies depth-based masking to all 2D (depth, along) variables using valid_temp_depth
    and bathymetry clearance logic.

    Parameters:
    - ds: xarray.Dataset, must include 'valid_temp_depth'
    - topo: xarray.Dataset, bathymetry

    Returns:
    - ds_masked: xarray.Dataset with masked versions of all (depth, along) variables
    """

    # Extract variables
    along = ds['along'].values
    depth = ds['depth'].values
    lon = ds['longitude'].values
    lat = ds['latitude'].values

    if 'valid_temp_depth' not in ds:
        raise ValueError("Dataset must include 'valid_temp_depth'")

    # Interpolate bathymetry
    ocean_floor = -topo['Band1'].interp(
        lon=xr.DataArray(lon, dims='along'),
        lat=xr.DataArray(lat, dims='along'),
        method='nearest').values

    valid_depths = ds['valid_temp_depth'].values
    local_clearance = ocean_floor - valid_depths

    # Average clearance in trusted region
    trusted = (along >= 40000) & (along <= 77000)
    mean_clearance = np.nanmean(local_clearance[trusted])

    # Compute adaptive mask depth
    mask_depth = np.minimum(
        np.where(local_clearance <= mean_clearance,
                 ocean_floor - local_clearance,
                 ocean_floor - mean_clearance),
        ocean_floor)

    # ─── Extra masking of 5 m in deep basin (30–75 km) ───
    along_km = along / 1000
    extra_mask_zone = (along_km >= 40) & (along_km <= 75)
    mask_depth[extra_mask_zone] = np.maximum(mask_depth[extra_mask_zone] - 5, 0)

    # Apply mask to all 2D (depth, along) variables
    for var in ds.data_vars:
        da = ds[var]
        if set(da.dims) == {'depth', 'along'}:
            arr = da.values.copy()
            for j in range(len(along)):
                limit_depth = mask_depth[j]
                if np.isnan(limit_depth):
                    arr[:, j] = np.nan
                else:
                    arr[depth > limit_depth, j] = np.nan
            # Save masked version
            ds[var] = (('depth', 'along'), arr)
            ds['mask_depth'] = ('along', mask_depth)

    return ds

In [None]:
from pathlib import Path
file_path = Path("~/CalvertLine_reprocessed/dfo-hal1002-20240702_grid_delayed.nc").expanduser()
topo = xr.open_dataset("~/Desktop/Summer 2025 Python/british_columbia_3_msl_2013.nc")

ds_out, ds_return = clean_and_interpolate(str(file_path), topo)

if ds_out is not None:
    plot_section(ds_out, topo)
else:
    print("⚠️ No outbound leg found.")

if ds_return is not None:
    plot_section(ds_return, topo)
else:
    print("⚠️ No outbound leg found.")

For fun, I'll test some of the new data now

In [None]:
file_path = Path("~/Desktop/dfo-hal1002-20250701_grid.nc").expanduser()
topo = xr.open_dataset("~/Desktop/Summer 2025 Python/british_columbia_3_msl_2013.nc")

ds_out, ds_return = clean_and_interpolate(str(file_path), topo)

if ds_out is not None:
    plot_section(ds_out, topo)
else:
    print("⚠️ No outbound leg found.")

if ds_return is not None:
    plot_section(ds_return, topo)
else:
    print("⚠️ No outbound leg found.")

# The above reprocessing looks good, so I'm moving onto making a 2024-2025 cube now #

In [None]:
from pathlib import Path
import xarray as xr
import pandas as pd

# ─── Directories ─────────────────────────────────────────
input_dir = Path("~/CalvertLine_reprocessed").expanduser()
output_dir = Path("~/Desktop/Summer 2025 Python/cleaned_transects_2024_2025").expanduser()
output_dir.mkdir(parents=True, exist_ok=True)

# ─── Load bathymetry once ───────────────────────────────
topo = xr.open_dataset("~/Desktop/Summer 2025 Python/british_columbia_3_msl_2013.nc")

# ─── Gather files from 2024 & 2025 ───────────────────────
all_files = sorted(input_dir.glob("*_grid*.nc"))
target_files = [f for f in all_files if any(y in f.name for y in ["2024", "2025"])]

print(f"🔍 Found {len(target_files)} files from 2024–2025")

# ─── Loop through files ──────────────────────────────────
for file_path in target_files:
    print(f"\n🧼 Cleaning: {file_path.name}")

    try:
        ds_out, ds_return = clean_and_interpolate(str(file_path), topo)

        for ds, leg in [(ds_out, "out"), (ds_return, "return")]:
            if ds is None:
                print(f"⚠️ No {leg} leg in {file_path.name}")
                continue

            valid_times = ds['time'].values[~np.isnan(ds['time'].values)]
            if len(valid_times) == 0:
                print(f"⚠️ Skipping {file_path.name} — no valid times in {leg} leg.")
                continue

            timestamp = pd.to_datetime(valid_times[0]).strftime('%Y%m%d')
            out_path = output_dir / f"{timestamp}_{leg}.nc"
            ds.to_netcdf(out_path)
            print(f"✅ Saved {leg}: {out_path.name}")

    except Exception as e:
        print(f"❌ Failed on {file_path.name}: {e}")

In [None]:
import xarray as xr
from pathlib import Path

input_dir = Path("~/Desktop/Summer 2025 Python/cleaned_transects_2024_2025").expanduser()
all_files = sorted(input_dir.glob("*.nc"))

datasets = []

for nc_file in all_files:
    try:
        ds = xr.open_dataset(nc_file)

        # Promote lon/lat to regular variables if they are coordinates
        for coord in ['longitude', 'latitude']:
            if coord in ds.coords:
                ds = ds.reset_coords(coord)

        # Add transect label
        transect_label = nc_file.stem
        ds = ds.expand_dims(transect=[transect_label])
        datasets.append(ds)

        print(f"✅ Added: {transect_label}")

    except Exception as e:
        print(f"⚠️ Skipping {nc_file.name}: {e}")

# Combine all into one cube
if datasets:
    cube = xr.concat(datasets, dim='transect')
    cube.to_netcdf("~/Desktop/2024_2025_transect_cube.nc")
    print("✅ Saved cube to ~/Desktop/2024_2025_transect_cube.nc")
else:
    print("⚠️ No valid datasets to combine.")

That worked! signing off for now as of Wed Aug 6 @ 14:04

edit: something was wrong (depths weren't all 1m spacing so plotting looked crazy weird, I fixed that and I also only kept deep casts within 20km of along cuz one of the dates did some weird stuff and this fixed it.) Made a new cube as of 15:08pm, and trying to plot again. Been working at this for 3 hours straight, wow processing data can be quite the adventure. 

Okay yahhh it worked. But why are some casts not showing up? cuzzz my 0km wasn't long enough.

As of like 14:07, I've redefined my waypoints. Let's see if this new cube works. Gah now a data point that was being masked before for the weird stuff isnt being masked, changed the deep casts to be within the range of 17km now which is just within that new one.