## Set up

In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import time

## Download data

Necessary inputs:

* *depth* `wget -O ocean_static.deptho.nc https://psl.noaa.gov/thredds/fileServer/Projects/CEFI/regional_mom6/cefi_derivative/northeast_pacific/full_domain/hindcast/monthly/regrid/r20250509/static/ocean_static.deptho.nc`
* *MLD_003* `wget -O MLD_003.nep.full.hcast.monthly.regrid.r20250509.199301-202412.nc https://psl.noaa.gov/thredds/fileServer/Projects/CEFI/regional_mom6/cefi_portal/northeast_pacific/full_domain/hindcast/monthly/regrid/r20250509/MLD_003.nep.full.hcast.monthly.regrid.r20250509.199301-202412.nc`
* *thetao*`wget -O thetao.nep.full.hcast.monthly.regrid.r20250509.199301-202412.nc https://psl.noaa.gov/thredds/fileServer/Projects/CEFI/regional_mom6/cefi_portal/northeast_pacific/full_domain/hindcast/monthly/regrid/r20250509/thetao.nep.full.hcast.monthly.regrid.r20250509.199301-202412.nc`
* *uo_rotate* `wget -O uo_rotate.nep.full.hcast.monthly.regrid.r20250509.199301-202412.nc https://psl.noaa.gov/thredds/fileServer/Projects/CEFI/regional_mom6/cefi_portal/northeast_pacific/full_domain/hindcast/monthly/regrid/r20250509/uo_rotate.nep.full.hcast.monthly.regrid.r20250509.199301-202412.nc`
* *vo_rotate* `wget -O vo_rotate.nep.full.hcast.monthly.regrid.r20250509.199301-202412.nc https://psl.noaa.gov/thredds/fileServer/Projects/CEFI/regional_mom6/cefi_portal/northeast_pacific/full_domain/hindcast/monthly/regrid/r20250509/vo_rotate.nep.full.hcast.monthly.regrid.r20250509.199301-202412.nc`

Script assumes you have these files in a `OcModels/Data/MOM6` folder.

Depth was based on a link Chia-Wei shared with me. The remaining queries were generated on the [CEFI Data Portal](https://psl.noaa.gov/cefi_portal/#data_access).
* Region: NEP
* Subregion: full domain
* Experiment Type: hindcast
* Output Frequency: monthly regrid
* Release: r20250509
* Data Category: ocean_monthly_z

## Life stage dictionary

Based on [Data Availability](https://docs.google.com/document/d/1P8D0kH2xn4NYBc0ib3rYfSO0KAiyDNQ_kZgG4Qub7qY/edit?tab=t.0#heading=h.z5x77oqxpj4i), define filters for each lifestage for time, bottom depth, and latitude.

For northern yellowtail, the bottom depth is used to determine the longitudinal extent, and the average is taken across the whole water column. For example, if the longitudinal extent is bottom depths 90-180m, the netCDF is filtered for isobaths where the depth is \[90, 180], and the average is calculated for all the water \[0, 180] at those depths.

In [2]:
# assign months, latitudes, and depths to filter
# all are integers, depths are in meters
# filters are inclusive (closed interval), e.g. depth 0-180 
#   will include depths 0 and 180 in addition to every depth between

lifestage_dict = {
    "partuition": {
        "min_month": 1,
        "max_month": 4,
        "min_lat": 40,
        "max_lat": 48,
        "min_depth": 0,
        "max_depth": 180
    },
    "larvae": {
        "min_month": 2,
        "max_month": 3,
        "min_lat": 40,
        "max_lat": 48,
        "min_depth": 0,
        "max_depth": 90
    }
}

## Load datasets

In [3]:
# read in depth and MLD
ds_mld = xr.open_dataset("../../Data/MOM6/MLD_003.nep.full.hcast.monthly.regrid.r20250509.199301-202412.nc")
ds_depth = xr.open_dataset("../../Data/MOM6/ocean_static.deptho.nc")

In [4]:
# pass in a dataset to print the # and % records that are not np.nan
# assumes you have already imported numpy as np
def print_non_nans(dataset, var_name):
    numerator = np.isfinite(dataset[var_name].data).sum()
    denominator = dataset[var_name].size
    percent = numerator / denominator * 100
    print(f'{numerator} of {denominator} records are not np.nan')
    print(f'{percent:.{2}f}% records')
    return None

In [5]:
'''
Calculate annual mean for a 2-D variable

Inputs:
- var_dataset: xarray Dataset with the variable to average
- depth_dataset: xarray Dataset with depth to filter
- ds_var_name: string with the variable name, used to extract from var_dataset
- df_var_name: string with the variable name, used to rename column in returned dataframe
- filter_dict: dictionary with relevant filters
    - min_mon, max_mon: minimum and maximum month (integers, inclusive)
    - min_lat, max_lat: minimum and maximum latitude (integers or floats, inclusive)
    - min_depth, max_depth: minimum and maximum depth to filter the longitudinal extent
- print_counts: boolean, to print record count and % at each step

Outputs:
- ds_df: pandas dataframe with an annual average of the variable
'''

def annual_mean_2d(var_dataset, depth_dataset, ds_var_name, df_var_name, filter_dict, print_counts):
    
    if print_counts:
        print("---" + df_var_name + "---")
        print_non_nans(var_dataset, ds_var_name)
    
    # merge files
    ds_merged = xr.merge([var_dataset, depth_dataset])
    if print_counts:
        print_non_nans(ds_merged, ds_var_name)

    # create list with months
    month_list = [x for x in range(filter_dict["min_month"], filter_dict["max_month"]+1)]
    # select times and latitudes
    ds_selected = ds_merged.sel(time =  ds_merged.time.dt.month.isin(month_list),
                                lat = slice(filter_dict["min_lat"], filter_dict["max_lat"]))
    if print_counts:
        print_non_nans(ds_selected, ds_var_name)
    
    # mask depths
    ds_masked = ds_selected.where(
        (ds_selected.deptho >= filter_dict["min_depth"]) & 
        (ds_selected.deptho <= filter_dict["max_depth"])
    )
    if print_counts:
        print_non_nans(ds_masked, ds_var_name)
    
    # calculate mean
    ds_mean = ds_masked[ds_var_name].groupby('time.year').mean(dim = ['time', 'lon', 'lat'])
    
    # create pandas dataframe
    ds_df = ds_mean.to_dataframe()
    
    # rename column from dataset to dataframe variable name
    ds_df.rename(columns = {ds_var_name:df_var_name}, inplace = True)

    return ds_df

In [6]:
MLDpart = annual_mean_2d(ds_mld, ds_depth, "MLD_003", "MLDpart", lifestage_dict['partuition'], True)
MLDlarv = annual_mean_2d(ds_mld, ds_depth, "MLD_003", "MLDlarv", lifestage_dict['larvae'], True)

---MLDpart---
38069760 of 106719360 records are not np.nan
35.67% records
38069760 of 106719360 records are not np.nan
35.67% records
1804288 of 4102912 records are not np.nan
43.98% records
12416 of 4102912 records are not np.nan
0.30% records
---MLDlarv---
38069760 of 106719360 records are not np.nan
35.67% records
38069760 of 106719360 records are not np.nan
35.67% records
902144 of 2051456 records are not np.nan
43.98% records
2944 of 2051456 records are not np.nan
0.14% records


In [9]:
# merge pandas dataframes
output_df = pd.merge(MLDpart, MLDlarv, how = 'outer', left_index = True, right_index = True)
output_df

Unnamed: 0_level_0,MLDpart,MLDlarv
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1993,11.830518,10.904479
1994,11.643274,11.090589
1995,10.15039,8.718237
1996,8.544428,7.829722
1997,8.017573,7.474869
1998,11.557767,8.623075
1999,10.01516,8.873915
2000,9.018702,7.800598
2001,15.199876,14.3034
2002,10.175895,8.582202


In [10]:
# save result
output_df.to_csv("../../Data/MOM6/output.csv")