# Part 1: Take the full land surface model dataset generated for Trail Valley Creek and create the subsets relevant to this study

Benoit Montpetit, CPS/CRD/ECCC, 2025  
Nicolas Leroux, RPN-E/MRD/ECCC, 2025  
Mike Brady, CPS/CRD/ECCC, 2025

This notebook takes the full time series of multi-layered snowpacks from land surface models (Soil Vegetation Snow version 2 [Woolley et al. (2024)](https://doi.org/10.5194/tc-18-5685-2024); [Vionnet et al. (2022)](https://agupubs.onlinelibrary.wiley.com/doi/10.1029/2021WR031778); [(SVS-2; Vionnet et al. Preprint)](https://doi.org/10.5194/egusphere-2025-3396)) and sub-samples it to the relevant time period of this study. Another subset, selecting only the top 30 ensemble identified by [Woolley et al. (2024)](https://doi.org/10.5194/tc-18-5685-2024), is also created.  
  
The dataset used directly with these codes can be found here: [TVC SVS-2 (Montpetit et al., Preprint)](ZenodoLink), to avoid duplicating large datasets on Zenodo.  
A different version of the same dataset, originally published by [Woolley et al. (2024)](https://doi.org/10.5194/tc-18-5685-2024) can be found here: [TVC SVS-2 (Woolley et al., Preprint)](link)

In [None]:
from pathlib import Path
import xarray as xr
import pandas as pd

In [None]:
def load_data_subset_time(svs2_netcdf):
    """helper function to load only a specific time range for a given SVS-2 netcdf"""
    ds = xr.open_dataset(svs2_netcdf)
    return ds.sel(time=slice('2018-12-01', '2019-01-31'))

In [None]:
def load_ensemble_data(svs2_netcdf_files, model_option_list):
    """helper function to load a specific ensemble netcdf"""
    svs_nc_match = [
        s for s in svs2_netcdf_files 
        if all(xs in s.parent.name for xs in model_option_list)
    ][0]
    return load_data_subset_time(svs_nc_match)

In [None]:
DATA_ROOT = Path('../Data')

Arctic SVS-2 Data from Zenodo: https://doi.org/10.5281/zenodo.15690838

In [None]:
filepaths = sorted((DATA_ROOT / 'SVS-2' / 'Arctic').rglob('*.nc'))
assert len(filepaths) > 0

In [None]:
# load all arctic temporal subsets and write to netcdf
arctic = xr.concat(
    [
        load_data_subset_time(filepath)
        for filepath in filepaths
    ],
    dim='ensemble'
)
arctic.to_netcdf(DATA_ROOT / 'SVS-2_ArcticEnsembles_TVC02.nc')

### The excel spreadsheet below corresponds to Appendix D: Table D2 of [Woolley et al., 2024](https://doi.org/10.5194/tc-18-5685-2024)

In [None]:
# derived from Wooley et al. (2024)  Appendix D: Table D2
arctic_ensembles = pd.DataFrame(
    data={
        'Ensemble': range(1, 31),
        'SD': [
            'R21W', 'R21W', 'R21W', 'R21W', 'R21W', 'R21W', 'R21W', 'R21W', 'R21W', 'R21W', 'R21W', 'R21W', 'R21W', 'R21W', 'R21W',
            'R21W', 'R21W', 'R21W', 'R21R', 'R21W', 'R21R', 'R21W', 'R21W', 'R21W', 'R21R', 'R21W', 'R21R', 'R21R', 'R21W', 'R21W'
        ],
        'FS': [
            'R21', 'R21', 'GW1', 'R21', 'R21', 'R21', 'R21', 'R21', 'GW1', 'R21', 'GW1', 'GW2', 'GW1', 'GW1', 'GW2', 
            'GW1', 'R21', 'GW1', 'R21', 'GW1', 'R21', 'R21', 'R21', 'R21', 'R21', 'R21', 'R21', 'GW1', 'GW2', 'GW1'
        ],
        'TC': [
            'C11', 'C11', 'C11', 'C11', 'F21', 'F21', 'F21', 'S97', 'C11', 'C11', 'C11', 'S97', 'F21', 'C11', 'S97',
            'S97', 'C11', 'C11', 'F21', 'S97', 'F21', 'F21', 'S97', 'S97', 'S97', 'F21', 'C11', 'C11', 'F21', 'F21'
        ],
        'LWC': [
            'B02', 'O04', 'O04', 'B02', 'O04', 'B92', 'SPK', 'O04', 'O04', 'B92', 'B02', 'SPK', 'O04', 'SPK', 'O04',
            'B02', 'B92', 'B02', 'B02', 'B02', 'O04', 'SPK', 'B02', 'SPK', 'SPK', 'O04', 'B92', 'O04', 'SPK', 'B92'
        ],
        'C': [
            'R2V', 'R2V', 'R21', 'R21', 'R21', 'R21', 'R21', 'R21', 'R2V', 'R2V', 'R21', 'R21', 'R21', 'R2V', 'R21',
            'R2V', 'R2D', 'R21', 'R2V', 'R2V', 'R2V', 'R2D', 'R2D', 'R2D', 'R2V', 'R2D', 'R2V', 'R21', 'R21', 'R2V'
        ],
        'TF': [
            'M98', 'RIL', 'RIL', 'M98', 'DEF', 'DEF', 'DEF', 'RIL', 'M98', 'DEF', 'RIL', 'M98', 'RIL', 'RIL', 'DEF',
            'M98', 'M98', 'M98', 'DEF', 'RIL', 'RIL', 'M98', 'RIL', 'DEF', 'RIL', 'DEF', 'DEF', 'DEF', 'RIL', 'DEF'
        ],
        'CRPS': [
            74.04, 74.28, 74.53, 74.53, 74.88, 75.16, 75.48, 75.49, 75.69, 75.7, 75.84, 76.6, 76.69, 76.71, 76.84,
            77.06, 77.11, 77.28, 77.34, 77.36, 77.55, 77.63, 77.79, 77.84, 77.9, 77.92, 77.95, 77.99, 78.02, 78.08
        ]
    }
)

In [None]:
# load top30 arctic temporal subsets and write to netcdf
arctic_top = xr.concat(
    [
        load_ensemble_data(filepaths, model_options)
        for _, model_options in arctic_ensembles[['SD','FS','TC','LWC','C','TF']].iterrows()
    ], 
    dim='ensemble'
)
arctic_top.to_netcdf(DATA_ROOT / 'SVS-2_ArcticTop30Ensembles_TVC02.nc')

Default SVS-2 Data from Zenodo: https://doi.org/10.5281/zenodo.15690838

In [None]:
# the Default NetCDFs have a slightly different subdirectory structure than the Arctic NetCDFs so we 
# check for file vs directory when globbing
filepaths = sorted([
    nc for nc in (DATA_ROOT / 'SVS-2' / 'Default').rglob('*.nc')
    if nc.is_file()
])
assert len(filepaths) > 0

In [None]:
# load all default temporal subsets and write to netcdf
default = xr.concat(
    [
        load_data_subset_time(filepath)
        for filepath in filepaths
    ],
    dim='ensemble'
)
default.to_netcdf(DATA_ROOT / 'SVS-2_DefaultEnsembles_TVC02.nc')

### The excel spreadsheet below corresponds to Appendix D: Table D1 of [Woolley et al., 2024](https://doi.org/10.5194/tc-18-5685-2024)

In [None]:
# derived from Wooley et al. (2024) Appendix D: Table D1
default_ensembles = pd.DataFrame(
    data={
        'Ensemble': range(1, 31),
        'SD': [
            'DFLT', 'DFLT', 'DFLT', 'DFLT', 'GA01', 'GA01', 'GA01', 'GA01', 'VI13', 'DFLT', 'DFLT', 'GA01', 'DFLT', 'DFLT', 'GA01', 
            'VI13', 'DFLT', 'DFLT', 'DFLT', 'DFLT', 'DFLT', 'DFLT', 'VI13', 'DFLT', 'DFLT', 'DFLT', 'DFLT', 'GA01', 'VI13', 'DFLT'
        ],
        'FS': [
            'S02', 'S02', 'S02', 'S02', 'S02', 'S02', 'S02', 'S02', 'S02', 'S02', 'V12', 'S02', 'V12', 'V12', 'S02',
            'S02', 'V12', 'V12', 'V12', 'P75', 'V12', 'A76', 'V12', 'A76', 'P75', 'A76', 'A76', 'V12', 'V12', 'A76'
        ],
        'TC': [
            'I02', 'I02', 'Y81', 'Y81', 'I02', 'Y81', 'I02', 'Y81', 'I02', 'C11', 'I02', 'C11', 'Y81', 'C11', 'Y81',
            'C11', 'Y81', 'Y81', 'I02', 'I02', 'C11', 'Y81', 'I02', 'Y81', 'Y81', 'Y81', 'Y81', 'Y81', 'C11', 'Y81'
        ],
        'LWC': [
            'B92', 'B92', 'B02', 'SPK', 'B02', 'SPK', 'B92', 'O04', 'O04', 'B02', 'B02', 'B02', 'O04', 'B02', 'B02',
            'O04', 'SPK', 'O04', 'O04', 'B02', 'SPK', 'O04', 'SPK', 'B02', 'SPK', 'SPK', 'SPK', 'O04', 'SPK', 'B92'
        ],
        'C': [
            'S14', 'S14', 'S14', 'B92', 'S14', 'S14', 'S14', 'S14', 'S14', 'B92', 'S14', 'S14', 'S14', 'S14', 'S14',
            'B92', 'B92', 'B92', 'B92', 'S14', 'B92', 'S14', 'B92', 'S14', 'S14', 'B92', 'B92', 'S14', 'B92', 'B92'
        ],
        'TF': [
            'M98', 'RIL', 'M98', 'RIL', 'RIL', 'RIL', 'RIL', 'DEF', 'RIL', 'DEF', 'DEF', 'DEF', 'M98', 'M98', 'RIL',
            'M98', 'RIL', 'RIL', 'RIL', 'M98', 'RIL', 'DEF', 'M98', 'DEF', 'RIL', 'DEF', 'RIL', 'RIL', 'M98', 'DEF'
        ],
        'CRPS': [
            89.50, 89.89, 89.92, 91.18, 91.33, 91.38, 91.61, 91.67, 91.88, 91.93, 92.37, 93.21, 93.49, 93.52, 93.64,
            93.67, 94.22, 94.57, 94.60, 94.62, 94.73, 94.83, 95.06, 95.27, 95.31, 95.47, 95.47, 95.48, 95.55, 95.73
        ]
    }
)

In [None]:
# load top30 default temporal subsets and write to netcdf
default_top = xr.concat(
    [
        load_ensemble_data(filepaths, model_options)
        for _, model_options in default_ensembles[['SD','FS','TC','LWC','C','TF']].iterrows()
    ], 
    dim='ensemble'
)
default_top.to_netcdf(DATA_ROOT / 'SVS-2_DefaultTop30Ensembles_TVC02.nc')