# Making training data

Here we take the various experiments that have been run and combine them into a single training dataset

# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# general libraries
import glob
import os
import pathlib
import site
import re
import sys
import warnings
from loguru import logger
# data handling libraries
import geopandas as gpd
import numpy as np
import pandas as pd
import rioxarray as rxr
import xarray as xr
from tqdm.dask import TqdmCallback as ProgressBarDask
from tqdm.notebook import tqdm as ProgressBar
# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
# machine learning libraries
import gpytorch
from sklearn import gaussian_process as gp
import torch

import cryogrid_pytools as cg
import pamir_mlpermafrost as pamir

# Loading data

In [4]:
path_output = pathlib.Path("../../pamir-CryoGrid/output/")
fnames = sorted(path_output.glob("*.zarr"))

In [5]:
# matches anything between hyphens or underscores
pattern = re.compile(r"[-_]?([a-zA-Z0-9]+)[_-]?")  

tables = []
for fname in fnames:
    logger.info(f"Processing file: {fname}")
    ds = xr.open_zarr(fname, consolidated=True)
    exp = re.findall(pattern, fname.stem)[-2]
    df = pamir.data.process_permafrost.get_training_data_table(ds, experiment_name=exp)
    tables.append(df)

table = pd.concat(tables)

[32m2025-07-31 17:33:46.347[0m | [1mINFO    [0m | [36m1912776063:<module>:6[0m - [1mProcessing file: ../../pamir-CryoGrid/output/cluster_config-k1500-pamir_N180-exp1.zarr[0m
[32m2025-07-31 17:34:00.007[0m | [1mINFO    [0m | [36m1912776063:<module>:6[0m - [1mProcessing file: ../../pamir-CryoGrid/output/cluster_config-k1500-pamir_N180-exp4.zarr[0m
[32m2025-07-31 17:34:14.408[0m | [1mINFO    [0m | [36m1912776063:<module>:6[0m - [1mProcessing file: ../../pamir-CryoGrid/output/cluster_config-k1500-pamir_S180-exp1.zarr[0m
[32m2025-07-31 17:34:26.104[0m | [1mINFO    [0m | [36m1912776063:<module>:6[0m - [1mProcessing file: ../../pamir-CryoGrid/output/cluster_config-k1500-pamir_S180-exp2.zarr[0m
[32m2025-07-31 17:34:38.214[0m | [1mINFO    [0m | [36m1912776063:<module>:6[0m - [1mProcessing file: ../../pamir-CryoGrid/output/cluster_config-k1500-pamir_S180-exp4.zarr[0m
[32m2025-07-31 17:34:50.068[0m | [1mINFO    [0m | [36m1912776063:<module>:6[0m - [1

# Collocating variables from spatial data

In [None]:
fname_spatial = f'simplecache::s3://spi-pamir-cryogrid/processed-cluster_config/spatial_variables-710w365s750e400n-100m.zarr/'

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # Open the spatial variables dataset
    ds_spatial = xr.open_zarr(fname_spatial, storage_options=pamir.data.s3_utils.fsspec_kwargs)

df_spatial = pamir.data.process_permafrost.get_collocated_spatial_data(ds_spatial, table)
df_spatial = df_spatial.dropna()



In [6]:
table['land_cover'] = df_spatial.land_cover
table['temperature'] = df_spatial.temperature
table['temperature_downscaled'] = df_spatial.temperature_downscaled
table['precipitation'] = df_spatial.precipitation
table['snow_melt_doy'] = df_spatial.snow_melt_doy

## ERA5 summer winter temperatures

In [None]:
props = {
    'consolidated': True,
    'storage_options': {'endpoint_url': pamir.data.s3_utils.fsspec_kwargs['s3']['endpoint_url']}}

In [8]:
url_template = "s3://spi-pamir-c7-sdsc/era5_data/central_asia/central_asia-{year}.zarr/"
era5_central_asia_list = [xr.open_zarr(url_template.format(year=y), **props) for y in range(2000, 2025)]

era5_central_asia = xr.concat(era5_central_asia_list, dim='time')

era5_tajik = (
    era5_central_asia[['t2m', 'tp']]
    .sel(
        latitude=slice(40, 36.5),
        longitude=slice(70, 75),
        time=slice('2000', None))
    .rename(
        latitude='y',
        longitude='x',
        tp='precip',
        t2m='temp2m'
    )
)

In [9]:
def get_seasonal_quantiles(da, quantiles=[0.05, 0.5, 0.95], dim='time'):
    """
    Calculate seasonal quantiles for a given DataArray.
    
    Parameters:
        da (xr.DataArray): Input data array with a 'time' dimension.
        quantiles (list): List of quantiles to compute.
        dim (str): Dimension along which to compute the quantiles.
        
    Returns:
        xr.DataArray: DataArray containing the computed quantiles.
    """
    name = da.name
    
    out = (
        da.groupby(f'{dim}.season')
        .quantile(quantiles, dim=dim)
        .stack(stacked=['quantile', 'season'])
        .to_dataset(dim='stacked')
        .drop_vars(['quantile', 'season']))

    out = out.rename({
        (q, seas): f"{name}_{seas}_q{int(q*100):02d}" for q, seas in out.data_vars
    })

    return out

In [10]:
with ProgressBarDask(desc="Calculating seasonal quantiles for ERA5 data"):
    # Calculate seasonal quantiles for temperature and precipitation
    ds_era5_quantiles = (
        xr.merge(
            [get_seasonal_quantiles(era5_tajik.temp2m),
            get_seasonal_quantiles(era5_tajik.precip)])
        .interp_like(ds_spatial)
        .compute())

Calculating seasonal quantiles for ERA5 data:   0%|          | 0/3896 [00:00<?, ?it/s]

In [11]:
# ds_era5_quantiles.to_zarr('../../pamir-CryoGrid/forcing/spatial_variables-710w365s750e400n-100m.zarr/', mode='a')

In [None]:
df_era5 = pamir.data.process_permafrost.get_collocated_spatial_data(
    ds_era5_quantiles, table).filter(regex='^temp2m_|^precip_')



In [13]:
for key in df_era5:
    table[key] = df_era5[key]

# Saving table

In [21]:
table.dropna().to_parquet('../data/training/training_data-k1500-pamir_ns180-expX.parquet')

# Creating inference data

In [13]:
fname_spatial = '../../pamir-CryoGrid/forcing/spatial_variables-710w365s750e400n-100m.zarr/'
ds_spatial = xr.open_zarr(fname_spatial).drop_vars('spatial_ref').astype('float32').load()

In [31]:
from numcodecs.zarr3 import Blosc

In [34]:
for key in ds_spatial.data_vars:
    ds_spatial[key].encoding = {'dtype': 'float32', 'compressors': Blosc()}
    ds_spatial[key].attrs = {}

for key in ds_spatial.coords:
    ds_spatial[key].encoding = {}
    ds_spatial[key].attrs = {}

In [35]:
ds_spatial.chunk({'x': 1000, 'y': 1000}).unify_chunks().to_zarr(
    '../data/inference/inference_variables-710w365s750e400n-100m.zarr', mode='w')

<xarray.backends.zarr.ZarrStore at 0x165957060>

In [None]:
df_inference = pamir.models.datasets.load_inference_data_from_zarr(ds_spatial.compute())

In [14]:
df_inference.to_parquet('../data/inference/inference_data-pamir_N180.parquet')