In [10]:
import os
from os.path import join

from omegaconf import OmegaConf
import pandas as pd
import xarray as xr
from itertools import product

from sklearn.model_selection import KFold, train_test_split

In [11]:
# load config
config = OmegaConf.load("config/config.yaml")

## Load data

In [12]:
ds = xr.open_dataset(config.path)

ds = ds.sel(time=slice(config.start_date, config.end_date))
ds

In [13]:
unique_stations = ds.station.values

seeds = list(range(config.n_seeds))
path = join("data", "_".join([config.start_date, config.end_date]))
os.makedirs(path, exist_ok=True)
for seed in seeds:
    seed_path = join(path, f"seed_{seed}")
    os.makedirs(seed_path, exist_ok=True)
    
    for fold_id, (train_idx, test_idx) in enumerate(KFold(n_splits=config.n_folds, shuffle=True, random_state=seed).split(unique_stations)):
        fold_path = join(seed_path, f"fold_{fold_id}")
        os.makedirs(fold_path, exist_ok=True)
        
        for kind, idx in zip(["train", "test"], [train_idx, test_idx]):
            stations = unique_stations[idx]
            df = ds.sel(station=stations).to_dataframe().reset_index()

            df.to_csv(join(fold_path, f"{kind}.csv"), index=False)
            print(f"Saved {kind} data for fold {fold_id} with seed {seed} of shape df={df.shape}")

Saved train data for fold 0 with seed 0 of shape df=(22320, 25)
Saved test data for fold 0 with seed 0 of shape df=(7440, 25)
Saved train data for fold 1 with seed 0 of shape df=(22320, 25)
Saved test data for fold 1 with seed 0 of shape df=(7440, 25)
Saved train data for fold 2 with seed 0 of shape df=(22320, 25)
Saved test data for fold 2 with seed 0 of shape df=(7440, 25)
Saved train data for fold 3 with seed 0 of shape df=(22320, 25)
Saved test data for fold 3 with seed 0 of shape df=(7440, 25)
Saved train data for fold 0 with seed 1 of shape df=(22320, 25)
Saved test data for fold 0 with seed 1 of shape df=(7440, 25)
Saved train data for fold 1 with seed 1 of shape df=(22320, 25)
Saved test data for fold 1 with seed 1 of shape df=(7440, 25)
Saved train data for fold 2 with seed 1 of shape df=(22320, 25)
Saved test data for fold 2 with seed 1 of shape df=(7440, 25)
Saved train data for fold 3 with seed 1 of shape df=(22320, 25)
Saved test data for fold 3 with seed 1 of shape df=(74