In [1]:
import subprocess

import os
from os.path import join

import yaml
from addict import Dict
import pandas as pd
import xarray as xr
from itertools import product

from sklearn.model_selection import KFold, train_test_split

In [2]:
HOME = subprocess.check_output("echo $HOME", shell=True).decode().strip()
REPO_ROOT = subprocess.check_output("git rev-parse --show-toplevel", shell=True).decode().strip()
CONFIG_ROOT = join(REPO_ROOT, "config")

DATA_CONFIG = Dict(yaml.load(open(join(CONFIG_ROOT, "data.yaml"), "r"), Loader=yaml.FullLoader))
ARTIFACTS_PATH = "artifacts"

## Load data

In [3]:
ds = xr.open_dataset(join(HOME, DATA_CONFIG.path))

ds = ds.sel(time=slice(DATA_CONFIG.start_date, DATA_CONFIG.end_date))
ds

In [4]:
features = DATA_CONFIG.features
target = DATA_CONFIG.target

categorical_features = []

unique_stations = ds.station.values

seeds = list(range(DATA_CONFIG.n_seeds))
path = join(ARTIFACTS_PATH, "_".join(sorted(features)+[DATA_CONFIG.start_date, DATA_CONFIG.end_date]))
os.makedirs(path, exist_ok=True)
for seed in seeds:
    seed_path = join(path, f"seed_{seed}")
    os.makedirs(seed_path, exist_ok=True)
    
    for fold_id, (train_idx, test_idx) in enumerate(KFold(n_splits=DATA_CONFIG.n_folds, shuffle=True, random_state=seed).split(unique_stations)):
        fold_path = join(seed_path, f"fold_{fold_id}")
        os.makedirs(fold_path, exist_ok=True)
        
        for kind, idx in zip(["train", "test"], [train_idx, test_idx]):
            stations = unique_stations[idx]
            df = ds.sel(station=stations).to_dataframe().reset_index()
            
            X = df[features]
            y = df[target]

            X.to_csv(join(fold_path, f"{kind}_x.csv"), index=False)
            y.to_csv(join(fold_path, f"{kind}_y.csv"), index=False)
            print(f"Saved {kind} data for fold {fold_id} with seed {seed} of shape X={X.shape} and y={y.shape}")

Saved train data for fold 0 with seed 0 of shape X=(22320, 3) and y=(22320,)
Saved test data for fold 0 with seed 0 of shape X=(7440, 3) and y=(7440,)
Saved train data for fold 1 with seed 0 of shape X=(22320, 3) and y=(22320,)
Saved test data for fold 1 with seed 0 of shape X=(7440, 3) and y=(7440,)
Saved train data for fold 2 with seed 0 of shape X=(22320, 3) and y=(22320,)
Saved test data for fold 2 with seed 0 of shape X=(7440, 3) and y=(7440,)
Saved train data for fold 3 with seed 0 of shape X=(22320, 3) and y=(22320,)
Saved test data for fold 3 with seed 0 of shape X=(7440, 3) and y=(7440,)
Saved train data for fold 0 with seed 1 of shape X=(22320, 3) and y=(22320,)
Saved test data for fold 0 with seed 1 of shape X=(7440, 3) and y=(7440,)
Saved train data for fold 1 with seed 1 of shape X=(22320, 3) and y=(22320,)
Saved test data for fold 1 with seed 1 of shape X=(7440, 3) and y=(7440,)
Saved train data for fold 2 with seed 1 of shape X=(22320, 3) and y=(22320,)
Saved test data f