In [1]:
import pandas as pd
import numpy as np
import sys, os

from matplotlib import pyplot as plt
sys.path.insert(1, os.path.join(sys.path[0], '../../'))  # for importing local packages from src
sys.path.insert(1, os.path.join(sys.path[0], '../'))  # for importing model config

%reload_ext autoreload
%autoreload 2 

## Datasets

In [3]:
from src.model.rc_dataset import ExperimentDataset

DATASET_PATH = '../datasets/hydroshoot_large_trimmed.csv'

dataset = ExperimentDataset(csv_path=DATASET_PATH)

## Model Configuration

In [45]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupKFold
from src.learning.scorers import nmse_scorer
from pipeline_hydroshoot import GroupGenerator, TimeGenerator
from pipeline_base import TrainTestSplitter, DirectTransform, WarmupTransform

# constants
DATASET_ID = 'HydroShoot_large'
STATE_SIZE = 32
N_STATE_SAMPLES = 16
state_ids = None


# Reservoir generation
np.random.seed(42)
generate_state_sample = lambda : np.random.choice(dataset.state_size(), size=STATE_SIZE, replace=False)


# readout model
readout_model = Pipeline([
  ('ridge_regression', Ridge(alpha=1, fit_intercept=True))
])
model_param_grid = [{
  'ridge_regression__alpha': 10 ** np.linspace(np.log10(1e-4), np.log10(1e2), 50)
}]


# Regression task pipeline
datasets = (dataset,)
run_ids = dataset.get_run_ids()
shared_pipeline_params = {
  # Data generation
  'datasets': datasets,
  'groups': GroupGenerator(day_length=24, run_ids=run_ids, days_between_runs=1),
  'time': TimeGenerator(day_length=24, run_ids=run_ids),
  
  # Model training and validation
  'readout_model': readout_model,
  'model_param_grid': model_param_grid,
  'model_scorer': nmse_scorer,
  'folds': GroupKFold(n_splits=5),
  'train_test_split': TrainTestSplitter(block_size=4, test_ratio=0.5),
}

## Regression Pipelines

In [15]:
from pipeline_base import (
  RCPipeline, 
  Rescale, 
  DaylightMask
)

from pipeline_hydroshoot import (
  TargetGenerator, 
  SingleReservoirGenerator,
  MultiReservoirGenerator,
  TargetReservoirGenerator,
  GroupRescale
)


from model_config_hydroshoot import (
  targets, 
  measurable_reservoirs,
  baseline_reservoirs,
  heterogeneous_reservoirs, 
)

### Input and Physiological Tasks

In [26]:
target_state_pairs = [(target, state_var) for target in targets for state_var in measurable_reservoirs]
target_env_pairs = [(name, target, env_targets) for target in targets for (name, env_targets) in baseline_reservoirs]
target_het_pairs = [(name, target, state_vars) for target in targets for (name, state_vars) in heterogeneous_reservoirs]

print(targets)
print(measurable_reservoirs)
print(baseline_reservoirs)
print(heterogeneous_reservoirs)

['input_Tac', 'input_u', 'input_hs', 'input_Rg', 'output_Rg', 'output_An', 'output_E', 'output_Tleaf']
['state_An', 'state_Tlc', 'state_gs', 'state_E', 'state_Flux', 'state_psi_head']
[('env_all', ['input_Tac', 'input_u', 'input_hs', 'input_Rg']), ('env_temp', ['input_Tac']), ('env_humidity', ['input_hs']), ('env_PAR', ['input_Rg']), ('env_wind', ['input_u'])]
[('state_all', ('state_An', 'state_Tlc', 'state_gs', 'state_E', 'state_Flux', 'state_psi_head'))]


In [27]:
input_phys_transforms = [
  WarmupTransform(warmup_days=4, day_length=24),
  DirectTransform()
]

In [33]:
def generate_single(target_state_pairs, state_ids):
  for target, state_var in target_state_pairs:
    pipeline = RCPipeline(
      metadata={'target_id': target, 'reservoir_id': state_var, 'dataset_id': DATASET_ID},
      # Data generation
      target=TargetGenerator(target=target, run_ids=run_ids),
      reservoir=SingleReservoirGenerator(state_var=state_var, run_ids=run_ids, state_ids=state_ids),
      # Data transformation
      transforms=input_phys_transforms,
      # Data preprocessing
      preprocessing=[
        DaylightMask(day_length=24, start=5, end=21),
        Rescale(per_feature=False)
      ],
      **shared_pipeline_params,
    )
    yield pipeline


def generate_multi(target_het_pairs, state_ids):
  for name, target, state_vars in target_het_pairs:
    pipeline = RCPipeline(
      metadata={'target_id': target, 'reservoir_id': name, 'dataset_id': DATASET_ID},
      # Data generation
      target=TargetGenerator(target=target, run_ids=run_ids),
      reservoir=MultiReservoirGenerator(state_vars=state_vars, run_ids=run_ids, state_ids=state_ids),
      # Data transformation
      transforms=input_phys_transforms,
      # Data preprocessing
      preprocessing=[
        DaylightMask(day_length=24, start=5, end=21),
        GroupRescale(datasets=datasets, state_vars=state_vars, state_ids=state_ids)
      ],
      **shared_pipeline_params,
    )
    yield pipeline


def generate_env(target_env_pairs):
  for name, target, env_targets in target_env_pairs:
    pipeline = RCPipeline(
      metadata={'target_id': target, 'reservoir_id': name, 'dataset_id': DATASET_ID},
      # Data generation
      target=TargetGenerator(target=target, run_ids=run_ids),
      reservoir=TargetReservoirGenerator(targets=env_targets, run_ids=run_ids),
      # Data transformation
      transforms=input_phys_transforms,
      # Data preprocessing
      preprocessing=[
        DaylightMask(day_length=24, start=5, end=21),
        Rescale(per_feature=True)
      ],
      **shared_pipeline_params,
    )
    yield pipeline

In [43]:
from itertools import chain


def generate_pipelines(state_ids):
  pipelines = list(chain(
    generate_single(target_state_pairs, state_ids),
    generate_multi(target_het_pairs, state_ids),
    generate_env(target_env_pairs)
  ))
  return pipelines


np.random.seed(42)
state_ids = generate_state_sample()
pipelines = generate_pipelines(state_ids)
print(f'Total pipelines: {len(pipelines)}')

Total pipelines: 96


### Computational Benchmarks

In [44]:
# TODO

## Execution

In [49]:
pipes = generate_pipelines(state_ids)
n_pipelines = len(pipes)
print(n_pipelines)

96


In [50]:
from tqdm import tqdm
from itertools import chain
from pipeline_base import execute_pipeline


RESULTS_FILE = 'results_hydroshoot_input_phys_16sample.csv'

np.random.seed(42)
all_state_samples = [generate_state_sample() for _ in range(N_STATE_SAMPLES)]

total_loops = n_pipelines * N_STATE_SAMPLES
results = []


with tqdm(total=total_loops) as pbar:

  for i_state_sample, state_ids in enumerate(all_state_samples):
    pipelines = generate_pipelines(state_ids)
    for pipeline in pipelines:
      result = execute_pipeline(pipeline)
      result['state_sample'] = i_state_sample
      results.append(result)
      pbar.update(1)


results_df = pd.DataFrame.from_dict(results)
results_df.to_csv(RESULTS_FILE)
print(f'Saved scores to {RESULTS_FILE}')

100%|██████████| 1536/1536 [09:01<00:00,  2.84it/s]

Saved scores to results_hydroshoot_input_phys_16sample.csv



