# Multirun experiment pipeline (HydroShoot)

The following notebook establishes a generalized pipeline for evaluating a computing reservoir against a given task, given multiple experimental runs of the same reservoir.


In [3]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
sys.path.insert(1, os.path.join(sys.path[0], '../../'))  # for importing local packages from src

### Loading the datasets

Currently we aqree loading the HydroShoot dataset generated during the first semester.

In [4]:
DATASET_PATH = '../datasets/hydroshoot_large_trimmed.csv'

In [5]:
from src.model.rc_dataset import ExperimentDataset

dataset = ExperimentDataset(csv_path=DATASET_PATH)
print(dataset)

Dataset properties:
	n_runs:      84
	n_steps:    168
	state_size: 360

Available targets: 
	input_Tac, input_u, input_hs, input_Rg, output_Rg, output_An, output_E, output_Tleaf

Available state variables: 
	state_An, state_E, state_Eabs, state_Ei, state_Flux, state_FluxC, state_Tlc, state_gb, state_gs, state_psi_head, state_u



### Defining targets and observed state variables

In [7]:
%reload_ext autoreload
%autoreload 2 

from model_config import targets, state_variables

print(f'Targets:')
for target in targets:
  print(f'\t- {target}')

print(f'\nState variables:')
for state_var in state_variables:
  print(f'\t- {state_var}')

Targets:
	- input_Tac
	- input_u
	- input_hs
	- input_Rg
	- output_Rg
	- output_An
	- output_E
	- output_Tleaf

State variables:
	- state_An
	- state_E
	- state_Eabs
	- state_Ei
	- state_Flux
	- state_FluxC
	- state_Tlc
	- state_gb
	- state_gs
	- state_psi_head
	- state_u


#### Data preprocessing, grouping and train-test splitting

#### Preprocessing

In [16]:
from hydroshoot_pipeline_utils import preprocess_data

print(preprocess_data.__doc__)


    Preprocessing performed: 

    1. The target signal for each run is computed.
        - Target and reservoir are cast into a ndarray.
    2. Target and reservoir signals are trimmed.
        - A warmup mask is applied to target and reservoir.
        - A night-time mask is applied to target and reservoir.
    3. Target and reservoir are rescaled to zero-mean and unit variance
        - Normalizing transform is fitted on the entire dataset of included experiment runs.
    


#### Grouping strategy

In [18]:
from hydroshoot_pipeline_utils import group_by_day

print(group_by_day.__doc__)

Simulation state from the same calendar day of simulation inputs, 
    across all runs, are grouped together per day. Shape of X is assumed to be (runs, time_steps, nodes)

    ```
    GROUP 1 | GROUP 2 | GROUP 3 | GROUP 4 | ...
    --------+---------+---------+---------+----
    sim1/d1  sim1/d2   sim1/d3   /         /
    /        sim2/d2   sim2/d3   sim2/d4   /       ...
    /        /         sim3/d3   sim3/d4   sim3/d5 
                                ...                ...
    ```
    


#### Train-test split

In [19]:
from hydroshoot_pipeline_utils import train_test_split_blocks

print(train_test_split_blocks.__doc__)


    - Train-test splitting is done at group scope (i.e. by calendar day)
    - Training and testing ranges are chosen as contiguous blocks rather 
      than randomly selected.

    e.g. for `interval_length = 8` and `test_ratio = 0.25`, 
    the consecutive groups are assigned as follows:

    ```
    g1     g2      g3      g4      g5      g6      g7     g8                   
    ------+-------+-------+-------+-------+-------+------+------+
    Train | Train | Train | Train | Train | Train | Test | Test | ... (repeat)
    ```
    


### Model definition

- Readout model is a standard RidgeRegression model with intercept term and CV-tuned regularization strength $\alpha$.
- CV search grid is a progression of logarithmicly spaced values for regularization strength $\alpha$.
- CV and testing metric is NMSE.

In [20]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupKFold

from src.learning.scorers import nmse_scorer

# Define model
readout = Pipeline([
  ('ridge_regression', Ridge(alpha=1, fit_intercept=True))
])

# define search grid
search_grid = [{
  'ridge_regression__alpha': 10 ** np.linspace(np.log10(1e-4), np.log10(1e2), 50)
}]

# define cross-validation and testing metric
scorer = nmse_scorer

## Generating test results

In [27]:
from src.learning.preprocessing import generate_mask
from sklearn.model_selection import GroupKFold


TRAIN_TEST_RATIO = 0.5  # 50% of samples are reserved for the test set.
BLOCK_SIZE = 4  # Train test split in blocks of 4: 1111 0000 1111 0000 ...


STATE_SIZE = 32                         # Sixteen random nodes are selected as reservoir readouts
RUN_IDS = np.arange(dataset.n_runs())   # All runs are used


WARMUP_STEPS = 4 * 24                   # First 4 days of each simulation are discarded
DAY_MASK = generate_mask(5, 21)         # All nighttime data between 5am and 9pm (inclusive) is discarded


STRATEGIES = [
  {
    'strat_name': '3_folds',
    'fold_generator': lambda groups : GroupKFold(n_splits=3)
  },
  {
    'strat_name': '4_folds',
    'fold_generator': lambda groups : GroupKFold(n_splits=4)
  },
  {
    'strat_name': '5_folds',
    'fold_generator': lambda groups : GroupKFold(n_splits=5)
  },
  {
    'strat_name': 'LeaveOneOut',
    'fold_generator': lambda groups : GroupKFold(n_splits=np.unique(groups).shape[0])
  },
]

#### Generating a manifest of model combinations to be fitted:

In [34]:
from hydroshoot_pipeline_utils import direct_target_generator
from model_config import input_targets as input_target_names, output_targets as output_target_Names

input_target_names = dataset.get_input_variables()
input_targets_runs = [list(direct_target_generator(dataset, name, RUN_IDS)) for name in input_target_names]
input_targets = list(zip(input_target_names, input_targets_runs))


output_target_names = dataset.get_output_variables()
output_target_runs = [list(direct_target_generator(dataset, name, RUN_IDS)) for name in output_target_names]
output_targets = list(zip(output_target_names, output_target_runs))

TARGETS = all_targets = [*input_targets, *output_targets]
STATE_VARS = state_variables

In [37]:
from tqdm import tqdm
from src.learning.training import perform_gridsearch
from hydroshoot_pipeline_utils import direct_reservoir_generator


total_loops = len(STRATEGIES) * len(TARGETS) * len(STATE_VARS)
print(f'Performing {total_loops} fits...')


models = {}
results = []


with tqdm(total=total_loops) as pbar:
  for strategy in STRATEGIES:
    for state_var in STATE_VARS:
      # the reservoir is the same for every target
      reservoir_run_list = list(direct_reservoir_generator(dataset, state_var, RUN_IDS, STATE_SIZE, random_state=42))
      
      for target_name, target_generator in TARGETS:
        # Preprocess data for model fit
        X, y = preprocess_data(dataset, RUN_IDS, target_generator, reservoir_run_list, STATE_SIZE,
                               warmup_steps=WARMUP_STEPS, day_mask=DAY_MASK)
        
        # Generate group labels and make train/test split
        days_per_run = X.shape[1] // DAY_MASK.sum()
        groups = group_by_day(X, days_per_run)
        train, test = train_test_split_blocks(X, y, groups, TRAIN_TEST_RATIO, BLOCK_SIZE * 2)
        X_train, y_train, groups_train = train

        # fit model
        folds = strategy['fold_generator'](groups_train)
        model, scores = perform_gridsearch(readout, X_train, y_train, groups_train, folds, search_grid, verbose=False)
        (train_mean, train_std), (cv_mean, cv_std) = scores

        # determine test score
        X_test, y_test, _ = test
        test_score = scorer(model, X_test, y_test)
        models[(target_name, state_var)] = model
        results.append({
            'target': target_name,
            'state_var': state_var,
            'test_score': test_score,
            'train_mean': train_mean,
            'train_std': train_std,
            'cv_mean': cv_mean,
            'cv_std': cv_std,
            'strategy': strategy['strat_name'],
            'dataset': 'HydroShoot_large'
        })
        pbar.update(1)

    results_df = pd.DataFrame.from_dict(results)
    filename = f'scores_{strategy["strat_name"]}.csv'
    results_df.to_csv(filename)
    print(f'Saved scores to {filename}')
    models = {}
    results = []


Performing 352 fits...


 25%|██▌       | 89/352 [00:14<00:41,  6.38it/s]

Saved scores to scores_3_folds.csv


 50%|█████     | 176/352 [00:28<00:29,  5.88it/s]

Saved scores to scores_4_folds.csv


 75%|███████▌  | 264/352 [00:45<00:17,  5.05it/s]

Saved scores to scores_5_folds.csv


100%|██████████| 352/352 [02:06<00:00,  2.79it/s]

Saved scores to scores_LeaveOneOut.csv



