# Multirun experiment pipeline (WheatFspm)

The following notebook establishes a generalized pipeline for evaluating a computing reservoir against a given task, given multiple experimental runs of the same reservoir.


In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
sys.path.insert(1, os.path.join(sys.path[0], '../../'))  # for importing local packages from src

In [2]:
DATASET_NEMA_H0 = '../datasets/dataset_NEMA_NEMA_H0.csv'
DATASET_NEMA_H3 = '../datasets/dataset_NEMA_NEMA_H3.csv'
DATASET_NEMA_H15 = '../datasets/dataset_NEMA_NEMA_H15.csv'

### Loading the datasets

These datasets were collected and converted in the WheatFspm repository.

There are three simulations made available in the WheatFspm repository that are useable for RC experiments: NEMA H0, H3 and H15.

We can try using these datasets in two different ways:

1. Treat every dataset as a separate plant, training a readout for each simulation run.
2. Concatenating the three datasets as observed behavior of a single plant.

In [3]:
from src.model.rc_dataset import ExperimentDataset

dataset_nema_h0 = ExperimentDataset(csv_path=DATASET_NEMA_H0)
dataset_nema_h3 = ExperimentDataset(csv_path=DATASET_NEMA_H3)
dataset_nema_h15 = ExperimentDataset(csv_path=DATASET_NEMA_H15)

datasets = [
  ('NEMA_H0', dataset_nema_h0), 
  ('NEMA_H3', dataset_nema_h3), 
  ('NEMA_H15', dataset_nema_h15)
]

### Defining targets and observed state variables

These were selected in a previous notebook, `2022_03_23_wheatfspm_dataset_inspection.ipynb` and are defined in a config file for reuse among notebooks.

In [4]:
%reload_ext autoreload
%autoreload 2 

from model_config import targets, state_variables

print(f'Targets:')
for target in targets:
  print(f'\t- {target}')

print(f'\nState variables:')
for state_var in state_variables:
  print(f'\t- {state_var}')

Targets:
	- input_air_temperature
	- input_humidity
	- input_PARi
	- output__axes__Total_Transpiration
	- output__axes__C_exudated
	- output__axes__SAM_temperature
	- output__axes__delta_teq
	- output__axes__sum_respi_shoot
	- output__organ_roots__N_exudation

State variables:
	- state__An
	- state__Transpiration
	- state__S_Sucrose
	- state__Ts
	- state__gs
	- state__Ag
	- state__Tr
	- state__sucrose
	- state__Rd
	- state__sum_respi
	- state__Photosynthesis
	- state__PARa


### Data preprocessing, grouping and train-test splitting

The available datasets will be processed into 4 datasets:

- NEMA_H0
- NEMA_H3
- NEMA_H15
- NEMA_COMBINED (concatenated as data from the same plant)

In [5]:
from src.learning.preprocessing import generate_mask


WARMUP_STEPS = 4 * 24
DAY_MASK = generate_mask(5, 21)

In [6]:
from wheatfspm_pipeline_utils import preprocess_data, group_by_day, train_test_split_alternating
# from wheatfspm_pipeline_utils import direct_target_generator, direct_reservoir_generator
# from wheatfspm_pipeline_utils import preprocess_raw_X
# from model_config import max_time_step

# def generate_X_y_groups(datasets, target, state_var, target_generator, state_generator):
#   """Generates X, y and groups arrays for each dataset, plus a concatenated dataset.
#      NOTE: The groups in the concatenated dataset are such that the same calendar day is in the same group.

#      Also generates a baseline dataset where the reservoir is just a combination of all environmental inputs.
#   """
#   data = {}

#   # Preprocess the data for each dataset
#   for name, dataset in datasets:
#     target_data = next(target_generator(dataset, target, name))
#     reservoir_data = next(state_generator(dataset, state_var, name))
#     X_raw, y_raw = preprocess_data(target_data, reservoir_data, WARMUP_STEPS, DAY_MASK)
#     X, y = X_raw[0, :, :], y_raw[0, :]
#     groups = group_by_day(X, DAY_MASK)
#     data[name] = (X, y, groups)

#   # Generate the concatenated dataset
#   all_arrays = list(data.values())
#   X_combined = np.concatenate(list(map(lambda x : x[0], all_arrays)))
#   y_combined = np.concatenate(list(map(lambda x : x[1], all_arrays)))
#   groups_combined = np.concatenate(list(map(lambda x : x[2], all_arrays)))
#   data['combined'] = (X_combined, y_combined, groups_combined)

#   return data

In [7]:
from wheatfspm_pipeline_utils import generate_X_y_groups, direct_target_generator, direct_reservoir_generator


TARGET = 'output__axes__C_exudated' # targets[0]
STATE_VAR = 'state__Ts' # state_variables[0]

preprocessed_data = generate_X_y_groups(
  datasets=datasets, 
  target=TARGET, 
  state_var=STATE_VAR,
  target_generator=direct_target_generator, 
  state_generator=direct_reservoir_generator,
  warmup_steps=WARMUP_STEPS, 
  day_mask=DAY_MASK,
  combined_only=True,
  add_env=True,
)

for name, (X, y, groups) in preprocessed_data.items():
  print(f'{name}:')
  print(f'\tX: {X.shape}')
  print(f'\ty: {y.shape}')
  print(f'\tgroups: {len(np.unique(groups))} (shape {groups.shape})')

combined:
	X: (1424, 10)
	y: (1424,)
	groups: 33 (shape (1424,))
combined_env_reservoir:
	X: (1424, 3)
	y: (1424,)
	groups: 33 (shape (1424,))
combined_env_temp_only:
	X: (1424, 1)
	y: (1424,)
	groups: 33 (shape (1424,))


### Model definition

- Readout model is a standard RidgeRegression model with intercept term and CV-tuned regularization strength $\alpha$.
- CV search grid is a progression of logarithmicly spaced values for regularization strength $\alpha$.
- CV and testing metric is NMSE.

In [8]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

from src.learning.scorers import nmse_scorer

# Define model
readout = Pipeline([
  ('ridge_regression', Ridge(alpha=1, fit_intercept=True))
])

# define search grid
search_grid = [{
  'ridge_regression__alpha': 10 ** np.linspace(np.log10(1e-4), np.log10(1e2), 50)
}]

# define cross-validation and testing metric
scorer = nmse_scorer

### Generating a manifest of all experiments to run

Currently we are only benchmarking direct target prediction, but in the future there will be other tasks generated from the base targets as well. These will be generated in this section.


In [9]:
# Ratio of training to testing samples (in groups of 16 samples belonging to the same day)
TRAIN_TEST_RATIO = 1

# Amount of consecutive groups that are taken as training or testing groups.
# e.g. for BLOCKS=4, the split is 1111 0000 1111 0000 ...
BLOCKS = 4

We will test different cross-validation folding strategies:

In [10]:
from sklearn.model_selection import GroupKFold


STRATEGIES = [
  # {
  #   'strat_name': '3_folds',
  #   'fold_generator': lambda groups : GroupKFold(n_splits=3)
  # },
  # {
  #   'strat_name': '4_folds',
  #   'fold_generator': lambda groups : GroupKFold(n_splits=4)
  # },
  # {
  #   'strat_name': '5_folds',
  #   'fold_generator': lambda groups : GroupKFold(n_splits=5)
  # },
  {
    'strat_name': 'LeaveOneOut',
    'fold_generator': lambda groups : GroupKFold(n_splits=np.unique(groups).shape[0])
  },
]

### Fitting all readout functions

Process:
- for each fitting strategy:
  - For each target:
    - For each observed state variable:
      - For each dataset:
        1. Preprocess the data
        2. Fit for each dataset
        3. Store the resulting training, cross-validation and test scores.

In [11]:
from tqdm import tqdm
from src.learning.training import perform_gridsearch
from wheatfspm_pipeline_utils import train_test_split_alternating


TARGETS = [(target_name, direct_target_generator) for target_name in targets]
STATE_VARS = [(state_var, direct_reservoir_generator) for state_var in state_variables]


total_loops = len(STRATEGIES) * len(targets) * len(state_variables) * (len(preprocessed_data))
print(f'Performing {total_loops} fits...')


models = {}
results = []


with tqdm(total=total_loops) as pbar:

    for strategy in STRATEGIES:
        for target_name, target_generator in TARGETS:
            for state_var, state_generator in STATE_VARS:

                # Preprocess data for model fit
                preprocessed_data = generate_X_y_groups(
                    datasets=datasets, 
                    target=target_name, 
                    state_var=state_var,
                    target_generator=target_generator, 
                    state_generator=state_generator,
                    warmup_steps=WARMUP_STEPS, 
                    day_mask=DAY_MASK,
                    combined_only=True,
                    add_env=True,
                )

                # For each dataset combination
                for dataset_name, (X, y, groups) in preprocessed_data.items():
                    train, test = train_test_split_alternating(X, y, groups, ratio=TRAIN_TEST_RATIO, blocks=BLOCKS)
                    
                    # fit model
                    X_train, y_train, groups_train = train
                    folds = strategy['fold_generator'](groups_train)
                    model, scores = perform_gridsearch(readout, X_train, y_train, groups_train, folds, search_grid, verbose=False)
                    (train_mean, train_std), (cv_mean, cv_std) = scores

                    # Determine test score
                    X_test, y_test, _ = test
                    test_score = scorer(model, X_test, y_test)
                    models[(target_name, state_var, dataset_name)] = model
                    results.append({
                        'target': target_name,
                        'state_var': state_var,
                        'dataset': dataset_name,
                        'test_score': test_score,
                        'train_mean': train_mean,
                        'train_std': train_std,
                        'cv_mean': cv_mean,
                        'cv_std': cv_std,
                        'strategy': strategy['strat_name']
                    })
                    pbar.update(1)      

        results_df = pd.DataFrame.from_dict(results)
        results_df.set_index(['target', 'state_var', 'dataset'])
        filename = f'scores_{strategy["strat_name"]}.csv'
        results_df.to_csv(filename)
        print(f'Saved scores to {filename}')
        models = {}
        results = []


Performing 324 fits...


100%|██████████| 324/324 [01:55<00:00,  2.80it/s]

Saved scores to scores_LeaveOneOut.csv



