## RC Demo single run all scores

iterate upon the methods for plant RC experiments and focus on techniques for scaling to larger datasets.

- Just a replica of Olivier’s experiment
- Extend to larger datasets of multiple experiments

In [2]:
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt

sys.path.insert(1, os.path.join(sys.path[0], '../../'))  # for importing local packages from src

In [None]:
from src.model.rc_dataset import ExperimentDataset 

csv_path = '../datasets/hydroshoot_large.csv'
dataset = ExperimentDataset(csv_path=csv_path)

In [6]:
print(f'Available targets: \n\t{", ".join(dataset.get_targets())}\n'
      f'Available state variables: \n\t{", ".join(dataset.get_state_variables())}\n')

print(f'Dataset properties:\n'
      f'\tn_runs:     {dataset.n_runs():>3}\n'
      f'\tn_steps:    {dataset.n_steps():>3}\n'
      f'\tstate_size: {dataset.state_size():>3}\n')

Available targets: 
	input_Tac, input_u, input_hs, input_Rg, output_Rg, output_An, output_E, output_Tleaf
Available state variables: 
	state_Ci, state_gb, state_Ei, state_FluxC, state_Tlc, state_An, state_Flux, state_psi_head, state_u, state_E, state_Eabs, state_gs, state_par_photo.dHd

Dataset properties:
	n_runs:      84
	n_steps:    168
	state_size: 360



### Selecting a single run to use

Base this on the original HydroShoot experiment: select the run that starts on `2012-08-01 00:00:00` because this is the original data shown in the HydroShoot paper.

In [7]:
def find_run(iso_date, dataset):
  for i in range(dataset.n_runs()):
    run_index = dataset.get_target('input_Tac', i).index
    if str(run_index[0]).startswith(iso_date):
      return i
  return -1 

run_id = find_run('2012-08-01', dataset)
print(f'We will use run {run_id} for this notebook.')

We will use run 58 for this notebook.


### Model definition

In [8]:
 """Define readout model pipeline and hyperparameter search grid."""

from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

# Define model
readout = Pipeline([
  ('ridge_regression', Ridge(alpha=1, fit_intercept=True))
])

# define search grid
search_grid = [{
  'ridge_regression__alpha': 10 ** np.linspace(np.log10(1e-4), np.log10(1e6), 100)
}]

### Training and scoring code

In [17]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.base import BaseEstimator
from src.learning.preprocessing import generate_mask
from src.learning.scorers import nmse_scorer
from src.learning.training import perform_gridsearch


def scalar_normal_scale_df(df: pd.DataFrame) -> pd.DataFrame:
  array = df.to_numpy()
  mean = array.mean()
  std = array.std()
  array = (array - mean) / std
  return pd.DataFrame(array, columns=df.columns, index=df.index)
  

def scalar_normal_scale_series(series: pd.Series) -> pd.Series:
  array = series.to_numpy()
  mean = array.mean()
  std = array.std()
  array = (array - mean) / std
  return pd.Series(array, index=series.index)


def split_train_test(array: np.ndarray, day_length: int) -> np.ndarray:
  """Splits the data in a train and test set by alternating days.
  (samples, state_vars) -> (day, day_samples, state_vars)"""
  n_samples = array.shape[0]
  dimensions = array.shape[1:]
  array_days = array.reshape((n_samples // day_length, -1, *array.shape[1:]))
  return array_days[::2], array_days[1::2]


def flatten_data(*array: np.ndarray) -> np.ndarray:
  """Flattens dataset that is grouped per day in the first dimension:
  (n_days, day_length, *dims) -> (n_days * day_length, *dims)"""
  
  def _flatten_data(array):
    dimensions = array.shape[2:]
    return array.reshape((-1, *dimensions))

  return list(map(_flatten_data, array))


def get_state_random_subset(state: pd.DataFrame, state_size: int) -> pd.DataFrame:
  choice = np.random.choice(state.shape[1], size=state_size, replace=False)
  return state.iloc[:, choice]


def perform_N_fits(n_samples: int, dataset: ExperimentDataset, estimator: BaseEstimator, 
                  search_grid: dict, run_id: int, target: str, state_var: str, state_size: int):
  """Randomly samples the state space n_samples times (may overlap between samples).
  Fits a model with Pieters et al. train-test split and grouping strategy, + discarding nighttime data.

  Returns array of test scores.
  """
  state = dataset.get_state(state_var, run_id).sort_index()
  target = target = dataset.get_target(target, run_id)

  # Normalize target and state data to zero mean and unit variance.
  state = scalar_normal_scale_df(state)
  target = scalar_normal_scale_series(target)

  # Apply daylight mask to discard night time samples.
  daylight_mask = generate_mask(5, 21)
  day_length = daylight_mask.sum()
  daylight_mask_run = np.tile(daylight_mask, target.shape[0] // 24)
  state = state.iloc[daylight_mask_run, :]
  target = target.iloc[daylight_mask_run]

  # Reshape target data and generate groups
  y_train, y_test = split_train_test(target.to_numpy(), day_length)  

  # Assign CV grouping strategy
  folds = LeaveOneGroupOut()
  groups = np.arange(len(y_train)).repeat(day_length).reshape(y_train.shape)

  # Flatten group and target data
  y_train, y_test, groups = flatten_data(y_train, y_test, groups)
  
  test_scores = np.zeros((n_samples))

  for i_sample in tqdm(range(n_samples)):
    state_sample = get_state_random_subset(state, state_size)
    X_train, X_test = split_train_test(state_sample.to_numpy(), day_length)
    X_train, X_test = flatten_data(X_train, X_test)
    tuned_readout, tuned_cv_scores = perform_gridsearch(readout, X_train, y_train, groups, folds, search_grid, verbose=False)
    score = nmse_scorer(tuned_readout, X_test, y_test)
    test_scores[i_sample] = score

  return test_scores

np.random.seed(42)
perform_N_fits(16, dataset, readout, search_grid, run_id, 'input_Rg', 'state_An', 64)

100%|██████████| 16/16 [00:03<00:00,  4.44it/s]


array([-0.69123517, -0.69736065, -0.70161125, -0.70625365, -0.69400503,
       -0.69188041, -0.7048022 , -0.69645837, -0.70032674, -0.70454831,
       -0.70180841, -0.69469726, -0.69486581, -0.69761427, -0.68930291,
       -0.70204317])

### Collecting data for every pairing

- Want to generate $T$ test scores for each pairing of target and state_variable.

In [20]:
from collections import defaultdict
from tqdm import tqdm

N_SAMPLES = 25
STATE_SIZE = 16

targets = dataset.get_targets()
variables = dataset.get_state_variables()

test_scores = np.array((len(targets), len(variables), N_SAMPLES))

for i_target, target in tqdm(enumerate(targets)):
  for i_var, var in enumerate(variables):
    test_scores[i_target, i_var, :] = perform_N_fits(N_SAMPLES, dataset, readout, search_grid, run_id, target, var, STATE_SIZE)


  x = asanyarray(arr - arrmean)
  array = (array - mean) / std
  0%|          | 0/25 [00:00<?, ?it/s]
0it [00:00, ?it/s]


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').