## RC Demo single run all scores

iterate upon the methods for plant RC experiments and focus on techniques for scaling to larger datasets.

- Just a replica of Olivier’s experiment
- Extend to larger datasets of multiple experiments

In [1]:
import sys, os
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt

sys.path.insert(1, os.path.join(sys.path[0], '../../'))  # for importing local packages from src

In [2]:
from model import load_experiment

csv_path = '../datasets/hydroshoot_large_trimmed.csv'
dataset = load_experiment(csv_path)
print(dataset)

### Selecting a single run to use

Base this on the original HydroShoot experiment: select the run that starts on `2012-08-01 00:00:00` because this is the original data shown in the HydroShoot paper.

In [22]:
%load_ext autoreload
%autoreload 2

import importlib
import sys

def reload_utils(): 
  importlib.reload(sys.modules['model'])


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
reload_utils()
from model import find_run

run_id = model.find_run('2012-08-01', dataset)
print(f'We will use run {run_id} for this notebook.')

We will use run 58 for this notebook.


### Model definition

In [5]:
"""Define readout model pipeline and hyperparameter search grid."""

from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

# Define model
readout = Pipeline([
  ('ridge_regression', Ridge(alpha=1, fit_intercept=True))
])

# define search grid
search_grid = [{
  'ridge_regression__alpha': 10 ** np.linspace(np.log10(1e-4), np.log10(1e6), 100)
}]

### Training and scoring code

In [6]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.base import BaseEstimator
from src.learning.preprocessing import generate_mask
from src.learning.scorers import nmse_scorer
from src.learning.training import perform_gridsearch


def scalar_normal_scale_df(df: pd.DataFrame) -> pd.DataFrame:
  array = df.to_numpy()
  mean = array.mean()
  std = array.std()
  array = (array - mean) / std
  return pd.DataFrame(array, columns=df.columns, index=df.index)
  

def scalar_normal_scale_series(series: pd.Series) -> pd.Series:
  array = series.to_numpy()
  mean = array.mean()
  std = array.std()
  array = (array - mean) / std
  return pd.Series(array, index=series.index)


def split_train_test(array: np.ndarray, day_length: int) -> np.ndarray:
  """Splits the data in a train and test set by alternating days.
  (samples, state_vars) -> (day, day_samples, state_vars)"""
  n_samples = array.shape[0]
  dimensions = array.shape[1:]
  array_days = array.reshape((n_samples // day_length, -1, *array.shape[1:]))
  return array_days[::2], array_days[1::2]


def flatten_data(*array: np.ndarray) -> np.ndarray:
  """Flattens dataset that is grouped per day in the first dimension:
  (n_days, day_length, *dims) -> (n_days * day_length, *dims)"""
  
  def _flatten_data(array):
    dimensions = array.shape[2:]
    return array.reshape((-1, *dimensions))

  return list(map(_flatten_data, array))


def get_state_random_subset(state: pd.DataFrame, state_size: int) -> pd.DataFrame:
  choice = np.random.choice(state.shape[1], size=state_size, replace=False)
  return state.iloc[:, choice]


def perform_N_fits(n_samples: int, dataset: ExperimentDataset, estimator: BaseEstimator, 
                  search_grid: dict, run_id: int, target: str, state_var: str, state_size: int):
  """Randomly samples the state space n_samples times (may overlap between samples).
  Fits a model with Pieters et al. train-test split and grouping strategy, + discarding nighttime data.

  Returns array of test scores.
  """
  state = dataset.get_state(state_var, run_id).sort_index()
  target = target = dataset.get_target(target, run_id)

  # Normalize target and state data to zero mean and unit variance.
  state = scalar_normal_scale_df(state)
  target = scalar_normal_scale_series(target)

  # Apply daylight mask to discard night time samples.
  daylight_mask = generate_mask(5, 21)
  day_length = daylight_mask.sum()
  daylight_mask_run = np.tile(daylight_mask, target.shape[0] // 24)
  state = state.iloc[daylight_mask_run, :]
  target = target.iloc[daylight_mask_run]

  # Reshape target data and generate groups
  y_train, y_test = split_train_test(target.to_numpy(), day_length)  

  # Assign CV grouping strategy
  folds = LeaveOneGroupOut()
  groups = np.arange(len(y_train)).repeat(day_length).reshape(y_train.shape)

  # Flatten group and target data
  y_train, y_test, groups = flatten_data(y_train, y_test, groups)
  
  test_scores = np.zeros((n_samples))

  for i_sample in tqdm(range(n_samples)):
    state_sample = get_state_random_subset(state, state_size)
    X_train, X_test = split_train_test(state_sample.to_numpy(), day_length)
    X_train, X_test = flatten_data(X_train, X_test)
    tuned_readout, tuned_cv_scores = perform_gridsearch(readout, X_train, y_train, groups, folds, search_grid, verbose=False)
    score = nmse_scorer(tuned_readout, X_test, y_test)
    test_scores[i_sample] = score

  return test_scores

np.random.seed(42)
scores = perform_N_fits(16, dataset, readout, search_grid, run_id, 'input_Rg', 'state_An', 64)

100%|██████████| 16/16 [00:08<00:00,  1.90it/s]


In [7]:
mean_score = scores.mean()
std_score = scores.std()

# random sampler confidence interval: https://sphweb.bumc.bu.edu/otlt/mph-modules/bs/bs704_confidence_intervals/bs704_confidence_intervals_print.html
conf95_score = std_score * 1.96 / np.sqrt(len(scores))

print(f'test score: {mean_score:.3f} ± {conf95_score:.3f}')

test score: -0.698 ± 0.002


### Collecting data for every pairing

- Want to generate $T$ test scores for each pairing of target and state_variable.

In [None]:
from collections import defaultdict
from tqdm import tqdm

N_SAMPLES = 25
STATE_SIZE = 16

targets = dataset.get_targets()
variables = dataset.get_state_variables()

test_scores = np.zeros((len(targets), len(variables), N_SAMPLES))

for i_target, target in enumerate(targets):
  for i_var, var in enumerate(variables):
    print(f'({target}, {var})')
    test_scores[i_target, i_var, :] = perform_N_fits(N_SAMPLES, dataset, readout, search_grid, run_id, target, var, STATE_SIZE)


In [21]:

for i_target, target in enumerate(targets):
  for i_var, var in enumerate(variables):
      tuple_scores = test_scores[i_target, i_var]
      score_mean = tuple_scores.mean()
      scores_95conf = tuple_scores.std() / np.sqrt(N_SAMPLES) * 1.96
      print(f'{target:<16} | {var:<16}: {-score_mean:.3f} ± {scores_95conf:.3f}')

input_Tac        | state_An        : 0.782 ± 0.012
input_Tac        | state_E         : 0.721 ± 0.022
input_Tac        | state_Eabs      : 0.779 ± 0.005
input_Tac        | state_Ei        : 0.785 ± 0.004
input_Tac        | state_Flux      : 0.715 ± 0.015
input_Tac        | state_FluxC     : 0.775 ± 0.018
input_Tac        | state_Tlc       : 0.545 ± 0.012
input_Tac        | state_gb        : 0.870 ± 0.001
input_Tac        | state_gs        : 0.753 ± 0.019
input_Tac        | state_psi_head  : 0.720 ± 0.011
input_Tac        | state_u         : 0.902 ± 0.000
input_u          | state_An        : 0.916 ± 0.025
input_u          | state_E         : 0.973 ± 0.010
input_u          | state_Eabs      : 0.955 ± 0.017
input_u          | state_Ei        : 0.965 ± 0.014
input_u          | state_Flux      : 0.966 ± 0.012
input_u          | state_FluxC     : 0.903 ± 0.015
input_u          | state_Tlc       : 0.948 ± 0.008
input_u          | state_gb        : 0.689 ± 0.001
input_u          | state_gs    

In [28]:

test_scores_table = test_scores.reshape((-1, N_SAMPLES))
print(f'{test_scores.shape} -> {test_scores_table.shape}')



(8, 11, 25) -> (88, 25)


In [37]:
test_scores_df = pd.DataFrame(test_scores_table)

variables_col = variables * len(targets)
targets_col = [[target] * len(variables) for target in targets]
targets_col = [item for sublist in targets_col for item in sublist]

test_scores_df.insert(0, 'target', targets_col)
test_scores_df.insert(1, 'state_variable', variables_col)

test_scores_df.head(10)

Unnamed: 0,target,state_variable,0,1,2,3,4,5,6,7,...,15,16,17,18,19,20,21,22,23,24
0,input_Tac,state_An,-0.760317,-0.75996,-0.771159,-0.863952,-0.763795,-0.773623,-0.826801,-0.795141,...,-0.749038,-0.799609,-0.773265,-0.769829,-0.826512,-0.84257,-0.777824,-0.768252,-0.759944,-0.764784
1,input_Tac,state_E,-0.93009,-0.689037,-0.728648,-0.693773,-0.704166,-0.881873,-0.706255,-0.699777,...,-0.715987,-0.696443,-0.691448,-0.714793,-0.710691,-0.710226,-0.702552,-0.702413,-0.701105,-0.691715
2,input_Tac,state_Eabs,-0.775248,-0.760211,-0.786536,-0.770726,-0.787909,-0.773117,-0.765655,-0.791152,...,-0.792459,-0.779637,-0.769522,-0.766486,-0.784973,-0.775157,-0.771141,-0.798198,-0.770514,-0.780886
3,input_Tac,state_Ei,-0.804914,-0.779989,-0.779155,-0.786934,-0.773832,-0.79741,-0.772771,-0.778218,...,-0.784338,-0.788909,-0.779139,-0.778623,-0.791658,-0.778057,-0.776094,-0.792732,-0.783696,-0.760749
4,input_Tac,state_Flux,-0.703959,-0.713499,-0.710165,-0.71561,-0.695987,-0.692387,-0.70561,-0.698665,...,-0.721573,-0.725445,-0.693214,-0.699579,-0.703595,-0.703103,-0.718593,-0.73187,-0.896186,-0.70284
5,input_Tac,state_FluxC,-0.955925,-0.744357,-0.786487,-0.759415,-0.791661,-0.759661,-0.722842,-0.750689,...,-0.75791,-0.761974,-0.723181,-0.775759,-0.7546,-0.767431,-0.744699,-0.770101,-0.778229,-0.752096
6,input_Tac,state_Tlc,-0.521541,-0.556333,-0.547321,-0.570723,-0.532064,-0.548163,-0.532804,-0.55401,...,-0.547921,-0.536527,-0.520437,-0.544666,-0.528601,-0.559788,-0.522727,-0.553543,-0.671201,-0.565174
7,input_Tac,state_gb,-0.871071,-0.874426,-0.868231,-0.874001,-0.873401,-0.873531,-0.867268,-0.87205,...,-0.866756,-0.869793,-0.875839,-0.874713,-0.865666,-0.869078,-0.865257,-0.866041,-0.866405,-0.875543
8,input_Tac,state_gs,-0.772697,-0.696871,-0.726618,-0.81157,-0.723199,-0.831867,-0.704357,-0.816936,...,-0.757907,-0.731079,-0.879122,-0.798976,-0.781132,-0.738842,-0.694592,-0.682442,-0.68584,-0.736336
9,input_Tac,state_psi_head,-0.708819,-0.730024,-0.767883,-0.710394,-0.711923,-0.725855,-0.696039,-0.703334,...,-0.714873,-0.70737,-0.696209,-0.713373,-0.724529,-0.716994,-0.824313,-0.698239,-0.715189,-0.710845


In [38]:
test_scores_df.to_csv('test_scores_single_2022_03_03.csv', index=False)