## New dataset storage format and loader class

All data for an experiment will be cast into a **single CSV file**.  Each row starts with a **type column**: value of `INPUT | OUTPUT | RESERVOIR`. Each row also has common columns across all types:

- `seq_id`: unique identifier of the experiment
- `run_id`: unique id within the experiment of the simulation run
- `time`: record time of this data entry

The other populated columns are dependent on the value of the type column:

- `in_{VARNAME}` for INPUT type variables.
- `out_{VARNAME}` for OUTPUT type variables.
- `state_{VARNAME}` for RESERVOIR type variables.
- `state_id` for a unique identifier of the observed point (e.g. leaf_01)
- `state_type` for metadata about the observed state (e.g. leaf, stem, …)


In [103]:
import pandas as pd
import numpy as np

### Restructuring existing HydroShoot data

Need to save time!

Pragmatic solution for Hydroshoot: load existing data from process_experiment, then rearrange it in the new format and store it. 

Hydroshoot experiment format -> Generalized intermediate representation -> Final representation -> CSV

In [9]:
sys.path.insert(1, os.path.join(sys.path[0], '..'))  # for importing local packages from src
from src.model.hydroshoot_experiment import HydroShootExperiment

test_dir = 'results/gdc_can1_grapevine'

hs_data = HydroShootExperiment(test_dir)

Reformatting input data

In [35]:
def HydroShoot_input_df(experiment: HydroShootExperiment):
  input_cols = experiment.inputs.loc[:, experiment.inputs.columns != 'time'].columns
  new_names = {col : f'input_{col}' for col in input_cols}
  input_df = experiment.inputs.rename(columns=new_names)
  input_df = input_df.reset_index(drop=True)
  input_df.insert(1, 'type', 'INPUT')
  return input_df

print(HydroShoot_input_df(hs_data).head())

                 time   type  input_Tac  input_u  input_hs  input_Rg
0 2012-08-01 00:00:00  INPUT      20.74    0.030     53.70       0.0
1 2012-08-01 01:00:00  INPUT      20.47    0.097     55.87       0.0
2 2012-08-01 02:00:00  INPUT      20.07    0.334     58.04       0.0
3 2012-08-01 03:00:00  INPUT      19.60    0.378     61.50       0.0
4 2012-08-01 04:00:00  INPUT      18.93    0.198     66.18       0.0


Reformatting output data

In [36]:
# Renaming the columns
def HydroShoot_output_df(experiment: HydroShootExperiment):
  output_cols = experiment.outputs.loc[:, experiment.outputs.columns != 'time'].columns
  new_names = {col : f'output_{col}' for col in output_cols}
  output_df = experiment.outputs.rename(columns=new_names)
  output_df = output_df.reset_index(drop=True)
  output_df.insert(1, 'type', 'OUTPUT')

  return output_df

print(HydroShoot_output_df(hs_data).head())

                 time    type  output_Rg  output_An      output_E  \
0 2012-08-01 00:00:00  OUTPUT        0.0  -1.960767  4.164749e-11   
1 2012-08-01 01:00:00  OUTPUT        0.0  -2.153141  5.554039e-11   
2 2012-08-01 02:00:00  OUTPUT        0.0  -2.275122  5.372938e-11   
3 2012-08-01 03:00:00  OUTPUT        0.0  -2.224711  5.707320e-11   
4 2012-08-01 04:00:00  OUTPUT        0.0  -2.063881  4.088476e-11   

   output_Tleaf  
0     16.620123  
1     17.877212  
2     18.561438  
3     18.195865  
4     17.112675  


Reformatting reservoir state data

In [34]:
print(f'State variables: {hs_data.states.get_variables()}')
print(f'State size: {hs_data.states.state_size()}')
print(f'Steps: {hs_data.states.n_steps()}')

State variables: ('u', 'gs', 'E', 'An', 'Ei', 'Flux', 'Tlc', 'gb', 'psi_head', 'FluxC', 'Eabs', 'Ci', 'par_photo.dHd')
State size: 360
Steps: 96


In [37]:
"""
Final shape should be of form: 
| time | state_id | state_type | state_{VARNAME} |
"""

def HydroShoot_state_df(experiment: HydroShootExperiment):
  varnames = experiment.states.get_variables()
  state_ids = list(experiment.states._states[varnames[0]].keys())
  steps = experiment.n_steps()

  rows = []
  for step in range(steps):
    for state_id in state_ids:
      row = {'time': experiment.inputs['time'].iloc[step], 'type': 'STATE', 'state_id': state_id, 'state_type': 'leaf'}
      for varname in varnames:
        value = experiment.states._states[varname][state_id][step]
        row[f'state_{varname}'] = value
      rows.append(row)

  state_df = pd.DataFrame(rows)
  return state_df

print(HydroShoot_state_df(experiment).head())

        time   type  state_id state_type  state_u      state_gs       state_E  \
0 2012-08-01  STATE       500       leaf     0.03 -3.233394e-14  0.000000e+00   
1 2012-08-01  STATE       503       leaf     0.03 -1.057920e-13  0.000000e+00   
2 2012-08-01  STATE       505       leaf     0.03 -3.648206e-14  0.000000e+00   
3 2012-08-01  STATE       507       leaf     0.03  3.847517e-14  2.946443e-16   
4 2012-08-01  STATE       508       leaf     0.03  5.148654e-14  4.060563e-16   

   state_An  state_Ei    state_Flux  state_Tlc  state_gb  state_psi_head  \
0 -0.516222       0.0  0.000000e+00  17.860072  0.230126       -0.204458   
1 -0.498186       0.0  0.000000e+00  17.934616  0.229895       -0.204572   
2 -0.526013       0.0  0.000000e+00  18.465255  0.201632       -0.204345   
3 -0.513748       0.0  2.319772e-20  18.197775  0.181748       -0.204253   
4 -0.537541       0.0  1.392108e-20  18.373517  0.201632       -0.204361   

   state_FluxC  state_Eabs      state_Ci  state_par_phot

Combining into a single dataframe and dump to CSV

In [55]:
input_df = HydroShoot_input_df(hs_data)
output_df = HydroShoot_output_df(hs_data)
state_df = HydroShoot_state_df(hs_data)

experiment_df = pd.concat([input_df, output_df, state_df])
experiment_df.to_csv('NEW_STORAGE/test.csv', index=False)


### Loading dataset from CSV

In [57]:
csv_path = 'NEW_STORAGE/test.csv'

dataset_df = pd.read_csv(csv_path)
dataset_df.groupby('type').count()

Unnamed: 0_level_0,time,input_Tac,input_u,input_hs,input_Rg,output_Rg,output_An,output_E,output_Tleaf,state_id,...,state_An,state_Ei,state_Flux,state_Tlc,state_gb,state_psi_head,state_FluxC,state_Eabs,state_Ci,state_par_photo.dHd
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
INPUT,96,96,96,96,96,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OUTPUT,96,0,0,0,0,96,96,96,96,0,...,0,0,0,0,0,0,0,0,0,0
STATE,34560,0,0,0,0,0,0,0,0,34560,...,34560,34560,34560,34560,34560,34560,34560,34560,34560,34560


In [171]:
class ExperimentRun:
  """Wrapper class for handling a dataset from a single RC experiment run."""

  def __init__(self, path: str):
    """Load a experiment dataset from csv file."""
    self._inputs: pd.DataFrame = None
    self._outputs: pd.DataFrame = None
    self._state: pd.DataFrame = None
    self.load_data(path)


  def load_data(self, path: str):
    dataset_df = pd.read_csv(csv_path)
    self._inputs = dataset_df[dataset_df['type'] == 'INPUT'].dropna(how='all', axis=1)
    self._outputs = dataset_df[dataset_df['type'] == 'OUTPUT'].dropna(how='all', axis=1)
    self._state = dataset_df[dataset_df['type'] == 'STATE'].dropna(how='all', axis=1)
    self._state['state_id'] = self._state['state_id'].astype(int)
    assert (len(self._inputs) == len(self._outputs)), 'Input and output set have different lengths.'


  def get_input_variables(self) -> tuple:
    """Get the input keys available."""
    input_col_names = self._inputs.columns
    return tuple(filter(lambda x: x.startswith('input_'), input_col_names))


  def get_output_variables(self) -> tuple:
    """Get the input output available."""
    output_col_names = self._outputs.columns
    return tuple(filter(lambda x: x.startswith('output_'), output_col_names))


  def get_targets(self) -> tuple:
    """Get the target keys available."""
    return (*self.get_input_variables(), *self.get_output_variables())


  def get_state_variables(self) -> tuple:
    """Get the state variables available."""
    state_col_names = self._state.loc[:, ~self._state.columns.isin(['state_id', 'state_type'])].columns
    return tuple(filter(lambda x: x.startswith('state_'), state_col_names))


  def n_steps(self) -> int:
    return len(self._inputs)


  def state_size(self) -> int:
    return len(self._state.groupby('state_id'))


  def get_target(self, target_key) -> pd.Series:
    "Get a target signal as pandas Series by the target key."
    assert (target_key in self.get_targets()), f"{target_key} not in available targets."
    source = self._inputs if target_key.startswith('input_') else self._outputs
    target_series = source[target_key]
    target_series.index = source['time']
    return target_series


  def get_state(self, state_key) -> pd.DataFrame:
    "Get the entire reservoir state of variable as pandas DataFrame by the state key."
    assert (state_key in self.get_state_variables()), f"{state_key} not in available state variables."
    return self._state.pivot(index='time', columns=['state_id'], values=state_key)




    
dataset = ExperimentRun(csv_path)  
print(dataset._inputs.shape)
print(dataset._outputs.shape)
print(dataset._state.shape)
print()
print(f'Inputs: {dataset.get_input_variables()}')
print(f'Outputs: {dataset.get_output_variables()}')
print(f'Targets: {dataset.get_targets()}')
print(f'State: {dataset.get_state_variables()}')
print(f'n_steps: {dataset.n_steps()}')
print(f'state_size: {dataset.state_size()}')
print()
print('Input target:', dataset.get_target('input_Tac').shape)
print('Output target:', dataset.get_target('output_An').shape)
print('Reservoir state:', dataset.get_state('state_E').shape)






(96, 6)
(96, 6)
(34560, 17)

Inputs: ('input_Tac', 'input_u', 'input_hs', 'input_Rg')
Outputs: ('output_Rg', 'output_An', 'output_E', 'output_Tleaf')
Targets: ('input_Tac', 'input_u', 'input_hs', 'input_Rg', 'output_Rg', 'output_An', 'output_E', 'output_Tleaf')
State: ('state_u', 'state_gs', 'state_E', 'state_An', 'state_Ei', 'state_Flux', 'state_Tlc', 'state_gb', 'state_psi_head', 'state_FluxC', 'state_Eabs', 'state_Ci', 'state_par_photo.dHd')
n_steps: 96
state_size: 360

Input target: (96,)
Output target: (96,)
Reservoir state: (96, 360)
