## Convert old datasets to new storage format

This notebook is meant as a utility to convert old pickled data to CSV format.

In [1]:
import numpy as np
import pandas as pd

sys.path.insert(1, os.path.join(sys.path[0], '..'))  # for importing local packages from src

In [2]:
"""Configure data sources"""

# EXPERIMENT_ROOT = '/thesis-cyborg-plants/results/fixed_predawn_7days_jun_jul_aug'
EXPERIMENT_ROOT = '/media/mcndt/thesis/thesis-cyborg-plants/results/fixed_predawn_7days_jun_jul_aug'

In [3]:
"""Loading dataset"""
from src.model.hydroshoot.hydroshoot_experiment import load_runs, HydroShootExperiment

assert(os.path.isdir(EXPERIMENT_ROOT))
runs = load_runs(EXPERIMENT_ROOT)
print(len(runs))
print(runs[0])
print('Targets:', runs[0].get_targets())
print('Variables:', runs[0].get_state_variables())

AssertionError: 

We also need to define a meta dataset structure to store experiments with multiple runs, offset in time.

In [12]:
"""Methods for converting data to DataFrame format (from previous notebook)"""

def HydroShoot_input_df(experiment: HydroShootExperiment):
  input_cols = experiment.inputs.loc[:, experiment.inputs.columns != 'time'].columns
  new_names = {col : f'input_{col}' for col in input_cols}
  input_df = experiment.inputs.rename(columns=new_names)
  input_df = input_df.reset_index(drop=True)
  input_df.insert(1, 'type', 'INPUT')
  return input_df


def HydroShoot_output_df(experiment: HydroShootExperiment):
  output_cols = experiment.outputs.loc[:, experiment.outputs.columns != 'time'].columns
  new_names = {col : f'output_{col}' for col in output_cols}
  output_df = experiment.outputs.rename(columns=new_names)
  output_df = output_df.reset_index(drop=True)
  output_df.insert(1, 'type', 'OUTPUT')
  return output_df


def HydroShoot_state_df(experiment: HydroShootExperiment):
  varnames = experiment.states.get_variables()
  state_ids = list(experiment.states._states[varnames[0]].keys())
  steps = experiment.n_steps()

  rows = []
  for step in range(steps):
    for state_id in state_ids:
      row = {'time': experiment.inputs['time'].iloc[step], 'type': 'STATE', 'state_id': state_id, 'state_type': 'leaf'}
      for varname in varnames:
        value = experiment.states._states[varname][state_id][step]
        row[f'state_{varname}'] = value
      rows.append(row)

  state_df = pd.DataFrame(rows)
  return state_df

In [17]:
"""Method for creating a single DataFrame for an experiment, including run id"""

def HydroShoot_experiment_df(experiment: HydroShootExperiment, run_id=0) -> pd.DataFrame:
  input_df = HydroShoot_input_df(experiment)
  output_df = HydroShoot_output_df(experiment)
  state_df = HydroShoot_state_df(experiment)
  experiment_df = pd.concat([input_df, output_df, state_df])
  experiment_df.insert(0, 'run_id', run_id)
  return experiment_df

dataset_df = HydroShoot_experiment_df(runs[0])
dataset_df.head(1)

Unnamed: 0,run_id,time,type,input_Tac,input_u,input_hs,input_Rg,output_Rg,output_An,output_E,...,state_FluxC,state_Tlc,state_An,state_Flux,state_psi_head,state_u,state_E,state_Eabs,state_gs,state_par_photo.dHd
0,0,2012-06-01,INPUT,23.44,0.558,51.39,0.0,,,,...,,,,,,,,,,


In [32]:
"""Create a single dataframe for all experiment data across all runs."""
from typing import List

def HydroShoot_multirun_df(experiments: List[HydroShootExperiment]) -> pd.DataFrame:
  run_dfs = [HydroShoot_experiment_df(run, run_id=i) for i, run in enumerate(experiments)]
  return pd.concat(run_dfs, ignore_index=True)

dataset_df = HydroShoot_multirun_df(runs)

In [33]:
dataset_df.tail()

Unnamed: 0,run_id,time,type,input_Tac,input_u,input_hs,input_Rg,output_Rg,output_An,output_E,...,state_FluxC,state_Tlc,state_An,state_Flux,state_psi_head,state_u,state_E,state_Eabs,state_gs,state_par_photo.dHd
5108539,83,2012-06-16 23:00:00,STATE,,,,,,,,...,-0.009519,16.522982,-0.561885,2.0948349999999998e-20,-0.41647,0.367,6.863859000000001e-17,0.0,9.103505e-15,200.0
5108540,83,2012-06-16 23:00:00,STATE,,,,,,,,...,-0.008719,16.78607,-0.564809,0.0,-0.415877,0.367,0.0,0.0,-4.637179e-15,200.0
5108541,83,2012-06-16 23:00:00,STATE,,,,,,,,...,-0.007909,16.628627,-0.563778,1.2888859999999998e-19,-0.415554,0.367,5.099782e-16,0.0,6.654002e-14,200.0
5108542,83,2012-06-16 23:00:00,STATE,,,,,,,,...,-0.007401,16.832534,-0.57958,1.47707e-20,-0.415125,0.367,6.420931e-17,0.0,8.120019e-15,200.0
5108543,83,2012-06-16 23:00:00,STATE,,,,,,,,...,-0.006373,16.823023,-0.550544,1.3736559999999998e-19,-0.414652,0.367,6.586839e-16,0.0,8.342089e-14,200.0


In [34]:
dataset_df.to_csv('NEW_STORAGE/test_large.csv', index=False)