In [1]:
import numpy as np
import pandas as pd

sys.path.insert(1, os.path.join(sys.path[0], '..'))  # for importing local packages from src

In [3]:
csv_path = 'NEW_STORAGE/test_large.csv'
dataset_df = pd.read_csv(csv_path)

In [13]:
dataset_df.columns

Index(['run_id', 'time', 'type', 'input_Tac', 'input_u', 'input_hs',
       'input_Rg', 'output_Rg', 'output_An', 'output_E', 'output_Tleaf',
       'state_id', 'state_type', 'state_Ci', 'state_gb', 'state_Ei',
       'state_FluxC', 'state_Tlc', 'state_An', 'state_Flux', 'state_psi_head',
       'state_u', 'state_E', 'state_Eabs', 'state_gs', 'state_par_photo.dHd'],
      dtype='object')

In [39]:
class ExperimentDataset:
    """Wrapper class for handling a dataset from a single RC experiment run."""

    def __init__(self, dataset_df=None, csv_path=None):
        """Load a experiment dataset from csv file."""
        self._inputs: pd.DataFrame = None
        self._outputs: pd.DataFrame = None
        self._state: pd.DataFrame = None

        if dataset_df is not None:
          self.load_data(dataset_df)
        elif csv_path is not None:
          self.load_dataframe(csv_path)
        else:
          raise Exception('Must set kwarg "dataset_df" or "csv_path"')

    def load_dataframe(self, csv_path):
        df = pd.read_csv(csv_path)
        this.load_data(df)

    def load_data(self, dataset_df):
        self._inputs = dataset_df[dataset_df["type"] == "INPUT"].dropna(
            how="all", axis=1
        )
        self._outputs = dataset_df[dataset_df["type"] == "OUTPUT"].dropna(
            how="all", axis=1
        )
        self._state = dataset_df[dataset_df["type"] == "STATE"].dropna(
            how="all", axis=1
        )
        self._state["state_id"] = self._state["state_id"].astype(int)
        assert len(self._inputs) == len(
            self._outputs
        ), "Input and output set have different lengths."

    def get_input_variables(self) -> tuple:
        """Get the input keys available."""
        input_col_names = self._inputs.columns
        return tuple(filter(lambda x: x.startswith("input_"), input_col_names))

    def get_output_variables(self) -> tuple:
        """Get the input output available."""
        output_col_names = self._outputs.columns
        return tuple(filter(lambda x: x.startswith("output_"), output_col_names))

    def get_targets(self) -> tuple:
        """Get the target keys available."""
        return (*self.get_input_variables(), *self.get_output_variables())

    def get_state_variables(self) -> tuple:
        """Get the state variables available."""
        state_col_names = self._state.loc[
            :, ~self._state.columns.isin(["state_id", "state_type"])
        ].columns
        return tuple(filter(lambda x: x.startswith("state_"), state_col_names))

    def n_runs(self) -> int:
        return len(self._inputs.groupby('run_id'))

    def n_steps(self) -> int:
        return self._inputs.groupby('run_id').size()[0]

    def state_size(self) -> int:
        return len(self._state.groupby("state_id"))

    def get_target(self, target_key, run_id) -> pd.Series:
        "Get a target signal as pandas Series by the target key."
        assert (
            target_key in self.get_targets()
        ), f"{target_key} not in available targets."
        source = self._inputs if target_key.startswith("input_") else self._outputs
        source = source.groupby('run_id').get_group(run_id)
        target_series = source[target_key]
        target_series.index = source["time"]
        return target_series

    def get_state(self, state_key, run_id) -> pd.DataFrame:
        "Get the entire reservoir state of variable as pandas DataFrame by the state key."
        assert (
            state_key in self.get_state_variables()
        ), f"{state_key} not in available state variables."

        source = self._state.groupby('run_id').get_group(run_id)
        return source.pivot(index="time", columns=["state_id"], values=state_key)


dataset = ExperimentDataset(dataset_df=dataset_df)  
print(dataset._inputs.shape)
print(dataset._outputs.shape)
print(dataset._state.shape)
print()
print(f'Inputs: {dataset.get_input_variables()}')
print(f'Outputs: {dataset.get_output_variables()}')
print(f'Targets: {dataset.get_targets()}')
print(f'State: {dataset.get_state_variables()}')
print(f'n_runs: {dataset.n_runs()}')
print(f'n_steps: {dataset.n_steps()}')
print(f'state_size: {dataset.state_size()}')
print()
print('Input target:', dataset.get_target('input_Tac', 0).shape)
print('Output target:', dataset.get_target('output_An', 0).shape)
print('Reservoir state:', dataset.get_state('state_E', 0).shape)

(14112, 7)
(14112, 7)
(5080320, 18)

Inputs: ('input_Tac', 'input_u', 'input_hs', 'input_Rg')
Outputs: ('output_Rg', 'output_An', 'output_E', 'output_Tleaf')
Targets: ('input_Tac', 'input_u', 'input_hs', 'input_Rg', 'output_Rg', 'output_An', 'output_E', 'output_Tleaf')
State: ('state_Ci', 'state_gb', 'state_Ei', 'state_FluxC', 'state_Tlc', 'state_An', 'state_Flux', 'state_psi_head', 'state_u', 'state_E', 'state_Eabs', 'state_gs', 'state_par_photo.dHd')
n_runs: 84
n_steps: 168
state_size: 360

Input target: (168,)
Output target: (168,)
Reservoir state: (168, 360)
