In [1]:
import pandas as pd
import numpy as np
import sys, os
import seaborn as sns

from matplotlib import pyplot as plt
sys.path.insert(1, os.path.join(sys.path[0], '../../'))  # for importing local packages from src
sys.path.insert(1, os.path.join(sys.path[0], '../'))  # for importing model config

%reload_ext autoreload
%autoreload 2 

plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",
    "font.serif": ["Palatino"],
})
plt.rcParams['text.latex.preamble'] = r'\usepackage{amsmath} \usepackage{gensymb} \usepackage{siunitx}'
plt.rcParams['font.size'] = 15
plt.rcParams['legend.fontsize'] = 12

## Datasets

In [2]:
from src.model.rc_dataset import ExperimentDataset


DATASET_NEMA_H0 = '../datasets/dataset_NEMA_NEMA_H0.csv'
DATASET_NEMA_H3 = '../datasets/dataset_NEMA_NEMA_H3.csv'
DATASET_NEMA_H15 = '../datasets/dataset_NEMA_NEMA_H15.csv'

dataset_nema_h0 = ExperimentDataset(csv_path=DATASET_NEMA_H0)
dataset_nema_h3 = ExperimentDataset(csv_path=DATASET_NEMA_H3)
dataset_nema_h15 = ExperimentDataset(csv_path=DATASET_NEMA_H15)

datasets = [
  dataset_nema_h0, 
  dataset_nema_h3,
  dataset_nema_h15
]

## Model configuration

In [3]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import LeaveOneGroupOut
from src.learning.scorers import nmse_scorer
from pipeline_cnwheat import GroupGenerator, TimeGenerator
from pipeline_base import TrainTestSplitter, DirectTransform, WarmupTransform

# constants
DATASET_ID = 'NEMA_combined'
STATE_SIZE = 7
N_STATE_SAMPLES = 16
state_ids = None

# Reservoir generation
np.random.seed(42)
# NOTE: all measurable reservoirs considered have the same reservoir size of 10.
generate_state_sample = lambda : np.random.choice(10, size=STATE_SIZE, replace=False)


# Readout model
readout_model = Pipeline([
  ('ridge_regression', Ridge(alpha=1, fit_intercept=True))
])
model_param_grid = [{
  'ridge_regression__alpha': 10 ** np.linspace(np.log10(1e-4), np.log10(1e2), 50)
}]

# Regression task pipeline
shared_pipeline_params = {
  # Data generation
  'datasets': datasets,
  'groups': GroupGenerator(day_length=24),
  'time': TimeGenerator(day_length=24),
  
  # Model training and validation
  'readout_model': readout_model,
  'model_param_grid': model_param_grid,
  'model_scorer': nmse_scorer,
  'folds': LeaveOneGroupOut(),
  'train_test_split': TrainTestSplitter(block_size=4, test_ratio=0.5),
}

In [4]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import LeaveOneGroupOut
from src.learning.scorers import nmse_scorer
from pipeline_cnwheat import GroupGenerator, TimeGenerator
from pipeline_base import TrainTestSplitter, DirectTransform, WarmupTransform

# constants
DATASET_ID = 'NEMA_combined'
STATE_SIZE = 7

# Reservoir generation
np.random.seed(42)
state_ids = np.random.choice(10, size=STATE_SIZE, replace=False)

# Readout model
readout_model = Pipeline([
  ('ridge_regression', Ridge(alpha=1, fit_intercept=True))
])
model_param_grid = [{
  'ridge_regression__alpha': 10 ** np.linspace(np.log10(1e-4), np.log10(1e2), 50)
}]

# Regression task pipeline
shared_pipeline_params = {
  # Data generation
  'datasets': datasets,
  'groups': GroupGenerator(day_length=24),
  'time': TimeGenerator(day_length=24),
  
  # Model training and validation
  'readout_model': readout_model,
  'model_param_grid': model_param_grid,
  'model_scorer': nmse_scorer,
  'folds': LeaveOneGroupOut(),
  'train_test_split': TrainTestSplitter(block_size=4, test_ratio=0.5),
}

## Regression pipelines

In [5]:
from pipeline_base import (
  RCPipeline, 
  Rescale, 
  DaylightMask
)

from pipeline_cnwheat import (
  TargetGenerator, 
  SingleReservoirGenerator,
  MultiReservoirGenerator,
  TargetReservoirGenerator,
  GroupRescale
)

from model_config_cnwheat import (
  baseline_reservoirs,
  heterogeneous_reservoirs, 
  final_targets, 
  measurable_reservoirs,
)

In [12]:
measurable_reservoirs = ['state__Ts', 'state__Tr', 'state__gs']
final_targets = ['input_PARi', 'output__custom__PARa']
narma_targets = ['input_PARi']

target_state_pairs = [(target, state_var) for target in final_targets for state_var in measurable_reservoirs]
narma_pairs = [(target, state_var) for target in narma_targets for state_var in measurable_reservoirs]
target_env_pairs = []
target_het_pairs = []
# target_env_pairs = [(name, target, env_targets) for target in final_targets for (name, env_targets) in baseline_reservoirs]
# target_het_pairs = [(name, target, state_vars) for target in final_targets for (name, state_vars) in heterogeneous_reservoirs]

print(final_targets)
print(measurable_reservoirs)
print(baseline_reservoirs)
print(heterogeneous_reservoirs)

['input_PARi', 'output__custom__PARa']
['state__Ts', 'state__Tr', 'state__gs']
[('env_all', ['input_air_temperature', 'input_humidity', 'input_PARi']), ('env_temp', ['input_air_temperature']), ('env_humidity', ['input_humidity']), ('env_PAR', ['input_PARi'])]
[('state__het_all', ('state__An', 'state__Transpiration', 'state__Ts', 'state__gs', 'state__Ag', 'state__Tr', 'state__Rd', 'state__sum_respi', 'state__PARa'))]


In [18]:
from pipeline_base import NarmaTargetTransform

input_phys_transforms = [
  WarmupTransform(warmup_days=4, day_length=24),
  DirectTransform()
]


def generate_single(target_state_pairs, state_ids):
  for target, state_var in target_state_pairs:
    pipeline = RCPipeline(
      metadata={'target_id': target, 'reservoir_id': state_var, 'dataset_id': DATASET_ID},
      # Data generation
      target=TargetGenerator(target=target),
      reservoir=SingleReservoirGenerator(state_var=state_var, state_ids=state_ids),
      # Data transformation
      transforms=input_phys_transforms,
      # Data preprocessing
      preprocessing=[
        DaylightMask(day_length=24, start=5, end=21),
        Rescale(per_feature=False)
      ],
      **shared_pipeline_params,
    )
    yield pipeline


def generate_multi(target_het_pairs, state_ids):
  for name, target, state_vars in target_het_pairs:
    pipeline = RCPipeline(
      metadata={'target_id': target, 'reservoir_id': name, 'dataset_id': DATASET_ID},
      # Data generation
      target=TargetGenerator(target=target),
      reservoir=MultiReservoirGenerator(state_vars=state_vars, state_ids=state_ids),
      # Data transformation
      transforms=input_phys_transforms,
      # Data preprocessing
      preprocessing=[
        DaylightMask(day_length=24, start=5, end=21),
        GroupRescale(datasets=datasets, state_vars=state_vars)
      ],
      **shared_pipeline_params,
    )
    yield pipeline


def generate_env(target_env_pairs):
  for name, target, env_targets in target_env_pairs:
    pipeline = RCPipeline(
      metadata={'target_id': target, 'reservoir_id': name, 'dataset_id': DATASET_ID},
      # Data generation
      target=TargetGenerator(target=target),
      reservoir=TargetReservoirGenerator(targets=env_targets),
       # Data transformation
      transforms=input_phys_transforms,
      # Data preprocessing
      preprocessing=[
        DaylightMask(day_length=24, start=5, end=21),
        Rescale(per_feature=True)
      ],
      **shared_pipeline_params,
    )
    yield pipeline


def narma_benchmark(target_state_pairs, state_ids, *, n):
  for target, state_var in target_state_pairs:
    pipeline = RCPipeline(
      metadata={
        'target_id': f'{target}_NARMA_{n}', 
        'reservoir_id': state_var, 
        'dataset_id': DATASET_ID,
        'benchmark': 'NARMA',
        'narma_n': n,
      },
      # Data generation
      target=TargetGenerator(target=target),
      reservoir=SingleReservoirGenerator(state_var=state_var, state_ids=state_ids),
      # Data transformation
      transforms=[
        # NOTE: remove warmup before transform, otherwise the 
        # warmup steps are used as input to the NARMA system.
        WarmupTransform(warmup_days=4, day_length=24),  
        NarmaTargetTransform(n=n, scale=1),
      ],
      # Data preprocessing
      preprocessing=[
        DaylightMask(day_length=24, start=5, end=21),
        Rescale(per_feature=False)
      ],
      **shared_pipeline_params,
    )
    yield pipeline

In [19]:
from itertools import chain


def generate_pipelines(state_ids):
  pipelines = list(chain(
    generate_single(target_state_pairs, state_ids),
    generate_multi(target_het_pairs, state_ids),
    generate_env(target_env_pairs),
    narma_benchmark(narma_pairs, state_ids, n=8)
  ))
  return pipelines


pipelines = generate_pipelines(state_ids)
print(f'Total pipelines: {len(pipelines)}')

Total pipelines: 9


## Model fitting

In [20]:
from tqdm import tqdm
from itertools import chain
from pipeline_base import execute_pipeline



total_loops = len(pipelines)
results = []
model_data = []

with tqdm(total=total_loops) as pbar:
  for pipeline in pipelines:

    try:
      result, md = execute_pipeline(pipeline, return_model_data=True)
      results.append(result)
      model_data.append(md)
      pbar.update(1)
    except Exception as e:
      print('An exception occured executing the pipeline with the following metadata:')
      print(f'{pipeline.metadata}')
      raise e



100%|██████████| 9/9 [00:04<00:00,  1.92it/s]


## Prediction

In [21]:
cnwheat_pred = {}


for result, md in zip(results, model_data):
  model = md['final_model']
  X_test, y_test = md['test_data']
  target_id = result['target_id']
  reservoir_id = result['reservoir_id']
  y_pred = model.predict(X_test)
  cnwheat_pred[f'{target_id}__true'] = y_test
  cnwheat_pred[f'{target_id}__{reservoir_id}'] = y_pred


list(cnwheat_pred.keys())

['input_PARi__true',
 'input_PARi__state__Ts',
 'input_PARi__state__Tr',
 'input_PARi__state__gs',
 'output__custom__PARa__true',
 'output__custom__PARa__state__Ts',
 'output__custom__PARa__state__Tr',
 'output__custom__PARa__state__gs',
 'input_PARi_NARMA_8__true',
 'input_PARi_NARMA_8__state__Ts',
 'input_PARi_NARMA_8__state__Tr',
 'input_PARi_NARMA_8__state__gs']

In [24]:
cn_pred_fixed = {}

for name, pred in cnwheat_pred.items():
  l = len(pred)
  new_pred = np.ones(736) * np.NaN
  new_pred[:l] = pred
  cn_pred_fixed[name] = new_pred
    

In [26]:
pred_df = pd.DataFrame(cn_pred_fixed)
pred_df.to_csv('results_cnwheat_input_phys_predictions.csv')

pred_df.shape

(736, 12)