# CN_Wheat: Nonlinear and memory-bound tasks

In [2]:
import pandas as pd
import numpy as np
import sys, os

from matplotlib import pyplot as plt
sys.path.insert(1, os.path.join(sys.path[0], '../'))  # for importing local packages from src

%reload_ext autoreload
%autoreload 2 

## Datasets

In [3]:
from src.model.rc_dataset import ExperimentDataset


DATASET_PATH = 'datasets/hydroshoot_large_trimmed.csv'

dataset = ExperimentDataset(csv_path=DATASET_PATH)

## Model definition

Shared model parameters for all experiments:

- Ridge regresion model
  - Fitted bias term
  - Tuned regularization parameter
  - LeaveOneGroupOut cross-validation to tune param
- Datasets
  - NEMA_H0, H3 and H15 combined as one dataset
  - Grouped by calendar day of the model inputs
- Model preprocessing
  - ~~4 days of each run are discarded as warmup days~~ **Transforms must now be applied as a pipeline with the benchmark-specific transforms!**

In [4]:
DATASET_ID = 'HydroShoot_large'
STATE_SIZE = 32

In [9]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupKFold
from src.learning.scorers import nmse_scorer
from pipeline_hydroshoot import GroupGenerator, TimeGenerator
from pipeline_base import TrainTestSplitter, DirectTransform, WarmupTransform


readout_model = Pipeline([
  ('ridge_regression', Ridge(alpha=1, fit_intercept=True))
])

model_param_grid = [{
  'ridge_regression__alpha': 10 ** np.linspace(np.log10(1e-4), np.log10(1e2), 50)
}]

datasets = (dataset,)
run_ids = dataset.get_run_ids()
np.random.seed(42)
state_ids = np.random.choice(dataset.state_size(), size=16, replace=False)

warmup_days = 4

shared_pipeline_params = {
  # Data generation
  'datasets': datasets,
  'groups': GroupGenerator(day_length=24, run_ids=run_ids, days_between_runs=1),
  'time': TimeGenerator(day_length=24, run_ids=run_ids),
  
  # Model training and validation
  'readout_model': readout_model,
  'model_param_grid': model_param_grid,
  'model_scorer': nmse_scorer,
  'folds': GroupKFold(n_splits=5),
  'train_test_split': TrainTestSplitter(block_size=4, test_ratio=0.5),
}

## Benchmark pipelines

### Pipeline definitions

#### Imports

In [10]:
from pipeline_base import (
  RCPipeline, 
  Rescale, 
  DaylightMask
)

from pipeline_hydroshoot import (
  TargetGenerator, 
  SingleReservoirGenerator,
  MultiReservoirGenerator,
  TargetReservoirGenerator,
  GroupRescale
)

from model_config_hydroshoot import (
  baseline_reservoirs,
  heterogeneous_reservoirs, 
  targets, 
  state_variables
)

#### Delay line benchmark

In [12]:
from pipeline_base import DelayLineTransform, CustomWarmupTransform

def delay_line_benchmark(target_gen, target_id, res_gen, res_id, rescale, *, delay_steps: [int]):
  for d in delay_steps:
    pipeline = RCPipeline(
      metadata={
        'target_id': target_id, 
        'reservoir_id': res_id, 
        'dataset_id': DATASET_ID,
        'benchmark': 'delay_line',
        'delay': d,
      },
      # Data generation
      target=target_gen,
      reservoir=res_gen,
      # Data transformation
      transforms=[
        # NOTE: remove warmup before transform, otherwise the 
        # warmup steps are used as delayed prediction targets.
        WarmupTransform(warmup_days=4, day_length=24),
        DelayLineTransform(delay_steps=d),
      ],
      # Data preprocessing
      preprocessing=[
        DaylightMask(day_length=24, start=5, end=21),
        rescale
      ],
      **shared_pipeline_params,
    )
    yield pipeline

#### Polynomial benchmark

In [13]:
from pipeline_base import PolynomialTargetTransform

def polynomial_benchmark(target_gen, target_id, res_gen, res_id, rescale, *, exponents: [int]):
  for e in exponents:
    pipeline = RCPipeline(
      metadata={
        'target_id': target_id, 
        'reservoir_id': res_id, 
        'dataset_id': DATASET_ID,
        'benchmark': 'polynomial',
        'exponent': e,
      },
      # Data generation
      target=target_gen,
      reservoir=res_gen,
      # Data transformation
      transforms=[
        WarmupTransform(warmup_days=4, day_length=24),
        PolynomialTargetTransform(poly_coefs=[0] * e + [1]),
      ],
      # Data preprocessing
      preprocessing=[
        DaylightMask(day_length=24, start=5, end=21),
        rescale
      ],
      **shared_pipeline_params,
    )
    yield pipeline

#### NARMA benchmark

In [14]:
from pipeline_base import NarmaTargetTransform

def narma_benchmark(target_gen, target_id, res_gen, res_id, rescale, *, n_values: [int]):
  for n in n_values:
    pipeline = RCPipeline(
      metadata={
        'target_id': target_id, 
        'reservoir_id': res_id, 
        'dataset_id': DATASET_ID,
        'benchmark': 'NARMA',
        'narma_n': n,
      },
      # Data generation
      target=target_gen,
      reservoir=res_gen,
      # Data transformation
      transforms=[
        # NOTE: remove warmup before transform, otherwise the 
        # warmup steps are used as input to the NARMA system.
        WarmupTransform(warmup_days=4, day_length=24),  
        NarmaTargetTransform(n=n, scale=1),
      ],
      # Data preprocessing
      preprocessing=[
        DaylightMask(day_length=24, start=5, end=21),
        rescale
      ],
      **shared_pipeline_params,
    )
    yield pipeline

#### Near future prediction

In [15]:
def future_pred_benchmark(target_gen, target_id, res_gen, res_id, rescale, *, lookahead_steps: [int]):
  for l in lookahead_steps:
    pipeline = RCPipeline(
      metadata={
        'target_id': target_id, 
        'reservoir_id': res_id, 
        'dataset_id': DATASET_ID,
        'benchmark': 'near_future_pred',
        'lookahead': l,
      },
      # Data generation
      target=target_gen,
      reservoir=res_gen,
      # Data transformation
      transforms=[
        # NOTE: remove warmup before transform, otherwise the 
        # warmup steps are used as delayed prediction targets.
        WarmupTransform(warmup_days=4, day_length=24),
        DelayLineTransform(delay_steps=-l),
      ],
      # Data preprocessing
      preprocessing=[
        DaylightMask(day_length=24, start=5, end=21),
        rescale
      ],
      **shared_pipeline_params,
    )
    yield pipeline

In [17]:
test_target = TargetGenerator(target=targets[0], run_ids=run_ids)
test_reservoir = SingleReservoirGenerator(state_var=state_variables[0], run_ids=run_ids)
rescale = Rescale(per_feature=False)

lookahead_steps = [0, 1, 2]

test_narma_pipelines = list(future_pred_benchmark(test_target, 'target', test_reservoir, 'res', rescale, lookahead_steps=lookahead_steps))

### Generate pipelines

In [29]:
from itertools import chain

benchmarks = [
  # {
  #   'benchmark': delay_line_benchmark,
  #   'kwargs': {
  #     'delay_steps': np.arange(0, 13)  # 0-12 hours
  #   }
  # },
  # {
  #   'benchmark': polynomial_benchmark,
  #   'kwargs': {
  #     'exponents': np.arange(1, 10)  # exponents 1-9
  #   }
  # },
  # {
  #   'benchmark': narma_benchmark,
  #   'kwargs': {
  #     'n_values': [2, 4, 6, 8, 10, 12, 18, 20, 24, 32, 40, 48, 50]
  #   }
  # },
  {
    'benchmark': future_pred_benchmark,
    'kwargs': {
      'lookahead_steps': np.arange(0, 7)  # 0-6 hours
    }
  },
]


def generate_benchmark_pipelines(target_gen, target_id, reservoir_gen, reservoir_id, rescale):
  pipeline_generators = []
  for benchmark in benchmarks:
    gen = benchmark['benchmark'](target_gen, target_id, reservoir_gen, reservoir_id, rescale, **benchmark['kwargs'])
    pipeline_generators.append(gen)
  return chain(*pipeline_generators)


In [30]:
target_state_pairs = [(target, state_var) for target in targets for state_var in state_variables]
target_env_pairs = [(name, target, env_targets) for target in targets for (name, env_targets) in baseline_reservoirs]
target_het_pairs = [(name, target, state_vars) for target in targets for (name, state_vars) in heterogeneous_reservoirs]


all_pipelines = []

for target, state_var in target_state_pairs:
  target_gen = TargetGenerator(target=target, run_ids=run_ids)
  res_gen = SingleReservoirGenerator(state_var=state_var, run_ids=run_ids, state_ids=state_ids)
  pipelines = generate_benchmark_pipelines(
    target_gen, target, res_gen, state_var, 
    Rescale(per_feature=False)
  )
  all_pipelines += pipelines

for name, target, state_vars in target_het_pairs:
  target_gen = TargetGenerator(target=target, run_ids=run_ids)
  res_gen = MultiReservoirGenerator(state_vars=state_vars, run_ids=run_ids, state_ids=state_ids)
  pipelines = generate_benchmark_pipelines(
    target_gen, target, res_gen, name, 
    GroupRescale(datasets=datasets, state_vars=state_vars)
  )
  all_pipelines += pipelines

for name, target, env_targets in target_env_pairs:
  target_gen = TargetGenerator(target=target, run_ids=run_ids)
  res_gen = TargetReservoirGenerator(targets=env_targets, run_ids=run_ids)
  pipelines = generate_benchmark_pipelines(
    target_gen, target, res_gen, name, 
    Rescale(per_feature=True)
  )
  all_pipelines += pipelines


print(len(all_pipelines))

952


### Execution

In [31]:
from tqdm import tqdm
from itertools import chain
from pipeline_base import execute_pipeline


RESULTS_FILE = 'results_hydroshoot_future.csv'

total_loops = len(all_pipelines)
results = []


with tqdm(total=total_loops) as pbar:
  for pipeline in all_pipelines:

    try:
      result = execute_pipeline(pipeline)
      results.append(result)
      pbar.update(1)
    except Exception as e:
      print('An exception occured executing the pipeline with the following metadata:')
      print(f'{pipeline.metadata}')
      raise e


results_df = pd.DataFrame.from_dict(results)
results_df.to_csv(RESULTS_FILE)
print(f'Saved scores to {RESULTS_FILE}')

  X_g = (X_g - X_g.mean()) / (X_g.std() + 1e-12)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  X_g = (X_g - X_g.mean()) / (X_g.std() + 1e-12)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  X_g = (X_g - X_g.mean()) / (X_g.std() + 1e-12)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  X_g = (X_g - X_g.mean()) / (X_g.std() + 1e-12)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype

Saved scores to results_hydroshoot_future.csv



