# Within-site run times
Version of the within-site ablation tests that gets the run-times for the final model and architecture ablation tests.
#### Notes
- Only one fold model is created for each run
- Only 20 runs per test - i.e. timings are for one ensemble of twenty runs.
- No parallel running of tests
- The first test (test0) is the proposed model, so other tests are offset by 1 compared to the full ablation test (e.g. the dropout test is test4 here and test3 in the ablation test)
- By default, training times are output to the `train_stats.csv` file, so these could also be obtained from the ablation tests model directories. This notebook allows testing under more controlled conditions - e.g. if the full runs are run in a shared environment they may not be accurate/consistent due to the server workload. So this is a cut-down version that can run in a small dedicated environment.

In [None]:
import os
import json
import numpy as np
import pandas as pd

import initialise
import common
from model_utils import reshape_data
from modelling_functions import create_models, run_experiment
from architecture_within_site import model_params

## Directories and Input files
Change these settings as required
- `modis_csv`: The file containing extracted MODIS data for each sample, created by `Extract MODIS Data.ipynb`
- `prism_csv`: The file containing extracted PRISM data for each sample, created by `Extract PRISM Data.ipynb`
- `aux_csv`: The file containing extracted sample labels, DEM, climate zone and other auxiliary data, created by `Extract Auxiliary Data.ipynb`.

In [None]:
modis_csv = os.path.join(common.DATASETS_DIR, 'modis_365days.csv')
prism_csv = os.path.join(common.DATASETS_DIR, 'prism_365days.csv')
aux_csv = os.path.join(common.DATASETS_DIR, 'samples_365days.csv')

## Set up experiment parameters
If the experiment dictionary contains a 'tests' key that is not 'falsy' (False, None, 0, empty list) it is assumed to be a list of tests to run. Each test will run with the specified model parameters. Model parameters not specified will be the same for each test, as set in the main model_params dictionary. A failed run can be restarted by setting the 'restart' key to the test that failed. This test and the remaining tests will then be run.

If 'tests' is 'falsy' then a single test will be run using the parameters in the main model_params dictionary.

Other settings are:
- layerTypes: specifies which layers to include in the model
- Layer parameters should be specified as a list. The first entry in the list will be used for the first layer, etc.
- If the experiment includes changes to the layers, all non-default layer parameters need to be included. The parameters that are kept constant can be specified by including a key for the layer type in the experiment dictionary, and the value set to a dictionary of the constant parameters.

Model_parameters that cannot be changed in tests are:
- \*Filename
- \*Channels
- targetColumn

Example of setting layer parameters:
```
{'name': 'Filters',
 'description': 'Test effect of different filter sizes on conv layers',
 'tests': [{'conv': {'filters': [32, 32, 32]}},
           {'conv': {'filters': [8, 8, 8]}},
           {'conv': {'filters': [32, 8, 8]}},
           {'conv': {'filters': [8, 32, 8]}},
           {'conv': {'filters': [8, 8, 32]}},
           {'conv': {'filters': [8, 16, 32]}},
           {'conv': {'filters': [32, 16, 8]}}],
 'conv': {'numLayers': 3, 'poolSize': [2, 3, 4]},
 'restart': 0}
```

In [None]:
experiment = {
    'name': 'within_site_timings',
    'description': 'Timings for within-site architecture changes',
    'layerTypes': ['modisConv', 'prismConv', 'fc'],
    'tests': [
        {},
        {'modisConv': {'numLayers': 5, 'filters': [32] * 5, 'poolSize': [0, 5, 2, 3, 4]},
         'prismConv': {'numLayers': 5, 'filters': [32] * 5, 'poolSize': [0, 5, 2, 3, 4]}
        },
        {'modisConv': {'numLayers': 3, 'filters': [8, 8, 8], 'poolSize': [2, 3, 4]},
         'prismConv': {'numLayers': 3, 'filters': [8, 8, 8], 'poolSize': [2, 3, 4]}
        },
        {'fc': {'numLayers': 2, 'units': [512, 512]}},
        {'fc': {'numLayers': 3, 'units': [256, 256, 256]}},
        {'dropoutRate': 0.5},
        {'batchSize': 32},
    ],
    'restart': None,
    'testNames': [
        'Proposed model',
        'Conv filters: 32',
        'Conv layers: 3',
        'Dense layers: 2',
        'Dense units: 256',
        'Dropout: 0.5',
        'Batch size: 32',
    ]
}

# Save and display experiment details
experiment_dir = os.path.join(common.MODELS_DIR, experiment['name'])
restart = experiment.get('restart')
if not os.path.exists(experiment_dir):
    os.makedirs(experiment_dir)
elif restart is None:
    raise FileExistsError(f'{experiment_dir} exists but restart not requested')
experiment_file = f'experiment{restart}.json' if restart else 'experiment.json'
with open(os.path.join(experiment_dir, experiment_file), 'w') as f:
    json.dump(experiment, f, indent=2)
experiment

## Set up model parameters

### Model parameters settings
To find out more about any parameter, run `model_params.help('<parameter>')`. 

In [None]:
model_params['modelName'] = experiment['name']
model_params['description'] = experiment['description']
model_params['modisFilename'] = modis_csv
model_params['prismFilename'] = prism_csv
model_params['auxFilename'] = aux_csv
model_params['splitYear'] = 2017
model_params['splitFolds'] = 1
model_params['tempDir'] = common.TEMP_DIR
model_params['modelDir'] = os.path.join(common.MODELS_DIR, model_params['modelName'])
model_params['derivedModels'] = common.DERIVED_MODELS
model_params['seedList'] = [
    441, 780, 328, 718, 184, 372, 346, 363, 701, 358,
    566, 451, 795, 237, 788, 185, 397, 530, 758, 633,
    632, 941, 641, 519, 162, 215, 578, 919, 917, 585,
    914, 326, 334, 366, 336, 413, 111, 599, 416, 230,
    191, 700, 697, 332, 910, 331, 771, 539, 575, 457
]

if not os.path.exists(model_params['modelDir']):
    os.makedirs(model_params['modelDir'])

model_params

## Prepare the data

In [None]:
modis_data = pd.read_csv(model_params['modisFilename'], index_col=0)
x_modis = reshape_data(np.array(modis_data), model_params['modisChannels'])
print(f'Modis shape: {x_modis.shape}')

prism_data = pd.read_csv(model_params['prismFilename'], index_col=0)
x_prism = reshape_data(np.array(prism_data), model_params['prismChannels'])
print(f'Prism shape: {x_prism.shape}')

aux_data = pd.read_csv(model_params['auxFilename'], index_col=0)
y = aux_data[model_params['targetColumn']]

## Build and run the models
Builds and trains the LFMC models.

All models, predictions, evaluation statistics, and plots of test results are saved to `model_dir`, with each test and run saved to a separate sub-directory. For each model created, predictions and evaluation statistics are also returned as attributes of the `model` object. These are stored as nested lists, the structure for a full experiment is:
- Tests (omitted if not an experiment)
  - Runs (omitted for a single run)
    - Folds (for k-fold splitting)

In [None]:
def is_experiment():
    try:
        return bool(experiment['tests'])
    except:
        return False

In [None]:
X = {'modis': x_modis, 'prism': x_prism}
if is_experiment():
    ex_models = run_experiment(experiment, model_params, aux_data, X, y)
else:
    print('Running a single test')
    with open(os.path.join(model_params['modelDir'], 'model_params.json'), 'w') as f:
        model_params.save(f)
    models = create_models(model_params, aux_data, X, y)

## Display the training times
Time are Tensorflow/Keras model training time and excludes data preparation time.

In [None]:
train_times = []
for test in ex_models:
    run_time = 0
    for run in test:
        df = pd.read_csv(os.path.join(run.model_dir, 'train_stats.csv'))
        run_time += df.trainTime[0]
    weights = df.trainableWeights[0]
    train_times.append([run_time/60, run_time/60/len(test), weights])
pd.DataFrame(train_times, index=experiment['testNames'],
             columns=['ensemble_time', 'single_time', 'num_params']).round(2)