# Create Ensembles for Comparison Models
- Generates ensembles for both the within-site and out-of-site models
- Ensemble size is 20
- 50 ensembles are created by sampling (without replacement) from the pool of models
- Save the ensemble predictions and statistics

## Imports

In [None]:
import glob
import json
import os
import pandas as pd
import random

import initialise
import common
from analysis_utils import calc_statistics, samples_with_historical_data
from model_utils import generate_ensembles

## Directories and other settings
- Update the model directories as required

In [None]:
samples_file = os.path.join(common.DATASETS_DIR, 'samples_365days.csv')
model_dir = os.path.join(common.MODELS_DIR, 'comparison_models')

ensemble_model = common.ANALYSIS_MODEL
ensemble_size = common.ENSEMBLE_SIZE
ensemble_runs = common.ENSEMBLE_RUNS

precision = 3       # floating point precision for saved predictions
random_seed = 9876
random.seed(random_seed)

# Ensembles are needed for the Multi-tempCNN comparisons (tests 0 & 1)
ensemble_range = range(2)

# Single models are needed for the Modis-tempCNN comparisons (tests 2 & 3)  
single_range = range(2, 4)
single_model = common.MODIS_TEMPCNN_MODEL

In [None]:
with open(os.path.join(model_dir, 'model_params.json'), 'r') as f:
    model_params = json.load(f)
samples1 = pd.read_csv(samples_file, index_col=0)
temp_predict = pd.read_csv(os.path.join(model_dir, 'test0', 'run0', 'predictions.csv'), index_col=0)
samples2, _ = samples_with_historical_data(samples1, temp_predict, model_params['siteColumn'], model_params['yearColumn'])
samples_index = [samples2.index, samples1.index, samples2.index, samples1.index]

### Generate the ensembles
Generate ensembles of various sizes for each of the models. For each test, randomly select the runs to ensemble, then create the ensembles.

In [None]:
model_predicts = []
for i in ensemble_range:
    test_predicts = []
    for run_dir in glob.glob(os.path.join(model_dir, f'test{i}', 'run*')):
        test_predicts.append(pd.read_csv(os.path.join(run_dir, 'predictions.csv'), index_col=0).loc[samples_index[i]])
    model_predicts.append(test_predicts)
num_models = len(model_predicts[0])

predict, all_stats = generate_ensembles(model_predicts, common.ENSEMBLE_RUNS, common.ENSEMBLE_SIZE, random_seed=random_seed)

### Save Ensemble Predictions
Save the predictions made by the ensembles using the model of interest. Columns are the individual ensemble predictions and rows are the samples.

In [None]:
for num in ensemble_range:
    file_name = f"ensemble{ensemble_size}_{ensemble_model}.csv"
    stats_fname = f"ensemble{ensemble_size}_stats.csv"
    test_name = f'test{num}'
    print(os.path.join(model_dir, test_name, file_name))
    print(os.path.join(model_dir, test_name, stats_fname))
    df = pd.concat([pred_[common.ANALYSIS_MODEL] for pred_ in predict[num]], axis=1, ignore_index=True).round(precision)
    df.to_csv(os.path.join(model_dir, test_name, file_name))
    df = pd.DataFrame([run.stack() for run in all_stats[num]]).T
    df.to_csv(os.path.join(model_dir, test_name, stats_fname))

### Merge all model Predictions
Merge the predictions from each run of the Modis-tempCNN models

In [None]:
model_predicts = []
for i in single_range:
    test_predicts = []
    for run_dir in glob.glob(os.path.join(model_dir, f'test{i}', 'run*')):
        test_predicts.append(pd.read_csv(os.path.join(run_dir, 'predictions.csv'), index_col=0).loc[samples_index[i]])
    model_predicts.append(test_predicts)
num_models = len(model_predicts[0])


### Save Modis-tempCNN Predictions
Save the predictions made by the Modis-tempCNN models. Columns are the individual model predictions and rows are the samples.

In [None]:
for num in single_range:
    file_name = f"predictions_{single_model}.csv"
    stats_fname = "predictions_stats.csv"
    test_name = f'test{num}'
    print(os.path.join(model_dir, test_name, file_name))
    print(os.path.join(model_dir, test_name, stats_fname))
    preds = model_predicts[num-single_range[0]]
    df = pd.concat([pred_[single_model] for pred_ in preds], axis=1, ignore_index=True).round(precision)
    df.to_csv(os.path.join(model_dir, test_name, file_name))
    stats_df = []
    for p in preds:
        p_iter = p.drop('y', axis=1).iteritems()
        s = {pred_[0]: calc_statistics(preds[0].y, pred_[1]) for pred_ in p_iter}
        stats_df.append(pd.DataFrame.from_dict(s, orient='index'))
    stats_df = pd.DataFrame([run.stack() for run in stats_df]).T
    stats_df.to_csv(os.path.join(model_dir, test_name, stats_fname))