# Create Ensembles for Ablations and Omit-One tests
- Generates ensembles for both the within-site and out-of-site models
- Ensemble size is 20
- 50 ensembles are created by sampling (without replacement) from the pool of models
- Save the ensemble predictions and statistics

## Imports

In [None]:
import glob
import json
import os
import pandas as pd

import initialise
import common
from model_utils import generate_ensembles
from analysis_utils import samples_with_historical_data

## Directories and other settings
- Update the model directories as required

In [None]:
samples_file = os.path.join(common.DATASETS_DIR, 'samples_365days.csv')
model_dirs = [
    os.path.join(common.MODELS_DIR, 'out-of-site_ablation'),
    os.path.join(common.MODELS_DIR, 'within-site_ablation'),
    os.path.join(common.MODELS_DIR, 'out-of-site_omit_one'),
    os.path.join(common.MODELS_DIR, 'within-site_omit_one'),
]

precision = 3       # floating point precision for saved predictions
random_seed = 9876

In [None]:
with open(os.path.join(model_dirs[1], 'model_params.json'), 'r') as f:
    model_params = json.load(f)

In [None]:
samples1 = pd.read_csv(samples_file, index_col=0)
predict1 = pd.read_csv(os.path.join(model_dirs[2], 'test0', 'run0', 'predictions.csv'), index_col=0).reindex(samples1.index)
temp_predict = pd.read_csv(os.path.join(model_dirs[1], 'test0', 'run0', 'predictions.csv'), index_col=0)
samples2, _ = samples_with_historical_data(samples1, temp_predict, model_params['siteColumn'], model_params['yearColumn'])
samples_index = [samples1.index, samples2.index, samples1.index, samples2.index]

### Read Test Predictions
Read the prediction files for each test and retain the predictions that match the samples index. Either calculate the prediction statistics or read from the saved stats file (if available).

In [None]:
def read_predictions(model_dir, samples_index):
    model_predicts = []
    test_dirs = sorted(glob.glob(os.path.join(model_dir, f'test*')))
    for test_dir in test_dirs:
        test_predicts = []
        for run_dir in glob.glob(os.path.join(test_dir, 'run*')):
            preds_ = pd.read_csv(os.path.join(run_dir, 'predictions.csv'), index_col=0).loc[samples_index]
            test_predicts.append(preds_)
        model_predicts.append(test_predicts)
    return model_predicts, test_dirs

### Save Ensemble Predictions
Save the predictions made by the ensembles using the model of interest. Each size ensemble is stored in a separate CSV file. Columns are the individual ensemble predictions and rows are the samples.

In [None]:
def write_ensembles(test_dirs, model_predicts, model_stats, precision):
    for num in range(len(model_predicts)):
        file_name = f"ensemble{common.ENSEMBLE_SIZE}_{common.ANALYSIS_MODEL}.csv"
        stats_fname = f"ensemble{common.ENSEMBLE_SIZE}_stats.csv"
        test_dir = os.path.join(model_dir, f'test{num}')
        test_dir = test_dirs[num]
        print(os.path.join(test_dir, file_name))
        print(os.path.join(test_dir, stats_fname))
        df = pd.concat([pred_[common.ANALYSIS_MODEL] for pred_ in model_predicts[num]], axis=1, ignore_index=True).round(precision)
        df.to_csv(os.path.join(test_dir, file_name))
        df = pd.DataFrame([run.stack() for run in model_stats[num]]).T
        df.to_csv(os.path.join(test_dir, stats_fname))

### Generate the ensembles
Generate ensembles of various sizes for each of the models. For each ensemble size, randomly select the runs to ensemble, then create the ensembles. This is repeated for the desired number of ensembles of each size.

In [None]:
for test_num, model_dir in enumerate(model_dirs):
    print(f'Processing experiment {test_num}: {model_dir}')
    model_predicts, test_dirs = read_predictions(model_dir, samples_index[test_num])
    predict, all_stats = generate_ensembles(model_predicts, common.ENSEMBLE_RUNS, common.ENSEMBLE_SIZE, random_seed=random_seed)
    write_ensembles(test_dirs, predict, all_stats, precision)