# Create Ensembles for Main Models
- Generates ensembles for both the within-site and out-of-site models
- Ensemble sizes are 5, 10, 15, 20, and 25
- 50 ensembles of each size are created by sampling (without replacement) from the pool of models
- Save the ensemble predictions and statistics
- Equivalent prediction and statistics files for the single models are also created

## Imports

In [1]:
import glob
import json
import numpy as np
import os
import pandas as pd
import random
from scipy import stats

import initialise
import common
from model_utils import generate_ensembles
from analysis_utils import samples_with_historical_data

## Directories and other settings
- Update the model directories as required

In [2]:
samples_file = os.path.join(common.DATASETS_DIR, 'samples_365days.csv')
model_dirs = [
    os.path.join(common.MODELS_DIR, 'out-of-site_models'),
    os.path.join(common.MODELS_DIR, 'within-site_models')
]
model_names = ['Out-of-site', 'Within-site']

precision = 3       # floating point precision for saved predictions
random_seed = 9876

In [3]:
with open(os.path.join(model_dirs[1], 'model_params.json'), 'r') as f:
    model_params = json.load(f)
samples1 = pd.read_csv(samples_file, index_col=0)
temp_predict = pd.read_csv(os.path.join(model_dirs[1], 'run0', 'predictions.csv'), index_col=0)
samples2, _ = samples_with_historical_data(samples1, temp_predict, model_params['siteColumn'], model_params['yearColumn'])
samples_index = [samples1.index, samples2.index]

### Generate the ensembles and save predictions
Generate ensembles of various sizes for each of the models. For each ensemble size, randomly select the runs to ensemble, then create the ensembles. This is repeated for the desired number of ensembles of each size.

In [4]:
for m, model_dir in enumerate(model_dirs):
    print(f"Processing {model_names[m]} models; model directory {model_dir}")
    model_predicts = []
    for run_dir in glob.glob(os.path.join(model_dir, 'run*')):
        model_predicts.append(pd.read_csv(os.path.join(run_dir, 'predictions.csv'), index_col=0).loc[samples_index[m]])

    predict, all_stats = generate_ensembles(model_predicts, common.ENSEMBLE_RUNS, common.ENSEMBLE_SIZES, random_seed=random_seed)

    print("Saving ensembles ...")
    for num, ens_name in enumerate(common.ENSEMBLE_NAMES):
        if num == 0:
            file_name = f"single_{common.ANALYSIS_MODEL}.csv"
            stats_fname = f"single_stats.csv"
        else:
            file_name = f"ensemble{common.ENSEMBLE_SIZES[num]:02}_{common.ANALYSIS_MODEL}.csv"
            stats_fname = f"ensemble{common.ENSEMBLE_SIZES[num]:02}_stats.csv"
        print(f"   {ens_name}; predictions: {file_name}, stats: {stats_fname}")
        df = pd.concat([pred_[common.ANALYSIS_MODEL] for pred_ in predict[num]], axis=1, ignore_index=True).round(precision)
        df.to_csv(os.path.join(model_dir, file_name))
        df = pd.DataFrame([run.stack() for run in all_stats[num]]).T
        df.to_csv(os.path.join(model_dir, stats_fname))

Processing Out-of-site models; model directory G:\My Drive\LFMC Data\multi_modal_LFMC\Models\out-of-site_models
Generating ensembles - size 1: 
Generating ensembles - size 5: .........10.........20.........30.........40.........50
Generating ensembles - size 10: .........10.........20.........30.........40.........50
Generating ensembles - size 15: .........10.........20.........30.........40.........50
Generating ensembles - size 20: .........10.........20.........30.........40.........50
Generating ensembles - size 25: .........10.........20.........30.........40.........50
Saving ensembles ...
   Single model; predictions: single_base.csv, stats: single_stats.csv
   Ensemble 5; predictions: ensemble05_base.csv, stats: ensemble05_stats.csv
   Ensemble 10; predictions: ensemble10_base.csv, stats: ensemble10_stats.csv
   Ensemble 15; predictions: ensemble15_base.csv, stats: ensemble15_stats.csv
   Ensemble 20; predictions: ensemble20_base.csv, stats: ensemble20_stats.csv
   Ensemble 25