# Create Ensembles for Main Models
- Generates ensembles for both the yearly and out-of-site models
- Ensemble sizes are 5, 10, 15, 20, and 25
- 50 ensembles of each size are created by sampling (without replacement) from the pool of models
- Save the ensemble predictions and statistics
- Equivalent prediction and statistics files for the single models are also created

## Imports

In [1]:
import glob
import numpy as np
import os
import pandas as pd
import random
from scipy import stats

import initialise
from model_utils import generate_ensembles
from display_utils import display_frames

## Directories and other settings
- Update the model directories as required

In [6]:
model_dirs = [
    r'G:\My Drive\LFMC Data\LFMC_ensembles\Models\out-of-site_models',
    r'G:\My Drive\LFMC Data\LFMC_ensembles\Models\yearly_models'
]
model_names = ['Out-of-site'] #, 'Yearly']

# List of required ensemble sizes; single models (ensemble size 1) included for comparison
ensemble_sizes = [1] + list(range(5, 30, 5))
ensemble_names = ['Single model'] + [f'Ensemble {size}' for size in ensemble_sizes[1:]]

model = 'base'      # Using the base model, rather than any of the derived ones
model_runs = 50     # Total number of individual models created
ensemble_runs = 50  # Number of ensembles of each size
precision = 3       # floating point precision for saved predictions
random_seed = 9876

### Generate the ensembles and save predictions
Generate ensembles of various sizes for each of the models. For each ensemble size, randomly select the runs to ensemble, then create the ensembles. This is repeated for the desired number of ensembles of each size.

In [7]:
for m, model_dir in enumerate(model_dirs):
    print(f"Processing {model_names[m]} models; model directory {model_dir}")
    model_predicts = []
    model_stats = []
    for run_dir in glob.glob(os.path.join(model_dir, 'run*')):
        model_predicts.append(pd.read_csv(os.path.join(run_dir, 'predictions.csv'), index_col=0))
        model_stats.append(pd.read_csv(os.path.join(run_dir, 'predict_stats.csv'), index_col=0))

    predict, all_stats = generate_ensembles(model_predicts, model_stats, ensemble_runs, ensemble_sizes, random_seed=random_seed)

    print("Saving ensembles ...")
    for num, ens_name in enumerate(ensemble_names):
        if num == 0:
            file_name = f"single_{model}.csv"
            stats_fname = f"single_stats.csv"
        else:
            file_name = f"ensemble{ensemble_sizes[num]:02}_{model}.csv"
            stats_fname = f"ensemble{ensemble_sizes[num]:02}_stats.csv"
        print(f"   {ens_name}; predictions: {file_name}, stats: {stats_fname}")
        df = pd.concat([pred_[model] for pred_ in predict[num]], axis=1, ignore_index=True).round(precision)
        df.to_csv(os.path.join(model_dir, file_name))
        df = pd.DataFrame([run.stack() for run in all_stats[num]]).T
        df.to_csv(os.path.join(model_dir, stats_fname))

Processing Out-of-site, model directory G:\My Drive\LFMC Data\LFMC_ensembles\Models\out-of-site_models
Generating ensembles - size 1: 
Generating ensembles - size 5: .........10.........20.........30.........40.........50
Generating ensembles - size 10: .........10.........20.........30.........40.........50
Generating ensembles - size 15: .........10.........20.........30.........40.........50
Generating ensembles - size 20: .........10.........20.........30.........40.........50
Generating ensembles - size 25: .........10.........20.........30.........40.........50
Saving ensembles ...
   Single model; predictions: single_base.csv, stats: single_stats.csv
   Ensemble 5; predictions: ensemble05_base.csv, stats: ensemble05_stats.csv
   Ensemble 10; predictions: ensemble10_base.csv, stats: ensemble10_stats.csv
   Ensemble 15; predictions: ensemble15_base.csv, stats: ensemble15_stats.csv
   Ensemble 20; predictions: ensemble20_base.csv, stats: ensemble20_stats.csv
   Ensemble 25; predict