# Create and Analyse Out-of-site Ensembles
- Uses the results made after training for 50 epochs. The models were trained to 100 epochs, with results saved every 25 epochs, so need results from the epoch50 directories, not the base run directories
- Ensemble sizes are 5, 10, 15, 20, and 25
- 50 ensembles of each size are created by sampling (without replacement) from the pool of models
- Ensemble results are saved, so once generated, re-runs can load the saved results.

In [1]:
import glob
import numpy as np
import os
import pandas as pd
import random
from scipy import stats

import initialise
from model_utils import generate_ensembles
from display_utils import display_frames

In [2]:
model_dir = r'G:\My Drive\LFMC Data\LFMC_ensembles\Models\out-of-site_models'
model = 'base'
ensemble_sizes = list(range(5, 30, 5))
tests = ['Single model'] + [f'Ensemble {size}' for size in ensemble_sizes]
model_runs = 50
ensemble_runs = 50

### Generate the ensembles
Run this section if the ensembles have not been generated

For each ensemble size, randomly select the runs to ensemble, then create the ensembles. This is repeated for the desired number of ensembles of each size.

In [3]:
model_predicts = []
model_stats = []
for run_dir in glob.glob(os.path.join(model_dir, 'run*')):
    model_predicts.append(pd.read_csv(os.path.join(run_dir, 'predictions.csv'), index_col=0))
    model_stats.append(pd.read_csv(os.path.join(run_dir, 'predict_stats.csv'), index_col=0))
num_models = len(model_predicts)

In [5]:
random_seed = 9876
predict, all_stats = generate_ensembles(model_predicts, model_stats, ensemble_runs, [1] + ensemble_sizes, random_seed=random_seed)
all_stats = [pd.DataFrame([run.stack() for run in stats_]).T for stats_ in all_stats]
means = [stats_.mean(axis=1).unstack() for stats_ in all_stats]
variances = [stats_.var(axis=1).unstack() for stats_ in all_stats]

Generating ensembles - size 1: 
Generating ensembles - size 5: .........10.........20.........30.........40.........50
Generating ensembles - size 10: .........10.........20.........30.........40.........50
Generating ensembles - size 15: .........10.........20.........30.........40.........50
Generating ensembles - size 20: .........10.........20.........30.........40.........50
Generating ensembles - size 25: .........10.........20.........30.........40.........50


#### Save Ensemble Predictions
Save the predictions made by the ensembles using the model of interest. Each size ensemble is stored in a separate CSV file. Columns are the individual ensemble predictions and rows are the samples.

In [8]:
precision = 3
for num, test in enumerate(tests):
    if num == 0:
        file_name = f"single_{model}.csv"
        stats_fname = f"single_stats.csv"
    else:
        file_name = f"ensemble{ensemble_sizes[num-1]:02}_{model}.csv"
        stats_fname = f"ensemble{ensemble_sizes[num-1]:02}_stats.csv"
    print(file_name)
    df = pd.concat([pred_[model] for pred_ in predict[num]], axis=1, ignore_index=True).round(precision)
    df.to_csv(os.path.join(model_dir, file_name))
    all_stats[num].to_csv(os.path.join(model_dir, stats_fname))

single_base.csv
ensemble05_base.csv
ensemble10_base.csv
ensemble15_base.csv
ensemble20_base.csv
ensemble25_base.csv


### Load Ensemble Predictions
Run this section if the ensembles have already been generated

In [10]:
y = pd.read_csv((os.path.join(model_dir, 'run0', 'predictions.csv')), index_col=0).y
predict = []
all_stats = []
for num, test in enumerate(tests):
    if num == 0:
        file_name = f'single_{model}.csv'
        stats_fname = f"single_stats.csv"
    else:
        file_name = f"ensemble{ensemble_sizes[num-1]:02}_{model}.csv"
        stats_fname = f"ensemble{ensemble_sizes[num-1]:02}_stats.csv"
    df = pd.read_csv(os.path.join(model_dir, file_name), index_col=0)
    predict.append([pd.DataFrame({'y': y, model: pred_[1]}) for pred_ in df.iteritems()])
    all_stats.append(pd.read_csv(os.path.join(model_dir, stats_fname), index_col=(0,1)))
#     stats_ = pd.DataFrame([calc_statistics(y, pred_[1]) for pred_ in df.iteritems()]).T
#     stats_['model'] = model
#     all_stats.append(stats_.set_index('model', append=True).swaplevel())
means = [stats_.mean(axis=1).unstack() for stats_ in all_stats]
variances = [stats_.var(axis=1).unstack() for stats_ in all_stats]

### Means and Variances of test prediction statistics
Which test performed best?

In [11]:
ci_dict = {}
for n, t in enumerate(all_stats):
    a = t.loc[model].T
    ci = stats.t.interval(0.95, len(a)-1, loc=np.mean(a), scale=stats.sem(a))
    ci_dict[tests[n]] = pd.DataFrame([means[n].loc['base'].array, ci[0], ci[1]], index=['mean', 'lower', 'upper'], columns=a.columns)
display_frames(ci_dict.values(), tests, precision=2)

Unnamed: 0,Bias,R,R2,RMSE,ubRMSE
mean,0.62,0.7,0.48,26.75,26.73
lower,0.36,0.7,0.48,26.7,26.68
upper,0.88,0.7,0.49,26.8,26.78

Unnamed: 0,Bias,R,R2,RMSE,ubRMSE
mean,0.65,0.73,0.53,25.59,25.58
lower,0.55,0.73,0.53,25.57,25.56
upper,0.76,0.73,0.53,25.61,25.6

Unnamed: 0,Bias,R,R2,RMSE,ubRMSE
mean,0.63,0.73,0.53,25.44,25.43
lower,0.58,0.73,0.53,25.43,25.42
upper,0.69,0.73,0.53,25.46,25.45

Unnamed: 0,Bias,R,R2,RMSE,ubRMSE
mean,0.65,0.73,0.53,25.4,25.39
lower,0.59,0.73,0.53,25.39,25.38
upper,0.71,0.73,0.53,25.41,25.4

Unnamed: 0,Bias,R,R2,RMSE,ubRMSE
mean,0.58,0.73,0.54,25.36,25.35
lower,0.53,0.73,0.53,25.35,25.34
upper,0.63,0.73,0.54,25.37,25.36

Unnamed: 0,Bias,R,R2,RMSE,ubRMSE
mean,0.62,0.73,0.54,25.36,25.35
lower,0.59,0.73,0.54,25.35,25.34
upper,0.66,0.73,0.54,25.36,25.36


### Results Summary
Create a data frame containing the results to show in the paper.

In [12]:
precision = 2
df_list = []
for num, test in enumerate(tests):
    df_dict={}
    df_dict.update(ci_dict[test].RMSE.round(precision).add_prefix('RMSE_').to_dict())
    df = pd.concat([pred_[model] for pred_ in predict[num]], axis=1, ignore_index=True).round(precision)
    df_dict['Pred_var'] = round(df.var(axis=1).mean(), precision)
    df_dict.update(ci_dict[test].R2.round(precision).add_prefix('R2_').to_dict())
    df_list.append(df_dict)
pd.DataFrame(df_list, index=tests)

Unnamed: 0,RMSE_mean,RMSE_lower,RMSE_upper,Pred_var,R2_mean,R2_lower,R2_upper
Single model,26.75,26.7,26.8,76.19,0.48,0.48,0.49
Ensemble 5,25.59,25.57,25.61,13.97,0.53,0.53,0.53
Ensemble 10,25.44,25.43,25.46,6.07,0.53,0.53,0.53
Ensemble 15,25.4,25.39,25.41,3.61,0.53,0.53,0.53
Ensemble 20,25.36,25.35,25.37,2.26,0.54,0.53,0.54
Ensemble 25,25.36,25.35,25.36,1.52,0.54,0.54,0.54
