# Detailed Analysis of Results for Bound Vanish Synthetic Data Generated from a Single Window

In [None]:
import red
import numpy as np
import os
import pickle as pkl
from tqdm import tqdm
import pandas as pd
import seaborn as sns

from matplotlib.axes import Axes
from matplotlib.figure import Figure
from matplotlib import gridspec
import matplotlib.pyplot as plt
plt.style.use('ggplot')

colors = sns.color_palette('colorblind')

def get_subplots(systems: list[str], scale_x: float = 1, scale_y: float =1) -> tuple[plt.Figure, list[plt.Axes]]:
    # Plot two columns side-by-side
    n_cols = min(2, len(systems))
    n_rows = int(np.ceil(len(systems) / n_cols))
    # fig, axs = plt.subplots(n_rows, n_cols, figsize=(3 * n_cols, 5*n_rows))
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(scale_x*3 * n_cols, scale_y*3*n_rows))
    if len(systems) == 1:
        axs = [axs]
    else:
        axs = axs.flatten()

    # Set titles
    for i, system in enumerate(systems):
        axs[i].set_title(system)

    # Remove any unused axes
    for i in range(len(systems), len(axs)):
        fig.delaxes(axs[i])

    return fig, axs

def sanitise_name(name: str) -> str:
    # Replace underscores with spaces
    name = name.replace('_', ' ')
    
    # Split the string by spaces and newlines
    words = name.split()
    
    # Capitalize the first letter of each word
    capitalized_words = [word[0].upper() + word[1:] if word else '' for word in words]
    
    # Join the words back together with spaces
    sanitized_name = ' '.join(capitalized_words)
    
    # Restore newlines
    sanitized_name = sanitized_name.replace(' \n ', '\n')
    
    return sanitized_name


In [None]:
with open("../compute_equil_times/output_single/synthetic_data_bound_vanish_with_equil_times.pkl", "rb") as f:
    data = pkl.load(f)

## Load and process data <a id="load"></a>

In [None]:
with open("../compute_equil_times/output_single/synthetic_data_bound_vanish_with_equil_times.pkl", "rb") as f:
    data = pkl.load(f)

datasets = list(data.keys())
# Sort datasets to put standard first
datasets = sorted(datasets, key=lambda x: x != "standard")
systems = [system for system in data[datasets[0]].keys() if system != "times"]
methods = [method for method in data[datasets[0]][systems[0]][0].keys() if method != "data"]

# Create version of the data with MIF only
mif_data = {dataset: data[dataset]["MIF"] for dataset in datasets}

# Seperate out automated and fixed methods
automated_methods = [method for method in methods if "Discard" not in method]
fixed_methods = [method for method in methods if "Discard" in method]

def bootstrap(data: np.ndarray, fn: callable = lambda x: x, n_bootstraps: int = 10_000) -> tuple[np.ndarray, np.ndarray]:
    """Get the 95 % confidence interval for the mean"""
    means = np.zeros(n_bootstraps)
    for i in range(n_bootstraps):
        # Get a random list of indices
        values = np.random.choice(data, size=len(data), replace=True)
        # Apply passed function to the data
        values = fn(values)
        # Take mean. If fn has already been applied to give a single value,
        # e.g. the variance, then this will not change the value.
        means[i] = np.mean(values)
    return np.percentile(means, [2.5, 97.5]), means

# Skip slow bootstrap calculations if FIGURES_ONLY is set
if not os.environ.get("FIGURES_ONLY", False):

    # Compute statistics for all datasets.
    overall_results = {}
    distributions = {}

    # Can't pickle lambda functions for multiprocessing, so define the functions here

    for dataset in tqdm(datasets):
        overall_results[dataset] = {}
        distributions[dataset] = {}
        for system in systems:
            overall_results[dataset][system] = {}
            distributions[dataset][system] = {}

            for method in methods:
                overall_results[dataset][system][method] = {}
                distributions[dataset][system][method] = {}
                local_data = {i: data[dataset][system][i][method] for i in data[dataset][system]}

                # Basic stats
                means = np.array([local_data[i]["mean"] for i in local_data])
                bias = means.mean()
                variance = means.var()

                # MUEs and RMSEs are easy to calculate as the true answer is 0
                mues = abs(means)
                mue = mues.mean()
                mses = means ** 2
                rmse = np.sqrt(mses.mean())

                # Check time taken for the calculations
                times = np.array([local_data[i]["time"] for i in local_data])
                time = times.mean()

                # Fraction of data discarded
                fracs = np.array([local_data[i]["frac_discarded"] for i in local_data])
                frac_discarded = fracs.mean()

                # Get the 95 % confidence intervals
                mean_ci, mean_distr = bootstrap(means)
                frac_ci, frac_distr = bootstrap(fracs)
                time_ci, time_distr = bootstrap(times)
                var_ci, var_distr = bootstrap(means, np.var)
                mue_ci, mue_distr = bootstrap(means, abs)
                # CIs for the RMSE - need to square root to get the RMSE
                rmse_ci, rmse_distr = bootstrap(means, np.square)
                rmse_ci = np.sqrt(rmse_ci)
                rmse_distr = np.sqrt(rmse_distr)

                # Save the results
                overall_results[dataset][system][method]["bias"] = bias
                overall_results[dataset][system][method]["variance"] = variance
                overall_results[dataset][system][method]["mue"] = mue
                overall_results[dataset][system][method]["rmse"] = rmse
                overall_results[dataset][system][method]["time"] = time
                overall_results[dataset][system][method]["frac_discarded"] = frac_discarded
                overall_results[dataset][system][method]["mean_ci"] = mean_ci
                overall_results[dataset][system][method]["frac_ci"] = frac_ci
                overall_results[dataset][system][method]["time_ci"] = time_ci
                overall_results[dataset][system][method]["var_ci"] = var_ci
                overall_results[dataset][system][method]["mue_ci"] = mue_ci
                overall_results[dataset][system][method]["rmse_ci"] = rmse_ci

                # Save the distributions
                distributions[dataset][system][method]["means"] = means
                distributions[dataset][system][method]["fracs"] = fracs
                distributions[dataset][system][method]["times"] = times
                distributions[dataset][system][method]["mues"] = mues
                distributions[dataset][system][method]["mses"] = mses

                # Save the bootstrap distributions
                distributions[dataset][system][method]["mean_boot"] = mean_distr
                distributions[dataset][system][method]["frac_boot"] = frac_distr
                distributions[dataset][system][method]["time_boot"] = time_distr
                distributions[dataset][system][method]["var_boot"] = var_distr
                distributions[dataset][system][method]["mue_boot"] = mue_distr
                distributions[dataset][system][method]["rmse_boot"] = rmse_distr


In [None]:
if os.environ.get("FIGURES_ONLY", False):
    with open("output_single/overall_results.pkl", "rb") as f:
        overall_results = pkl.load(f)

    with open("output_single/distributions.pkl", "rb") as f:
        distributions = pkl.load(f)

else:
    with open("output_single/overall_results.pkl", "wb") as f:
        pkl.dump(overall_results, f)

    with open("output_single/distributions.pkl", "wb") as f:
        pkl.dump(distributions, f)

## Summary Tables

In [None]:
# Collect the RMSEs into dataframes. We'll create one table for each dataset.

for dataset in datasets:
    # Save a latex table of results
    rows = []
    for method in automated_methods:
        system_rmses = {}
        for system in systems:
            rmse = overall_results[dataset][system][method]["rmse"]
            upper_ci = overall_results[dataset][system][method]["rmse_ci"][1]
            lower_ci = overall_results[dataset][system][method]["rmse_ci"][0]
            system_rmses[system] = f"${rmse:.2f}_{{{lower_ci:.2f}}}^{{{upper_ci:.2f}}}$"
        row = {"Method": method, **system_rmses}
        rows.append(row)

    df = pd.DataFrame(rows)
    df.to_latex(f"output_single/rmse_table_{dataset}.tex", index=False, escape=False, column_format= "l" + "c" * len(systems))

## Load Data Creation Parameters and Plot Bias, SEM and RMSE <a id="plot"></a>

In [None]:
with open("../synthetic_data_creation/output_single/synthetic_data_params.pkl", "rb") as f:
    params = pkl.load(f)

In [None]:
def exp_decay(x: np.ndarray, a: float, b: float) -> np.ndarray:
    return a * np.exp(-b * x)

def compute_bias(times: np.ndarray,
                 exp_params: tuple[float, float],
                 fast_exp_params: tuple[float, float]) -> np.ndarray:
    """Compute the bias for the given times"""
    # First, compute the bias at each point in the series
    bias = exp_decay(times, *exp_params) + exp_decay(times, *fast_exp_params)
    # Then, for each data point, average over all subsequent data points to get the mean biases
    mean_bias = np.zeros(len(bias))
    for i, _ in enumerate(bias):
        n_points = len(bias) - i
        mean_bias[i] = np.sum(bias[i:]) / n_points    

    return mean_bias

def compute_mean_variance(times: np.ndarray,
                          autocov_series: np.ndarray) -> np.ndarray:
    """Compute the mean variance over the times passed."""
    # Precompute cumulative sums
    # return autocov_series[0] + 2*np.sum(autocov_series[1:])
    cumsum_autocov = np.cumsum(autocov_series[1:])
    uncor_variance = autocov_series[0]

    forward_cor_variances = np.zeros(len(times))
    for i in range(len(times)):
        remaining_points = len(times) - i
        forward_cor_variances[i] = cumsum_autocov[remaining_points - 1] if remaining_points - 1 < len(cumsum_autocov) else cumsum_autocov[-1]

    # Backward correlations are just the same as forward correlations, but in reverse,
    # so simply double and add uncorrelated variance
    return np.mean(2*forward_cor_variances + uncor_variance)

def compute_sem(times: np.ndarray,
                autocov_series: np.ndarray) -> np.ndarray:
    """
    Compute the standard error of the mean for the given times. It is assumed
    that the times passed represent the entire series, with same frequency as the 
    original data.
    """
    sems = np.zeros(len(times))
    for i, _ in tqdm(enumerate(times), total=len(times), desc="Processing Times"):
        variance = compute_mean_variance(times[i:], autocov_series)
        n_points = len(times) - i
        sems[i] = np.sqrt(variance / n_points)
    return sems

def compute_rmse(times: np.ndarray,
                 exp_params: tuple[float, float],
                 fast_exp_params: tuple[float, float],
                 autocov_series: np.ndarray) -> np.ndarray:
    """
    Compute the RMSE at each time given the exponential parameters. It is
    assumed that the times passed represent the entire series, and the number of
    data points are the same as those sampled originally.
    """
    bias = compute_bias(times, exp_params, fast_exp_params)
    sem = compute_sem(times, autocov_series)
    return np.sqrt(sem ** 2 + bias ** 2)

In [None]:
# Skip slow calculations if FIGURES_ONLY is set
if not os.environ.get("FIGURES_ONLY", False):

    # Calculate the bias, SEM, and RMSE for each dataset
    fixed_trunc_error_series = {}

    # Currently, only compute for the standard results
    for dataset in datasets:
        fixed_trunc_error_series[dataset] = {}
        for system in systems:
            fixed_trunc_error_series[dataset][system] = {}

            # Get the parameters used to generate the series
            exp_params = params[system]["exp_params"]
            fast_exp_params = params[system]["fast_exp_params"]
            variance_fac = 1 if dataset != "noisy" else 5
            autocov_series = params[system]["autocov_convex"] * variance_fac

            # If this is the subsampled dataset, them subsample the autocovariance series to account for this
            autocov_series = autocov_series[::100] if dataset == "subsampled" else autocov_series

            # If the dataset is block averaged, the stats are the same as the standard dataset
            # If dataset is subsampled, we'll just subsample the standard times.
            dataset_lookup = dataset if dataset not in ["block_averaged", "subsampled"] else "standard"
            test_data = data[dataset_lookup][system][0]["data"]
            tot_time = 8 if dataset != "short" else 0.2
            delta_t = 0.0008 # 200 steps energy frequency times 4 fs per step
            # First datum is at 0.8 ps, and last datum is at tot_time - 0.8 ps due to the way the data were generated
            times = np.linspace(delta_t, tot_time-delta_t, len(test_data)) # The times at which the data were sampled
            if dataset == "subsampled":
                times = times[::100]

            # Compute the bias, sem, and rmse
            bias = compute_bias(times, exp_params, fast_exp_params)
            sem = compute_sem(times, autocov_series)
            rmse = np.sqrt(sem ** 2 + bias ** 2)

            # Get the optimal truncation point
            trunc_point = np.argmin(rmse)
            trunc_time = times[trunc_point]

            # If this is block averaged data, then we need to downsample the series
            if dataset == "block_averaged":
                # Blocks of size 100. This is an approximate way of doing things, but only
                # slightly affects points at very start/ end
                trunc_point = (bias.shape[0] // 100)*100
                bias = np.sqrt(np.mean(bias[:trunc_point].reshape(-1, 100)**2, axis=1))
                sem = np.sqrt(np.mean(sem[:trunc_point].reshape(-1, 100)**2, axis=1))
                rmse = np.sqrt(np.mean(rmse[:trunc_point].reshape(-1, 100)**2, axis=1))
            
            assert len(sem) == len(data[dataset][system][0]["data"]), f"Length of SEM series {len(sem)} does not match length of data {len(data[dataset][system][0]['data'])}"

            # Save the results
            fixed_trunc_error_series[dataset][system]["bias_series"] = bias
            fixed_trunc_error_series[dataset][system]["sem_series"] = sem
            fixed_trunc_error_series[dataset][system]["rmse_series"] = rmse
            fixed_trunc_error_series[dataset][system]["optimal_trunc_point"] = trunc_point
            fixed_trunc_error_series[dataset][system]["optimal_trunc_time"] = trunc_time

In [None]:
if os.environ.get("FIGURES_ONLY", False):
    with open("output_single/fixed_trunc_error_series_stats.pkl", "rb") as f:
        fixed_trunc_error_series = pkl.load(f)
else:
    with open("output_single/fixed_trunc_error_series_stats.pkl", "wb") as f:
        pkl.dump(fixed_trunc_error_series, f)

In [None]:
def plot_theoretical_rmse_on_axis(ax: Axes, dataset: str, system: str) -> None:
    lookup_dataset = dataset if dataset not in ["block_averaged", "subsampled"] else "standard"
    test_data = data[lookup_dataset][system][0]["data"]
    tot_time = 8 if dataset != "short" else 0.2
    # test_times = np.linspace(0, tot_time, len(test_data) + 1)[1:] # The times at which the data were sampled
    delta_t = 0.0008 # 200 steps energy frequency times 4 fs per step
    test_times = np.linspace(delta_t, tot_time-delta_t, len(test_data)) # The times at which the data were sampled
    if dataset == "subsampled":
        test_times = test_times[::100]
    if dataset == "block_averaged":
        trunc_point = (len(test_times) // 100)*100
        test_times = test_times[:trunc_point][49::100]
    rmse_series = fixed_trunc_error_series[dataset][system]["rmse_series"]

    # Get the real RMSE from the discarded fractions
    real_rmse = [overall_results[dataset][system][method]["rmse"] for method in fixed_methods]
    real_cis = [overall_results[dataset][system][method]["rmse_ci"] for method in fixed_methods]
    cis_lower = abs(np.array(real_cis)[:,0] - np.array(real_rmse))
    cis_upper = abs(np.array(real_cis)[:,1] - np.array(real_rmse))
    fracs = [float(method.split(" ")[-1]) for method in fixed_methods]
    tot_time = 8 - delta_t if dataset != "short" else 0.2 - delta_t
    times = [(frac * tot_time)+delta_t for frac in fracs]

    # Plot the RMSE
    ax.plot(test_times[:], rmse_series[:], label="Theoretical", zorder=2, alpha=0.7)
    ax.errorbar(times, real_rmse, yerr=[cis_lower, cis_upper], fmt='-', label="Empirical", zorder=1, ecolor='black')
    ax.set_xlabel("Truncation Time / ns")
    ax.set_ylabel("$\\mathrm{RMSE}(\\langle \\Delta G \\rangle_{[n_{0},N]})$ / kcal mol$^{-1}$")
    ax.set_title(system)
    
    # Set max y limit to be 10 % above the highest RMSE, and 10 % below the lowest RMSE
    threshold =np.min(real_rmse[:-1]) * 0.9, np.max(real_rmse[:-1]) * 1.1
    ax.set_ylim(*threshold)

fig, axs = get_subplots(systems)

for i, system in enumerate(systems):
    plot_theoretical_rmse_on_axis(axs[i], "standard", system)

fig.tight_layout()

# Only put the legend to the left of the last plot
axs[-2].legend(bbox_to_anchor=(1.05, 1), loc='upper left')

fig.savefig("output_single/theoretical_vs_empirical_rmse.png", dpi=300, bbox_inches='tight')

In [None]:
def plot_error_components_on_ax(ax: Axes, dataset: str, system: str, show_min: bool = True) -> None:
    test_data = data[dataset][system][0]["data"]
    tot_time = 8 if dataset != "short" else 0.2
    test_times = np.linspace(0, tot_time, len(test_data) + 1)[1:] # The times at which the data was sampled
    bias_series = fixed_trunc_error_series[dataset][system]["bias_series"]
    sem_series = fixed_trunc_error_series[dataset][system]["sem_series"]
    rmse_series = fixed_trunc_error_series[dataset][system]["rmse_series"]

    # Plot the error components, truncating the last 1 % of the data
    n_truncate = round(len(test_times) * 0.01) # Truncate the last 1 % of the data to avoid large RMSE scaling the y-axis
    ax.plot(test_times[:-(n_truncate + 1)], rmse_series[:-(n_truncate+1)], label="$\\mathrm{RMSE}(\\langle \\Delta G \\rangle_{[n_{0},N]})$", zorder=2, alpha=0.7)
    ax.plot(test_times[:-(n_truncate + 1)], bias_series[:-(n_truncate+1)], label="$\\mathrm{Bias}(\\langle \\Delta G \\rangle_{[n_{0},N]})$", zorder=1, alpha=0.7)
    ax.plot(test_times[:-(n_truncate + 1)], sem_series[:-(n_truncate +1)], label="$\\mathrm{SD}(\\langle \\Delta G \\rangle_{[n_{0},N]})$", zorder=1, alpha=0.7)
    
    if show_min:
        # Plot dashed vertical line at the optimal truncation point
        trunc_time = fixed_trunc_error_series[dataset][system]["optimal_trunc_time"]
        ax.axvline(trunc_time, color='black', linestyle='--', label="Optimal Truncation Time")

        # Plot a dashed horizontal line at the RMSE at the optimal truncation point
        trunc_rmse = rmse_series[np.argmin(rmse_series)]
        ax.axhline(trunc_rmse, color='black', linestyle='--')

        # Add a small red dot at the minimum RMSE
        ax.plot(trunc_time, trunc_rmse, 'ro', alpha=0.7)

    ax.set_xlabel("Truncation Time / ns")
    ax.set_ylabel("Error / kcal mol$^{-1}$")
    ax.set_title(system)

# Plot the components of the RMSE
fig, axs = get_subplots(systems)

for i, system in enumerate(systems):
    ax = axs[i]
    plot_error_components_on_ax(ax, "standard", system, show_min=False)
    ax.set_xlabel("Truncation Time / ns")
    ax.set_ylabel("Error / kcal mol$^{-1}$")
    ax.set_title(system)

fig.tight_layout()

# Only put the legend to the left of the last plot
axs[-2].legend(bbox_to_anchor=(1.2, 0.7), loc='upper left')

fig.savefig("output_single/error_components.png", dpi=300, bbox_inches='tight')

## Plot Discard Times



In [None]:
def plot_discard_times_on_ax(ax: Axes, dataset: str, system: str, n_truncate: int = 100) -> None:
    # Get a dataframe of the times discarded
    tot_time = 8 if dataset != "short" else 0.2
    df_times = pd.DataFrame({method: distributions[dataset][system][method]["fracs"]*tot_time for method in automated_methods})
    sns.violinplot(data=df_times, ax=ax, orient="h",palette=colors, alpha=1)
    ax.set_title(system)
    ax.set_xlabel("Truncation Time / ns")

    # Plot dashed vertical line at the optimal truncation point
    trunc_time = fixed_trunc_error_series[dataset][system]["optimal_trunc_time"]
    ax.axvline(trunc_time, color='black', linestyle='--', label="Optimal Truncation Time")

    ax.set_title(system)

# Plot the discard times
fig, axs = get_subplots(systems)

for i, system in enumerate(systems):
    ax = axs[i]
    plot_discard_times_on_ax(ax, "standard", system, 100)
    # Remove y tick labels from right hand column
    if i % 2 == 1:
        ax.set_yticklabels([])
    ax.set_xlabel("Truncation Time / ns")
    ax.set_title(system)

# Tight layout, but only in the y direction
fig.subplots_adjust(hspace=0.5)

# Only put the legend to the left of the last plot
axs[-2].legend(bbox_to_anchor=(1.2, 0.7), loc='upper left')

fig.savefig("output_single/discard_times_standard.png", dpi=300, bbox_inches='tight')
    

## Plot RMSES

In [None]:
def plot_rmses_on_ax(ax: Axes, dataset: str, system: str, n_truncate: int = 100) -> None:
    # Get RMSEs and confidence intervals
    rmse = [overall_results[dataset][system][method]["rmse"] for method in automated_methods]
    cis = [overall_results[dataset][system][method]["rmse_ci"] for method in automated_methods]

    # Convert CIs from absolute values to relative values
    cis_lower = np.array(rmse) - np.array(cis)[:,0]
    cis_upper = np.array(cis)[:,1] - np.array(rmse)
    error_bar_settings = {"capsize": 0, "alpha": 1, "elinewidth": 1}
    ax.bar(automated_methods, rmse, yerr=[cis_lower, cis_upper], capsize=5, 
           alpha=1, error_kw=error_bar_settings, color=colors, edgecolor='black', linewidth=1)
    
    # Get the minimum possible fixed-time RMSE and plot a horizontal to show it
    min_rmse = np.min(fixed_trunc_error_series[dataset][system]["rmse_series"])
    ax.axhline(min_rmse, color='black', linestyle='--', label="Minimum Fixed-Time RMSE")

    # Plot the 0.2 % discard RMSE
    # discard_rmse = overall_results[dataset][system]["Discard Fraction 0.2"]["rmse"]
    # ax.axhline(discard_rmse, color='red', linestyle='--', label="Discard Fraction 0.2 RMSE")

    ax.set_xticklabels(methods, rotation=90)
    ax.set_title(system)

    # Remove x grid ylines
    ax.xaxis.grid(False)

# Plot the RMSEs
fig, axs = get_subplots(systems)

for i, system in enumerate(systems):
    ax = axs[i]
    plot_rmses_on_ax(ax, "standard", system, 100)
    ax.set_ylabel("$\\mathrm{RMSE}(\\langle \\Delta G \\rangle)$ \n/ kcal mol$^{-1}$")
    ax.set_title(system)

    # Remove x labels from first 3 plots
    if i < 3:
        ax.set_xticklabels([])

    # Remove y labels from right column
    if i % 2 == 1:
        ax.set_ylabel("")

    # Add the legend to the last plot
    if i == len(systems) - 1:
        ax.legend(bbox_to_anchor=(1.05, -0.3), loc='upper left')

fig.tight_layout()

fig.savefig("output_single/rmses_standard.png", dpi=300, bbox_inches='tight')

## Distributions of Unsigned Errors

In [None]:
def plot_unsigned_error_distribution_on_ax(ax: Axes, dataset: str, system: str) -> None:
    # Get the squared errors
    df_ses = pd.DataFrame({method: distributions[dataset][system][method]["mues"] for method in automated_methods})
    sns.violinplot(data=df_ses, ax=ax, palette=colors)
    ax.set_title(system)
    ax.set_ylabel("Unsigned Error / kcal mol$^{-1}$")
    ax.set_xticklabels(methods, rotation=90)
    
    # Get the minimum possible fixed-time RMSE and plot a horizontal to show it
    min_rmse = np.min(fixed_trunc_error_series[dataset][system]["rmse_series"])
    ax.axhline(min_rmse, color='black', linestyle='--', label="Minimum Fixed-Time RMSE")

# Plot the squared errors
fig, axs = get_subplots(systems)

for i, system in enumerate(systems):
    ax = axs[i]
    plot_unsigned_error_distribution_on_ax(ax, "standard", system)
    ax.set_title(system)

    # Remove x labels from first 3 plots
    if i < 3:
        ax.set_xticklabels([])

    # Remove y labels from right column
    if i % 2 == 1:
        ax.set_ylabel("")

fig.tight_layout()

fig.savefig("output_single/unsigned_errors_standard.png", dpi=300, bbox_inches='tight')

## Contour Plots of Components of Error

In [None]:
# Based on the code above
def plot_contour_plot_on_axis(ax: Axes, dataset: str, system: str) -> None:
    # Get the fixed-time bias and sem
    bias = fixed_trunc_error_series[dataset][system]["bias_series"]
    sem = fixed_trunc_error_series[dataset][system]["sem_series"]
    ax.plot(bias, sem, label="Fixed Truncation\n Time Limit", zorder=2)

    # Now, get the biases, variances, and associated CIs for all of the methods.
    # We need these to decide how big the grid needs to be
    biases = [overall_results[dataset][system][method]["bias"] for method in automated_methods]
    sems = [overall_results[dataset][system][method]["variance"]**0.5 for method in automated_methods]
    bias_cis_upper = [overall_results[dataset][system][method]["mean_ci"][1] - overall_results[dataset][system][method]["bias"] for method in automated_methods]
    bias_cis_lower = [overall_results[dataset][system][method]["bias"] - overall_results[dataset][system][method]["mean_ci"][0] for method in automated_methods]
    sem_cis_upper = [overall_results[dataset][system][method]["var_ci"][1]**0.5 - overall_results[dataset][system][method]["variance"]**0.5 for method in automated_methods]
    sem_cis_lower = [overall_results[dataset][system][method]["variance"]**0.5 - overall_results[dataset][system][method]["var_ci"][0]**0.5 for method in automated_methods]

    # Create a grid of points
    max_bias_or_sem = max(max(biases), max(sems))
    limit = max_bias_or_sem + max_bias_or_sem * 0.1
    x = np.linspace(-limit, limit, 1000)
    y = np.linspace(-limit, limit, 1000)
    X, Y = np.meshgrid(x, y)

    # Calculate distance from origin
    Z = np.sqrt(X**2 + Y**2)

    # Create a contour plot so that there are 10 contours
    d_error = limit / 8
    # Round error steps up to nearest 0.05 kcal/mol
    # d_error = np.ceil(d_error / 0.05) * 0.05
    d_error = round(d_error, 2)
    contour_levels = np.arange(0, np.max(Z), d_error)
    contourf = ax.contourf(X, Y, Z, levels=contour_levels, cmap='viridis')

    # Add on the equilibration detection results
    for i, method in enumerate(automated_methods):
        ax.errorbar(biases[i], sems[i], xerr=[[bias_cis_lower[i]], [bias_cis_upper[i]]], yerr=[[sem_cis_lower[i]], [sem_cis_upper[i]]], 
                                              fmt='none', ecolor='black', capsize=5, markerfacecolor='white', zorder=1)

        ax.scatter(biases[i], sems[i], label=method, edgecolors='black', linewidth=1, s=50, zorder=2, color=colors[i])

    # Set x and y limits
    negative_limit = -limit * 0.02
    ax.set_xlim([negative_limit, limit])
    ax.set_ylim([negative_limit, limit])

    # Add a horizontal colour bar below the plot
    cbar = plt.colorbar(contourf, orientation='horizontal', location='top')
    cbar.set_label("$\\mathrm{RMSE}(\\langle \\Delta G \\rangle )$ / kcal mol$^{-1}$")

    # Set x and y labels, and force aspect ratio to be equal
    ax.set_xlabel("$\\langle \\mathrm{Bias}(\\langle \\Delta G \\rangle) \\rangle$ / kcal mol$^{-1}$")
    ax.set_ylabel("$\\mathrm{SD}(\\langle \\Delta G \\rangle)$ / kcal mol$^{-1}$")
    ax.set_aspect('equal', adjustable='box')

# Plot the contour plots
fig, axs = get_subplots(systems, scale_y=1.2)

for i, system in enumerate(systems):
    ax = axs[i]
    plot_contour_plot_on_axis(ax, "standard", system)
    ax.set_title(system, pad=60)

fig.tight_layout()

# Set the legend on the last plot
axs[-2].legend(loc='center left', bbox_to_anchor=(1.1, 0.5))

fig.savefig("output_single/bias_vs_sem_contour_plots.png", dpi=300, bbox_inches='tight')

## Overall Plots

Combine the plots of error components, discard times, and RMSEs.

In [None]:
# Create a grid of 5 sections, one for each system. For each section, create a grid of 4 plots.

fig = plt.figure(figsize=(30, 6))

# Gridspec with 2 rows and 10 columns
# gs_outer = gridspec.GridSpec(1, 5, figure=fig)
subfigs = fig.subfigures(1, 5)

axs = []
for i, system in enumerate(systems):

    # Get subfigure, grid spec, and add title
    subfig = subfigs[i]
    gs = gridspec.GridSpec(2, 2, figure=subfig, hspace=0.05, wspace=0.05)
    subfig.suptitle(system, fontsize=16, fontweight='bold')

    # Create subplots with axes shared as required
    components_ax = subfig.add_subplot(gs[1,0])
    discard_ax = subfig.add_subplot(gs[0,0], sharex=components_ax)
    rmse_ax = subfig.add_subplot(gs[1,1], sharey=components_ax)
    unused_ax = subfig.add_subplot(gs[0,1])
    
    # Plot/ delete axes
    plot_error_components_on_ax(components_ax, "standard", system, 100)
    plot_discard_times_on_ax(discard_ax, "standard", system, 100)
    plot_rmses_on_ax(rmse_ax, "standard", system, 100)
    subfig.delaxes(unused_ax)

    # Remove unnecessary labels/ titles
    components_ax.set_title("")
    if i == 0:
        components_ax.legend(bbox_to_anchor=(-0.05, -0.3), loc='upper left')
    rmse_ax.set_title("$\\mathrm{RMSE}(\\langle \\Delta G \\rangle)$")
    rmse_ax.set_ylabel("")
    discard_ax.set_title("Truncation Time")
    discard_ax.set_xlabel("")
    # Hide the numbers on the x axis for the discard times plot, but don't set labels to empty as this effects the RMSE plot
    discard_ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
    rmse_ax.tick_params(axis='y', which='both', left=False, right=False, labelleft=False)
    
    # If this isn't the first system, remove the labels from the discard times plot
    if i != 0:
        discard_ax.set_yticklabels([])
        

fig.savefig("output_single/combined_plots.png", dpi=300, bbox_inches='tight')


In [None]:
# Create a grid of 5 sections, one for each system. For each section, create a grid of 4 plots.

fig = plt.figure(figsize=(13.5, 9))

# Gridspec with 2 rows and 10 columns
# gs_outer = gridspec.GridSpec(1, 5, figure=fig)
# subfigs = fig.subfigures(2, 3)
subfigs = fig.subfigures(2, 3, wspace=0.1, hspace=0.1)
subfigs = subfigs.flatten()

axs = []
for i, system in enumerate(systems):

    # Get subfigure, grid spec, and add title
    subfig = subfigs[i]
    gs = gridspec.GridSpec(2, 2, figure=subfig, hspace=0.05, wspace=0.05)
    subfig.suptitle(system, fontsize=16, fontweight='bold', y=1.02)

    # Create subplots with axes shared as required
    components_ax = subfig.add_subplot(gs[1,0])
    discard_ax = subfig.add_subplot(gs[0,0], sharex=components_ax)
    rmse_ax = subfig.add_subplot(gs[1,1], sharey=components_ax)
    unused_ax = subfig.add_subplot(gs[0,1])
    
    # Plot/ delete axes
    plot_error_components_on_ax(components_ax, "standard", system, 100)
    plot_discard_times_on_ax(discard_ax, "standard", system, 100)
    plot_rmses_on_ax(rmse_ax, "standard", system, 100)
    subfig.delaxes(unused_ax)

    # Remove unnecessary labels/ titles
    components_ax.set_title("")
    if i == 2:
        # components_ax.legend(bbox_to_anchor=(-0.5, -0.45), loc='upper left')
        components_ax.legend(bbox_to_anchor=(0.05, -2.0), loc='upper left')

    # Remove x labels from first two plots
    if i < 2:
        rmse_ax.set_xticklabels([])

    rmse_ax.set_title("$\\mathrm{RMSE}(\\langle \\Delta G \\rangle)$")
    rmse_ax.set_ylabel("")
    discard_ax.set_title("Truncation Time")
    discard_ax.set_xlabel("")
    # Hide the numbers on the x axis for the discard times plot, but don't set labels to empty as this effects the RMSE plot
    discard_ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
    rmse_ax.tick_params(axis='y', which='both', left=False, right=False, labelleft=False)
    
    # If this isn't the first system in a row, remove the labels from the discard times plot
    if i not in [0, 3]:
        discard_ax.set_yticklabels([])
        

fig.savefig("output_single/combined_plots_reformatted.png", dpi=300, bbox_inches='tight')


In [None]:
# Repeat above plot, but show distributions of unsigned
# errors instead of RMSE of dataset

fig = plt.figure(figsize=(30, 6))

# Gridspec with 2 rows and 10 columns
# gs_outer = gridspec.GridSpec(1, 5, figure=fig)
subfigs = fig.subfigures(1, 5)

axs = []
for i, system in enumerate(systems):

    # Get subfigure, grid spec, and add title
    subfig = subfigs[i]
    gs = gridspec.GridSpec(2, 2, figure=subfig, hspace=0.05, wspace=0.05)
    subfig.suptitle(system, fontsize=16, fontweight='bold')

    # Create subplots with axes shared as required
    components_ax = subfig.add_subplot(gs[1,0])
    discard_ax = subfig.add_subplot(gs[0,0], sharex=components_ax)
    rmse_ax = subfig.add_subplot(gs[1,1], sharey=components_ax)
    unused_ax = subfig.add_subplot(gs[0,1])
    
    # Plot/ delete axes
    plot_error_components_on_ax(components_ax, "standard", system, 100)
    plot_discard_times_on_ax(discard_ax, "standard", system, 100)
    plot_unsigned_error_distribution_on_ax(rmse_ax, "standard", system)
    subfig.delaxes(unused_ax)

    # Remove unnecessary labels/ titles
    components_ax.set_title("")
    if i == 0:
        components_ax.legend(bbox_to_anchor=(-0.05, -0.3), loc='upper left')
    rmse_ax.set_title("Unsigned Errors")
    rmse_ax.set_ylabel("")
    discard_ax.set_title("Truncation Time")
    discard_ax.set_xlabel("")
    # Hide the numbers on the x axis for the discard times plot, but don't set labels to empty as this effects the RMSE plot
    discard_ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
    rmse_ax.tick_params(axis='y', which='both', left=False, right=False, labelleft=False)
    
    # If this isn't the first system, remove the labels from the discard times plot
    if i != 0:
        discard_ax.set_yticklabels([])
        

fig.savefig("output_single/combined_plots_unsigned_errors.png", dpi=300, bbox_inches='tight')


In [None]:
# Create a grid of 5 sections, one for each system. For each section, create a grid of 4 plots.

fig = plt.figure(figsize=(13.5, 9))

# Gridspec with 2 rows and 10 columns
# gs_outer = gridspec.GridSpec(1, 5, figure=fig)
# subfigs = fig.subfigures(2, 3)
subfigs = fig.subfigures(2, 3, wspace=0.1, hspace=0.1)
subfigs = subfigs.flatten()

axs = []
for i, system in enumerate(systems):

    # Get subfigure, grid spec, and add title
    subfig = subfigs[i]
    gs = gridspec.GridSpec(2, 2, figure=subfig, hspace=0.05, wspace=0.05)
    subfig.suptitle(system, fontsize=16, fontweight='bold', y=1.02)

    # Create subplots with axes shared as required
    components_ax = subfig.add_subplot(gs[1,0])
    discard_ax = subfig.add_subplot(gs[0,0], sharex=components_ax)
    rmse_ax = subfig.add_subplot(gs[1,1], sharey=components_ax)
    unused_ax = subfig.add_subplot(gs[0,1])
    
    # Plot/ delete axes
    plot_error_components_on_ax(components_ax, "standard", system, 100)
    plot_discard_times_on_ax(discard_ax, "standard", system, 100)
    plot_unsigned_error_distribution_on_ax(rmse_ax, "standard", system)
    subfig.delaxes(unused_ax)

    # Remove unnecessary labels/ titles
    components_ax.set_title("")
    if i == 2:
        # components_ax.legend(bbox_to_anchor=(-0.5, -0.45), loc='upper left')
        components_ax.legend(bbox_to_anchor=(0.05, -2.0), loc='upper left')

    # Remove x labels from first two plots
    if i < 2:
        rmse_ax.set_xticklabels([])

    rmse_ax.set_title("$\\mathrm{RMSE}(\\langle \\Delta G \\rangle)$")
    rmse_ax.set_ylabel("")
    discard_ax.set_title("Truncation Time")
    discard_ax.set_xlabel("")
    # Hide the numbers on the x axis for the discard times plot, but don't set labels to empty as this effects the RMSE plot
    discard_ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
    rmse_ax.tick_params(axis='y', which='both', left=False, right=False, labelleft=False)
    
    # If this isn't the first system in a row, remove the labels from the discard times plot
    if i not in [0, 3]:
        discard_ax.set_yticklabels([])
        

fig.savefig("output_single/combined_plots_unsigned_errors_reformatted.png", dpi=300, bbox_inches='tight')
