# Preprocessing of Data from Adaptive Runs vs Non-Adaptive Runs

The cells below are examples of the slow analyses (generally run in TMUX sessions using ipython) which were performed to generate data for the final analyses.

In [1]:
import a3fe as a3 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams, rcParamsDefault
import pickle
import scipy.stats as stats
from sklearn import metrics
rcParams.update(rcParamsDefault)
plt.style.use("seaborn-colorblind")
plt.rc('text.latex', preamble=r'\usepackage{amsmath}')
from typing import List, Tuple, Dict, Callable, Union, Optional, Any, Dict
%matplotlib inline
from scipy.stats import linregress, kruskal, t, sem, wilcoxon
from matplotlib import gridspec
import pymbar
print(pymbar.version.version)
import logging

ligs = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

adaptive_paths = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

non_adapt_paths = {lig: f"../non_adaptive/{adapt_path}_5000ps" for lig, adapt_path in adaptive_paths.items()}

REF_COST = 0.21 # GPU hours per ns



INFO:rdkit:Enabling RDKit 2022.09.1 jupyter extensions


3.0.5.dev-Unknown


  plt.style.use("seaborn-colorblind")


## Long Analyses to be Run in TMUX Sessions

In [None]:
# Run in tmux ipython session

import a3fe as a3
import pickle

adaptive_paths = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

non_adapt_paths = {lig: f"../non_adaptive/{adapt_path}_5000ps" for lig, adapt_path in adaptive_paths.items()}

# Compare convergence of the overall adaptive and non-adaptive simulations
comparitive_conv_data = {}

for lig in adaptive_paths:
    comparitive_conv_data[lig] = {}
    calc_iterator = a3.run._utils.SimulationRunnerIterator(
        [adaptive_paths[lig], non_adapt_paths[lig]],
        a3.Calculation
    )

    conv_data = a3.analyse.compare.get_comparitive_convergence_data(
        calc_iterator,
        equilibrated=False,
        mode="block"
    )

    for k, label in enumerate(["Adaptive", "Non-adaptive"]):
        comparitive_conv_data[lig][label] = {"dgs": conv_data[k][1], "times": conv_data[k][0]}


    # Pickle the current data
    with open("final_analysis/comparitive_conv_data.pkl", "wb") as f:
        pickle.dump(comparitive_conv_data, f)

In [None]:
# Get the 100 blocked data for the convergence analysis for all runs
import a3fe as a3
import pickle
from EnsEquil.analyse.process_grads import get_time_series_multiwindow_mbar as get_ts

ligs = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

equil_dgs = {}
for lig in ligs:
    equil_dgs[lig] = {}
    calc = a3.Calculation(base_dir = ligs[lig])
    for leg in calc.legs:
        leg_str = str(leg.leg_type).split(".")[-1].lower()
        equil_dgs[lig][leg_str] = {}
        for stage in leg.stages:
            stage_str = str(stage.stage_type).split(".")[-1].lower()
            equil_dgs[lig][leg_str][stage_str] = {}
            dgs, times = get_ts(stage.lam_windows, stage.output_dir, equilibrated=False)
            equil_dgs[lig][leg_str][stage_str]["dgs"] = dgs
            equil_dgs[lig][leg_str][stage_str]["times"] = times
    calc._close_logging_handlers()
    del(calc)

    # Pickle the current data
    with open("final_analysis/equil_dgs.pkl", "wb") as f:
        pickle.dump(equil_dgs, f)

In [None]:
import a3fe as a3
import pickle
from a3fe.analyse.process_grads import get_time_series_multiwindow_mbar as get_ts

ligs = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

overall_costs = {"Adaptive":{}, "Non-adaptive":{}}
# Get the costs of each leg
costs = {}
for lig in ligs:
    costs[lig] = {}
    calc = a3.Calculation(base_dir = ligs[lig])
    for leg in calc.legs:
        costs[lig][str(leg.leg_type)] = leg.relative_simulation_cost
    calc._close_logging_handlers()
    del calc

with open("final_analysis/costs.pkl", "wb") as f:
    pickle.dump(costs, f)

In [None]:
# Calculate overall costs from costs - weight according to simulation time
import a3fe as a3
import pickle
from a3fe.analyse.process_grads import get_time_series_multiwindow_mbar as get_ts

ligs = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

with open("final_analysis/costs.pkl", "rb") as f:
    costs = pickle.load(f)

overall_costs = {"Adaptive":{}, "Non-adaptive":{}}
for lig in comparitive_conv_data:
    for method in comparitive_conv_data[lig]:
        base_dirs = adaptive_paths if method == "Adaptive" else non_adapt_paths
        # Get the overall cost as a weighted sum 
        bound_cost = costs[lig]["bound"]
        free_cost = costs[lig]["free"]
        calc = a3.Calculation(base_dir = base_dirs[lig])
        simtime_bound = calc.legs[0].tot_simtime
        simtime_free = calc.legs[1].tot_simtime
        calc._close_logging_handlers()
        del(calc)
        overall_cost = (bound_cost*simtime_bound + free_cost*simtime_free) / (simtime_bound + simtime_free)
        overall_costs[method][lig] = overall_cost

with open("final_analysis/overall_costs.pkl", "wb") as f:
    pickle.dump(overall_costs, f)

In [None]:
# Get all the results

import a3fe as a3
import logging
import pickle

with open("costs.pkl", "rb") as f:
    costs = pickle.load(f)

ligs = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

adaptive_paths = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

non_adapt_paths = {lig: f"../non_adaptive/{adapt_path}_5000ps" for lig, adapt_path in adaptive_paths.items()}

path_dicts = {"adaptive": adaptive_paths, "non_adaptive": non_adapt_paths}
final_dGs_all = {}
for system in ligs:
    final_dGs_all[system] = {}
    for method in path_dicts:
        print(f"Analysing {system} {method}")
        calc = a3.Calculation(base_dir=path_dicts[method][system], stream_log_level=logging.CRITICAL)
        final_dGs_all[system][method] = {}
        final_dGs_all[system][method]["dgs"] = calc._delta_g
        for leg in calc.legs:
            print(f"Analysing {leg.leg_type}")
            final_dGs_all[system][method][str(leg.leg_type)] = {}
            final_dGs_all[system][method][str(leg.leg_type)]["dg"] = leg._delta_g
            for stage in leg.stages:
                final_dGs_all[system][method][str(leg.leg_type)][str(stage.stage_type)] = {}
                final_dGs_all[system][method][str(leg.leg_type)][str(stage.stage_type)]["dg"] = stage._delta_g
        calc._close_logging_handlers()
        del(calc)

        with open("final_analysis/final_dGs_all.pkl", "wb") as f:
            pickle.dump(final_dGs_all, f)

In [None]:
# Get the total costs of all runs

import a3fe as a3
import logging
import pickle

with open("costs.pkl", "rb") as f:
    costs = pickle.load(f)

REF_COST = 0.21 # GPU hours per ns

ligs = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

adaptive_paths = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

non_adapt_paths = {lig: f"../non_adaptive/{adapt_path}_5000ps" for lig, adapt_path in adaptive_paths.items()}
# Restraint corrections and restraint parameters

import a3fe as a3
import pickle

ligs = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

# Get the restraint corrections for each system
restraint_corrections = {}
restraint_dicts = {}
for lig in ligs:
    print(f"Analysing {lig}")
    calc = a3.Calculation(base_dir = ligs[lig])
    restr_corr = calc.legs[0].restraints[0].getCorrection().value()
    restraint_corrections[lig] = restr_corr
    print(f"Getting restraint parameters for {lig}")
    restr = calc.legs[0].restraints[0]
    restr_dict = eval(calc.legs[0].restraints[0].toString("SOMD").split("=")[1])
    restraint_dicts[lig] = restr_dict
    calc._close_logging_handlers()
    del(calc)

with open("final_analysis/restraint_corrections.pkl", "wb") as f:
    pickle.dump(restraint_corrections, f)

with open("final_analysis/restraint_dicts.pkl", "wb") as f:
    pickle.dump(restraint_dicts, f)

path_dicts = {"adaptive": adaptive_paths, "non_adaptive": non_adapt_paths}
total_costs = {}
for method in path_dicts:
    total_costs[method] = {}
    for system in ligs:
        print(f"Analysing {system} {method}")
        calc = a3.Calculation(base_dir=path_dicts[method][system], stream_log_level=logging.CRITICAL)
        total_cost_sys = 0
        for leg in calc.legs:
            leg_cost = costs[system][leg.leg_type.name.lower()] * REF_COST * leg.tot_simtime
            total_cost_sys += leg_cost
        total_costs[method][system] = total_cost_sys

        calc._close_logging_handlers()
        del(calc)

        with open("final_analysis/total_costs.pkl", "wb") as f:
            pickle.dump(total_costs, f)


In [5]:
# Process Alibay results
import pandas as pd
import numpy as np
from typing import Tuple
from scipy.stats import sem, t

def get_95_ci(data: np.ndarray) -> Tuple[float, float]:
    """Get the 95% confidence interval for a given array of data using scipy.stats.sem"""
    mean_free_energy = np.mean(data)
    conf_int = t.interval(
        0.95,
        len(data) - 1,
        mean_free_energy,
        scale=sem(data),
    )[1] - mean_free_energy # 95 % C.I.
    return conf_int

alibay_results = pd.read_csv("final_analysis/alibay_results.csv", index_col=0)

# Cyclod ligands we want
cyclod_ligs = ["2", "3", "4", "8", "14", "16", "27", "39", "40"]

# Save a dict of overall results
overall_results_alibay = {}
for lig in cyclod_ligs:
    overall_results_alibay[lig] = {}
    dgs = alibay_results.loc[alibay_results["ligand_ID"] == lig, "calc_dG"].values
    mean_dgs = np.mean(dgs)
    ci_95 = get_95_ci(dgs)
    overall_results_alibay[lig]["mean_dG"] = mean_dgs
    overall_results_alibay[lig]["95_ci"] = ci_95

with open("final_analysis/overall_results_alibay.pkl", "wb") as f:
    pickle.dump(overall_results_alibay, f)

In [None]:
# Restraint corrections and restraint parameters

import a3fe as a3
import pickle

ligs = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

# Get the restraint corrections for each system
restraint_corrections = {}
restraint_dicts = {}
for lig in ligs:
    print(f"Analysing {lig}")
    calc = a3.Calculation(base_dir = ligs[lig])
    restr_corr = calc.legs[0].restraints[0].getCorrection().value()
    restraint_corrections[lig] = restr_corr
    print(f"Getting restraint parameters for {lig}")
    restr = calc.legs[0].restraints[0]
    restr_dict = eval(calc.legs[0].restraints[0].toString("SOMD").split("=")[1])
    restraint_dicts[lig] = restr_dict
    calc._close_logging_handlers()
    del(calc)

with open("final_analysis/restraint_corrections.pkl", "wb") as f:
    pickle.dump(restraint_corrections, f)

with open("final_analysis/restraint_dicts.pkl", "wb") as f:
    pickle.dump(restraint_dicts, f)


In [None]:
# Get lambda values

import a3fe as a3
import pickle


lam_vals = {}
for lig in ligs:
    print(f"Analysing {lig}")
    lam_vals[lig] = {}
    calc = a3.Calculation(base_dir = ligs[lig], stream_log_level=logging.CRITICAL)
    for leg in calc.legs:
        lam_vals[lig][str(leg.leg_type)] = {}
        for stage in leg.stages:
            lam_vals[lig][str(leg.leg_type)][str(stage.stage_type)] = stage.lam_vals
    calc._close_logging_handlers()
    del(calc)

    # Write most recent version of the dictionary to a pickle
    with open("final_analysis/lam_vals.pkl", "wb") as f:
        pickle.dump(lam_vals, f)

In [None]:
# Get the sampling times

import a3fe as a3
import pickle

sampling_times_ns = {}
for lig in ligs:
    print(f"Analysing {lig}")
    sampling_times_ns[lig] = {}
    calc = a3.Calculation(base_dir = ligs[lig], stream_log_level=logging.CRITICAL)
    for leg in calc.legs:
        leg_str = str(leg.leg_type).split(".")[1].lower()
        sampling_times_ns[lig][leg_str] = {}
        for stage in leg.stages:
            stage_str = str(stage.stage_type).split(".")[1].lower()
            sampling_times_ns[lig][leg_str][stage_str] = {}
            sampling_times_ns[lig][leg_str][stage_str]["times"] = [lam.tot_simtime for lam in stage.lam_windows]
            sampling_times_ns[lig][leg_str][stage_str]["times"] = [lam.tot_simtime for lam in stage.lam_windows]
            sampling_times_ns[lig][leg_str][stage_str]["equil_times"] = [lam.equil_time * lam.ensemble_size for lam in stage.lam_windows]
            sampling_times_ns[lig][leg_str][stage_str]["lam_vals"] = [lam.lam for lam in stage.lam_windows]

    calc._close_logging_handlers()
    del(calc)

    # Write most recent version of the dictionary to a pickle
    with open("final_analysis/sampling_times.pkl", "wb") as f:
        pickle.dump(sampling_times_ns, f)

In [None]:
# Get the sampling times for the non-adaptive runs
import a3fe as a3
import pickle

ligs = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

adaptive_paths = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

non_adapt_paths = {lig: f"../non_adaptive/{adapt_path}_5000ps" for lig, adapt_path in adaptive_paths.items()}
sampling_times_ns = {}
for lig in ligs:
    print(f"Analysing {lig}")
    sampling_times_ns[lig] = {}
    calc = a3.Calculation(base_dir = non_adapt_paths[lig], stream_log_level=logging.CRITICAL)
    for leg in calc.legs:
        leg_str = str(leg.leg_type).split(".")[1].lower()
        sampling_times_ns[lig][leg_str] = {}
        for stage in leg.stages:
            stage_str = str(stage.stage_type).split(".")[1].lower()
            sampling_times_ns[lig][leg_str][stage_str] = {}
            sampling_times_ns[lig][leg_str][stage_str]["times"] = [lam.tot_simtime for lam in stage.lam_windows]
            sampling_times_ns[lig][leg_str][stage_str]["times"] = [lam.tot_simtime for lam in stage.lam_windows]
            sampling_times_ns[lig][leg_str][stage_str]["equil_times"] = [lam.equil_time * lam.ensemble_size for lam in stage.lam_windows]
            sampling_times_ns[lig][leg_str][stage_str]["lam_vals"] = [lam.lam for lam in stage.lam_windows]

    calc._close_logging_handlers()
    del(calc)

    # Write most recent version of the dictionary to a pickle
    with open("final_analysis/sampling_times_nonadapt.pkl", "wb") as f:
        pickle.dump(sampling_times_ns, f)

In [None]:
import a3fe as a3
import logging
import pickle

ligs = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

adaptive_paths = {
    "2": "lig_2",
    "3": "lig_3",
    "4": "lig_4",
    "8": "lig_8",
    "14": "lig_14",
    "16": "lig_16",
    "27": "lig_27",
    "39": "lig_39_cry_pose",
    "40": "lig_40",
}

non_adapt_paths = {lig: f"../non_adaptive/{adapt_path}_5000ps" for lig, adapt_path in adaptive_paths.items()}

# Code to generate dictionary of 100 blocks of free energy changes for each stage for each leg for each time for each system
for method in ["adaptive", "non_adaptive"]:
    paths = adaptive_paths if method == "adaptive" else non_adapt_paths
    # If there is already a pickle, load it
    try:
        with open(f"final_analysis/dgs_conv_{method}_nonequil.pkl", "rb") as f:
            dgs_conv = pickle.load(f)
    except FileNotFoundError:
        dgs_conv = {}
    for lig in ligs:
        if lig in dgs_conv:
            continue
        dgs_conv[lig] = {}
        print(f"Analysing {lig}")
        calc = a3.Calculation(base_dir = paths[lig], stream_log_level=logging.CRITICAL)
        for leg in calc.legs:
            dgs_conv[lig][str(leg.leg_type)] = {}
            for stage in leg.stages:
                dgs_conv[lig][str(leg.leg_type)][str(stage.stage_type)] = {}
                fracts, dgs = stage.analyse_convergence(mode="block", equilibrated=False)
                dgs_conv[lig][str(leg.leg_type)][str(stage.stage_type)]["fracts"] = fracts
                dgs_conv[lig][str(leg.leg_type)][str(stage.stage_type)]["dgs"] = dgs
        calc._close_logging_handlers()
        del(calc)

        # Write most recent version of the dictionary to a pickle
        with open(f"final_analysis/dgs_conv_{method}_nonequil.pkl", "wb") as f:
            pickle.dump(dgs_conv, f)