# Analysis of Alibay ABFE Results

In [our manuscript](https://doi.org/10.26434/chemrxiv-2024-3ft7f), we point out that most component simulations of the ABFE calculations are not strictly converged, because the distributions of perturbed energies sampled by replicate runs are significantly different. To illustrate that this issue is wide-spread, we analyse gradients from ABFE calculations from [Alibay et al.'s work](https://www.nature.com/articles/s42004-022-00721-4). We arbitrarily choose to analyse the Cyclphilin-D data. We are grateful to Alibay et al. for providing the raw gradient data.

In [None]:
import a3fe as a3
import os
from pathlib import Path
import matplotlib.pyplot as plt
from scipy.stats import kruskal
from pymbar.timeseries import statisticalInefficiency
from tqdm import tqdm
import numpy as np
plt.style.use("seaborn-v0_8-colorblind")

## Cyclophilin D: Data Extraction

In [None]:
# Get the data from zenodo

! wget https://zenodo.org/records/5906019/files/complex.zip\?download\=1
! wget https://zenodo.org/records/5906019/files/ligand.zip\?download\=1

In [None]:
# Unzip the data

! mv complex.zip\?download\=1.1 complex.zip && unzip complex.zip
! mv ligand.zip\?download\=1 ligand.zip && unzip ligand.zip

In [None]:
# Base directory for 'complex' and 'ligand'
base_dir = Path('.')

xvg_paths = {}
for leg in ['complex', 'ligand']:
    leg_path = base_dir / leg
    if not leg_path.exists():
        print(f"Directory {leg_path} does not exist.")
        continue

    xvg_paths[leg] = {}
    for lig_path in leg_path.iterdir():
        if not lig_path.is_dir():
            continue  # Skip files, only process directories

        lig = lig_path.name
        xvg_paths[leg][lig] = {}
        for run in range(1, 6):
            run_path = lig_path / f"run{run}"
            xvg_paths[leg][lig][run] = {}
            for stage in ["restraints", "coul", "vdw"]:
                if leg == 'ligand' and stage == 'restraints':
                    continue  # No restraint stage for ligand
                stage_path = run_path / f"{stage}-xvg"
                if not stage_path.exists():
                    print(f"Directory {stage_path} does not exist.")
                    continue

                xvg_paths[leg][lig][run][stage] = {}
                for xvg_file in stage_path.iterdir():
                    if xvg_file.is_file() and xvg_file.suffix == '.xvg':
                        lam = xvg_file.stem.split(".")[1]
                        xvg_paths[leg][lig][run][stage][lam] = xvg_file

In [None]:
def read_grads(xvg_path: Path) -> list[float]:
    lines = xvg_path.read_text().splitlines()
    filtered_lines = [line for line in lines if not line.startswith(("#", "@"))]
    # The gradients are the second column
    return [float(line.split()[1]) for line in filtered_lines]

In [None]:
# For one example, check that we have the expected number of gradient data points
example_xvg_path = xvg_paths['complex']['ligand-27'][1]['coul']['0']
example_xvgs = read_grads(example_xvg_path)

NRG_FREQ = 100
TIMESTEP = 4E-6 # ns

print(f"Total time: {(len(example_xvgs) - 1) * NRG_FREQ * TIMESTEP} ns")

In [None]:
# Now, get the data and subsample with pymbar timeseries

grads_subsampled = {}
for leg in ['complex', 'ligand']:
    grads_subsampled[leg] = {}
    for lig in tqdm(xvg_paths[leg], desc=leg):
        grads_subsampled[leg][lig] = {}
        for run in xvg_paths[leg][lig]:
            grads_subsampled[leg][lig][run] = {}
            for stage in xvg_paths[leg][lig][run]:
                grads_subsampled[leg][lig][run][stage] = {}
                for lam in xvg_paths[leg][lig][run][stage]:
                    xvg_path = xvg_paths[leg][lig][run][stage][lam]
                    grads = read_grads(xvg_path)
                    g = statisticalInefficiency(grads)
                    grads_subsampled[leg][lig][run][stage][lam] = grads[::round(g)]

## Cyclophilin D: Data Analysis

In [None]:
def get_sig_diff_grads(grads_subsampled: dict, leg: str, lig: str, stage: str) -> tuple[float, float]:
    """
    Calculate the percentage of lambda windows where the gradient distributions
    are significantly different, using the Kruskal-Wallis test
    """
    n_lam = len(grads_subsampled[leg][lig][1][stage])
    n_sig_diff = 0

    for i in range(n_lam):
        gradients = [grads_subsampled[leg][lig][run][stage][str(i)] for run in range(1, 6)]
        _, p = kruskal(*gradients)
        if p < 0.05:
            n_sig_diff += 1

    return n_lam, n_sig_diff

In [None]:
# Get a dictionary with the percentage of significantly different gradients
sig_diff_grads = {}
for lig in grads_subsampled["complex"]:
    sig_diff_grads[lig] = {}
    for leg in ['complex', 'ligand']:
        sig_diff_grads[lig][leg] = {}
        for stage in grads_subsampled[leg][lig][1]:
            sig_diff_grads[lig][leg][stage] = get_sig_diff_grads(grads_subsampled, leg, lig, stage)


In [None]:
# Plot the percentage of lambda windows where the gradients are significantly different all on one bar plot
fig, ax = plt.subplots(figsize=(12, 4), dpi=300)
x = np.arange(len(sig_diff_grads))
width = 0.2
stage_name_map = {"restraints": "Restrain", "coul": "Discharge", "vdw": "Vanish"}
ligand_labels = [lig_name.replace("-", " ") for lig_name in sig_diff_grads]

# Plot complex and ligand next to each other
for i, stage in enumerate(['restraints', 'coul', 'vdw']):
    color = ax._get_lines.get_next_color()
    stage_name = stage_name_map[stage]

    # Bound leg
    y_complex = [100 * (sig_diff_grads[lig]["complex"][stage][1] / sig_diff_grads[lig]["complex"][stage][0]) for lig in sig_diff_grads]
    ax.bar(x + i * width, y_complex, width, label=f"Bound {stage_name}", edgecolor="k", alpha=1, color=color)

    # Free leg
    if stage != 'restraints':
        y_lig = [100 * (sig_diff_grads[lig]["ligand"][stage][1] / sig_diff_grads[lig]["ligand"][stage][0]) for lig in sig_diff_grads]
        ax.bar(x + i * width, y_lig, width, label=f"Free {stage_name}", edgecolor="k", alpha=0.8, color=color, hatch="///////")

# Set x ticks but rotate them 90
ax.set_xticks(x + width)
ax.set_xticklabels(ligand_labels, rotation=90)
ax.set_ylabel("% Windows with Significant\n Inter-run Differences Between\n Gradient Distributions")
# Put label off to right of plot
ax.legend(bbox_to_anchor=(1.03, 0.7))
fig.tight_layout(pad=-2)
fig.savefig("final_analysis/gradient_sig_diffs_cyclod_alibay.png", dpi=300, bbox_inches="tight")


In [None]:
# Remove all the downloaded data

! rm -r complex ligand complex.zip ligand.zip