In [None]:
import urllib

import mdtraj as md
import nglview
import numpy as np
from openff.toolkit.topology import Molecule, Topology
from openff.toolkit.typing.engines.smirnoff import ForceField
from openff.units import unit
from openff.units.openmm import to_openmm
from openmm import app
from openmm import unit as openmm_unit

from openff.interchange.components.interchange import Interchange
from openff.interchange.drivers import get_amber_energies, get_openmm_energies
from openff.interchange.drivers.all import get_summary_data

In [None]:
def get_packed_coordinates(structure: str, n_waters: int):
    from openff.evaluator import unit as evaluator_unit
    from openff.evaluator.utils.packmol import pack_box

    water = Molecule.from_smiles("O")
    water.generate_conformers(n_conformers=1)

    trj = md.load(structure)

    if trj.unitcell_lengths:
        box_size = 1.2 * trj.unitcell_lengths
    else:
        box_size = 1.2 * (np.max(trj.xyz, axis=1) - np.min(trj.xyz, axis=1))[0]

    packed_trj, _ = pack_box(
        molecules=[water],
        number_of_copies=[n_waters],
        structure_to_solvate=structure,
        box_size=box_size * evaluator_unit.nanometer,
        # mass_density=0.8 * evaluator_unit.Unit("g/cm**3"),
    )

    return (packed_trj.xyz[0], packed_trj.unitcell_lengths)

This example uses sample data from [Protein Ligand Benchmark](https://github.com/openforcefield/protein-ligand-benchmark#proteinligandbenchmarks) data set curated by the Open Force Field Initiative. Specifially, [MCL1](https://github.com/openforcefield/protein-ligand-benchmark/tree/8c94c0dcc892dfd77992567294b1ff31c62e8695/plbenchmark/sample_data/2020-08-26_mcl1_sample) data is used. Conveniently for the purposes of this example, the ligand is already docked and the protein is relatively small (~2000 atoms). Follow the links for details or to swap out ligand(s).

In [None]:
url = (
    "https://raw.githubusercontent.com/openforcefield/protein-ligand-benchmark/"
    "8c94c0dcc892dfd77992567294b1ff31c62e8695/plbenchmark/sample_data/2020-08-26_mcl1_sample/"
)

urllib.request.urlretrieve(url + "/01_protein/crd/protein.pdb", "protein.pdb")
urllib.request.urlretrieve(url + "02_ligands/lig_23/crd/lig_23.sdf", "lig_23.sdf")

# These two files (`protein.pdb` and `lig_23.sdf`) should be in the local path now
!ls -lhrt

The PDB file includes a few waters; the OpenFF Toolkit currently does not explicitly support parsing multi-component PDB files, so we'll use [MDTraj](https://mdtraj.org/) to parse the protein and save it to a new file.

In [None]:
protein_with_waters = md.load("protein.pdb")
protein_pdb = protein_with_waters.atom_slice(
    protein_with_waters.top.select("chainid 0")
)
protein_pdb.save("sliced.pdb")

Now, we can use the OpenFF Toolkit to load the protein and ligand from PDB and SDF files, respectively

In [None]:
%%capture
# This will take more than a few seconds, but it should take less than a minute
protein = Molecule.from_pdb("sliced.pdb")
ligand = Molecule.from_file("lig_23.sdf")

From these `Molecule` objects, we can make a `Topology` object that represents the protein-ligand complex with no water. Later, we'll visualize this topology as a subset of the solvated complex.

In [None]:
docked_topology = Topology.from_molecules([protein, ligand])

# TODO: There may be a simpler way to process positions after openff-toolkit #1207
docked_positions = openmm_unit.Quantity(
    np.concatenate([protein_pdb.xyz[0], ligand.conformers[0].m_as(unit.nanometer)]),
    openmm_unit.nanometer,
)

docked_topology.to_file(
    filename="docked.pdb",
    positions=docked_positions,
)

Next, let's add an arbitrary number of waters to the system and visualize the result. The density here will be wrong; use your imagination to act like the right number of waters were added.

In [None]:
water = Molecule.from_smiles("O")
water.generate_conformers(n_conformers=1)
n_waters = 1000

(packed_coordinates, box_vectors) = get_packed_coordinates("docked.pdb", n_waters)

final_topology = Topology.from_molecules([protein, ligand, *n_waters * [water]])
final_topology.to_file(
    filename="packed.pdb",
    positions=packed_coordinates * openmm_unit.nanometer,
)

In [None]:
w = nglview.show_mdtraj(md.load("packed.pdb"))
w.add_representation(
    "spacefill",
    selection=[*range(protein.n_atoms, docked_topology.n_atoms)],
    color="green",
)
w

In [None]:
w.render_image()

Now that we've prepared the topology of the system, we can apply force fields and generate inputs for simulation engines. Here, we'll use [OpenFF 2.0.0 "Sage"](https://openforcefield.org/community/news/general/sage2.0.0-release/) as a small molecule force field for the ligand and [OpenFF's port of Amber's ff14SB](https://github.com/openforcefield/amber-ff-porting/releases/tag/0.0.1) for the protein. Sage happens to include TIP3P parameters which we'll use for the waters. Because of some bugs/performance issues, we have to remote the improper torsions from the protein force field and constraints from both force fields for now.

In [None]:
# This will take 1-2 minutes to load
ff14sb = ForceField("ff14sb_off_impropers_0.0.1.offxml")
ff14sb.deregister_parameter_handler("ImproperTorsions")
ff14sb["Bonds"].fractional_bondorder_method = "AM1-Wiberg"

sage = ForceField("openff_unconstrained-2.0.0.offxml")

For now, OpenFF's force field lines are not unified because a SMIRNOFF-based biopolymer force field is not yet released; in the future a self-consistent force field can describe both biopolymers and small molecules in one pass. But until then, we need to apply each force field to their respective components, generating an `Interchange` object for each, and then combine them using the `+` operator. This operatator uses custom code that attempts to handle combining the chemical topologies, physical forces, and positions; it's not haphazardly squishing the object together. (In this example, we're setting the positions on each topology before adding them together and then overwriting those positions later using the packed results. This is to get around a bug that  has not been fixed yet.) However, this is still a sharp edge and likely to produce strange behavior - please do not use it in production work!

In [None]:
ff14sb["Bonds"].fractional_bondorder_method = "AM1-Wiberg"

In [None]:
# This might take a few minutes, some debug code should
# print each step to the cell's output
protein_interchange = Interchange.from_smirnoff(ff14sb, protein.to_topology())

In [None]:
sage_interchange = Interchange.from_smirnoff(
    sage, Topology.from_molecules([ligand, *n_waters * [water]])
)

Since we have already prepared the positions of the final system, which contains all components, we won't track positions in the intermediate `Interchange` objects and instead just use the setter on the final object. This will produce a warning (`Setting positions to None ...`) but that's fine.

In [None]:
combined_interchange = protein_interchange + sage_interchange
combined_interchange.positions = unit.Quantity(packed_coordinates, unit.nanometer)
combined_interchange.box = unit.Quantity(box_vectors * np.eye(3), unit.nanometer)

Now that we've prepared all atomic positions, applied each force field, and combined the results, we can visualize the result to verify that at least the positions and topology are not mangled:

In [None]:
combined_interchange.to_pdb(file_path="out.pdb")

w = nglview.show_mdtraj(md.load("out.pdb"))
w.add_representation(
    "spacefill",
    selection=[*range(protein.n_atoms, docked_topology.n_atoms)],
    color="green",
)
w

In [None]:
w.render_image()

Finally, we can export the final `Interchange` object to models understood by various simulation engines. Some of these exports are not yet optimized for large files.

In [None]:
openmm_system = combined_interchange.to_openmm()
openmm_topology = combined_interchange.topology.to_openmm(
    ensure_unique_atom_names=False
)
print(type(openmm_system), type(openmm_topology))

In [None]:
combined_interchange.to_inpcrd("out.inpcrd")
combined_interchange.to_prmtop("out.prmtop")

In [None]:
# These exports in particular are unfortunately too slow
# to use for large systems at the moment
if False:
    combined_interchange.to_gro("out.gro")
    combined_interchange.to_top("out.top")

    combined_interchange.to_lammps("out.lmp")

In order to verify the accuracy of each export, we can use functions in the `drivers` module to call out to each engine to evaluate single-point energies. Under the hood, each function uses the export functions just as we did in the above cells. 

In [None]:
print("OpenMM " + str(get_openmm_energies(combined_interchange)))
print("AMBER " + str(get_amber_energies(combined_interchange)))

Note that some of these functions are not yet performant for systems of this size, so we are only evaluating the OpenMM and Amber interfaces. In the future, GROMACS and LAMMPS exports can be included above, and the function `get_summary_data` can be called on it. As a sneak peek, below is the result of calling that function on an `Interchange` that contains only the ligand. The data is presented as a Pandas DataFrame, which incldues convenient methods for summary statistics.

In [None]:
ligand_interchange = Interchange.from_smirnoff(sage, ligand.to_topology())
ligand_interchange.positions = ligand.conformers[0]
ligand_interchange.box = unit.Quantity(box_vectors * np.eye(3), unit.nanometer)

In [None]:
summary = get_summary_data(ligand_interchange)
summary

In [None]:
summary.describe()