Merge pull request #193 from openforcefield/connectivity-check

Deal with connectivity changes in QC generation
openforcefield · Sep 21, 2022 · 5663b52 · 5663b52
2 parents 17edf9c + f112f24
commit 5663b52
Show file tree

Hide file tree

Showing 6 changed files with 281 additions and 5 deletions.
diff --git a/devtools/conda-envs/docs-env.yaml b/devtools/conda-envs/docs-env.yaml
@@ -37,6 +37,7 @@ dependencies:
   - chemper
   - geometric
   - torsiondrive
+  - pymbar <4
 
     # Executor
   - uvicorn

diff --git a/devtools/conda-envs/no_openeye.yaml b/devtools/conda-envs/no_openeye.yaml
@@ -39,6 +39,7 @@ dependencies:
   - chemper
   - geometric
   - torsiondrive
+  - pymbar <4
 
     # Executor
   - uvicorn

diff --git a/devtools/conda-envs/test-env.yaml b/devtools/conda-envs/test-env.yaml
@@ -41,6 +41,7 @@ dependencies:
   - chemper
   - geometric
   - torsiondrive
+  - pymbar <4
 
     # Executor
   - uvicorn

diff --git a/openff/bespokefit/executor/services/coordinator/stages.py b/openff/bespokefit/executor/services/coordinator/stages.py
@@ -51,6 +51,7 @@
     ProperTorsionSMIRKS,
     VdWSMIRKS,
 )
+from openff.bespokefit.schema.targets import TargetSchema
 from openff.bespokefit.schema.tasks import Torsion1DTask
 from openff.bespokefit.utilities.pydantic import BaseModel
 from openff.bespokefit.utilities.smirks import (
@@ -512,7 +513,9 @@ async def _inject_bespoke_qc_data(
         input_schema: BespokeOptimizationSchema,
     ):
 
-        targets = [target for stage in input_schema.stages for target in stage.targets]
+        targets: List[TargetSchema] = [
+            target for stage in input_schema.stages for target in stage.targets
+        ]
         for i, target in enumerate(targets):
 
             if not isinstance(target.reference_data, BespokeQCData):

diff --git a/openff/bespokefit/schema/targets.py b/openff/bespokefit/schema/targets.py
@@ -1,14 +1,17 @@
 import abc
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Optional, TypeVar, Union
 
 from openff.qcsubmit.results import (
     BasicResultCollection,
     OptimizationResultCollection,
     TorsionDriveResultCollection,
 )
-from pydantic import Field, PositiveFloat
+from openff.toolkit.topology import Molecule
+from pydantic import Field, PositiveFloat, validator
 from qcelemental.models import AtomicResult
+from qcelemental.models import Molecule as QCEMolecule
 from qcelemental.models.procedures import OptimizationResult, TorsionDriveResult
+from qcelemental.molutil import guess_connectivity
 from typing_extensions import Literal
 
 from openff.bespokefit.schema.data import BespokeQCData, LocalQCData
@@ -20,6 +23,130 @@
 from openff.bespokefit.utilities.pydantic import SchemaBase
 
 
+def _check_connectivity(
+    qcschema: QCEMolecule,
+    name: str,
+    fragment: Optional[Molecule] = None,
+):
+    """
+    Raise an exception if the geometry of ``qcschema`` does not match ``fragment``
+
+    Parameters
+    ==========
+
+    qcschema
+        A QCElemental ``Molecule`` representing the final geometry of a QC
+        computation
+    name
+        A name for the current calculation. Used in the exception raised by this
+        method.
+    fragment
+        An OpenFF ``Molecule`` representing the true chemical identity of the
+        fragment.
+
+    Returns
+    =======
+
+    None
+
+    Raises
+    ======
+
+    ValueError
+        If the connectivity does not match.
+    """
+    # If expected connectivity is not provided, compute it from the fragment
+    if fragment is None:
+        fragment = Molecule.from_qcschema(qcschema)
+
+    # Get expected connectivity from bonds
+    expected_connectivity = {
+        tuple(sorted([bond.atom1_index + 1, bond.atom2_index + 1]))
+        for bond in fragment.bonds
+    }
+
+    # Guess found connectivity from the output geometry
+    actual_connectivity = {
+        tuple(sorted([a + 1, b + 1]))
+        for a, b in guess_connectivity(qcschema.symbols, qcschema.geometry)
+    }
+
+    if expected_connectivity != actual_connectivity:
+        # Pydantic validators must raise ValueError, TypeError or AssertionError
+        raise ValueError(
+            f"Target record {name}: "
+            + "Reference data does not match target.\n"
+            + f"Expected mapped SMILES: {fragment.to_smiles(mapped=True)}\n"
+            + "The following connections were expected but not found: "
+            + f"{expected_connectivity - actual_connectivity}\n"
+            + "The following connections were found but not expected: "
+            + f"{actual_connectivity - expected_connectivity}\n"
+            + f"The reference geometry is: {qcschema.geometry}"
+        )
+
+
+R = TypeVar(
+    "R",
+    None,
+    LocalQCData[TorsionDriveResult],
+    LocalQCData[OptimizationResult],
+    BespokeQCData[Torsion1DTaskSpec],
+    BespokeQCData[OptimizationTaskSpec],
+    OptimizationResultCollection,
+    TorsionDriveResultCollection,
+)
+
+
+def _validate_connectivity(
+    cls,
+    ref_data: R,
+) -> R:
+    """
+    Check that connectivity has not changed over the course of QC computation.
+
+    This function can be used as a validator for the ``reference_data`` field of
+    a target schema if the connectivity may change over the course of computing
+    the target:
+
+        def __init__(...):
+            ...
+            _reference_data_connectivity = validator("reference_data", allow_reuse=True)(
+                _check_connectivity
+            )
+            ...
+
+    """
+    if ref_data is None or isinstance(ref_data, BespokeQCData):
+        # Reference data has not been computed, so the connectivity is intact
+        return ref_data
+    elif isinstance(ref_data, LocalQCData):
+        for qc_record in ref_data.qc_records:
+            # Some qc records (eg, TorsionDriveResult) use .final_molecules (plural),
+            # others (eg, OptimizationResult) use .final_molecule (singular)
+            try:
+                final_molecules = qc_record.final_molecules
+            except AttributeError:
+                final_molecules = {"opt": qc_record.final_molecule}
+
+            for name, qcschema in final_molecules.items():
+                _check_connectivity(qcschema, name)
+
+    elif hasattr(ref_data, "to_records"):
+        for qc_record, fragment in ref_data.to_records():
+            # Some qc records (eg, TorsionDriveRecord) use .get_final_molecules() (plural),
+            # others (eg, OptimizationRecord) use .get_final_molecule() (singular)
+            try:
+                final_molecules = qc_record.get_final_molecules()
+            except AttributeError:
+                final_molecules = {"opt": qc_record.get_final_molecule()}
+
+            for name, qcschema in final_molecules.items():
+                _check_connectivity(qcschema, name, fragment)
+
+    # No connectivity changes found, so return the unchanged input as validated
+    return ref_data
+
+
 class BaseTargetSchema(SchemaBase, abc.ABC):
     """The base class for models which store information about fitting targets."""
 
@@ -67,6 +194,9 @@ class TorsionProfileTargetSchema(BaseTargetSchema):
         description="The reference QC data (either existing or to be generated on the "
         "fly) to fit against.",
     )
+    _reference_data_connectivity = validator("reference_data", allow_reuse=True)(
+        _validate_connectivity
+    )
     calculation_specification: Optional[Torsion1DTaskSpec] = Field(
         None,
         description="The specification for the reference torsion drive calculation, also acts as a provenance source.",
@@ -102,6 +232,9 @@ class AbInitioTargetSchema(BaseTargetSchema):
         description="The reference QC data (either existing or to be generated on the "
         "fly) to fit against.",
     )
+    _reference_data_connectivity = validator("reference_data", allow_reuse=True)(
+        _validate_connectivity
+    )
     calculation_specification: Optional[Torsion1DTaskSpec] = Field(
         None,
         description="The specification for the reference torsion drive calculation, also acts as a provenance source.",
@@ -169,6 +302,9 @@ class OptGeoTargetSchema(BaseTargetSchema):
         description="The reference QC data (either existing or to be generated on the "
         "fly) to fit against.",
     )
+    _reference_data_connectivity = validator("reference_data", allow_reuse=True)(
+        _validate_connectivity
+    )
     calculation_specification: Optional[OptimizationTaskSpec] = Field(
         None,
         description="The specification for the reference optimisation calculation, also acts as a provenance source.",

diff --git a/openff/bespokefit/tests/schema/test_targets.py b/openff/bespokefit/tests/schema/test_targets.py
@@ -2,8 +2,12 @@
 from pydantic import ValidationError
 from qcelemental.models.common_models import Model
 
-from openff.bespokefit.schema.data import BespokeQCData
-from openff.bespokefit.schema.targets import TorsionProfileTargetSchema
+from openff.bespokefit.schema.data import BespokeQCData, LocalQCData
+from openff.bespokefit.schema.targets import (
+    AbInitioTargetSchema,
+    OptGeoTargetSchema,
+    TorsionProfileTargetSchema,
+)
 from openff.bespokefit.schema.tasks import HessianTaskSpec, Torsion1DTaskSpec
 
 
@@ -31,3 +35,133 @@ def test_check_reference_data(qc_torsion_drive_results):
                 )
             )
         )
+
+
+@pytest.mark.parametrize(
+    "TargetSchema",
+    [
+        TorsionProfileTargetSchema,
+        AbInitioTargetSchema,
+        OptGeoTargetSchema,
+    ],
+)
+class TestCheckConnectivity:
+    @pytest.fixture()
+    def expected_err(self, TargetSchema) -> str:
+        return (
+            r"1 validation error for "
+            + TargetSchema.__name__
+            + r"\n"
+            + r"reference_data\n"
+            + r"  Target record (opt|\[-165\]): Reference data "
+            + r"does not match target\.\n"
+            + r"Expected mapped SMILES: "
+            # This regex for the mapped SMILES is probably extremely fragile;
+            # if this test breaks after an RDkit/OpenEye update, try replacing it
+            # with something like `+ r".*"`
+            + r"(\(?[-+]?\[("
+            + r"H:13|c:1|c:3|c:7|c:11|c:8|c:4|H:16|H:20|c:12|c:9|c:5|c:2|c:6|c:10"
+            + r"|H:22|H:18|H:14|H:17|H:21|H:19|H:15"
+            + r")\]1?2?\)?){22}"
+            # End fragile regex
+            + r"\n"
+            + r"The following connections were expected but not found: "
+            + r"{\(1, 13\), \(1, 3\), \(1, 4\)}\n"
+            + r"The following connections were found but not expected: "
+            + r"{\(1, 6\), \(1, 2\), \(1, 14\), \(1, 5\)}\n"
+            + r"The reference geometry is: \[(\[.*\]\n ){21}\[.*\]\]"
+            # + r"The reference geometry is: \[.*\]"
+            + r" \(type=value_error\)"
+        )
+
+    @pytest.fixture()
+    def ref_data_local(self, TargetSchema, request):
+        ref_data_fixture = {
+            TorsionProfileTargetSchema: "qc_torsion_drive_qce_result",
+            AbInitioTargetSchema: "qc_torsion_drive_qce_result",
+            OptGeoTargetSchema: "qc_optimization_qce_result",
+        }[TargetSchema]
+        result, _ = request.getfixturevalue(ref_data_fixture)
+        return [result]
+
+    @pytest.fixture()
+    def ref_data_qcfractal(self, TargetSchema, request):
+        ref_data_fixture = {
+            TorsionProfileTargetSchema: "qc_torsion_drive_results",
+            AbInitioTargetSchema: "qc_torsion_drive_results",
+            OptGeoTargetSchema: "qc_optimization_results",
+        }[TargetSchema]
+        return request.getfixturevalue(ref_data_fixture)
+
+    def test_check_connectivity_local_positive(
+        self,
+        TargetSchema,
+        ref_data_local,
+    ):
+        TargetSchema(reference_data=LocalQCData(qc_records=ref_data_local))
+
+    def test_check_connectivity_local_negative(
+        self,
+        TargetSchema,
+        ref_data_local,
+        expected_err,
+    ):
+        # Swap the first two atoms' coordinates to break their connectivity
+        torsiondrive_result_disconnection = ref_data_local[0]
+        try:
+            geom = next(
+                iter(torsiondrive_result_disconnection.final_molecules.values())
+            ).geometry
+        except AttributeError:
+            geom = torsiondrive_result_disconnection.final_molecule.geometry
+        geom[0], geom[1] = geom[1], geom[0]
+
+        with pytest.raises(ValidationError, match=expected_err):
+            TargetSchema(
+                reference_data=LocalQCData(
+                    qc_records=[torsiondrive_result_disconnection]
+                )
+            )
+
+    def test_check_connectivity_qcfractal_positive(
+        self,
+        TargetSchema,
+        ref_data_qcfractal,
+    ):
+        TargetSchema(reference_data=ref_data_qcfractal)
+
+    def test_check_connectivity_qcfractal_negative(
+        self,
+        TargetSchema,
+        ref_data_qcfractal,
+        expected_err,
+    ):
+        [(record, _)] = ref_data_qcfractal.to_records()
+
+        # Get the first of the final molecules, and prepare an update function so
+        # that changes are preserved across calls to get_final_molecule(s)()
+        try:
+            final_molecules = record.get_final_molecules()
+            key, first_final_mol = next(iter(record.cache["final_molecules"].items()))
+
+            def update_record(updated_mol):
+                final_molecules.update({key: updated_mol})
+                record.__dict__["get_final_molecules"] = lambda: final_molecules
+
+        except AttributeError:
+            first_final_mol = record.get_final_molecule()
+
+            def update_record(updated_mol):
+                record.__dict__["get_final_molecule"] = lambda: updated_mol
+
+        # Swap the first two atoms' coordinates to break their connectivity
+        geom = first_final_mol.geometry.copy()
+        geom[0], geom[1] = geom[1], geom[0]
+        updated_mol = first_final_mol.copy(update={"geometry": geom})
+
+        # Update the record with the new geometry
+        update_record(updated_mol)
+
+        # Create the target schema, which should fail to validate
+        with pytest.raises(ValidationError, match=expected_err):
+            TargetSchema(reference_data=ref_data_qcfractal)