In [1]:
import datasets
import tqdm

## Load dataset

These are [Huggingface dataset](https://huggingface.co/docs/datasets/en/index) formats.

In [2]:
opt_dataset = datasets.Dataset.load_from_disk("../data/additional-optimizations/")
td_dataset = datasets.Dataset.load_from_disk("../data/additional-torsiondrives/")

Datasets are represented with the features in each entry.

In [3]:
opt_dataset

Dataset({
    features: ['smiles', 'coords', 'energy', 'forces'],
    num_rows: 70
})

Datasets can be indexed to get a single entry. "coords", "forces", etc. are stored as flat lists of floats.

In [4]:
opt_dataset[0]

{'smiles': '[H:14][C:3]1=[C:2]([N:11]([N:10]=[C:4]1[C:5]23[C:6]([C:7]([C:8]2([H:18])[H:19])([C:9]3([H:20])[H:21])[H:17])([H:15])[H:16])[H:22])[N:1]([H:12])[H:13]',
 'coords': tensor([-1.7365,  1.3097,  5.1288, -1.5198,  1.1527,  3.7460, -0.6614,  0.3525,
          3.0057, -0.8821,  0.7326,  1.6502, -0.2310,  0.1866,  0.4400, -0.4846,
          0.6215, -1.0329,  0.5960, -0.5005, -1.1146,  1.2970,  0.1879,  0.0944,
         -0.2535, -1.2926, -0.0766, -1.8106,  1.6883,  1.5537, -2.1830,  1.9305,
          2.8444, -1.0775,  0.7501,  5.6606, -2.6807,  1.0490,  5.4048,  0.0104,
         -0.4076,  3.3820, -1.4734,  0.3898, -1.4397, -0.1691,  1.6355, -1.2960,
          1.0740, -0.8972, -2.0142,  1.7470,  1.1697, -0.0845,  1.9392, -0.4433,
          0.7182,  0.2762, -2.0313,  0.5348, -1.2254, -1.6684, -0.4125, -2.8092,
          2.6975,  3.0476]),
 'energy': tensor([-298509.5312]),
 'forces': tensor([ 8.6516e-03,  9.9962e-03, -2.1068e-02, -6.0739e-02, -7.5098e-02,
          1.4797e-02,  6.7018e

We'll need to re-convert to PyTorch.

In [5]:
# reformat dataset lists to torch tensors
opt_dataset.set_format(
    "torch", columns=["energy", "coords", "forces"], output_all_columns=True
)
td_dataset.set_format(
    "torch", columns=["energy", "coords", "forces"], output_all_columns=True
)

In [6]:
opt_dataset[0]

{'coords': tensor([-1.7365,  1.3097,  5.1288, -1.5198,  1.1527,  3.7460, -0.6614,  0.3525,
          3.0057, -0.8821,  0.7326,  1.6502, -0.2310,  0.1866,  0.4400, -0.4846,
          0.6215, -1.0329,  0.5960, -0.5005, -1.1146,  1.2970,  0.1879,  0.0944,
         -0.2535, -1.2926, -0.0766, -1.8106,  1.6883,  1.5537, -2.1830,  1.9305,
          2.8444, -1.0775,  0.7501,  5.6606, -2.6807,  1.0490,  5.4048,  0.0104,
         -0.4076,  3.3820, -1.4734,  0.3898, -1.4397, -0.1691,  1.6355, -1.2960,
          1.0740, -0.8972, -2.0142,  1.7470,  1.1697, -0.0845,  1.9392, -0.4433,
          0.7182,  0.2762, -2.0313,  0.5348, -1.2254, -1.6684, -0.4125, -2.8092,
          2.6975,  3.0476]),
 'energy': tensor([-298509.5312]),
 'forces': tensor([ 8.6516e-03,  9.9962e-03, -2.1068e-02, -6.0739e-02, -7.5098e-02,
          1.4797e-02,  6.7018e-02,  6.8418e-02,  1.4632e-02, -1.8251e-02,
         -5.5096e-02,  2.0181e-02, -4.5286e-02,  5.9464e-02, -4.5025e-02,
         -3.8120e-03, -6.7527e-03,  5.1948e-03

In [7]:
len(opt_dataset)

70

In [8]:
len(td_dataset[0]["coords"])

1584

## Fitting

For how to fit a force field to optimization data from a SMIRNOFF force field, here's an [example I put together for the IRL Irvine meeting](https://openforcefield.atlassian.net/wiki/spaces/MEET/pages/3440508935/Hackathon+How+to+train+your+force+field+with+smee) (`run-smee-fit-from-qca-data-commented.ipynb` where you can largely follow on from the "Assign parameters to molecules in the dataset" heading.

The only note is that the `descent.targets.energy.predict` function would have to be re-written to not include forces in the objective and prediction if they're not in the data.

Let's try fitting as a 3-step procedure:

1. Identify a place where the way the existing 
2. Use BESMARTS to 

In [9]:
from utils import ff_to_csys

chemical_system = ff_to_csys("openff-2.2.1.offxml")

In [35]:
from besmarts.core.assignments import graph_db as GraphDb
from besmarts.core.assignments import graph_db_add_single_molecule_state
from besmarts.core.assignments import graph_db_address as GraphDbAddress
from besmarts.mechanics.fits import (
    objective_config_energy_total as ObjectiveConfigEnergyTotal,
)
from besmarts.mechanics.fits import objective_config_gradient as ObjectiveConfigGradient
from besmarts.mechanics.fits import objective_config_position as ObjectiveConfigPosition
from besmarts.mechanics.fits import objective_tier as ObjectiveTier
from utils import data_to_graph_assignment

graph_db = GraphDb()
eid = 0
n_entries = 0  # number of entries from each dataset to include, 0 for all

# First, the optimizations
for entry in list(opt_dataset)[: n_entries or len(opt_dataset)]:
    smiles = entry["smiles"]

    # Create mappings between the data and the graph implied by the SMILES
    positions = data_to_graph_assignment(
        smiles=smiles,
        data=entry["coords"].reshape(-1, 3),
    )
    gradients = data_to_graph_assignment(
        smiles=smiles,
        data=entry["forces"].reshape(-1, 3),
    )
    energy = entry["energy"][0]

    # Add the data to the graph
    graph_db_add_single_molecule_state(
        graph_db,
        positions=positions,
        gradients=gradients,
        energy=energy,
    )

    # Increment the entry index into the graph
    eid += 1

# Next, the torsion drives
for entry in list(td_dataset)[: n_entries or len(opt_dataset)]:
    smiles = entry["smiles"]

    n_frames = len(entry["energy"])

    for coords, energy in zip(
        entry["coords"].reshape(n_frames, -1, 3), entry["energy"]
    ):
        positions = data_to_graph_assignment(
            smiles=smiles,
            data=coords,
        )

        graph_db_add_single_molecule_state(
            graph_db,
            positions=positions,
            energy=energy,
        )

        eid += 1

Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending energies
Appending en

In [36]:
from besmarts.mechanics.fits import gdb_to_physical_systems

physical_systems = gdb_to_physical_systems(graph_db, chemical_system)

2025-03-14 17:43:30.153988 Starting parameterization
2025-03-14 17:43:30.154020 Parameterizing..      1/2038[H:14][C:3]1=[C:2]([N:11]([N:10]=[C:4]1[C:5]23[C:6]([C:7]([C:8]2([H:18])[H:19])([C:9]3([H:20])[H:21])[H:17])([H:15])[H:16])[H:22])[N:1]([H:12])[H:13]
Charges: [H:14][C:3]1=[C:2]([N:11]([N:10]=[C:4]1[C:5]23[C:6]([C:7]([C:8]2([H:18])[H:19])([C:9]3([H:20])[H:21])[H:17])([H:15])[H:16])[H:22])[N:1]([H:12])[H:13] [0.162, -0.3453, 0.0913, -0.0396, -0.5068, 0.3836, -0.1413, -0.0754, -0.1137, -0.0754, 0.0572, 0.0572, -0.0754, 0.0572, 0.0572, 0.0947, 0.0572, 0.0572, 0.3227, -0.7812, 0.3798, 0.3798]
[H:19][c:11]1[c:10]2[c:6]([c:5]([c:4]([c:12]1[C:13]([H:20])([H:21])[Br:14])[C:2](=[O:1])[O:3][H:15])[H:16])[N:7]([N:8]=[C:9]2[H:18])[H:17]
Charges: [H:19][c:11]1[c:10]2[c:6]([c:5]([c:4]([c:12]1[C:13]([H:20])([H:21])[Br:14])[C:2](=[O:1])[O:3][H:15])[H:16])[N:7]([N:8]=[C:9]2[H:18])[H:17] [0.145, -0.051, -0.2049, -0.0412, -0.084, -0.0876, -0.0803, 0.0673, 0.0937, 0.0937, -0.2044, 0.6507, -0.53, -0.

In [37]:
from besmarts.core.codecs import intvec_codec as IntvecCodec

icd = chemical_system.perception.icd
compressed_graph_db = {i: icd.graph_encode(g) for i, g in graph_db.graphs.items()}

In [38]:
model = {model.name: i for i, model in enumerate(chemical_system.models)}
model

{'Bonds': 0,
 'Angles': 1,
 'Torsions': 2,
 'OutOfPlanes': 3,
 'Electrostatics': 4,
 'vdW': 5}

In [39]:
from besmarts.mechanics.fits import forcefield_optimization_strategy_default

strategy = forcefield_optimization_strategy_default(
    chemical_system,
    models={model["Torsions"]: ["t17"]},
)
strategy.build_steps()

In [40]:
from besmarts.core.optimization import optimization_iteration as OptimizationIteration
from besmarts.mechanics.fits import (
    forcefield_optimization_strategy as ForceFieldOptimizationStrategy,
)
from besmarts.mechanics.molecular_models import physical_system as PhysicalSystem


def prepare_macro_iteration(
    physical_systems: dict[int, PhysicalSystem],
    strategy: ForceFieldOptimizationStrategy,
) -> OptimizationIteration:
    step_tracker = strategy.step_tracker
    assigned_nodes = sorted(
        set(
            [
                (m, l)
                for psys in physical_systems.values()
                for m, pm in enumerate(psys.models)
                for proc in pm.labels
                for glbl in proc.values()
                for t, l in glbl.items()
            ]
        )
    )

    assigned_nodes = [
        x for x in assigned_nodes if x[0] in strategy.bounds and x[1] not in "sc"
    ]
    assigned_nodes = set([x[1] for x in assigned_nodes])

    nodes = [
        x
        for x in strategy.tree_iterator(chemical_system)
        if x.type == "parameter" and x.name in assigned_nodes
        # if strategy.cursor == -1
        and any(
            strategy.cursor >= y
            for y in step_tracker.get(
                (x.category, x.name),
                {strategy.SPLIT: -1, strategy.MERGE: -1, strategy.MODIFY: -1}.values(),
            )
        )
        and (
            (strategy.reference_list and x.name not in strategy.reference_list)
            or (
                strategy.reference_list
                and strategy.target_list
                and x.name in strategy.target_list
            )
        )
        # and x.type == "parameter"
    ]

    for n in nodes:
        tkey = n.category, n.name
        if tkey not in step_tracker:
            step_tracker[tkey] = {
                strategy.SPLIT: 0,
                strategy.MERGE: 0,
                strategy.MODIFY: 0,
            }

    return strategy.macro_iteration(nodes)

In [41]:
from besmarts.core.compute import workqueue_local as LocalWorkqueue
from besmarts.mechanics.fits import generate_candidates

candidates, iteration = generate_candidates(
    csys=chemical_system,
    gdb=graph_db,
    strategy=strategy,
    # The following are computed from the above within this notebook
    psystems=physical_systems,
    G0=compressed_graph_db,
    macro=prepare_macro_iteration(physical_systems, strategy),
    union_cache={},
    wq=LocalWorkqueue("0.0.0.0", 61666),
)

2025-03-14 17:48:23.279775 Collecting SMARTS for t17 and setting to depth=0

 == iteration=   1 macro=  1/2 micro=  1/2 operation=1 cluster=t17  N= 231 overlap=[0] bits=1->1 depth=0->0 branch=0->0

Attempting to split t17:
S0: [*:1]~[#6X3:2]-[#6X4:3]-[*:4] split_space: [*:1]~[#6X3:2]-[#6X4:3]-[*:4]
Matched N=231
000001 (0, (3, 4, 5, 6))        [#6H1X3x2r5A+0:3]@;-[#6H0X3x2r5A+0:4]!@;-[#6H0X4x3r4A+0:5]@;-[#6H2X4x2r4A+0:6] < [#6H1X3x2r5A+0:3]@;-[#6H0X3x2r5A+0:4]!@;-[#6H0X4x3r4A+0:5]@;-[#6H2X4x2r4A+0:6]
000002 (0, (3, 4, 5, 8))        [#6H1X3x2r5A+0:3]@;-[#6H0X3x2r5A+0:4]!@;-[#6H0X4x3r4A+0:5]@;-[#6H2X4x2r4A+0:8] < [#6H1X3x2r5A+0:3]@;-[#6H0X3x2r5A+0:4]!@;-[#6H0X4x3r4A+0:5]@;-[#6H2X4x2r4A+0:8]
000003 (0, (3, 4, 5, 9))        [#6H1X3x2r5A+0:3]@;-[#6H0X3x2r5A+0:4]!@;-[#6H0X4x3r4A+0:5]@;-[#6H2X4x2r4A+0:9] < [#6H1X3x2r5A+0:3]@;-[#6H0X3x2r5A+0:4]!@;-[#6H0X4x3r4A+0:5]@;-[#6H2X4x2r4A+0:9]
000004 (1, (4, 12, 13, 14))     [#6H0X3x2r6a+0:4]@;:[#6H0X3x2r6a+0:12]!@;-[#6H2X4x0!rA+0:13]!@;-[#35H0X1x0!rA+

In [42]:
candidates

{(0, None, -1, 1): (<besmarts.core.trees.tree_node at 0x72b588106fc0>,
  <besmarts.core.graphs.structure at 0x72b57442cee0>,
  <besmarts.core.optimization.optimization_step at 0x72b56ca56d50>,
  None,
  None,
  None,
  None),
 (0, None, 0, 1): (<besmarts.core.trees.tree_node at 0x72b588106fc0>,
  <besmarts.core.graphs.structure at 0x72b57442c700>,
  <besmarts.core.optimization.optimization_step at 0x72b56ca56d50>,
  None,
  None,
  None,
  None),
 (0, None, 1, 1): (<besmarts.core.trees.tree_node at 0x72b588106fc0>,
  <besmarts.core.graphs.structure at 0x72b58815bbe0>,
  <besmarts.core.optimization.optimization_step at 0x72b56ca56d50>,
  None,
  None,
  None,
  None),
 (0, None, 2, 1): (<besmarts.core.trees.tree_node at 0x72b588106fc0>,
  <besmarts.core.graphs.structure at 0x72b58815ba30>,
  <besmarts.core.optimization.optimization_step at 0x72b56ca56d50>,
  None,
  None,
  None,
  None),
 (0, None, 3, 1): (<besmarts.core.trees.tree_node at 0x72b588106fc0>,
  <besmarts.core.graphs.struc

In [52]:
from besmarts.codecs.codec_rdkit import graph_codec_rdkit as RdkitGraphCodec

gcd = RdkitGraphCodec()
for (_, _, _, op), (parameter, structure, step, *_) in candidates.items():
    ops = {
        strategy.MERGE: "MERGE",
        strategy.SPLIT: "SPLIT",
        strategy.MODIFY: "MODIFY",
    }
    print(ops[op], parameter.name, gcd.smarts_encode(structure))

SPLIT t17 [*:1]~[#6X3:2]-[#6X4:3]-[+:4]
SPLIT t17 [*:1]~[#6X3:2]-[#6X4:3]-[r4:4]
SPLIT t17 [*:1]~[#6X3:2]-[#6X4:3]-[X4:4]
SPLIT t17 [*:1]~[#6X3:2]-[#6X4:3]-[H3:4]
SPLIT t17 [*:1]~[#6X3:2]-[#6X4:3]-[#35:4]
SPLIT t17 [*:1]~[#6X3:2]-[#6X4:3]-[#16:4]
SPLIT t17 [*:1]~[#6X3:2]-[#6X4:3]-[#9:4]
SPLIT t17 [*:1]~[#6X3:2]-[#6H3X4:3]-[*:4]
SPLIT t17 [*:1]~[#6X3:2]-[#6!H2X4:3]-[*:4]
SPLIT t17 [*:1]~[#6X3:2]@;-[#6X4:3]-[*:4]
SPLIT t17 [A:1]~[#6X3:2]-[#6X4:3]-[*:4]
SPLIT t17 [!r6:1]~[#6X3:2]-[#6X4:3]-[*:4]
SPLIT t17 [r5:1]~[#6X3:2]-[#6X4:3]-[*:4]
SPLIT t17 [!r:1]~[#6X3:2]-[#6X4:3]-[*:4]
SPLIT t17 [x3:1]~[#6X3:2]-[#6X4:3]-[*:4]
SPLIT t17 [!x2:1]~[#6X3:2]-[#6X4:3]-[*:4]
SPLIT t17 [!X3:1]~[#6X3:2]-[#6X4:3]-[*:4]
SPLIT t17 [X2:1]~[#6X3:2]-[#6X4:3]-[*:4]
SPLIT t17 [X1:1]~[#6X3:2]-[#6X4:3]-[*:4]
SPLIT t17 [H2:1]~[#6X3:2]-[#6X4:3]-[*:4]
SPLIT t17 [H1:1]~[#6X3:2]-[#6X4:3]-[*:4]
SPLIT t17 [H0:1]~[#6X3:2]-[#6X4:3]-[*:4]
SPLIT t17 [#8:1]~[#6X3:2]-[#6X4:3]-[*:4]
SPLIT t17 [#7:1]~[#6X3:2]-[#6X4:3]-[*:4]
SPLIT t17

In [116]:
from besmarts.mechanics.molecular_models import chemical_system as ChemicalSystem


def param_smarts(chemical_system: ChemicalSystem) -> dict[str, str]:

    sections = {}
    for model in chemical_system.models:
        params = sections.setdefault(model.name, {})
        for procedure in model.procedures:
            if not hasattr(procedure, "smarts_hierarchies"):
                continue
            for smarts_hierarchy in procedure.smarts_hierarchies.values():
                for node in smarts_hierarchy.index.nodes.values():
                    smarts = smarts_hierarchy.smarts[node.index]
                    if smarts is None:
                        continue
                    assert node.name not in params
                    params[node.name] = smarts
    return sections

torsion_smarts = param_smarts(chemical_system)["Torsions"]

In [59]:
from openff.toolkit import Molecule

all_smiles = set(s for s in (*opt_dataset["smiles"], *td_dataset["smiles"]))
all_molecules = [Molecule.from_mapped_smiles(s, allow_undefined_stereo=True) for s in all_smiles]