In [1]:
import datasets
import tqdm

## Load dataset

These are [Huggingface dataset](https://huggingface.co/docs/datasets/en/index) formats.

In [2]:
opt_dataset = datasets.Dataset.load_from_disk("data/additional-optimizations/")
td_dataset = datasets.Dataset.load_from_disk("data/additional-torsiondrives/")

Datasets are represented with the features in each entry.

In [3]:
opt_dataset

Dataset({
    features: ['smiles', 'coords', 'energy', 'forces'],
    num_rows: 70
})

Datasets can be indexed to get a single entry. "coords", "forces", etc. are stored as flat lists of floats.

In [4]:
opt_dataset[0]

{'smiles': '[H:14][C:3]1=[C:2]([N:11]([N:10]=[C:4]1[C:5]23[C:6]([C:7]([C:8]2([H:18])[H:19])([C:9]3([H:20])[H:21])[H:17])([H:15])[H:16])[H:22])[N:1]([H:12])[H:13]',
 'coords': tensor([-1.7365,  1.3097,  5.1288, -1.5198,  1.1527,  3.7460, -0.6614,  0.3525,
          3.0057, -0.8821,  0.7326,  1.6502, -0.2310,  0.1866,  0.4400, -0.4846,
          0.6215, -1.0329,  0.5960, -0.5005, -1.1146,  1.2970,  0.1879,  0.0944,
         -0.2535, -1.2926, -0.0766, -1.8106,  1.6883,  1.5537, -2.1830,  1.9305,
          2.8444, -1.0775,  0.7501,  5.6606, -2.6807,  1.0490,  5.4048,  0.0104,
         -0.4076,  3.3820, -1.4734,  0.3898, -1.4397, -0.1691,  1.6355, -1.2960,
          1.0740, -0.8972, -2.0142,  1.7470,  1.1697, -0.0845,  1.9392, -0.4433,
          0.7182,  0.2762, -2.0313,  0.5348, -1.2254, -1.6684, -0.4125, -2.8092,
          2.6975,  3.0476]),
 'energy': tensor([-298509.5312]),
 'forces': tensor([ 8.6516e-03,  9.9962e-03, -2.1068e-02, -6.0739e-02, -7.5098e-02,
          1.4797e-02,  6.7018e

We'll need to re-convert to PyTorch.

In [5]:
# reformat dataset lists to torch tensors
opt_dataset.set_format(
    "torch", columns=["energy", "coords", "forces"], output_all_columns=True
)
td_dataset.set_format(
    "torch", columns=["energy", "coords", "forces"], output_all_columns=True
)

In [6]:
opt_dataset[0]

{'coords': tensor([-1.7365,  1.3097,  5.1288, -1.5198,  1.1527,  3.7460, -0.6614,  0.3525,
          3.0057, -0.8821,  0.7326,  1.6502, -0.2310,  0.1866,  0.4400, -0.4846,
          0.6215, -1.0329,  0.5960, -0.5005, -1.1146,  1.2970,  0.1879,  0.0944,
         -0.2535, -1.2926, -0.0766, -1.8106,  1.6883,  1.5537, -2.1830,  1.9305,
          2.8444, -1.0775,  0.7501,  5.6606, -2.6807,  1.0490,  5.4048,  0.0104,
         -0.4076,  3.3820, -1.4734,  0.3898, -1.4397, -0.1691,  1.6355, -1.2960,
          1.0740, -0.8972, -2.0142,  1.7470,  1.1697, -0.0845,  1.9392, -0.4433,
          0.7182,  0.2762, -2.0313,  0.5348, -1.2254, -1.6684, -0.4125, -2.8092,
          2.6975,  3.0476]),
 'energy': tensor([-298509.5312]),
 'forces': tensor([ 8.6516e-03,  9.9962e-03, -2.1068e-02, -6.0739e-02, -7.5098e-02,
          1.4797e-02,  6.7018e-02,  6.8418e-02,  1.4632e-02, -1.8251e-02,
         -5.5096e-02,  2.0181e-02, -4.5286e-02,  5.9464e-02, -4.5025e-02,
         -3.8120e-03, -6.7527e-03,  5.1948e-03

In [7]:
len(opt_dataset)

70

## Fitting

For how to fit a force field to optimization data from a SMIRNOFF force field, here's an [example I put together for the IRL Irvine meeting](https://openforcefield.atlassian.net/wiki/spaces/MEET/pages/3440508935/Hackathon+How+to+train+your+force+field+with+smee) (`run-smee-fit-from-qca-data-commented.ipynb` where you can largely follow on from the "Assign parameters to molecules in the dataset" heading.

The only note is that the `descent.targets.energy.predict` function would have to be re-written to not include forces in the objective and prediction if they're not in the data.

In [8]:
from utils import ff_to_csys

chemical_system = ff_to_csys("openff-2.2.1.offxml")

In [9]:
from typing import Iterable, TypeVar

T = TypeVar("T")

def flatten(iterable: Iterable[Iterable[T]]) -> Iterable[T]:
    for inner_iterable in iterable:
        yield from inner_iterable

In [10]:
import numpy as np
from besmarts.core.assignments import graph_db as GraphDb
from besmarts.core.assignments import graph_db_add_single_molecule_state
from besmarts.core.assignments import graph_db_address as GraphDbAddress
from besmarts.mechanics.fits import (
    objective_config_energy_total as ObjectiveConfigEnergyTotal,
)
from besmarts.mechanics.fits import objective_config_gradient as ObjectiveConfigGradient
from besmarts.mechanics.fits import objective_config_position as ObjectiveConfigPosition
from besmarts.mechanics.fits import objective_tier as ObjectiveTier
from openff.toolkit import Molecule
from utils import data_to_graph_assignment

graph_db = GraphDb()
eid = 0
objectives = []
smiles_to_opt_eids = {}
smiles_to_add = set(opt_dataset["smiles"][:10])

# First, the optimizations
for entry in opt_dataset:
    smiles = entry["smiles"]
    if smiles not in smiles_to_add:
        continue
        
    n_frames = len(entry["energy"])
    n_atoms = Molecule.from_mapped_smiles(smiles, allow_undefined_stereo=True).n_atoms

    for position, gradient, energy in sorted(zip(
            np.asarray(entry["coords"].reshape(n_frames, n_atoms, 3)),
            np.asarray(entry["forces"].reshape(n_frames, n_atoms, 3)),
            np.asarray(entry["energy"]),
            strict=True,
        ),
        key=lambda t: t[2],
    ):
        
        # Add the data to the graph
        graph_db_add_single_molecule_state(
            graph_db,
            positions=data_to_graph_assignment(
                smiles=smiles,
                data=position,
            ),
            gradients=data_to_graph_assignment(
                smiles=smiles,
                data=gradient,
            ),
            energy=energy,
        )

        # # Create objectives using the data
        # objectives.extend(
        #     [
        #         ObjectiveConfigPosition(
        #             GraphDbAddress(
        #                 eid=[eid],
        #             ),
        #             scale=100,
        #         ),
        #         ObjectiveConfigGradient(
        #             GraphDbAddress(
        #                 eid=[eid],
        #             ),
        #             scale=1e-5,
        #         ),
        #     ]
        # )
        # We'll need this EID for the total energy objective later
        smiles_to_opt_eids.setdefault(smiles, []).append(eid)

        # Increment the entry index into the graph
        eid += 1


# Next, the torsion drives
for entry in list(td_dataset):
    smiles = entry["smiles"]
    if smiles not in smiles_to_add:
        continue
        
    n_atoms = Molecule.from_mapped_smiles(smiles, allow_undefined_stereo=True).n_atoms
    n_frames = len(entry["energy"])

    eids = list(smiles_to_opt_eids[smiles])
    for coords, energy in zip(
        np.asarray(entry["coords"].reshape(n_frames, n_atoms, 3)),
        np.asarray(entry["energy"]),
        strict=True,
    ):
        positions = data_to_graph_assignment(
            smiles=smiles,
            data=np.asarray(coords),
        )

        graph_db_add_single_molecule_state(
            graph_db,
            positions=positions,
            energy=energy,
        )
        eids.append(eid)
        eid += 1

    # Add all energy objectives for this molecule in one
    # objective to handle QC's arbitrary energy baseline
    energy_total_objective = ObjectiveConfigEnergyTotal(
        GraphDbAddress(
            eid=eids, # First energy in first eid is reference energy
        ),
        scale=1,
    )
    energy_total_objective.ene_mode = "pairs"
    objectives.append(energy_total_objective)

    
for obj in objectives:
    obj.verbose = 2

objective_tier = ObjectiveTier()
objective_tier.objectives = dict(enumerate(objectives))

In [11]:
from besmarts.mechanics.fits import gdb_to_physical_systems

physical_systems = gdb_to_physical_systems(graph_db, chemical_system)

2025-04-02 14:48:12.968870 Starting parameterization
[H:14][C:3]1=[C:2]([N:11]([N:10]=[C:4]1[C:5]23[C:6]([C:7]([C:8]2([H:18])[H:19])([C:9]3([H:20])[H:21])[H:17])([H:15])[H:16])[H:22])[N:1]([H:12])[H:13]
Charges: [H:14][C:3]1=[C:2]([N:11]([N:10]=[C:4]1[C:5]23[C:6]([C:7]([C:8]2([H:18])[H:19])([C:9]3([H:20])[H:21])[H:17])([H:15])[H:16])[H:22])[N:1]([H:12])[H:13] [0.162, -0.3453, 0.0913, -0.0396, -0.5068, 0.3836, -0.1413, -0.0754, -0.1137, -0.0754, 0.0572, 0.0572, -0.0754, 0.0572, 0.0572, 0.0947, 0.0572, 0.0572, 0.3227, -0.7812, 0.3798, 0.3798]
2025-04-02 14:48:13.109759 Parameterizing..      2/321[H:19][c:11]1[c:10]2[c:6]([c:5]([c:4]([c:12]1[C:13]([H:20])([H:21])[Br:14])[C:2](=[O:1])[O:3][H:15])[H:16])[N:7]([N:8]=[C:9]2[H:18])[H:17]
Charges: [H:19][c:11]1[c:10]2[c:6]([c:5]([c:4]([c:12]1[C:13]([H:20])([H:21])[Br:14])[C:2](=[O:1])[O:3][H:15])[H:16])[N:7]([N:8]=[C:9]2[H:18])[H:17] [0.145, -0.051, -0.2049, -0.0412, -0.084, -0.0876, -0.0803, 0.0673, 0.0937, 0.0937, -0.2044, 0.6507, -0.53, -0.5

In [12]:
model = {model.name: i for i, model in enumerate(chemical_system.models)}
model

{'Bonds': 0,
 'Angles': 1,
 'Torsions': 2,
 'OutOfPlanes': 3,
 'Electrostatics': 4,
 'vdW': 5}

In [13]:
split_on_models = {model["Torsions"]: ["t17"]}

In [14]:
from besmarts.mechanics.fits import forcefield_optimization_strategy_default

fitting_strategy = forcefield_optimization_strategy_default(
    chemical_system, models=split_on_models
)

In [15]:
from besmarts.mechanics.fits import chemical_objective

force_field_objective = chemical_objective

In [16]:
from copy import deepcopy

# Fit only the models we split on
objective_tier.fit_models = list(split_on_models)
objective_tier.fit_names = None # Fit all torsions
# Choose the particular parameter values to fit
# omitting this fits all symbols, including eg periodicities and phases
objective_tier.fit_symbols = ['k']
objective_tier.step_limit = 200
# Suggest parameters with altered periodicities
objective_tier.enable_modify=True
objective_tier.modify_torsion_frequency_limit = 5
# Can suggest multiple periodicities at the same time by configuring the bounds' bit depth

initial_tier = deepcopy(objective_tier)
initial_tier.minstep = 1e-3
# TODO: investigate initial_tier.tol
final_tier = deepcopy(objective_tier)
final_tier.minstep = 1e-3
# TODO: investigate final_tier.tol


# Only do 4 rounds of fitting when evaluating candidate parameters
objective_tier.step_limit = 4

objective_tiers = [objective_tier]

In [None]:
from besmarts.core import configs
from besmarts.mechanics.fits import ff_optimize
from contextlib import redirect_stderr, redirect_stdout
import sys
from datetime import datetime

configs.processors = 8
configs.remote_compute_enable = False
configs.workqueue_port = 12345

with open(f"example-ff_optimize_{datetime.now()}.log", "x") as f:
    with redirect_stderr(f), redirect_stdout(f):
        newcsys, (P0, P), (C0, C) = ff_optimize(
            csys0=chemical_system,
            gdb=graph_db,
            psystems=physical_systems,
            strategy=fitting_strategy,
            chemical_objective=force_field_objective,
            initial_objective=initial_tier,
            tiers=objective_tiers,
            final_objective=final_tier,
        )

In [None]:
for i, smarts in newcsys.models[model["Torsions"]].procedures[0].smarts_hierarchies[0].smarts.items():
    orig_smarts_i = chemical_system.models[model["Torsions"]].procedures[0].smarts_hierarchies[0].smarts.get(i)
    if smarts != orig_smarts_i:
        print(smarts, i, orig_smarts_i)

In [None]:
newcsys.models[model["Torsions"]].procedures[0].smarts_hierarchies[0].smarts

In [None]:
# TODO: Parse output for 
#    - Nanostep - 
#    - RMSE
#    - Total