# Usage/tutorials

## Labeling

In [1]:
import pprint

from besmarts.assign.hierarchy_assign_rdkit import smarts_hierarchy_assignment_rdkit
from besmarts.codecs.codec_rdkit import graph_codec_rdkit
from besmarts.core import assignments, graphs, hierarchies, trees

gcd = graph_codec_rdkit()

labeler = smarts_hierarchy_assignment_rdkit()


hier = trees.tree_index()

In [2]:
print(f"{hier=}\n{hier.nodes=}\n{hier.above=}\n{hier.below=}")

hier=<besmarts.core.trees.tree_index object at 0x742fb02bd280>
hier.nodes={}
hier.above={}
hier.below={}


In [3]:
# create a root node that has no parent
root = hier.node_add_below(None)
root.name = "p0"

In [4]:
print(f"{hier=}\n{hier.nodes=}\n{hier.above=}\n{hier.below=}")

hier=<besmarts.core.trees.tree_index object at 0x742fb02bd280>
hier.nodes={0: <besmarts.core.trees.tree_node object at 0x742f6d527240>}
hier.above={0: None}
hier.below={None: [], 0: []}


In [5]:
hidx = hierarchies.smarts_hierarchy(hier, {})
hidx.smarts[root.index] = "[*:1]~[*:2]~[*:3]~[*:4]"

In [6]:
print(f"{hidx=}\n{hidx.index=}\n{hidx.smarts=}")

hidx=<besmarts.core.hierarchies.smarts_hierarchy object at 0x742f6d5306a0>
hidx.index=<besmarts.core.trees.tree_index object at 0x742fb02bd280>
hidx.smarts={0: '[*:1]~[*:2]~[*:3]~[*:4]'}


In [7]:
print(f"{hier=}\n{hier.nodes=}\n{hier.above=}\n{hier.below=}")

hier=<besmarts.core.trees.tree_index object at 0x742fb02bd280>
hier.nodes={0: <besmarts.core.trees.tree_node object at 0x742f6d527240>}
hier.above={0: None}
hier.below={None: [], 0: []}


In [8]:
p = [
    "[*:1]~[X4:2]~[X4:3]~[*:4]",
    "[*:1]~[X4:2]~[X3:3]~[*:4]",
    "[*:1]~[X4:2]~[X2:3]~[*:4]",
    "[*:1]~[X3:2]~[X3:3]~[*:4]",
    "[*:1]~[X3:2]~[X2:3]~[*:4]",
    "[*:1]~[X2:2]~[X2:3]~[*:4]",
]


for i, smarts in enumerate(p, 1):
    n = hier.node_add_below(root.index)
    n.name = f"p{i}"
    hidx.smarts[n.index] = smarts

print(f"{hier=}\n{hier.nodes=}\n{hier.above=}\n{hier.below=}")

hier=<besmarts.core.trees.tree_index object at 0x742fb02bd280>
hier.nodes={0: <besmarts.core.trees.tree_node object at 0x742f6d527240>, 1: <besmarts.core.trees.tree_node object at 0x742f6d545c00>, 2: <besmarts.core.trees.tree_node object at 0x742f6d546900>, 3: <besmarts.core.trees.tree_node object at 0x742f6d544d80>, 4: <besmarts.core.trees.tree_node object at 0x742f6d546ac0>, 5: <besmarts.core.trees.tree_node object at 0x742f6d546b80>, 6: <besmarts.core.trees.tree_node object at 0x742f6d546c40>}
hier.above={0: None, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
hier.below={None: [], 0: [1, 2, 3, 4, 5, 6], 1: [], 2: [], 3: [], 4: [], 5: [], 6: []}


In [9]:
print(f"{hidx=}\n{hidx.index=}\n{hidx.smarts=}")

hidx=<besmarts.core.hierarchies.smarts_hierarchy object at 0x742f6d5306a0>
hidx.index=<besmarts.core.trees.tree_index object at 0x742fb02bd280>
hidx.smarts={0: '[*:1]~[*:2]~[*:3]~[*:4]', 1: '[*:1]~[X4:2]~[X4:3]~[*:4]', 2: '[*:1]~[X4:2]~[X3:3]~[*:4]', 3: '[*:1]~[X4:2]~[X2:3]~[*:4]', 4: '[*:1]~[X3:2]~[X3:3]~[*:4]', 5: '[*:1]~[X3:2]~[X2:3]~[*:4]', 6: '[*:1]~[X2:2]~[X2:3]~[*:4]'}


In [10]:
hidx = hierarchies.smarts_hierarchy_to_structure_hierarchy_torsions(hidx, gcd)

In [11]:
smiles = ["c1ccccc1C([NH]CC=CO)=O"]

g = gcd.smiles_decode(smiles[0])

In [12]:
# Label the molecule

lbls: assignments.smiles_assignment_group = labeler.assign(
    hidx,
    gcd,
    smiles,
    hidx.topology,
)
glbl = lbls.assignments[0]

In [13]:
glbl.selections

{(6, 1, 2, 15): 'p4',
 (14, 1, 2, 15): 'p4',
 (2, 1, 6, 5): 'p4',
 (2, 1, 6, 7): 'p4',
 (3, 2, 1, 6): 'p4',
 (3, 2, 1, 14): 'p4',
 (1, 2, 3, 4): 'p4',
 (1, 2, 3, 16): 'p4',
 (15, 2, 3, 16): 'p4',
 (4, 3, 2, 15): 'p4',
 (2, 3, 4, 5): 'p4',
 (2, 3, 4, 17): 'p4',
 (16, 3, 4, 17): 'p4',
 (5, 4, 3, 16): 'p4',
 (3, 4, 5, 6): 'p4',
 (3, 4, 5, 18): 'p4',
 (17, 4, 5, 18): 'p4',
 (6, 5, 4, 17): 'p4',
 (4, 5, 6, 7): 'p4',
 (5, 6, 1, 14): 'p4',
 (7, 6, 1, 14): 'p4',
 (1, 6, 5, 4): 'p4',
 (1, 6, 5, 18): 'p4',
 (7, 6, 5, 18): 'p4',
 (1, 6, 7, 8): 'p4',
 (1, 6, 7, 13): 'p4',
 (5, 6, 7, 8): 'p4',
 (5, 6, 7, 13): 'p4',
 (6, 7, 8, 9): 'p4',
 (6, 7, 8, 19): 'p4',
 (13, 7, 8, 19): 'p4',
 (9, 8, 7, 13): 'p4',
 (7, 8, 9, 10): 'p2',
 (7, 8, 9, 20): 'p2',
 (7, 8, 9, 21): 'p2',
 (19, 8, 9, 20): 'p2',
 (19, 8, 9, 21): 'p2',
 (10, 9, 8, 19): 'p2',
 (8, 9, 10, 11): 'p2',
 (8, 9, 10, 22): 'p2',
 (20, 9, 10, 22): 'p2',
 (21, 9, 10, 22): 'p2',
 (11, 10, 9, 20): 'p2',
 (11, 10, 9, 21): 'p2',
 (9, 10, 11, 12): 'p4',
 

In [14]:
nodes = {n.name: n for n in hidx.index.nodes.values()}
nodes

{'p0': <besmarts.core.trees.tree_node at 0x742fb8326100>,
 'p1': <besmarts.core.trees.tree_node at 0x742f6d5470c0>,
 'p2': <besmarts.core.trees.tree_node at 0x742f6d547fc0>,
 'p3': <besmarts.core.trees.tree_node at 0x742f6d54c040>,
 'p4': <besmarts.core.trees.tree_node at 0x742f6d54c0c0>,
 'p5': <besmarts.core.trees.tree_node at 0x742f6d54c100>,
 'p6': <besmarts.core.trees.tree_node at 0x742f6d54c140>}

In [15]:
graphs.graph_set_primitives_atom(g, ["element", "connectivity_total"])

for ic in graphs.graph_torsions(g):
    lbl = glbl.selections.get(ic)
    n = nodes[lbl]
    smarts = hidx.smarts[n.index]
    subg = graphs.subgraph_remove_unselected(graphs.graph_to_subgraph(g, ic))
    print(f"{str(ic):20s} {lbl} {smarts} <- {gcd.smarts_encode(subg)}")

(6, 1, 2, 15)        p4 [*:1]~[X3:2]~[X3:3]~[*:4] <- [#6X3:6]@;:[#6X3:1]@;:[#6X3:2]!@;-[#1X1:15]
(14, 1, 2, 15)       p4 [*:1]~[X3:2]~[X3:3]~[*:4] <- [#1X1:14]!@;-[#6X3:1]@;:[#6X3:2]!@;-[#1X1:15]
(2, 1, 6, 5)         p4 [*:1]~[X3:2]~[X3:3]~[*:4] <- [#6X3:2]@;:[#6X3:1]@;:[#6X3:6]@;:[#6X3:5]
(2, 1, 6, 7)         p4 [*:1]~[X3:2]~[X3:3]~[*:4] <- [#6X3:2]@;:[#6X3:1]@;:[#6X3:6]!@;-[#6X3:7]
(3, 2, 1, 6)         p4 [*:1]~[X3:2]~[X3:3]~[*:4] <- [#6X3:3]@;:[#6X3:2]@;:[#6X3:1]@;:[#6X3:6]
(3, 2, 1, 14)        p4 [*:1]~[X3:2]~[X3:3]~[*:4] <- [#6X3:3]@;:[#6X3:2]@;:[#6X3:1]!@;-[#1X1:14]
(1, 2, 3, 4)         p4 [*:1]~[X3:2]~[X3:3]~[*:4] <- [#6X3:1]@;:[#6X3:2]@;:[#6X3:3]@;:[#6X3:4]
(1, 2, 3, 16)        p4 [*:1]~[X3:2]~[X3:3]~[*:4] <- [#6X3:1]@;:[#6X3:2]@;:[#6X3:3]!@;-[#1X1:16]
(15, 2, 3, 16)       p4 [*:1]~[X3:2]~[X3:3]~[*:4] <- [#1X1:15]!@;-[#6X3:2]@;:[#6X3:3]!@;-[#1X1:16]
(4, 3, 2, 15)        p4 [*:1]~[X3:2]~[X3:3]~[*:4] <- [#6X3:4]@;:[#6X3:3]@;:[#6X3:2]!@;-[#1X1:15]
(2, 3, 4, 5)         p4 [*:1]~[X3

In [16]:
hierarchies.smarts_hierarchy_print(hidx)

**  0 p0 [*:1]~[*:2]~[*:3]~[*:4]
**   1 p1 [*:1]~[X4:2]~[X4:3]~[*:4]
**   2 p2 [*:1]~[X4:2]~[X3:3]~[*:4]
**   3 p3 [*:1]~[X4:2]~[X2:3]~[*:4]
**   4 p4 [*:1]~[X3:2]~[X3:3]~[*:4]
**   5 p5 [*:1]~[X3:2]~[X2:3]~[*:4]
**   6 p6 [*:1]~[X2:2]~[X2:3]~[*:4]


In [17]:
from besmarts.core import tree_iterators


def smarts_hierarchy_print(hidx):
    roots = [hidx.index.nodes[x] for x, y in hidx.index.above.items() if y is None]

    for root in roots:
        for e in tree_iterators.tree_iter_dive(hidx.index, root):
            s = " " * trees.tree_index_node_depth(hidx.index, e)
            print("**", e.index, s, e.name, hidx.smarts.get(e.index))


smarts_hierarchy_print(hidx)

** 0  p0 [*:1]~[*:2]~[*:3]~[*:4]
** 1   p1 [*:1]~[X4:2]~[X4:3]~[*:4]
** 2   p2 [*:1]~[X4:2]~[X3:3]~[*:4]
** 3   p3 [*:1]~[X4:2]~[X2:3]~[*:4]
** 4   p4 [*:1]~[X3:2]~[X3:3]~[*:4]
** 5   p5 [*:1]~[X3:2]~[X2:3]~[*:4]
** 6   p6 [*:1]~[X2:2]~[X2:3]~[*:4]


## Splitting

### Numerical/iterative search

Given a set of SMARTS, generate the partitions which minimize the number of SMARTS edits


In [1]:
from pprint import pprint

from besmarts.codecs import codec_rdkit
from besmarts.core import codecs, compute, configs, graphs, splits, topology
from besmarts.core.primitives import primitive_key

In [2]:
configs.remote_compute_enable = False
configs.workqueue_port = 54321
configs.processors = 1

# Configure the primitives to decode AND split on
prims = (primitive_key.ELEMENT, primitive_key.HYDROGEN), (primitive_key.BOND_ORDER,)

# # use all available primitives
# prims = (None, None)

# Configure the graph codec
gcd = codec_rdkit.graph_codec_rdkit(*prims)
gcd.smiles_config.strip_hydrogen = False

# Configure the "vector of integers" (vectorization) codec for compressing the dataset
icd = codecs.intvec_codec(
    primitive_codecs=gcd.primitive_codecs,
    atom_primitives=gcd.atom_primitives,
    bond_primitives=gcd.bond_primitives,
)

In [3]:
# Define the data set we want SMARTS for
# The set of SMARTS we want to partition is the bonds in this molecule
smi = "CCO"

G = {0: gcd.smiles_decode(smi)}

In [4]:
# Extract some info we'll need later while the dataset is in a convenient format
ic_list = [s for g in G.values() for s in graphs.graph_to_structure_bonds(g)]
pprint(ic_list)
selections = [(i, x) for i, g in G.items() for x in graphs.graph_bonds(g)]
pprint(selections)

[<structure with 9 nodes, 8 edges, select=(1, 2), topology=<structure_topology with primary=(0, 1) at 0x7aaec6d1e780> at 0x7aaf2014dc60>,
 <structure with 9 nodes, 8 edges, select=(2, 3), topology=<structure_topology with primary=(0, 1) at 0x7aaec6d1e780> at 0x7aaec9f85900>,
 <structure with 9 nodes, 8 edges, select=(1, 4), topology=<structure_topology with primary=(0, 1) at 0x7aaec6d1e780> at 0x7aaec9f85990>,
 <structure with 9 nodes, 8 edges, select=(1, 5), topology=<structure_topology with primary=(0, 1) at 0x7aaec6d1e780> at 0x7aaec9f85a20>,
 <structure with 9 nodes, 8 edges, select=(1, 6), topology=<structure_topology with primary=(0, 1) at 0x7aaec6d1e780> at 0x7aaec9f85ab0>,
 <structure with 9 nodes, 8 edges, select=(2, 7), topology=<structure_topology with primary=(0, 1) at 0x7aaec6d1e780> at 0x7aaec9f85b40>,
 <structure with 9 nodes, 8 edges, select=(2, 8), topology=<structure_topology with primary=(0, 1) at 0x7aaec6d1e780> at 0x7aaec9f85bd0>,
 <structure with 9 nodes, 8 edges,

In [5]:
# Compress the dataset by vectorizing it
# This needs to fit into memory
G = {k: icd.graph_encode(v) for k, v in G.items()}

In [11]:
# Configure a basic, quick search
splitter = configs.smarts_splitter_config(
    # Find splits that only differ by 1 bit
    bit_search_min=1,
    bit_search_limit=1,
    # Only examine the two primary atoms of the bond
    # set these all to 1 to split on neighbors too
    branch_min=0,
    branch_limit=0,
    branch_depth_min=0,
    branch_depth_limit=0,
    # Other config options
    unique=False,
    return_matches=True,
    max_splits=0,
    split_general=True,
    split_specific=True,
    unique_complements=False,
    unique_complements_prefer_min=True,
)

In [7]:
# for this to work, we need to extend our graphs to at least the depth of S0
# I don't know what this means - JM
extender = configs.smarts_extender_config(
    splitter.branch_depth_min, splitter.branch_depth_limit, True
)
graphs.structure_extend(extender, ic_list)
ic_list = [graphs.structure_remove_unselected(g) for g in ic_list]
ic_list

[<structure with 2 nodes, 1 edges, select=(1, 2), topology=<structure_topology with primary=(0, 1) at 0x7aaec6d1e780> at 0x7aaec9f86050>,
 <structure with 2 nodes, 1 edges, select=(2, 3), topology=<structure_topology with primary=(0, 1) at 0x7aaec6d1e780> at 0x7aaec9f860e0>,
 <structure with 2 nodes, 1 edges, select=(1, 4), topology=<structure_topology with primary=(0, 1) at 0x7aaec6d1e780> at 0x7aaec9f86170>,
 <structure with 2 nodes, 1 edges, select=(1, 5), topology=<structure_topology with primary=(0, 1) at 0x7aaec6d1e780> at 0x7aaec9f86200>,
 <structure with 2 nodes, 1 edges, select=(1, 6), topology=<structure_topology with primary=(0, 1) at 0x7aaec6d1e780> at 0x7aaec9f86290>,
 <structure with 2 nodes, 1 edges, select=(2, 7), topology=<structure_topology with primary=(0, 1) at 0x7aaec6d1e780> at 0x7aaec9f86320>,
 <structure with 2 nodes, 1 edges, select=(2, 8), topology=<structure_topology with primary=(0, 1) at 0x7aaec6d1e780> at 0x7aaec9f863b0>,
 <structure with 2 nodes, 1 edges,

In [8]:
# Define the SMARTS pattern we want to split
S0 = gcd.smarts_decode("[*:1]~[*:2]")
S0 = graphs.structure(S0.nodes, S0.edges, (1, 2), topology.bond)
S0

<structure with 2 nodes, 1 edges, select=(1, 2), topology=<structure_topology with primary=(0, 1) at 0x7aaecd158680> at 0x7aaec9f85cf0>

In [9]:
# Print out the bonds we want to partition
print("Structures")
for i, f in enumerate(ic_list):
    print(i, gcd.smarts_encode(f))

Structures
0 [#6H3:1]-[#6H2:2]
1 [#6H2:2]-[#8H1:3]
2 [#6H3:1]-[#1H0:4]
3 [#6H3:1]-[#1H0:5]
4 [#6H3:1]-[#1H0:6]
5 [#6H2:2]-[#1H0:7]
6 [#6H2:2]-[#1H0:8]
7 [#8H1:3]-[#1H0:9]


In [10]:
# Perform the calculation
configs.remote_compute_enable = False

wq = compute.workqueue_local("127.0.0.1", 63210)
results: splits.split_return_type = splits.split_structures_distributed(
    splitter, S0, G, selections, wq, icd
)
wq.close()

2025-02-25 16:14:34.350474 Generating splits
2025-02-25 16:14:34.350761 Union merging=8
2025-02-25 16:14:34.372947 Union merging=1
2025-02-25 16:14:34.374128 Generating single splits
2025-02-25 16:14:34.374599 Generated 22 splits
BIT [#6_:1]_[__:2]
BIT [!#6_:1]_[__:2]
BIT [#8_:1]_[__:2]
BIT [!#8_:1]_[__:2]
BIT [_H1:1]_[__:2]
BIT [_!H1:1]_[__:2]
BIT [_H2:1]_[__:2]
BIT [_!H2:1]_[__:2]
BIT [_H3:1]_[__:2]
BIT [_!H3:1]_[__:2]
BIT [__:1]_[#1_:2]
BIT [__:1]_[!#1_:2]
BIT [__:1]_[_H0:2]
BIT [__:1]_[_!H0:2]
2025-02-25 16:14:34.379052 Building tasks
workspace listening on local host. Remote connections prohibited.
2025-02-25 16:14:34.379266 P:   0.00%    28/28 IQ:    1 OQ:    0 IP:    0 LF:    0 RF:    0 RIQ:    0 ROQ:    0 RIP:    0  ERC:    0.0 
2025-02-25 16:14:34.479330 P: 100.00%     0/28 IQ:    0 OQ:    0 IP:    0 LF:    0 RF:    0 RIQ:    0 ROQ:    0 RIP:    0  ERC:    0.0 
    1 CND SPLITS=Y  [!#6:1]~[*:2]
    2 CND SPLITS=Y  [#6:1]~[*:2]
    5 CND SPLITS=N  [!#8:1]~[*:2]
    6 CND SPLITS

The output is as follows. The structures are shown at depth 0 which corresponds to the depth of S0 and the depth defined in the search settings. Next, the splits are found by first combining all structures and then enumerating all bits. This resulted in 11 unique bits, and 22 splits since we wanted to find both general and specific splits. This can be seen by the fact that the bit `[#6_:1]_[__:2]` was found which would produce a specific split (one atom must be carbon) versus its general counterpart `[!#6_:1]_[__:2]` (one atom must not be carbon). If multiple bits were searched, the algorithm would combine these bits to produce new splits. However, this is done by using combinations and therefore grows exponentially. The output then shows that out of the 28 possible splits, only 8 generated unique partitions. This section tries to fail as fast as possible, and so does not perform full scans and aims to be a quick filter. The following output then shows that a full match analysis is done on the 8 valid splits before the result is returned.

In [16]:
# custom processing of results

seen = {}
keep = {}

print("Results:", len(results.splits))
for j, (Sj, matches, bj) in enumerate(
    zip(results.splits, results.matched_idx, results.shards), 1
):
    Sj = graphs.structure(Sj.nodes, Sj.edges, Sj.select, results.topology)
    atoms, bits = len(Sj.select), graphs.graph_bits(Sj, maxbits=True)
    matches = tuple(sorted(matches))
    unmatches = tuple(sorted([i for i in range(len(ic_list)) if i not in matches]))
    entry = tuple(sorted([matches, unmatches]))
    if len(matches) > 0 and len(ic_list) != len(matches):
        if entry in seen:
            if (atoms, bits) < seen[entry]:
                seen[entry] = (atoms, bits)
                keep[entry] = j
        else:
            seen[entry] = (atoms, bits)
            keep[entry] = j

unique = {}
found = 0
for j, (Sj, matches, bj) in enumerate(
    zip(results.splits, results.matched_idx, results.shards), 1
):

    matches = tuple(matches)
    l = unique.get(matches, list())
    l.append((Sj, bj))
    unique[matches] = l

for j, (matches, params) in enumerate(unique.items(), 1):
    matches = tuple(matches)
    found += 1
    if splitter.return_matches:
        print(
            f"{found:4d}",
            f"{j:4d}",
            "\n    matched:  ",
            f"{len(matches):4d}",
            "\n    unmatched:",
            f"{len(ic_list) - len(matches):4d}",
        )

    else:
        print(
            f"{found:4d}",
            f"{j:4d}",
        )
    for k, (Sj, bj) in enumerate(params, 1):

        Sj = graphs.structure(Sj.nodes, Sj.edges, Sj.select, results.topology)
        print(f"   {k:2d} Sj:", gcd.smarts_encode(Sj))
        # print(f"   {k:2d} Sj:    ", Sj.nodes)
    if splitter.return_matches:
        print("      ", matches)
        for i, f in enumerate(ic_list):
            if i in matches:
                print(f"{i:4d}", " -> ", f.select, gcd.smarts_encode(f))
            else:
                print(f"{i:4d}", f.select, gcd.smarts_encode(f))
    print("####################################")

Results: 8
   1    1 
    matched:      7 
    unmatched:    1
    1 Sj: [!#6:1]~[*:2]
       (1, 2, 3, 4, 5, 6, 7)
   0 (1, 2) [#6H3:1]-[#6H2:2]
   1  ->  (2, 3) [#6H2:2]-[#8H1:3]
   2  ->  (1, 4) [#6H3:1]-[#1H0:4]
   3  ->  (1, 5) [#6H3:1]-[#1H0:5]
   4  ->  (1, 6) [#6H3:1]-[#1H0:6]
   5  ->  (2, 7) [#6H2:2]-[#1H0:7]
   6  ->  (2, 8) [#6H2:2]-[#1H0:8]
   7  ->  (3, 9) [#8H1:3]-[#1H0:9]
####################################
   2    2 
    matched:      7 
    unmatched:    1
    1 Sj: [#6:1]~[*:2]
       (0, 1, 2, 3, 4, 5, 6)
   0  ->  (1, 2) [#6H3:1]-[#6H2:2]
   1  ->  (2, 3) [#6H2:2]-[#8H1:3]
   2  ->  (1, 4) [#6H3:1]-[#1H0:4]
   3  ->  (1, 5) [#6H3:1]-[#1H0:5]
   4  ->  (1, 6) [#6H3:1]-[#1H0:6]
   5  ->  (2, 7) [#6H2:2]-[#1H0:7]
   6  ->  (2, 8) [#6H2:2]-[#1H0:8]
   7 (3, 9) [#8H1:3]-[#1H0:9]
####################################
   3    3 
    matched:      2 
    unmatched:    6
    1 Sj: [#8:1]~[*:2]
    2 Sj: [H1:1]~[*:2]
       (1, 7)
   0 (1, 2) [#6H3:1]-[#6H2:2]
   1  ->  (2, 

### Analytic/Direct search

In [20]:
matches = (1, 7)

for i in range(len(ic_list)):
    if i not in matches:
        print(i, gcd.smarts_encode(ic_list[i]))
for i in matches:
    print(i, "->", gcd.smarts_encode(ic_list[i]))

spec = configs.smarts_perception_config(splitter, extender)
results: splits.split_return_type = splits.split_partition(
    topology.bond, spec, ic_list, matches, gcd=gcd, maxmoves=0
)

0 [#6H3:1]-[#6H2:2]
2 [#6H3:1]-[#1H0:4]
3 [#6H3:1]-[#1H0:5]
4 [#6H3:1]-[#1H0:6]
5 [#6H2:2]-[#1H0:7]
6 [#6H2:2]-[#1H0:8]
1 -> [#6H2:2]-[#8H1:3]
7 -> [#8H1:3]-[#1H0:9]
LUN:  [#1,#6;H0,H2:2]-[#8H1:3]
LHS:  [__:2]-[#8H1:3]
RUN:  [#6;H2,H3:1]-[#1,#6;H0,H2:2]
RHS:  [#6_:1]-[__:2]
LHS_DIFF:  [__:2]_[#8H1:3]
LHS_INVE:  [*:2]-[#8H1:3]
RHS_DIFF:  [__:1]_[__:2]
BESTLHS:  [*:1]-[#8H1:2]


As above, the structures are printed, except the desired partition is indicated with the arrows. We selected the two structures that have oxygen in the bond, now the goal is to find a SMARTS pattern that matches only these two. Some informational output is shown, and at the bottom we see BESTLHS is indicated a match was found.

In [21]:
shards = results.value

removeA = shards[2]
addA = shards[3]
nummoves = len(removeA) + len(addA)
verbose = True
shard = shards[0]
matches = [
    x for x in range(len(ic_list)) if x not in removeA and (x in matches or x in addA)
]
if shard is not None:
    print(f"Matches only the input with {nummoves} swaps:", gcd.smarts_encode(shard))
    if verbose and (removeA or addA):
        print("RemoveA", removeA)
        print("AddA", addA)
        for i in range(len(ic_list)):
            if i not in matches:
                print(i, gcd.smarts_encode(ic_list[i]))
        for i in range(len(ic_list)):
            if i in matches:
                print(i, "->", gcd.smarts_encode(ic_list[i]))

shard = shards[1]
if shard is not None:
    print(
        f"Matches the input complement with {nummoves} swaps:", gcd.smarts_encode(shard)
    )
    if verbose and (removeA or addA):
        print("RemoveA", removeA)
        print("AddA", addA)
        for i in range(len(ic_list)):
            if i in matches:
                print(i, gcd.smarts_encode(ic_list[i]))
        for i in range(len(ic_list)):
            if i not in matches:
                print(i, "->", gcd.smarts_encode(ic_list[i]))

Matches only the input with 0 swaps: [*:1]-[#8H1:2]


In [25]:
# Three bonds are identical to bond 2, so this partition cannot be matched exactly
matches = (2,)

for i in range(len(ic_list)):
    if i not in matches:
        print(i, gcd.smarts_encode(ic_list[i]))
for i in matches:
    print(i, "->", gcd.smarts_encode(ic_list[i]))

# Use `maxmoves=2` to produce a SMARTS that matches the desired bonds plus up to 2 more
results: splits.split_return_type = splits.split_partition(
    topology.bond, spec, ic_list, matches, gcd=gcd, maxmoves=2
)

0 [#6H3:1]-[#6H2:2]
1 [#6H2:2]-[#8H1:3]
3 [#6H3:1]-[#1H0:5]
4 [#6H3:1]-[#1H0:6]
5 [#6H2:2]-[#1H0:7]
6 [#6H2:2]-[#1H0:8]
7 [#8H1:3]-[#1H0:9]
2 -> [#6H3:1]-[#1H0:4]
LUN:  [#6H3:1]-[#1H0:4]
LHS:  [#6H3:1]-[#1H0:4]
RUN:  [#6,#8;!H0!H4:1]-[#1,#6;H0,H2:2]
RHS:  [__:1]-[__:2]
LHS_DIFF:  [#6H3:1]_[#1H0:4]
LHS_INVE:  [#6H3:1]-[#1H0:4]
RHS_DIFF:  [__:1]_[__:2]
BESTLHS:  [#6H3:1]-[#1H0:2]


### Hybrid Search

Generate all partitions and perform a direct search on each

In [27]:
# give a unique label to each for combination generation
labels = [str(i) for i in range(len(ic_list))]

# this is k in the nCk partition generation
# will be limited to n//2
spec.splitter.bit_search_limit = 9
results: splits.split_return_type = splits.split_all_partitions(
    topology.bond, spec, ic_list, labels, gcd=gcd, maxmoves=0
)

shards = results.value
for j, (lhs, rhs, matched, unmatch) in enumerate(shards, 1):
    print(f"###\n{j:2d} Sj: {gcd.smarts_encode(lhs)}")
    for i in range(len(ic_list)):
        if i not in matched:
            print(i, gcd.smarts_encode(ic_list[i]))
        else:
            print(i, "->", gcd.smarts_encode(ic_list[i]))

Direct on 1 combo ('0',) depth 0 0
LUN:  [#6H3:1]-[#6H2:2]
LHS:  [#6H3:1]-[#6H2:2]
RUN:  [#6,#8;!H0!H4:2]-[#1,#8;H0,H1:3]
RHS:  [__:2]-[__:3]
LHS_DIFF:  [#6H3:1]_[#6H2:2]
LHS_INVE:  [#6H3:1]-[#6H2:2]
RHS_DIFF:  [__:2]_[__:3]
BESTLHS:  [#6H3:1]-[#6H2:2]
Direct on 1 combo ('1',) depth 0 0
LUN:  [#6H2:2]-[#8H1:3]
LHS:  [#6H2:2]-[#8H1:3]
RUN:  [#6,#8;!H0!H4:1]-[#1,#6;H0,H2:2]
RHS:  [__:1]-[__:2]
LHS_DIFF:  [#6H2:2]_[#8H1:3]
LHS_INVE:  [#6H2:2]-[#8H1:3]
RHS_DIFF:  [__:1]_[__:2]
BESTLHS:  [#6H2:1]-[#8H1:2]
Direct on 1 combo ('2',) depth 0 0
LUN:  [#6H3:1]-[#1H0:4]
LHS:  [#6H3:1]-[#1H0:4]
RUN:  [#6,#8;!H0!H4:1]-[#1,#6;H0,H2:2]
RHS:  [__:1]-[__:2]
LHS_DIFF:  [#6H3:1]_[#1H0:4]
LHS_INVE:  [#6H3:1]-[#1H0:4]
RHS_DIFF:  [__:1]_[__:2]
Direct on 1 combo ('3',) depth 0 0
LUN:  [#6H3:1]-[#1H0:5]
LHS:  [#6H3:1]-[#1H0:5]
RUN:  [#6,#8;!H0!H4:1]-[#1,#6;H0,H2:2]
RHS:  [__:1]-[__:2]
LHS_DIFF:  [#6H3:1]_[#1H0:5]
LHS_INVE:  [#6H3:1]-[#1H0:5]
RHS_DIFF:  [__:1]_[__:2]
Direct on 1 combo ('4',) depth 0 0
LUN:  [#6

## Bitwise Operations

In [31]:
from besmarts.codecs.codec_rdkit import graph_codec_rdkit
from besmarts.core import graphs, mapper

gcd = graph_codec_rdkit()
g = "CCC=O"
g = gcd.smiles_decode(g)

# Combine two atom SMARTS into one
A, B = graphs.graph_to_structure_atoms(g)[:2]
C = mapper.union(A, B)

print(gcd.smarts_encode(graphs.structure_remove_unselected(A)))
print(gcd.smarts_encode(graphs.structure_remove_unselected(B)))
print(gcd.smarts_encode(graphs.structure_remove_unselected(C)))

[#6H3X4x0!rA+0:1]
[#6H2X4x0!rA+0:2]
[#6;H2,H3;X4;x0;!r;A;+0:1]


In [13]:
from besmarts.codecs.codec_rdkit import graph_codec_rdkit
from besmarts.core import configs, graphs, mapper

g = "CCC=O"
g = gcd.smiles_decode(g)

A, B = graphs.graph_to_structure_atoms(g)[:2]
print(gcd.smarts_encode(graphs.structure_remove_unselected(A)))
print(gcd.smarts_encode(graphs.structure_remove_unselected(B)))

# Extend A and B along every bond by one atom
min_depth = 1
max_depth = 1
hydrogen = True
config = configs.smarts_extender_config(min_depth, max_depth, hydrogen)
modified = mapper.mapper_smarts_extend(config, [A, B])
print(gcd.smarts_encode(graphs.structure_remove_unselected(A)))
print(gcd.smarts_encode(graphs.structure_remove_unselected(B)))

# The union happens atomwise
C = mapper.union(A, B)

print(gcd.smarts_encode(graphs.structure_remove_unselected(C)))

[#6H3X4x0!rA+0:1]
[#6H2X4x0!rA+0:2]
[#6H3X4x0!rA+0:1](!@;-[#6H2X4x0!rA+0])(!@;-[#1H0X1x0!rA+0])(!@;-[#1H0X1x0!rA+0])!@;-[#1H0X1x0!rA+0]
[#6H2X4x0!rA+0:2](!@;-[#6H3X4x0!rA+0])(!@;-[#6H1X3x0!rA+0])(!@;-[#1H0X1x0!rA+0])!@;-[#1H0X1x0!rA+0]
[#6;H2,H3;X4;x0;!r;A;+0:1](!@;-[#6;H2,H3;X4;x0;!r;A;+0])(!@;-[#1H0X1x0!rA+0])(!@;-[#1,#6;H0,H1;X1,X3;x0;!r;A;+0])!@;-[#1H0X1x0!rA+0]


In [14]:
from besmarts.core import graph_visitors

# Iterate over the bits in C
for bit in graph_visitors.structure_iter_bits(C, skip_ones=True, iter_inverse=True):
    print(gcd.smarts_encode(bit))

[_H2_____:1](_;_[_______])(_;_[_______])(_;_[_______])_;_[_______]
[_!H2_____:1](_;_[_______])(_;_[_______])(_;_[_______])_;_[_______]
[_H3_____:1](_;_[_______])(_;_[_______])(_;_[_______])_;_[_______]
[_!H3_____:1](_;_[_______])(_;_[_______])(_;_[_______])_;_[_______]
[_______:1](_;_[_H2_____])(_;_[_______])(_;_[_______])_;_[_______]
[_______:1](_;_[_!H2_____])(_;_[_______])(_;_[_______])_;_[_______]
[_______:1](_;_[_H3_____])(_;_[_______])(_;_[_______])_;_[_______]
[_______:1](_;_[_!H3_____])(_;_[_______])(_;_[_______])_;_[_______]
[_______:1](_;_[_______])(_;_[_______])(_;_[#1______])_;_[_______]
[_______:1](_;_[_______])(_;_[_______])(_;_[!#1______])_;_[_______]
[_______:1](_;_[_______])(_;_[_______])(_;_[#6______])_;_[_______]
[_______:1](_;_[_______])(_;_[_______])(_;_[!#6______])_;_[_______]
[_______:1](_;_[_______])(_;_[_______])(_;_[_H0_____])_;_[_______]
[_______:1](_;_[_______])(_;_[_______])(_;_[_!H0_____])_;_[_______]
[_______:1](_;_[_______])(_;_[_______])(_;_[_H1_____])_

In [7]:
from besmarts.codecs.codec_rdkit import graph_codec_rdkit
from besmarts.core import graphs, mapper

gcd = graph_codec_rdkit()

a = graphs.structure_atom(gcd.smarts_decode("[#6,#7,#8:1]"), [1])
b = graphs.structure_atom(gcd.smarts_decode("[#7,#8,#9:1]"), [1])

gcd.smarts_encode(mapper.union(a, b))

'[#6,#7,#8,#9:1]'

In [8]:
gcd.smarts_encode(mapper.intersection(a, b))

'[#7,#8:1]'

In [9]:
gcd.smarts_encode(mapper.xor(a, b))

'[#6,#9;_;_;_;_;_;_:1]'

In [10]:
gcd.smarts_encode(mapper.subtract(a, b))

'[#6______:1]'

In [11]:
gcd.smarts_encode(mapper.subtract(b, a))

'[#9______:1]'

In [18]:
# Cannot directly compare subgraphs; only structures
gcd.smarts_encode(
    mapper.union(gcd.smarts_decode("[#6,#7,#8:1]"), gcd.smarts_decode("[#7,#8,#9:1]"))
)

AttributeError: 'subgraph' object has no attribute 'topology'

## Clustering

### Clustering categorical data

In [1]:
from besmarts.assign.hierarchy_assign_rdkit import smarts_hierarchy_assignment_rdkit
from besmarts.cluster.cluster_assignment import smiles_assignment_str
from besmarts.cluster.cluster_optimization import cluster_classifications
from besmarts.codecs.codec_rdkit import graph_codec_rdkit
from besmarts.core import configs, hierarchies
from besmarts.core.assignments import smiles_assignment_group_bonds

In [30]:
def autorepr(obj):
    return "".join(
        [
            obj.__class__.__name__,
            "(",
            ", ".join(
                f"{attr}={getattr(obj, attr)}"
                for attr in dir(obj)
                if not (attr.startswith("_") or callable(getattr(obj, attr)))
            ),
            ")",
        ]
    )

In [24]:
configs.workqueue_port = 54321  # make sure this port is open/unused
configs.remote_compute_enable = False  # port is only open to localhost

gcd = graph_codec_rdkit()
labeler = smarts_hierarchy_assignment_rdkit()

In [31]:
# Label the bonds of ethyne either "a" or "b"
smi = "[C:1]([H:3])#[C:2][H:4]"
assns = {(1, 2): "a", (1, 3): "b", (2, 4): "b"}
sa = smiles_assignment_str(smi, assns)
print(autorepr(sa))

smiles_assignment_str(selections={(1, 2): 'a', (1, 3): 'b', (2, 4): 'b'}, smiles=[C:1]([H:3])#[C:2][H:4])


In [32]:
# Find a SMARTS hierarchy that assigns the requested labels
sag = smiles_assignment_group_bonds([sa])
cst = cluster_classifications(gcd, labeler, sag)

Assigning molecule     1/1 at depth 0
Labels per unique structure that need more depth
There are 2/3 unique structures at depth 0
There are 0 problems:
Max depth is set to 0
2025-02-26 15:27:03.297534 Labeling subgraphs
2025-02-26 15:27:03.298682 Checking consistency...
Optimization strategy is building steps...
2025-02-26 15:27:03.298819 The optimization strategy has the following iterations:
->   1. op= 1 a=[0] b=1->1 d=0->0 n=0->1
     2. op=-1 a=[0] b=0->0 d=0->0 n=0->0
     3. op= 1 a=[0] b=2->2 d=0->0 n=0->2
     4. op=-1 a=[0] b=0->0 d=0->0 n=0->0
     5. op= 1 a=[0] b=3->3 d=0->0 n=0->3
     6. op=-1 a=[0] b=0->0 d=0->0 n=0->0
Targets for this macro step 1:
1 p0
N Targets: 1
Step tracker for current macro step 1
p0 1


*******************
 2025-02-26 15:27:03.299028 iteration=   1 macro=  1/6 X=        0 params=(2|2) G=Y S=Y bits=1->3 depth=0->0 branch=0->3
*******************

Tree:
**  0   0 p0   {'b', 'a'} [*:1]~[*:2]
=====

2025-02-26 15:27:03.299077 Saving checkpoint to ch

In [33]:
hierarchies.smarts_hierarchy_print(cst.hierarchy)

**  0 p0 [*:1]~[*:2]
**   1 p1 [*:1]#[*:2]


### Clustering numerical data

In [34]:
from besmarts.assign.hierarchy_assign_rdkit import smarts_hierarchy_assignment_rdkit
from besmarts.cluster.cluster_assignment import smiles_assignment_float
from besmarts.cluster.cluster_objective import clustering_objective_mean_separation
from besmarts.cluster.cluster_optimization import cluster_means
from besmarts.codecs.codec_rdkit import graph_codec_rdkit
from besmarts.core import configs, hierarchies
from besmarts.core.assignments import smiles_assignment_group_bonds

In [35]:
configs.workqueue_port = 54321  # make sure this port is open/unused
configs.remote_compute_enable = False  # port is only open to localhost

configs.remote_compute_enable = False
configs.workqueue_port = 54321
gcd = graph_codec_rdkit()
labeler = smarts_hierarchy_assignment_rdkit()

In [36]:
# Define a bond length for each bond in ethyne
smi = "[C:1]([H:3])#[C:2][H:4]"
assns = {
    (1, 2): [1.1],  # CC
    (1, 3): [1.3],  # CH
    (2, 4): [1.3],  # CH
}
sa = smiles_assignment_float(smi, assns)
print(autorepr(sa))

smiles_assignment_float(selections={(1, 2): [1.1], (1, 3): [1.3], (2, 4): [1.3]}, smiles=[C:1]([H:3])#[C:2][H:4])


In [37]:
# Define an objective function for the clustering
# We want to find a hierarchy where parent and child SMARTS have bond lengths that differ by more than 0.1
objective = clustering_objective_mean_separation(split_separation=0.1)

In [38]:
# Perform the clustering
sag = smiles_assignment_group_bonds([sa])
cst = cluster_means(gcd, labeler, sag, objective=objective)

2025-02-26 15:34:21.752516 Labeling subgraphs
2025-02-26 15:34:21.753616 Checking consistency...
Optimization strategy is building steps...
2025-02-26 15:34:21.753714 The optimization strategy has the following iterations:
->   1. op= 1 a=[0] b=1->1 d=0->0 n=0->0
     2. op=-1 a=[0] b=0->0 d=0->0 n=0->0
     3. op= 1 a=[0] b=2->2 d=0->0 n=0->0
     4. op=-1 a=[0] b=0->0 d=0->0 n=0->0
Targets for this macro step 1:
1 p0
N Targets: 1
Step tracker for current macro step 1
p0 1


*******************
 2025-02-26 15:34:21.753854 iteration=   1 macro=  1/4 X=        0 params=(2|1) G=N S=Y bits=1->2 depth=0->0 branch=0->0
*******************

Tree:
**  0   0 p0    Mean=    1.2333 Std=    0.0943 N=      3 Min=    1.1000 Max=    1.3000 [*:1]~[*:2]
=====

2025-02-26 15:34:21.753960 Saving checkpoint to chk.cst.p
2025-02-26 15:34:21.754414 Collecting SMARTS for p0 N=3/3 and setting to depth=0
 == iteration=   2 macro=  1/4 micro=  1/1 operation=1 params=(2|1) cluster=p0   N= 3 overlap=[0] bits=1->

In [39]:
hierarchies.smarts_hierarchy_print(cst.hierarchy)

**  0 p0 [*:1]~[*:2]
**   1 p1 [*:1]#[*:2]


## Force Field Fitting

In [1]:
from typing import Dict, List, Tuple

from besmarts.assign import hierarchy_assign_rdkit
from besmarts.codecs import codec_rdkit
from besmarts.core import (
    arrays,
    assignments,
    compute,
    configs,
    graphs,
    optimization,
    perception,
    topology,
)
from besmarts.mechanics import fits
from besmarts.mechanics import molecular_models as mm
from besmarts.mechanics import optimizers_scipy, smirnoff_models, smirnoff_xml

In [2]:
def load_xyz(flist, indices=None) -> assignments.graph_db_row:
    s = 0
    N = None
    gdr = assignments.graph_db_row()
    for f in flist:
        lines = f.split("\n")
        if N is None:
            N = int(lines[0].split()[0])
        if indices is None:
            indices = [*range(1, N + 1)]
        assert N == int(lines[0].split()[0])
        for chunk in arrays.batched(lines, N + 2):
            if chunk and chunk[0]:
                sel = {}
                for i, line in enumerate(chunk, -2):
                    if i >= 0:
                        sym, x, y, z = line.split()[:4]
                        sel[indices[i],] = list(map(float, (x, y, z)))
                gdc = assignments.graph_db_column()
                gdc.selections.update(sel)
                gdr.columns[s] = gdc
                s += 1
    return gdr


def new_gdb(f: Dict[str, List[str]]) -> assignments.graph_db:
    gcd = codec_rdkit.graph_codec_rdkit()
    gdb = assignments.graph_db()

    ne = 0
    for smi, fn_dict in f.items():

        g = gcd.smiles_decode(smi)
        gid = assignments.graph_db_add_graph(gdb, smi, g)

        gdb.graphs[gid] = g
        gdb.smiles[gid] = smi
        gdb.selections[topology.index_of(topology.atom)] = {
            gid: {k: v for k, v in enumerate(graphs.graph_atoms(g))}
        }
        gde = assignments.graph_db_entry()
        gdb.entries[len(gdb.entries)] = gde
        for rid, rdata in enumerate(fn_dict):
            tid = assignments.POSITIONS
            gdt = assignments.graph_db_table(topology.atom)
            gdg = assignments.graph_db_graph()
            gdt.graphs[gid] = gdg
            fn = rdata[tid]
            # indices=dict(sorted([(j, x) for j, x in enumerate(g.nodes, 1)], key=lambda x: x[1]))
            indices = None
            gdr = load_xyz([fn], indices=indices)
            gdg.rows[0] = gdr
            gde.tables[tid] = gdt
            tid = assignments.GRADIENTS
            if tid in rdata:
                gdt = assignments.graph_db_table(topology.atom)
                gdg = assignments.graph_db_graph()
                gdt.graphs[gid] = gdg
                fn = rdata[tid]
                # indices=dict(sorted([(j, x) for j, x in enumerate(g.nodes)], key=lambda x: x[1]))
                gdr = load_xyz([fn], indices=indices)
                gdg.rows[0] = gdr
                gde.tables[tid] = gdt
                gx = [x for y in gdr[0].selections.values() for x in y]
                gdt.values.extend(gx)
            tid = assignments.ENERGY
            if tid in rdata:
                gdt = assignments.graph_db_table(topology.null)
                fn = rdata[tid]
                ene = [*map(float, [x for x in open(fn).read().split("\n") if x])]
                gdt.values.extend(ene)
                gde.tables[tid] = gdt
    return gdb


def run(gdb, ff_fn):
    # build the dataset and input ff
    gcd = codec_rdkit.graph_codec_rdkit()
    labeler = hierarchy_assign_rdkit.smarts_hierarchy_assignment_rdkit()
    pcp = perception.perception_model(gcd, labeler)
    csys = smirnoff_models.smirnoff_load(ff_fn, pcp)
    psys = fits.gdb_to_physical_systems(gdb, csys)

    # Fit only bond parameter b4
    models = {0: ["b4"]}
    fit_models = [0]
    
    strat = fits.forcefield_optimization_strategy_default(csys, models=models)
    co = fits.chemical_objective


    # Define the final objective tier that performs a full fit
    # The best parameter is accepted and incorporated into the parameter set
    final = fits.objective_tier()
    final.objectives = {
        0: fits.objective_config_position(
            assignments.graph_db_address(
                eid=[0],
            ),
            scale=1,
        ),
        1: fits.objective_config_gradient(
            assignments.graph_db_address(
                eid=[0],
            ),
            scale=1,
        ),
    }
    final.objectives[0].verbose = 2
    final.objectives[1].verbose = 2
    # final.key_filter = lambda x: x[0] in fit_models and x[1] == 'l'
    final.fit_models = fit_models
    final.fit_symbols = ["l"] # Fit only bond lengths; add "k" to fit force constants

    final.method = "L-BFGS-B"

    # Define an objective tier to focus on promising fits
    onestep = fits.objective_tier()
    onestep.objectives = final.objectives
    onestep.step_limit = 2 # Perform only 2 fitting steps
    onestep.accept = 3 # Pass the top 3 candidates to the next tier
    # onestep.key_filter = lambda x: x[0] in fit_models and x[1] == 'l'
    onestep.fit_models = fit_models
    onestep.fit_symbols = ["l"]
    onestep.method = "L-BFGS-B"

    tiers = [onestep]  # have a default

    initial = final

    kv0 = mm.chemical_system_iter_keys(csys)
    newcsys, (P0, P), (C0, C) = fits.ff_optimperception_modelize(
        csys, gdb, psys, strat, co, initial, tiers, final
    )

    print("Modified parameters:")
    kv = mm.chemical_system_iter_keys(newcsys)
    for k, v in kv.items():
        v0 = kv0.get(k)
        if v0 is not None:
            dv = v - v0
            if abs(dv) > 1e-7:
                print(f"{str(k):20s} | New: {v:12.6g} Ref {v0:12.6g} Diff {dv:12.6g}")
        else:
            print(f"{str(k):20s} + New: {v:12.6g}")
    print("Initial objectives:")
    X0 = P0 + C0
    X = P + C
    print(f"Total= {X0:15.8g} Physical {P0:15.8g} Chemical {C0:15.8g}")
    print("Final objectives:")
    print(f"Total= {X:15.8g} Physical {P:15.8g} Chemical {C:15.8g}")
    print("Differences:")
    print(
        f"Total= {100*(X-X0)/X0:14.2f}% Physical {100*(P-P0)/P0:14.2f}% Chemical {100*(C-C0)/C0:14.2f}%"
    )


In [3]:
configs.processors = 1
configs.remote_compute_enable = False
configs.workqueue_port = 54321

In [4]:
smi = "[C:1]1([H:9])=[C:2]([H:10])[C:3]([H:11])=[C:4]([C:5](=[O:6])[Cl:7])[O:8]1"

xyz_positions = """11

  C -1.448194 -0.849408  0.168489
  C -1.594013  0.503187 -0.016781
  C -0.273976  1.026226 -0.135035
  C  0.580644 -0.047164 -0.013031
  C  2.034612 -0.068609 -0.059252
  O  2.728097  0.901087 -0.219099
 Cl  2.762146 -1.707341  0.146556
  O -0.138973 -1.200446  0.173518
  H -2.152268 -1.658361  0.306090
  H -2.527430  1.048099 -0.061800
  H  0.029352  2.052732 -0.289658
"""

xyz_grad = """11

  C      0.49755    0.17370   -0.04115
  C     -0.00884    0.07632   -0.01031
  C      0.20074   -0.69547    0.09073
  C     -0.02955    1.24848   -0.17483
  C      0.55229   -1.91119    0.25039
  O     -0.15948    0.65794   -0.08724
 Cl     -0.33030    0.82983   -0.10559
  O     -0.73720   -0.66864    0.11909
  H     -0.11502    0.11021   -0.01168
  H     -0.00691    0.04737   -0.00649
  H      0.02566   -0.05163    0.00657
"""

d = {
    smi: [
        {
            assignments.POSITIONS: xyz_positions,
            assignments.GRADIENTS: xyz_grad,
        },
    ],
}
gdb = new_gdb(d)


{1: {0: {0: (1,),
   1: (9,),
   2: (2,),
   3: (10,),
   4: (3,),
   5: (11,),
   6: (4,),
   7: (5,),
   8: (6,),
   9: (7,),
   10: (8,)}}}

In [5]:
!wget https://raw.githubusercontent.com/openforcefield/openff-forcefields/refs/heads/main/openforcefields/offxml/openff-2.1.0.offxml

--2025-03-13 11:37:50--  https://raw.githubusercontent.com/openforcefield/openff-forcefields/refs/heads/main/openforcefields/offxml/openff-2.1.0.offxml
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
connected. to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... 
200 OKequest sent, awaiting response... 
Length: 74400 (73K) [text/plain]
Saving to: ‘openff-2.1.0.offxml.2’


2025-03-13 11:37:51 (4.25 MB/s) - ‘openff-2.1.0.offxml.2’ saved [74400/74400]



In [18]:
run(gdb, "openff-2.1.0.offxml")

Optimization strategy is building steps...
2025-02-26 16:49:33.233993 The optimization strategy has the following iterations:
->   1:00. op= 1 m=[0] a=[0] b=1->1 d=0->0 n=0->0
     2:00. op=-1 m=[0] a=[0] b=0->0 d=0->0 n=0->0
     3:00. op= 1 m=[0] a=[0] b=2->2 d=0->0 n=0->0
     4:00. op=-1 m=[0] a=[0] b=0->0 d=0->0 n=0->0
workspace listening on local host. Remote connections prohibited.

Initial parameter assignments of dataset:
Model:
Tree:
 0   0 Bonds  
 1   0  b4   [#6X3:1]-[#6X3:2] k: [540.3345953498] l: [1.466199291912]
 1   0  b6   [#6X3:1]=[#6X3:2] k: [898.589948525] l: [1.382361687103]
 1   0  b17  [#6X3:1]-[#8X2:2] k: [598.9859275918] l: [1.357746519746]
 1   0  b21  [#6:1]=[#8X1+0,#8X2+1:2] k: [1527.019744047] l: [1.221668642702]
 1   0  b70  [#6:1]-[#17:2] k: [368.4266150848] l: [1.722215272811]
 1   0  b85  [#6X3:1]-[#1:2] k: [775.3853383846] l: [1.081823673944]
Tree:
 0   1 Angles  
 1   1  a10  [*:1]~[#6X3:2]~[*:3] k: [147.0414413301] l: [2.0914524780510044]
 1   1  a1