In [1]:
from qcportal import PortalClient
from qcelemental.models.results import WavefunctionProtocolEnum
from openff.qcsubmit.results import OptimizationResultCollection,BasicResultCollection
from openff.qcsubmit.datasets import OptimizationDataset
from openff.qcsubmit.factories import OptimizationDatasetFactory
from openff.qcsubmit.common_structures import Metadata, QCSpec
import numpy as np
from openff.toolkit import Molecule
import tqdm
from openff.toolkit.utils import OpenEyeToolkitWrapper, ToolkitRegistry
from openff.units import unit
import itertools
import multiprocess
from openeye import oechem

In [2]:
test = OptimizationDataset.parse_file('dataset.json.bz2')

In [3]:
n_confs1 = np.array(
    [mol.n_conformers for mol in test.molecules]
)

In [4]:
print(min(n_confs1),np.mean(n_confs1),np.median(n_confs1),max(n_confs1))

1 2.357153357098232 1.0 10


In [6]:
np.argwhere(n_confs1>5)

array([[44732]])

In [9]:
too_many_confs = list(test.molecules)[44732]

In [10]:
sum([atom.mass.m for atom in too_many_confs.atoms])

160.218142

In [22]:
for conf in too_many_confs.conformers:
    for conf2 in too_many_confs.conformers:
        if np.any(conf== conf2) and not np.all(conf==conf2):
            print('\nConf1\n',conf)
            print('\nConf2\n',conf2)
            print('\nDiff\n',conf-conf2)


Conf1
 [[0.2372396582545052 0.9181611543560623 -0.3408256792389622] [-0.5280258641708974 1.9266209598111603 0.5144005405515044] [-0.10628502055563531 3.300006868056862 0.2500186846358158] [-0.665168345625347 4.394350526142954 0.9129877115631527] [-0.21475531015433122 5.674424648250734 0.6102113713596934] [-1.6618332884759737 4.235251429145449 1.8697674250051708] [-0.1613673569025521 -0.5491163753728665 -0.10909849174107637] [0.4946216638597981 -1.138903855938483 1.1414395567141482] [-0.21148632322574404 -2.366973399859228 1.6695493483906951] [0.33408292967648423 -3.2871694575538726 2.2689063532193123] [0.28671255741267654 -1.365767359815634 -1.2771856770950973] [1.3069505669736465 1.0394122589550416 -0.13207167258159497] [0.09378273798031113 1.1772170056889377 -1.3975962419040489] [-1.5952966203515742 1.865294935637597 0.27908101768208066] [-0.4034210725389535 1.7305893893241697 1.5823967457706096] [0.6201660043379299 3.479109525098052 -0.44040012420654245] [0.5129511929638758 5.82766

In [23]:
too_many_confs



NGLWidget(max_frame=9)

In [27]:
import numpy
def compute_rmsd_matrix_oe(molecule: Molecule) -> numpy.ndarray:
    """Computes the RMSD between all conformers stored on a molecule using an OpenEye
    backend."""

    from openeye import oechem

    oe_molecule: oechem.OEMol = molecule.to_openeye()
    oe_conformers = {
        i: oe_conformer for i, oe_conformer in enumerate(oe_molecule.GetConfs())
    }

    n_conformers = len(molecule.conformers)

    rmsd_matrix = numpy.zeros((n_conformers, n_conformers))

    for i, j in itertools.combinations([*oe_conformers], 2):
        rmsd_matrix[i, j] = oechem.OERMSD(
            oe_conformers[i],
            oe_conformers[j],
            True,
            False,
            True,
        )

    rmsd_matrix += rmsd_matrix.T
    return rmsd_matrix

In [28]:
compute_rmsd_matrix_oe(too_many_confs)



array([[0.        , 0.27097823, 0.35533477, 0.20790342, 0.26359266,
        0.09117674, 0.28702991, 0.22126973, 0.27927432, 0.16620801],
       [0.27097823, 0.        , 0.20836479, 0.31437573, 0.34157769,
        0.28473542, 0.09114855, 0.32228679, 0.35280758, 0.39400016],
       [0.35533477, 0.20836479, 0.        , 0.26997963, 0.41978506,
        0.36897308, 0.23236379, 0.28315308, 0.43138225, 0.4576149 ],
       [0.20790342, 0.31437573, 0.26997963, 0.        , 0.34080102,
        0.23255965, 0.33216007, 0.09113447, 0.35645036, 0.28818008],
       [0.26359266, 0.34157769, 0.41978506, 0.34080102, 0.        ,
        0.27781823, 0.35374673, 0.34837541, 0.09138267, 0.33237548],
       [0.09117674, 0.28473542, 0.36897308, 0.23255965, 0.27781823,
        0.        , 0.2709509 , 0.20786537, 0.26304314, 0.14011237],
       [0.28702991, 0.09114855, 0.23236379, 0.33216007, 0.35374673,
        0.2709509 , 0.        , 0.31427515, 0.34127424, 0.38427865],
       [0.22126973, 0.32228679, 0.2831530

In [29]:
test2 = OptimizationResultCollection.parse_file('filtered_and_combined_nagl2_opt.json')

In [33]:
too_many_confs.to_smiles(mapped=True)

'[H:24][C:9](=[O:10])[C:8]([H:22])([H:23])[C:7]([H:21])([C:1]([H:12])([H:13])[C:2]([H:14])([H:15])[N+:3](=[C:4]([N:5]([H:17])[H:18])[N:6]([H:19])[H:20])[H:16])[N+:11]([H:25])([H:26])[H:27]'

In [34]:
og_too_many_confs = [entry for entry in test2.entries['https://api.qcarchive.molssi.org:443/'] if entry.cmiles == too_many_confs.to_smiles(mapped=True)]

In [36]:
len(og_too_many_confs)

10

In [37]:
test3 = OptimizationDataset.parse_file('../2024-11-19-OpenFF-NAGL2-Training-Optimization-Dataset-Part-1-v4.0/dataset_part1.json.bz2')

In [38]:
n_confs3 = np.array(
    [mol.n_conformers for mol in test3.molecules]
)

In [39]:
print(min(n_confs3),np.mean(n_confs3),np.median(n_confs3),max(n_confs3))

1 2.3793847716472594 1.0 10


In [40]:
np.argwhere(n_confs3 > 5)

array([[45575]])