In [1]:
%env OE_LICENSE=/home/brent/oe_license.txt

env: OE_LICENSE=/home/brent/oe_license.txt


In [2]:
from pprint import pprint

import qcportal

from openff.toolkit import Molecule, ForceField
from openff.qcsubmit.datasets import OptimizationDataset
from openff.qcsubmit.factories import OptimizationDatasetFactory

import numpy as np

In [3]:
ff = ForceField("tm-2.2.offxml")

In [4]:
mols = []
with open("all.smiles") as inp:
    for line in inp:
        [pid, smiles] = line.split()
        mol = Molecule.from_smiles(smiles, allow_undefined_stereo=True)
        labels = ff.label_molecules(mol.to_topology())[0]["ProperTorsions"]
        assert pid in [p.id for p in labels.values()]
        mols.append(mol)

In [5]:
print(f"{len(mols)} molecules")
print(f"{len({mol.to_smiles() for mol in mols})} unique molecules")

56 molecules
51 unique molecules




In [6]:
dataset = OptimizationDataset(
    dataset_name="OpenFF Torsion Benchmark Supplement Optimization Dataset v1.0",
    dataset_tagline="B3LYP-D3BJ/DZVP optimization of molecules to increase proper torsion testing coverage.",
    description="Additional benchmarking data for Sage 2.2.0 proper torsions and new parameters from the torsion multiplicity work",
)
dataset.metadata.submitter = "ntBre"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/"
    "2024-04-18-OpenFF-Torsion-Benchmark-Supplement-Optimization-Dataset-v1.0"
)

In [7]:
dataset_factory = OptimizationDatasetFactory()
for mol in mols:
    dataset.add_molecule(dataset_factory.create_index(molecule=mol), mol)

[15:46:41] UFFTYPER: Unrecognized charge state for atom: 13
[15:46:41] UFFTYPER: Unrecognized charge state for atom: 13
[15:46:41] UFFTYPER: Unrecognized charge state for atom: 11
[15:46:45] UFFTYPER: Unrecognized charge state for atom: 31
[15:46:45] UFFTYPER: Unrecognized charge state for atom: 13
[15:46:45] UFFTYPER: Unrecognized charge state for atom: 13
[15:46:46] UFFTYPER: Unrecognized charge state for atom: 11


In [8]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

masses = [
    [
        sum([atom.mass.m for atom in molecule.atoms])
        for molecule in dataset.molecules
    ]
]
print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(m.total_charge.m for m in dataset.molecules)))

Number of unique molecules        51
Number of filtered molecules      0
Number of conformers              51
Number of conformers min mean max 1   1.00 1
Mean molecular weight: 259.38
Max molecular weight: 508.31
Charges: [-1.0, 0.0, 1.0, 2.0]


In [9]:
pprint(dataset.metadata.dict())

{'collection_type': 'OptimizationDataset',
 'creation_date': datetime.date(2024, 4, 18),
 'dataset_name': 'OpenFF Torsion Benchmark Supplement Optimization Dataset v1.0',
 'elements': {'H', 'C', 'F', 'Cl', 'Br', 'N', 'S', 'O', 'P'},
 'long_description': 'Additional benchmarking data for Sage 2.2.0 proper '
                     'torsions and new parameters from the torsion '
                     'multiplicity work',
 'long_description_url': HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2024-04-18-OpenFF-Torsion-Benchmark-Supplement-v1.0', ),
 'short_description': 'B3LYP-D3BJ/DZVP optimization of molecules to increase '
                      'proper torsion testing coverage.',
 'submitter': 'ntBre'}


# Exporting dataset

In [10]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file('dataset.smi', 'smi')
dataset.visualize("dataset.pdf", columns=8)

print(dataset.qc_specifications)

{'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>, implicit_solvent=None, maxiter=200, scf_properties=[<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.MayerIndices: 'mayer_indices'>], keywords={})}
