In [1]:
import pickle 
data = pickle.load(open('tid_clusters_list.p','rb'))

In [2]:
selected = data['core_selected']

In [29]:
# copied from https://github.com/openforcefield/qca-dataset-submission/blob/master/submissions/2021-03-23-OpenFF-Amide-Torsion-Set-v1.0/generate-dataset.ipynb
import logging
import warnings
from pprint import pprint

import numpy as np
from openeye import oechem
from openff.qcsubmit.common_structures import TorsionIndexer
from openff.qcsubmit.factories import TorsiondriveDatasetFactory
from openff.qcsubmit.workflow_components import StandardConformerGenerator, EnumerateStereoisomers, EnumerateProtomers
from openforcefield.topology import Molecule
from tqdm import tqdm

In [4]:
# Warnings that tell us we have undefined stereo and charged molecules
logging.getLogger("openforcefield").setLevel(logging.ERROR)
warnings.simplefilter("ignore")

In [17]:
molecules = []

for tid, clusters in tqdm(selected.items()):
    for cluster in clusters: 
        torsions = cluster['torsions']
        for torsion in torsions: 
            smiles_pattern = torsion['mol_index']
            dihedrals = torsion['indices']
            molecule: Molecule = Molecule.from_smiles(smiles_pattern)

            try:
                molecule.generate_conformers(n_conformers=1)
            except Exception:
                print(f"Skipping {smiles_pattern} - OMEGA error.")
                continue

            torsion_indexer = TorsionIndexer()
            torsion_indexer.add_torsion(dihedrals, (-165, 180), True)

            molecule.properties["dihedrals"] = torsion_indexer
            molecules.append(molecule)

 54%|█████▍    | 90/167 [03:13<01:08,  1.12it/s]Skipping C[C@H](C=C=C)N=N#N - OMEGA error.
 73%|███████▎  | 122/167 [04:17<01:50,  2.45s/it]Skipping C1CS(=O)(=O)CCC1[P-](F)(F)(F)(F)F - OMEGA error.
Skipping C1[C@@H](CCC(C1)[P-](F)(F)(F)(F)F)O - OMEGA error.
Skipping C([P-](F)(F)(F)(F)F)S(=O)(=O)N - OMEGA error.
Skipping C(C(=O)N)[P-](F)(F)(F)(F)F - OMEGA error.
Skipping CN(C)C[P-](F)(F)(F)(F)F - OMEGA error.
Skipping C(N=[N+]=[N-])[P-](F)(F)(F)(F)F - OMEGA error.
Skipping C1CC(=O)CCC1[P-](F)(F)(F)(F)F - OMEGA error.
Skipping C1CSC[C@@H]1[P-](F)(F)(F)(F)F - OMEGA error.
 74%|███████▎  | 123/167 [04:20<01:57,  2.66s/it]Skipping C(=[NH2+])[P-](F)(F)(F)(F)F - OMEGA error.
Skipping c1c(cc(cc1F)[P-](F)(F)(F)(F)F)O - OMEGA error.
 75%|███████▍  | 125/167 [04:24<01:42,  2.43s/it]Skipping N#N=NOO - OMEGA error.
Skipping C(ON=N#N)F - OMEGA error.
Skipping CC(=O)ON=N#N - OMEGA error.
 83%|████████▎ | 138/167 [04:47<00:49,  1.72s/it]Skipping C=NN=N#N - OMEGA error.
Skipping N#N=NN=S=O - OMEGA erro

In [22]:
dataset_factory = TorsiondriveDatasetFactory()

dataset_factory.add_workflow_component(
    StandardConformerGenerator(max_conformers=10, rms_cutoff=0.1, clear_existing=True)
)
dataset_factory.add_workflow_component(
    EnumerateStereoisomers(max_isomers=10)
)
dataset = dataset_factory.create_dataset(
    dataset_name="OpenFF Gen3 Torsion Set v1.0",
    tagline="OpenFF Gen3 Torsion Set v1.0",
    description="This dataset is a simple-molecule-only dataset, a candidate of Sage torsion parameter training set",
    molecules=molecules,
)

dataset.metadata.submitter = "hyesujang"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/"
    "2021-04-09-OpenFF-Gen3-Torsion-Set-v1.0"
)

Deduplication                 : 100%|██████| 2395/2395 [00:05<00:00, 426.43it/s]
StandardConformerGenerator    : 100%|███████| 1864/1864 [01:35<00:00, 19.54it/s]
EnumerateStereoisomers        : 100%|███████| 1864/1864 [03:37<00:00,  8.57it/s]
Preparation                   : 100%|███████| 2556/2556 [00:57<00:00, 44.67it/s]


In [23]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of torsion drives         ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

masses = []
for molecule in dataset.molecules:
    oemol = molecule.to_openeye()
    mass = oechem.OECalculateMolecularWeight(oemol)
    masses.append(mass)

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(m.total_charge/m.total_charge.unit for m in dataset.molecules)))

Number of unique molecules        2433
Number of filtered molecules      167
Number of torsion drives          4684
Number of conformers min mean max 1   1.05 4
Mean molecular weight: 154.34
Max molecular weight: 514.56
Charges: [-1.0, 0.0, 1.0, 2.0]


In [24]:
pprint(dataset.metadata.dict())

{'collection_type': 'TorsiondriveDataset',
 'creation_date': datetime.date(2021, 4, 9),
 'dataset_name': 'OpenFF Gen3 Torsion Set v1.0',
 'elements': {'I', 'H', 'O', 'F', 'P', 'Cl', 'N', 'S', 'C', 'Br'},
 'long_description': 'This dataset is a simple-molecule-only dataset, a '
                     'candidate of Sage torsion parameter training set',
 'long_description_url': HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2021-04-09-OpenFF-Gen3-Torsion-Set-v1.0', scheme='https', host='github.com', tld='com', host_type='domain', path='/openforcefield/qca-dataset-submission/tree/master/submissions/2021-04-09-OpenFF-Gen3-Torsion-Set-v1.0'),
 'short_description': 'OpenFF Gen3 Torsion Set v1.0',
 'submitter': 'hyesujang'}


In [25]:
for spec, obj in dataset.qc_specifications.items():
    print("Spec:", spec)
    pprint(obj.dict())

Spec: default
{'basis': 'DZVP',
 'implicit_solvent': None,
 'method': 'B3LYP-D3BJ',
 'program': 'psi4',
 'spec_description': 'Standard OpenFF optimization quantum chemistry '
                     'specification.',
 'spec_name': 'default',
 'store_wavefunction': 'none'}


In [26]:
pprint(dataset.scf_properties)

[<SCFProperties.Dipole: 'dipole'>,
 <SCFProperties.Quadrupole: 'quadrupole'>,
 <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>,
 <SCFProperties.MayerIndices: 'mayer_indices'>]


In [27]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("dataset.smi", "smi")

dataset.visualize("dataset.pdf", columns=8)