In [1]:
import pickle
tid_clusters_list = pickle.load(open('tid_clusters_list.p','rb'))

In [2]:
count = 0
for tid, clusters in tid_clusters_list.items():
    for cluster in clusters: 
        count +=len(cluster['torsions'])
print(f'# input entries: {count}')

# input entries: 1085


In [3]:
# copied from https://github.com/openforcefield/qca-dataset-submission/blob/master/submissions/2021-03-23-OpenFF-Amide-Torsion-Set-v1.0/generate-dataset.ipynb
import logging
import warnings
from pprint import pprint

import numpy as np
from openff.qcsubmit.common_structures import TorsionIndexer
from openff.qcsubmit.factories import TorsiondriveDatasetFactory
from openforcefield.topology import Molecule
from openforcefield.utils import UndefinedStereochemistryError
from simtk import unit
from tqdm import tqdm

  "implicitly coercing SELECT object to scalar subquery; "


In [4]:
# Warnings that tell us we have undefined stereo and charged molecules
logging.getLogger("openforcefield").setLevel(logging.ERROR)
warnings.simplefilter("ignore")

In [5]:
molecules = []

for tid, clusters in tqdm(tid_clusters_list.items()):
    for cluster in clusters: 
        torsions = cluster['torsions']
        for torsion in torsions: 
            smiles_pattern = torsion['mol_index']
            dihedrals = torsion['indices']

            # RDKit is unable to understand the stereochemistry of these molecules so
            # we skip them.
            if smiles_pattern == "[H]/C(=C(/C([H])([H])[H])\\[H])/C(=C(\\C([H])([H])[H])/[H])/[H]":
                continue

            try:
                molecule: Molecule = Molecule.from_smiles(smiles_pattern)
            except UndefinedStereochemistryError:
                molecule: Molecule = Molecule.from_smiles(smiles_pattern, allow_undefined_stereo=True)
                molecule = ([molecule] + molecule.enumerate_stereoisomers(max_isomers=1))[-1]

            molecule = molecule.canonical_order_atoms()

            try:
                molecule.generate_conformers(n_conformers=10)
            except Exception:
                print(f"Skipping {smiles_pattern} - OMEGA error.")
                continue
            try:
                molecule.to_rdkit()
            except Exception:
                print(f"Skipping {smiles_pattern} - RDKit incompatible.")
                continue

            torsion_indexer = TorsionIndexer()
            torsion_indexer.add_torsion(dihedrals, (-165, 180), True)

            molecule.properties["dihedrals"] = torsion_indexer
            molecules.append(molecule)

100%|██████████| 167/167 [03:23<00:00,  1.22s/it]


In [6]:
len(molecules)

1084

In [7]:
dataset_factory = TorsiondriveDatasetFactory()

# dataset_factory.add_workflow_component(
#     StandardConformerGenerator(max_conformers=10, rms_cutoff=0.1, clear_existing=True)
# )
dataset = dataset_factory.create_dataset(
    dataset_name="OpenFF Gen3 Torsion Set v1.0",
    tagline="OpenFF Gen3 Torsion Set v1.0",
    description="This dataset is a simple-molecule-only torsiondrive dataset, aiming to avoid issue of torsion parameter contamination by large internal non-bonded interactions during a valece parameter optimization. Molecules with one effective rotating bond were generate by combining two simple substituents, which were identified by fragmenting small drug like molecules. Torsions from the generated molecule set were selected using clustering method, in a way that the dataset can allow a chemical diversity of molecules training each torsion parameter.",
    molecules=molecules,
)

dataset.metadata.submitter = "hyesujang"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/"
    "2021-04-09-OpenFF-Gen3-Torsion-Set-v1.0"
)

Deduplication                 : 100%|██████| 1084/1084 [00:03<00:00, 353.57it/s]
Preparation                   : 100%|█████████| 887/887 [00:14<00:00, 59.27it/s]


In [8]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of torsion drives         ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

masses = []
for molecule in dataset.molecules:
    mass = sum(atom.mass.value_in_unit(unit.dalton) for atom in molecule.atoms)
    masses.append(mass)

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(m.total_charge/m.total_charge.unit for m in dataset.molecules)))

Number of unique molecules        887
Number of filtered molecules      0
Number of torsion drives          888
Number of conformers min mean max 1   2.61 12
Mean molecular weight: 131.36
Max molecular weight: 433.68
Charges: [0.0, 1.0]


In [9]:
pprint(dataset.metadata.dict())

{'collection_type': 'TorsiondriveDataset',
 'creation_date': datetime.date(2021, 4, 20),
 'dataset_name': 'OpenFF Gen3 Torsion Set v1.0',
 'elements': {'P', 'H', 'S', 'N', 'O', 'Cl', 'Br', 'C', 'F'},
 'long_description': 'This dataset is a simple-molecule-only torsiondrive '
                     'dataset, aiming to avoid issue of torsion parameter '
                     'contamination by large internal non-bonded interactions '
                     'during a valece parameter optimization. Molecules with '
                     'one effective rotating bond were generate by combining '
                     'two simple substituents, which were identified by '
                     'fragmenting small drug like molecules. Torsions from the '
                     'generated molecule set were selected using clustering '
                     'method, in a way that the dataset can allow a chemical '
                     'diversity of molecules training each torsion parameter.',
 'long_description

In [10]:
dataset.priority = 'high'

In [11]:
for spec, obj in dataset.qc_specifications.items():
    print("Spec:", spec)
    pprint(obj.dict())

Spec: default
{'basis': 'DZVP',
 'implicit_solvent': None,
 'method': 'B3LYP-D3BJ',
 'program': 'psi4',
 'spec_description': 'Standard OpenFF optimization quantum chemistry '
                     'specification.',
 'spec_name': 'default',
 'store_wavefunction': 'none'}


In [12]:
pprint(dataset.scf_properties)

[<SCFProperties.Dipole: 'dipole'>,
 <SCFProperties.Quadrupole: 'quadrupole'>,
 <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>,
 <SCFProperties.MayerIndices: 'mayer_indices'>]


In [13]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("dataset.smi", "smi")

dataset.visualize("dataset.pdf", columns=8)