In [1]:
import pickle

In [2]:
tid_clusters_list = pickle.load(open('1st_tid_clusters_list.p','rb'))
tid_clusters_list2 = pickle.load(open('2nd_tid_clusters_list_interst.p','rb'))

In [3]:
count = 0
for tid, clusters in tid_clusters_list.items():
    for cluster in clusters: 
        count +=len(cluster['torsions'])
print(f'# input entries in the 1st batch: {count}')

count = 0
for tid, clusters in tid_clusters_list2.items():
    for cluster in clusters: 
        count +=len(cluster['torsions'])
print(f'# input entries in the second batch: {count}')

# input entries in the 1st batch: 1085
# input entries in the second batch: 164


In [4]:
# copied from https://github.com/openforcefield/qca-dataset-submission/blob/master/submissions/2021-03-23-OpenFF-Amide-Torsion-Set-v1.0/generate-dataset.ipynb
import logging
import warnings
from pprint import pprint

import numpy as np
from openff.qcsubmit.common_structures import TorsionIndexer
from openff.qcsubmit.factories import TorsiondriveDatasetFactory
from openff.qcsubmit.datasets import TorsiondriveDataset, load_dataset
from openff.toolkit.topology import Molecule
from openff.toolkit.utils import UndefinedStereochemistryError
from simtk import unit
from tqdm import tqdm

In [5]:
# Warnings that tell us we have undefined stereo and charged molecules
logging.getLogger("openforcefield").setLevel(logging.ERROR)
warnings.simplefilter("ignore")

In [6]:
def gen_molecules(cluster_list):
    molecules = []
    for tid, clusters in tqdm(cluster_list.items()):
        for cluster in clusters: 
            torsions = cluster['torsions']
            for torsion in torsions: 
                smiles_pattern = torsion['mol_index']
                dihedrals = torsion['indices']

                # RDKit is unable to understand the stereochemistry of these molecules so
                # we skip them.
                if smiles_pattern == "[H]/C(=C(/C([H])([H])[H])\\[H])/C(=C(\\C([H])([H])[H])/[H])/[H]":
                    continue

                # 1. generate off molecule object. 
                try:
                    molecule: Molecule = Molecule.from_smiles(smiles_pattern)
                except UndefinedStereochemistryError:
                    molecule: Molecule = Molecule.from_smiles(smiles_pattern, allow_undefined_stereo=True)
                    molecule = ([molecule] + molecule.enumerate_stereoisomers(max_isomers=1))[-1]

                molecule = molecule.canonical_order_atoms()
                # 2. generate conformers. 
                try:
                    molecule.generate_conformers(n_conformers=10)
                except Exception:
                    print(f"Skipping {smiles_pattern} - OMEGA error.")
                    continue
                # 3. check if the molecule is rdkit compatible. 
                try:
                    molecule.to_rdkit()
                except Exception:
                    print(f"Skipping {smiles_pattern} - RDKit incompatible.")
                    continue

                # 4. add dihedrals info.
                torsion_indexer = TorsionIndexer()
                torsion_indexer.add_torsion(dihedrals, (-165, 180), True)

                molecule.properties["dihedrals"] = torsion_indexer
                molecules.append(molecule)
    return molecules 

In [7]:
# 1st batch 
molecules1 = gen_molecules(tid_clusters_list)

ned_stereo=True): OEMol has unspecified stereochemistry. oemol.GetTitle(): 
Problematic atoms are:
Atom atomic num: 7, name: , idx: 5, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 2, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 3, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 7, name: , idx: 4, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 10, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 11, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 7, name: , idx: 15, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 6, aromatic: False, chiral: True with bonds:
bond order: 1, chiral:

In [8]:
print(f'len(molecules1) = {len(molecules1)}')

len(molecules1) = 1084


In [9]:
# second batch (molecules generated using additional substituents for higher torsion parameter coverage)
molecules2 = gen_molecules(tid_clusters_list2)

Problematic atoms are:
Atom atomic num: 7, name: , idx: 7, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 16, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 8, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 12, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 4, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 2, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 7, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 16, name: , idx: 8, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 4, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 2, aromatic: False, chiral: False

In [10]:
print(f'len(molecules2) = {len(molecules2)}')

len(molecules2) = 164


In [11]:
# 3rd batch (addition of charged molecules)
import copy 
molecules3 = []
from openeye import oequacpac, oechem

for tid, clusters in tqdm(tid_clusters_list.items()):
    for cluster in clusters: 
        torsions = cluster['torsions']
        for torsion in torsions: 
            smiles_pattern = torsion['mol_index']
            dihedrals = torsion['indices']

            # RDKit is unable to understand the stereochemistry of these molecules so
            # we skip them.
            if smiles_pattern == "[H]/C(=C(/C([H])([H])[H])\\[H])/C(=C(\\C([H])([H])[H])/[H])/[H]":
                continue
            try:
                molecule: Molecule = Molecule.from_smiles(smiles_pattern)
            except UndefinedStereochemistryError:
                molecule: Molecule = Molecule.from_smiles(smiles_pattern, allow_undefined_stereo=True)
                molecule = ([molecule] + molecule.enumerate_stereoisomers(max_isomers=1))[-1]

            molecule = molecule.canonical_order_atoms()

            try:
                molecule.generate_conformers(n_conformers=1)
            except Exception:
                print(f"Skipping {smiles_pattern} - OMEGA error.")
                continue
            try:
                molecule.to_rdkit()
            except Exception:
                print(f"Skipping {smiles_pattern} - RDKit incompatible.")
                continue

            # map atoms
            atom_map = {atomidx:idx+1 for idx, atomidx in enumerate(dihedrals)}
            oemol = molecule.to_openeye()
            if oechem.OENetCharge(oemol) == 0: 
                for oeatom in oemol.GetAtoms():
                    oe_index = oeatom.GetIdx()
                    if oe_index in atom_map:
                        oeatom.SetMapIdx(atom_map[oe_index])

                options = oequacpac.OEFormalChargeOptions()
                # add one as the input is included
                options.SetMaxCount(5)
                for protomer in oequacpac.OEEnumerateFormalCharges(oemol, options):
                    charge = oechem.OENetCharge(protomer)
                    if charge !=0.0:
                        try:
                            off_protomer = Molecule.from_openeye(protomer)
                            dihedrals = [k for k, v in sorted(off_protomer.properties['atom_map'].items(), key=lambda item: item[1])]
                        except UndefinedStereochemistryError:
                            # off_protomer = Molecule.from_openeye(protomer, allow_undefined_stereo=True)
                            # off_protomer = ([off_protomer] + off_protomer.enumerate_stereoisomers(max_isomers=1))[-1]
                            print(f"Skipping {off_protomer.to_smiles()} - UndefinedStereochemistryError.")
                            continue

                        try:
                            off_protomer.generate_conformers(n_conformers=10)
                        except Exception:
                            print(f"Skipping {off_protomer.to_smiles()} - OMEGA error.")
                            continue

                        # dihedrals =   list(off_protomer.properties['atom_map'].keys())
                        # dihedrals = [k for k, v in sorted(off_protomer.properties['atom_map'].items(), key=lambda item: item[1])]
                        if len(dihedrals) == 4: 
                            torsion_indexer = TorsionIndexer()
                            torsion_indexer.add_torsion(dihedrals, (-165, 180), True)

                            off_protomer.properties["dihedrals"] = torsion_indexer
                            off_protomer_copy = copy.deepcopy(off_protomer)
                            del off_protomer_copy.properties['atom_map']
                            molecules3 .append(off_protomer_copy)                       

l: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 2, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 7, name: , idx: 3, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 7, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 8, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 7, name: , idx: 12, aromatic: False, chiral: False

Skipping [H]C([H])([H])C([H])(C([H])([H])[H])[N+]([H])([H])N([H])S(=O)(=O)C([H])([H])[H] - UndefinedStereochemistryError.
Skipping [H]C([H])([H])C([H])(C([H])([H])[H])[N+]([H])([H])N([H])S(=O)(=O)C([H])([H])[H] - UndefinedStereochemistryError.
Problematic atoms are:
Atom atomic num: 7, name: , idx: 4, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num

In [12]:
print(f'len(molecules1) = {len(molecules1)}')
print(f'len(molecules2) = {len(molecules2)}')
print(f'len(molecules3) = {len(molecules3)}')

len(molecules1) = 1084
len(molecules2) = 164
len(molecules3) = 383


In [13]:
molecules = list(molecules1 +molecules2 + molecules3)
print(f'len(molecules) = {len(molecules)}')

len(molecules) = 1631


In [14]:
dataset_factory = TorsiondriveDatasetFactory()

dataset = dataset_factory.create_dataset(
    dataset_name="OpenFF Gen3 Torsion Set v1.1",
    tagline="OpenFF Gen3 Torsion Set v1.1",
    description="This dataset is a simple-molecule-only torsiondrive dataset, aiming to avoid issue of torsion parameter contamination by large internal non-bonded interactions during a valece parameter optimization. Molecules with one effective rotating bond were generate by combining two simple substituents, which were identified by fragmenting small drug like molecules. Torsions from the generated molecule set were selected using clustering method, in a way that the dataset can allow a chemical diversity of molecules training each torsion parameter.",
    molecules=molecules,
)

dataset.metadata.submitter = "hyesujang"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/"
    "2021-05-07-OpenFF-Gen3-Torsion-Set-v1.1"
)


Deduplication                 : 100%|██████| 1631/1631 [00:04<00:00, 388.12it/s]
Problematic atoms are:
Atom atomic num: 7, name: , idx: 10, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 11, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 15, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 10, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 11, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 15, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 10, aromatic: False, chiral: True with bonds:
bond order: 1

In [15]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])
masses = []
for molecule in dataset.molecules:
    mass = sum(atom.mass.value_in_unit(unit.dalton) for atom in molecule.atoms)
    masses.append(mass)
charges = sorted(set(m.total_charge/m.total_charge.unit for m in dataset.molecules))

nd order: 1, chiral: False to atom atomic num: 8, name: , idx: 5, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 16, name: , idx: 10, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 10, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 16, name: , idx: 2, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 11, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 15, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 10, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 16, name: , idx: 2, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 11, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 15, aromatic: False, chiral: Fa

In [16]:
print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of torsion drives         ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", charges)

Number of unique molecules        1352
Number of filtered molecules      0
Number of torsion drives          1353
Number of conformers min mean max 1   2.53 12
Mean molecular weight: 133.56
Max molecular weight: 433.68
Charges: [-2.0, -1.0, 0.0, 1.0, 2.0]


In [17]:
pprint(dataset.metadata.dict())

{'collection_type': 'TorsiondriveDataset',
 'creation_date': datetime.date(2021, 5, 7),
 'dataset_name': 'OpenFF Gen3 Torsion Set v1.1',
 'elements': {'C', 'S', 'H', 'N', 'F', 'Cl', 'Br', 'P', 'O'},
 'long_description': 'This dataset is a simple-molecule-only torsiondrive '
                     'dataset, aiming to avoid issue of torsion parameter '
                     'contamination by large internal non-bonded interactions '
                     'during a valece parameter optimization. Molecules with '
                     'one effective rotating bond were generate by combining '
                     'two simple substituents, which were identified by '
                     'fragmenting small drug like molecules. Torsions from the '
                     'generated molecule set were selected using clustering '
                     'method, in a way that the dataset can allow a chemical '
                     'diversity of molecules training each torsion parameter.',
 'long_description_

In [18]:
dataset.priority = 'high'

In [19]:
for spec, obj in dataset.qc_specifications.items():
    print("Spec:", spec)
    pprint(obj.dict())

Spec: default
{'basis': 'DZVP',
 'implicit_solvent': None,
 'method': 'B3LYP-D3BJ',
 'program': 'psi4',
 'spec_description': 'Standard OpenFF optimization quantum chemistry '
                     'specification.',
 'spec_name': 'default',
 'store_wavefunction': 'none'}


In [20]:
pprint(dataset.scf_properties)

[<SCFProperties.Dipole: 'dipole'>,
 <SCFProperties.Quadrupole: 'quadrupole'>,
 <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>,
 <SCFProperties.MayerIndices: 'mayer_indices'>]


In [21]:
dataset.export_dataset("dataset-expansion-1.json.bz2")
dataset.molecules_to_file("dataset-expansion-1.smi", "smi")

dataset.visualize("dataset-expansion-1.pdf", columns=8)

Problematic atoms are:
Atom atomic num: 7, name: , idx: 10, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 11, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 15, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 10, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 11, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 15, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 10, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: F