In [1]:
import pickle
tid_clusters_list1 = pickle.load(open('1st_tid_clusters_list.p','rb'))
tid_clusters_list2 = pickle.load(open('2nd_tid_clusters_list_interst.p','rb'))

In [2]:
import logging
import warnings
from pprint import pprint

import numpy as np
from openff.qcsubmit.factories import OptimizationDatasetFactory
from openff.qcsubmit.workflow_components import EnumerateStereoisomers, StandardConformerGenerator, EnumerateProtomers, EnumerateTautomers
from openff.toolkit.topology import Molecule
from openff.toolkit.utils import UndefinedStereochemistryError
from simtk import unit
from tqdm import tqdm

In [3]:
logging.getLogger("openforcefield").setLevel(logging.ERROR)
warnings.simplefilter("ignore")

In [4]:
def gen_molecules(cluster_list):
    molecules = []
    for tid, clusters in tqdm(cluster_list.items()):
        for cluster in clusters: 
            torsions = cluster['torsions']
            for torsion in torsions: 
                smiles_pattern = torsion['mol_index']
                
                try:
                    molecule: Molecule = Molecule.from_smiles(smiles_pattern)
                except UndefinedStereochemistryError:
                    molecule: Molecule = Molecule.from_smiles(smiles_pattern, allow_undefined_stereo=True)
                    molecule = ([molecule] + molecule.enumerate_stereoisomers(max_isomers=1))[-1]

                molecule = molecule.canonical_order_atoms()

                try:
                    molecule.generate_conformers(n_conformers=1)
                except Exception:
                    print(f"Skipping {smiles_pattern} - OMEGA error.")
                    continue
                try:
                    molecule.to_rdkit()
                except Exception:
                    print(f"Skipping {smiles_pattern} - RDKit incompatible.")
                    continue

                molecules.append(molecule)
    return molecules

In [5]:
molecules1 = gen_molecules(tid_clusters_list1)

ned_stereo=True): OEMol has unspecified stereochemistry. oemol.GetTitle(): 
Problematic atoms are:
Atom atomic num: 7, name: , idx: 5, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 2, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 3, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 7, name: , idx: 4, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 10, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 11, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 7, name: , idx: 15, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 6, aromatic: False, chiral: True with bonds:
bond order: 1, chiral:

In [6]:
molecules2 = gen_molecules(tid_clusters_list2)

Problematic atoms are:
Atom atomic num: 7, name: , idx: 7, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 16, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 8, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 12, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 4, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 2, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 7, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 16, name: , idx: 8, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 4, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 2, aromatic: False, chiral: False

In [7]:
print(f'len(molecules1): {len(molecules1)}')
print(f'len(molecules2): {len(molecules2)}')
molecules = list(molecules1 + molecules2)
print(f'len(molecules): {len(molecules)}')

len(molecules1): 1085
len(molecules2): 164
len(molecules): 1249


In [8]:
import pickle
pickle.dump(molecules, open('input-mols.p','wb'))

In [9]:
# import pickle
# molecules = pickle.load(open('input-mols.p','rb'))

In [10]:
dataset_factory = OptimizationDatasetFactory()
dataset_factory.add_workflow_components(
    EnumerateProtomers(max_states=10), EnumerateTautomers(max_tautomers=10), EnumerateStereoisomers(max_isomers=10), StandardConformerGenerator(max_conformers=10, rms_cutoff=0.1, clear_existing=True)
)
dataset = dataset_factory.create_dataset(
    dataset_name="OpenFF Gen3 Optimization Set v1.0",
    tagline="OpenFF Gen3 Torsion Set v1.0",
    description="This dataset is a simple-molecule-only optimization dataset. The input molecules are those being scanned in OpenFF Gen3 Torsion Set",
    molecules=molecules,
)

dataset.metadata.submitter = "hyesujang"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/"
    "2021-05-12-OpenFF-Gen3-Optimization-Set-v1.0"
)

emistry. oemol.GetTitle(): 
Problematic atoms are:
Atom atomic num: 7, name: , idx: 4, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 3, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 5, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 9, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 7, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 1, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 5, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 8, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 12, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 2, arom

In [11]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])
masses = []
for molecule in dataset.molecules:
    mass = sum(atom.mass.value_in_unit(unit.dalton) for atom in molecule.atoms)
    masses.append(mass)
charges = sorted(set(m.total_charge/m.total_charge.unit for m in dataset.molecules))

 order: 1, chiral: False to atom atomic num: 6, name: , idx: 8, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 7, name: , idx: 12, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 7, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 8, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 7, name: , idx: 12, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 10, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 7, name: , idx: 7, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 11, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 15, aromatic: False, chiral: False

Pr

In [12]:
print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of records                ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", charges)

Problematic atoms are:
Atom atomic num: 7, name: , idx: 8, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 7, name: , idx: 6, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 9, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 13, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 5, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 7, name: , idx: 3, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 6, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 10, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 10, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: False

In [13]:
pprint(dataset.metadata.dict())

{'collection_type': 'OptimizationDataset',
 'creation_date': datetime.date(2021, 5, 12),
 'dataset_name': 'OpenFF Gen3 Optimization Set v1.0',
 'elements': {'F', 'P', 'N', 'S', 'Br', 'H', 'Cl', 'O', 'C'},
 'long_description': 'This dataset is a simple-molecule-only optimization '
                     'dataset. The input molecules are those being scanned in '
                     'OpenFF Gen3 Torsion Set',
 'long_description_url': HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2021-05-12-OpenFF-Gen3-Optimization-Set-v1.0', scheme='https', host='github.com', tld='com', host_type='domain', path='/openforcefield/qca-dataset-submission/tree/master/submissions/2021-05-12-OpenFF-Gen3-Optimization-Set-v1.0'),
 'short_description': 'OpenFF Gen3 Torsion Set v1.0',
 'submitter': 'hyesujang'}


In [14]:
dataset.priority = 'high'

In [15]:
for spec, obj in dataset.qc_specifications.items():
    print("Spec:", spec)
    pprint(obj.dict())

Spec: default
{'basis': 'DZVP',
 'implicit_solvent': None,
 'method': 'B3LYP-D3BJ',
 'program': 'psi4',
 'spec_description': 'Standard OpenFF optimization quantum chemistry '
                     'specification.',
 'spec_name': 'default',
 'store_wavefunction': 'none'}


In [16]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("dataset.smi", "smi")

dataset.visualize("dataset.pdf", columns=8)

Problematic atoms are:
Atom atomic num: 7, name: , idx: 10, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 11, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 15, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 10, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 11, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 15, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 10, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: F