In [1]:
import logging
import warnings
from pprint import pprint
from typing import Tuple

import numpy as np
from openeye import oechem
from openff.qcsubmit.common_structures import TorsionIndexer, QCSpec
from openff.qcsubmit.factories import TorsiondriveDatasetFactory
from openff.qcsubmit.workflow_components import StandardConformerGenerator
from openforcefield.topology import Molecule
from tqdm import tqdm

In [2]:
# Warnings that tell us we have undefined stereo and charged molecules
logging.getLogger("openforcefield").setLevel(logging.ERROR)
warnings.simplefilter("ignore")

# Dataset Preperation

Define the SMILES patterns of the molecules to include.

In [3]:
smiles_patterns = [
    # SMILES A
    "COc1ccc(cc1)N" ,
    "c1cc(ccc1N)S" ,
    "c1cc(ccc1C(=O)O)N" ,
    "c1cc(ccc1N)[N+]#N" ,
    "CNc1ccc(cc1)OC" ,
    "CNc1ccc(cc1)S" ,
    "CNc1ccc(cc1)C(=O)O" ,
    "CNc1ccc(cc1)[N+]#N" , "CN(C)c1ccc(cc1)OC" , "CN(C)c1ccc(cc1)S" , "CN(C)c1ccc(cc1)C(=O)O" , "CN(C)c1ccc(cc1)[N+]#N"
    # SMILES B
    "c1cc(ccc1N)[O-]" ,"Cc1ccc(cc1)N" ,"c1cc(ccc1C#N)N" ,"C[N+](C)(C)c1ccc(cc1)N" ,"CNc1ccc(cc1)[O-]" ,"Cc1ccc(cc1)NC" ,"CNc1ccc(cc1)C#N" ,"CNc1ccc(cc1)[N+](C)(C)C" ,"CN(C)c1ccc(cc1)[O-]" ,"Cc1ccc(cc1)N(C)C" ,"CN(C)c1ccc(cc1)C#N" ,"CN(C)c1ccc(cc1)[N+](C)(C)C"
]

Define the SMARTS pattern which will match the aryl amine torsion and impropers.

In [4]:
proper_torsion_smarts = "[*:1]-[#7X3+0:2]-[#6:3]@[#6,#7:4]"
improper_torsion_smarts = "[#7X3+0:1](-[*:3])(-[*:4])-[#6:2]@[#6,#7]"

Define the range of improper and proper angles to scan as well as the increment to scan in.

In [5]:
min_improper = -54
max_improper = 54

min_proper = -150
max_proper = 180

improper_spacing = 6
proper_spacing = 30

Load in the molecules and flag the torsion(s) to be driven.

In [6]:
def find_torsion_indices(input: Molecule, smarts: str) -> Tuple[int, int, int, int]:

    matches = input.chemical_environment_matches(smarts)
    return sorted(matches, key=lambda element: (element[0], element[-1]))[0]

molecules = []

for smiles_pattern in tqdm(smiles_patterns):

    molecule: Molecule = Molecule.from_mapped_smiles(smiles_pattern)

    try:
        molecule.generate_conformers(n_conformers=1)
    except Exception:
        print(f"Skipping {smiles_pattern} - OMEGA error.")
        continue

    # Find the improper torsion
    improper_indices = find_torsion_indices(molecule, improper_torsion_smarts)
    # Find the proper torsion
    proper_indices = find_torsion_indices(molecule, proper_torsion_smarts)

    # Explicitly define that these are the two torsions to drive.
    torsion_indexer = TorsionIndexer()
    torsion_indexer.add_double_torsion(
        improper_indices,
        proper_indices,
        (min_improper, max_improper),
        (min_proper, max_proper),
    )

    molecule.properties["dihedrals"] = torsion_indexer
    molecules.append(molecule)

100%|██████████| 1/1 [00:00<00:00,  2.69it/s]


Prepare the main dataset from the molecule list.

In [7]:
# Monkey patch the factory to let our improper pass through
TorsiondriveDatasetFactory.create_index = lambda self, molecule: molecule.to_smiles(
    isomeric=True, explicit_hydrogens=True, mapped=True
)

dataset_factory = TorsiondriveDatasetFactory(
    grid_spacing=[improper_spacing, proper_spacing]
)

dataset_factory.add_workflow_component(
    StandardConformerGenerator(max_conformers=10, rms_cutoff=0.1, clear_existing=True)
)

dataset = dataset_factory.create_dataset(
    dataset_name="OpenFF Amide Torsion Set v1.0",
    tagline="TODO TODO",
    description="TODO TODO",
    molecules=molecules,
)

dataset.metadata.submitter = "simonboothroyd"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/"
    "2021-03-29-OpenFF-Aniline-2D-Impropers-v1.0"
)

Deduplication                 : 100%|████████████| 1/1 [00:00<00:00, 606.03it/s]
StandardConformerGenerator    : 100%|█████████████| 1/1 [00:00<00:00,  5.17it/s]
Preparation                   : 100%|█████████████| 1/1 [00:00<00:00, 40.10it/s]


Make sure the molecules in the dataset match the input molecules

In [8]:
old_smiles = {Molecule.from_smiles(smiles).to_smiles(isomeric=False) for smiles in smiles_patterns}
new_smiles = {molecule.to_smiles(isomeric=False) for molecule in dataset.molecules}

assert len(old_smiles.symmetric_difference(new_smiles)) == 0

Describe the molecule in the dataset

In [9]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of torsion drives         ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

masses = []
for molecule in dataset.molecules:
    oemol = molecule.to_openeye()
    mass = oechem.OECalculateMolecularWeight(oemol)
    masses.append(mass)

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(m.total_charge/m.total_charge.unit for m in dataset.molecules)))

Number of unique molecules        1
Number of filtered molecules      0
Number of torsion drives          1
Number of conformers min mean max 1   1.00 1
Mean molecular weight: 93.13
Max molecular weight: 93.13
Charges: [0.0]


Describe the dataset

In [10]:
pprint(dataset.metadata.dict())

{'collection_type': 'TorsiondriveDataset',
 'creation_date': datetime.date(2021, 3, 29),
 'dataset_name': 'OpenFF Amide Torsion Set v1.0',
 'elements': {'N', 'H', 'C'},
 'long_description': 'TODO TODO',
 'long_description_url': HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2021-03-29-OpenFF-Aniline-2D-Impropers-v1.0', scheme='https', host='github.com', tld='com', host_type='domain', path='/openforcefield/qca-dataset-submission/tree/master/submissions/2021-03-29-OpenFF-Aniline-2D-Impropers-v1.0'),
 'short_description': 'TODO TODO',
 'submitter': 'simonboothroyd'}


In [11]:
for spec, obj in dataset.qc_specifications.items():
    print("Spec:", spec)
    pprint(obj.dict())

Spec: default
{'basis': 'STO-3G',
 'implicit_solvent': None,
 'method': 'SCF',
 'program': 'psi4',
 'spec_description': 'cheap',
 'spec_name': 'default',
 'store_wavefunction': 'none'}


In [12]:
pprint(dataset.scf_properties)

[<SCFProperties.Dipole: 'dipole'>,
 <SCFProperties.Quadrupole: 'quadrupole'>,
 <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>,
 <SCFProperties.MayerIndices: 'mayer_indices'>]


Export the dataset.

In [13]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("dataset.smi", "smi")

dataset.visualize("dataset.pdf", columns=8)