In [1]:
import numpy as np
import gzip
from rdkit import Chem
from pprint import pprint
from openff.qcsubmit.common_structures import QCSpec, DriverEnum, SCFProperties
from openff.qcsubmit.factories import BasicDatasetFactory
from openff.toolkit.topology import Molecule, Topology
from tqdm import tqdm

In [2]:
# check version
from openff.toolkit._version import get_versions
get_versions()

{'date': '2023-02-27T12:08:52-0800',
 'dirty': False,
 'error': None,
 'full-revisionid': 'e05c876611e40067575bda9c8c47022570e3c508',
 'version': '0.12.1'}

### Load file

In [4]:
names = ['a', 'c', 'g', 'u']
for i, name in enumerate(names):
    inf = gzip.open(f'./sdf/torsion_scan_{name}_filtered.sdf.gz')
    with Chem.ForwardSDMolSupplier(inf, removeHs=False) as gzsuppl:
        rdmols = [x for x in gzsuppl if x is not None]
    print(f'#{len(rdmols)} conformers loaded from {name}')
    
    if i == 0:
        offmols = [ Molecule.from_rdkit(rdmol) for rdmol in rdmols ]
    else:
        offmols += [ Molecule.from_rdkit(rdmol) for rdmol in rdmols ]
print('-----')
print(f'total of #{len(offmols)} conformers loaded')

#2393 conformers loaded from a
#2382 conformers loaded from c
#2392 conformers loaded from g
#2388 conformers loaded from u
-----
total of #9555 conformers loaded


In [5]:
# Collatate all conformers of the same molecule
# https://github.com/openforcefield/openff-toolkit/blob/de8a4a545351301adfe424dff0d879b2dd13bc0b/examples/conformer_energies/conformer_energies.ipynb
molecules = [offmols[0]]
for i, molecule in enumerate(offmols[1:]):
    # add conformer to existing molecule
    if molecule == molecules[-1]:
        for conformer in molecule.conformers:
            molecules[-1].add_conformer(conformer)
    else:
        # add new molecule
        molecules.append(molecule)
n_molecules = len(molecules)
n_conformers = sum([mol.n_conformers for mol in molecules])
print(f'{n_molecules} unique molecule(s) loaded, with {n_conformers} total conformers')

4 unique molecule(s) loaded, with 9555 total conformers


### Data preparation

In [6]:
factory = BasicDatasetFactory(
        driver=DriverEnum.gradient, 
        qc_specifications = {
            'default': QCSpec(
                method='b3lyp-d3bj', 
                basis='dzvp', 
                program='psi4', 
                spec_name='default', 
                spec_description='Standard OpenFF optimization quantum chemistry specification', 
                store_wavefunction="none", 
                implicit_solvent=None, 
                maxiter=200, 
                scf_properties=[SCFProperties.Dipole, SCFProperties.Quadrupole], 
        ), 
    }
)

In [7]:
dataset = factory.create_dataset(dataset_name="RNA Nucleoside Single Point Dataset v1.0", 
                                 molecules=molecules, 
                                 tagline="QM dataset for ML", 
                                 description="This is a single point energy calculations of RNA nucleosides without O5' hydroxyl atom generated from 500K implicit solvent MD and chi torsion scanning. Data generation details can be found at https://github.com/choderalab/create-rna-nucleoside-dataset.")

Deduplication                 : 100%|█████████████| 4/4 [00:08<00:00,  2.21s/it]
Preparation                   : 100%|█████████████| 4/4 [02:12<00:00, 33.13s/it]


In [8]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max", confs.min(), "{:6.2f}".format(confs.mean()), confs.max())


from rdkit.Chem import Descriptors
masses = []
for molecule in dataset.molecules:
    rdmol = molecule.to_rdkit()
    mass = Descriptors.MolWt(rdmol)
    masses.append(mass)

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(float(m.total_charge/m.total_charge.units) for m in dataset.molecules)))

Number of unique molecules        4
Number of filtered molecules      0
Number of conformers              9555
Number of conformers min mean max 2382 2388.75 2393
Mean molecular weight: 243.48
Max molecular weight: 267.25
Charges: [0.0]


In [9]:
dataset.metadata.submitter = 'Kenichiro Takaba'
dataset.metadata.long_description_url = 'https://github.com/choderalab/create-rna-nucleoside-dataset'

In [10]:
pprint(dataset.metadata.dict())

{'collection_type': 'DataSet',
 'creation_date': datetime.date(2023, 3, 9),
 'dataset_name': 'RNA Nucleoside Single Point Dataset v1.0',
 'elements': {'H', 'N', 'O', 'C'},
 'long_description': 'This is a single point energy calculations of RNA '
                     "nucleosides without O5' hydroxyl atom generated from "
                     '500K implicit solvent MD and chi torsion scanning. Data '
                     'generation details can be found at '
                     'https://github.com/choderalab/create-rna-nucleoside-dataset.',
 'long_description_url': HttpUrl('https://github.com/choderalab/create-rna-nucleoside-dataset', ),
 'short_description': 'QM dataset for ML',
 'submitter': 'Kenichiro Takaba'}


In [11]:
for spec, obj in dataset.qc_specifications.items():
    print("Spec:", spec)
    pprint(obj.dict())

Spec: default
{'basis': 'dzvp',
 'implicit_solvent': None,
 'keywords': None,
 'maxiter': 200,
 'method': 'b3lyp-d3bj',
 'program': 'psi4',
 'scf_properties': ['dipole', 'quadrupole'],
 'spec_description': 'Standard OpenFF optimization quantum chemistry '
                     'specification',
 'spec_name': 'default',
 'store_wavefunction': 'none'}


In [12]:
pprint(dataset.qc_specifications)

{'default': QCSpec(method='b3lyp-d3bj', basis='dzvp', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>, implicit_solvent=None, maxiter=200, scf_properties=[<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>], keywords=None)}


#### Export the dataset

In [13]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("dataset.smi", "smi")

In [14]:
try:
    dataset.visualize('dataset.pdf')
except:
    from rdkit.Chem import Draw, AllChem
    #from rdkit.Chem.Draw import IPythonConsole
    #IPythonConsole.molSize = 300,300

    rdmols = [ mol.to_rdkit() for mol in molecules ]
    # convert 3D to 2D
    _ = [ AllChem.Compute2DCoords(rdmol) for rdmol in rdmols ] 
    img = Draw.MolsToGridImage(rdmols, molsPerRow=5, subImgSize=(500,500), returnPNG=False, maxMols=99999)
    img = img.convert('RGB')
    img.save("dataset.pdf")