In [1]:
import logging
import warnings
from pprint import pprint

import numpy as np
from openff.qcsubmit.common_structures import QCSpec, PCMSettings
from openff.qcsubmit.factories import OptimizationDatasetFactory
from openff.qcsubmit.datasets import OptimizationEntry
from openff.qcsubmit.datasets import OptimizationDataset
from openff.toolkit.topology import Molecule
from openff.qcsubmit.common_structures import MoleculeAttributes
from tqdm import tqdm

In [2]:
# Warnings that tell us we have undefined stereo and charged molecules
logging.getLogger("openff.toolkit").setLevel(logging.ERROR)
warnings.simplefilter("ignore")

In [3]:
from openff.toolkit.utils import GLOBAL_TOOLKIT_REGISTRY, OpenEyeToolkitWrapper
if OpenEyeToolkitWrapper.is_available():
    GLOBAL_TOOLKIT_REGISTRY.deregister_toolkit(OpenEyeToolkitWrapper)

# Dataset Preparation

Prepare the main dataset.

In [4]:
# Required due to occasional SCF failures. See the V1 dataset as well as
# http://forum.psicode.org/t/dft-scf-not-converging/1725/3
dft_ultra_fine_keywords = dict(
    dft_spherical_points=590,
    dft_radial_points=99,
    dft_pruning_scheme="robust"
)

dataset_factory = OptimizationDatasetFactory()
provenance = dataset_factory.provenance(GLOBAL_TOOLKIT_REGISTRY)

dataset = OptimizationDataset(
    dataset_name="OpenFF Protein PDB 4-mers v4.0",
    dataset_tagline="Optimizations of petide 4-mers extracted from the PDB using the B3LYP-D3BJ method and the DZVP basis set.",
    description="This dataset is composed of 1,000 4-mer peptide structures which "
    "were extracted from PDB entries in the Top8000 database. The purpose of this "
    "dataset is to fill in gaps within the existing protein training data with "
    "secondary structures which occur in real proteins. For all scripts used to "
    "generated the input files please visit https://github.com/ajfriedman22/4mer_Generation. "
    "The dataset was computed using the B3LYP-D3BJ method and the DZVP basis set."
    "\n\n"
    "This dataset includes 200 distinct sequences with 5 conformers each extracted from "
    "the Top8000 PDB database.",
    provenance=provenance
)

dataset.metadata.submitter = "ajfriedman22"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/"
    "2025-03-05-OpenFF-Protein-PDB-4mer-v4.0"
)

dataset.provenance["constructure"] = "0.0.1"

Add molecules with constraints to the dataset

In [5]:
import os
sdf_files = os.listdir('inputs')
dihedral_smarts = {
        'phi': f'[#6X3:1](=O)-[#7X3:2]-[#6X4:3]-[#6X3:4](=O)-[#7X3]',
        'psi': f'[#6X3](=O)-[#7X3:1]-[#6X4:2]-[#6X3:3](=O)-[#7X3:4]',
        }

# Add molecules with constraints on phi and psi backbone torsions to dataset
for sdf_file in tqdm(sdf_files):
    # Read molecule from extracted PDB 4-mer
    offmol = Molecule.from_file(
        f'inputs/{sdf_file}',
        allow_undefined_stereo = True
    )
    for j in range(1, len(offmol)):
        offmol[0].add_conformer(offmol[j].conformers[0])
    offmol = offmol[0]
    if offmol.n_conformers != 5:
        print(sdf_file)
    
    seq = sdf_file.strip('.sdf')
    
    #Determine indices for constrained dihedrals
    constraint_indices = offmol.chemical_environment_matches(dihedral_smarts['phi']) + offmol.chemical_environment_matches(dihedral_smarts['psi'])

    dataset.add_molecule(molecule = offmol, 
                        index = seq, 
                        attributes = MoleculeAttributes.from_openff_molecule(offmol),
                        extras={'Sequence': seq, 'Constrained Dihedrals': constraint_indices})
    for constraint_index in constraint_indices:
        dataset.dataset[seq].add_constraint(constraint = 'freeze', 
                                                  constraint_type = 'dihedral', 
                                                  indices = constraint_index,
                                                  bonded=True)


  0%|          | 0/200 [00:00<?, ?it/s]



































































































































































100%|██████████| 200/200 [00:53<00:00,  3.77it/s]


Describe the molecule in the dataset

In [6]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

masses = [
    [
        sum([atom.mass.m for atom in molecule.atoms])
        for molecule in dataset.molecules]
]

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(molecule.total_charge.m for molecule in dataset.molecules)))

Number of unique molecules        200
Number of filtered molecules      0
Number of conformers              1000
Number of conformers min mean max 5   5.00 5
Mean molecular weight: 451.35
Max molecular weight: 570.64
Charges: [-2.0, -1.0, 0.0, 1.0]


Describe the dataset

In [7]:
pprint(dataset.metadata.dict())

{'collection_type': 'OptimizationDataset',
 'creation_date': datetime.date(2025, 3, 7),
 'dataset_name': 'OpenFF Protein PDB 4-mers v4.0',
 'elements': {'C', 'N', 'O', 'H'},
 'long_description': 'This dataset is composed of 1,000 4-mer peptide '
                     'structures which were extracted from PDB entries in the '
                     'Top8000 database. The purpose of this dataset is to fill '
                     'in gaps within the existing protein training data with '
                     'secondary structures which occur in real proteins. For '
                     'all scripts used to generated the input files please '
                     'visit https://github.com/ajfriedman22/4mer_Generation. '
                     'The dataset was computed using the B3LYP-D3BJ method and '
                     'the DZVP basis set.\n'
                     '\n'
                     'This dataset includes 200 distinct sequences with 5 '
                     'conformers each extracted fro

In [8]:
for spec, obj in dataset.qc_specifications.items():
    print("Spec:", spec)
    pprint(obj.dict())

Spec: default
{'basis': 'DZVP',
 'implicit_solvent': None,
 'keywords': {},
 'maxiter': 200,
 'method': 'B3LYP-D3BJ',
 'program': 'psi4',
 'scf_properties': ['dipole',
                    'quadrupole',
                    'wiberg_lowdin_indices',
                    'mayer_indices'],
 'spec_description': 'Standard OpenFF optimization quantum chemistry '
                     'specification.',
 'spec_name': 'default',
 'store_wavefunction': 'none'}


In [9]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("dataset.smi", "smi")

try:
    dataset.visualize("dataset.pdf", columns=8)
except AttributeError:
    # patched in QCSubmit >0.50.2
    pass