# OpenFF Protein Capped 1-mers 3-mers Optimization

Optimization dataset for protein capped 1-mers from [OpenFF Protein Dipeptide 2-D TorsionDrive](https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2021-11-18-OpenFF-Protein-Dipeptide-2D-TorsionDrive) and capped 3-mers from [OpenFF Protein Capped 3-mer Backbones](https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2022-05-30-OpenFF-Protein-Capped-3-mer-Backbones). Dataset preparation in this notebook was taken from [OpenFF Gen2 Optimization Set Protomers](https://github.com/openforcefield/qca-dataset-submission/blob/master/submissions/2021-12-21-OpenFF-Gen2-Optimization-Set-Protomers/Dataset_Generation.ipynb).

In [1]:
from openff.qcsubmit.common_structures import QCSpec, SCFProperties
from openff.qcsubmit.factories import OptimizationDatasetFactory
from openff.qcsubmit import workflow_components
import numpy
from qcelemental.models.results import WavefunctionProtocolEnum
from qcportal.models.common_models import DriverEnum
from openeye import oechem

In [2]:
factory = OptimizationDatasetFactory(
    qc_specifications={
        'default': QCSpec(
            method='b3lyp-d3bj',
            basis='dzvp', 
            program='psi4', 
            spec_name='default', 
            spec_description='Standard OpenFF optimization quantum chemistry specification.',
            implicit_solvent=None, 
            maxiter=200, 
            scf_properties=[
                SCFProperties.Dipole, SCFProperties.Quadrupole, SCFProperties.WibergLowdinIndices, 
                SCFProperties.MayerIndices, SCFProperties.MBISCharges
            ],
        )
    },
)
factory.add_workflow_components(workflow_components.StandardConformerGenerator(max_conformers=10))
factory.dict()

{'qc_specifications': {'default': {'method': 'b3lyp-d3bj',
   'basis': 'dzvp',
   'program': 'psi4',
   'spec_name': 'default',
   'spec_description': 'Standard OpenFF optimization quantum chemistry specification.',
   'store_wavefunction': 'none',
   'implicit_solvent': None,
   'maxiter': 200,
   'scf_properties': ['dipole',
    'quadrupole',
    'wiberg_lowdin_indices',
    'mayer_indices',
    'mbis_charges'],
   'keywords': None}},
 'driver': 'gradient',
 'priority': 'normal',
 'dataset_tags': ['openff'],
 'compute_tag': 'openff',
 'type': 'OptimizationDatasetFactory',
 'workflow': [{'type': 'StandardConformerGenerator',
   'rms_cutoff': None,
   'max_conformers': 10,
   'clear_existing': True}],
 'optimization_program': {'program': 'geometric',
  'coordsys': 'dlc',
  'enforce': 0.0,
  'epsilon': 1e-05,
  'reset': True,
  'qccnv': False,
  'molcnv': False,
  'check': 0,
  'trust': 0.1,
  'tmax': 0.3,
  'maxiter': 300,
  'convergence_set': 'GAU',
  'constraints': {}}}

In [3]:
dataset = factory.create_dataset(
    dataset_name = 'OpenFF Protein Capped 1-mers 3-mers Optimization Dataset v1.0',
    molecules = 'capped_1-mers_3-mers.smi',
    tagline = 'Optimization dataset for protein capped 1-mers and 3-mers',
    description = 'Optimization dataset for protein capped 1-mers Ace-X-Nme and capped 3-mers Ace-Y-X-Y-Nme with Y = '
        '{Ala, Val} and X = 26 canonical amino acids with common protomers/tautomers (Ash, Cyx, Glh, Hid, Hip, and Lyn).'
)

Deduplication                 : 100%|██████████| 78/78 [00:00<00:00, 687.94it/s]
StandardConformerGenerator    : 100%|███████████| 78/78 [00:07<00:00, 11.14it/s]
Preparation                   : 100%|███████████| 78/78 [00:06<00:00, 11.85it/s]


In [5]:
confs = numpy.array([mol.n_conformers for mol in dataset.molecules])
molecular_weights = numpy.array([oechem.OECalculateMolecularWeight(mol.to_openeye()) for mol in dataset.molecules])
unique_formal_charges = numpy.unique([mol.total_charge / mol.total_charge.unit for mol in dataset.molecules])

print(f'Number of unique molecules        {dataset.n_molecules:d}')
print(f'Number of filtered molecules      {dataset.n_filtered:d}')
print(f'Number of conformers              {dataset.n_records:d}')
print(f'Number of conformers min mean max {confs.min():3d} {confs.mean():6.2f} {confs.max():3d}')
print(
    f'Molecular weight min mean max     {molecular_weights.min():6.2f} {molecular_weights.mean():6.2f} '
    f'{molecular_weights.max():6.2f}'
)
print(f'Charges                          ', sorted(unique_formal_charges))

print(dataset.metadata.dict())

for spec, obj in dataset.qc_specifications.items():
    print("Spec:", spec)
    print(obj.dict())

Number of unique molecules        78
Number of filtered molecules      0
Number of conformers              759
Number of conformers min mean max   4   9.73  10
Molecular weight min mean max     130.15 313.59 548.72
Charges                           [-1.0, 0.0, 1.0]
{'submitter': 'ccavende', 'creation_date': datetime.date(2022, 5, 30), 'collection_type': 'OptimizationDataset', 'dataset_name': 'OpenFF Protein Capped 1-mers 3-mers Optimization Dataset v1.0', 'short_description': 'Optimization dataset for protein capped 1-mers and 3-mers', 'long_description_url': None, 'long_description': 'Optimization dataset for protein capped 1-mers Ace-X-Nme and capped 3-mers Ace-Y-X-Y-Nme with Y = {Ala, Val} and X = 26 canonical amino acids with common protomers/tautomers (Ash, Cyx, Glh, Hid, Hip, and Lyn).', 'elements': {'N', 'C', 'H', 'O', 'S'}}
Spec: default
{'method': 'b3lyp-d3bj', 'basis': 'dzvp', 'program': 'psi4', 'spec_name': 'default', 'spec_description': 'Standard OpenFF optimization quantum

In [6]:
dataset.visualize('dataset.pdf')

In [7]:
dataset.metadata.submitter = 'Chapin Cavender'

In [8]:
dataset.metadata.long_description_url = 'https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2022-05-30-OpenFF-Protein-Capped-1mers-3mers-Optimization'

In [9]:
dataset.metadata

Metadata(submitter='Chapin Cavender', creation_date=datetime.date(2022, 5, 30), collection_type='OptimizationDataset', dataset_name='OpenFF Protein Capped 1-mers 3-mers Optimization Dataset v1.0', short_description='Optimization dataset for protein capped 1-mers and 3-mers', long_description_url=HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2022-05-30-OpenFF-Protein-Capped-1mers-3mers-Optimization', scheme='https', host='github.com', tld='com', host_type='domain', path='/openforcefield/qca-dataset-submission/tree/master/submissions/2022-05-30-OpenFF-Protein-Capped-1mers-3mers-Optimization'), long_description='Optimization dataset for protein capped 1-mers Ace-X-Nme and capped 3-mers Ace-Y-X-Y-Nme with Y = {Ala, Val} and X = 26 canonical amino acids with common protomers/tautomers (Ash, Cyx, Glh, Hid, Hip, and Lyn).', elements={'N', 'C', 'H', 'O', 'S'})

In [10]:
dataset.export_dataset('dataset.json.bz2')