# Generation process

This notebook documents the generation of a dataset of representative optimization molecules shared by XtalPi. The conformers used are the post-optimization conformers shared by XtalPi.

## Imports

In [1]:
import openff.qcsubmit
import openff.toolkit
import openeye
import qcelemental
import qcportal
import pyarrow
import pyarrow.dataset as ds
import numpy as np

print("OpenFF QCSubmit:", openff.qcsubmit.__version__)
print("OpenFF Toolkit:", openff.toolkit.__version__)
print("OpenEye:", openeye.__version__)
print("QCElemental:", qcelemental.__version__)
print("QCPortal:", qcportal.__version__)
print("PyArrow:", pyarrow.__version__)

OpenFF QCSubmit: 0.50.2
OpenFF Toolkit: 0.15.0
OpenEye: 2022.1.1
QCElemental: 0.27.1
QCPortal: 0.53
PyArrow: 15.0.0


In [2]:
import tqdm

from openff.units import unit

from openff.toolkit import Molecule
from openff.toolkit.utils import OpenEyeToolkitWrapper, ToolkitRegistry

from openff.qcsubmit.datasets import OptimizationDataset
from openff.qcsubmit.factories import OptimizationDatasetFactory

LICENSE: Could not open license file "oe_license.txt" in local directory
LICENSE: N.B. OE_LICENSE environment variable is not set
LICENSE: N.B. OE_DIR environment variable is not set
LICENSE: No product keys!
LICENSE: No product keys!
LICENSE: No product keys!
LICENSE: No product keys!
The OpenEye Toolkits are found to be installed but not licensed and therefore will not be used.
The OpenEye Toolkits require a (free for academics) license, see https://docs.eyesopen.com/toolkits/python/quickstart-python/license.html
The OpenEye Toolkits are found to be installed but not licensed and therefore will not be used.
The OpenEye Toolkits require a (free for academics) license, see https://docs.eyesopen.com/toolkits/python/quickstart-python/license.html


## Setting up dataset

In [3]:
dataset_factory = OptimizationDatasetFactory()

In [4]:
dataset = OptimizationDataset(
    dataset_name="XtalPi 20-percent Fragments OptimizationDataset v1.0",
    dataset_tagline="B3LYP-D3BJ/DZVP optimization of 20% the fragment dataset used by XtalPi to fit XFF.",
    description=(
        "A dataset containing 20% the fragments used by XtalPi "
        "in fitting the XFF force field "
        "(DOI: 10.1021/acs.jctc.3c00920). "
        "Conformers are the post-optimization geometries shared by XtalPi. "
    ),
)
dataset.metadata.submitter = "lilyminium"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/"
    "2024-04-02-xtalpi-20-percent-fragments-optimization-v1.0"
)

## Loading input

In [5]:
input_dataset = ds.dataset("/data/chodera/lilywang/datasets/xff/output/xff-20-percent-opt-dataset")
input_dataset.schema

ATOM: list<element: double>
  child 0, element: double
CORE_TIME: double
FINGERPRINT: string
RESP_CHARGE: list<element: double>
  child 0, element: double
CA_TYPE: string
GRADIENT: list<element: double>
  child 0, element: double
AM1BCC_CHARGE: list<element: double>
  child 0, element: double
INPUT_MOLECULE: list<element: double>
  child 0, element: double
HESSIAN: list<element: double>
  child 0, element: double
ENERGY: double
filename: string
directory: string
parent: string
smiles: string
mapped_smiles: string
n_atoms: int64
atomic_numbers: list<element: int64>
  child 0, element: int64

In [6]:
# number of input conformers
input_dataset.count_rows()

158575

In [7]:
from collections import defaultdict
from openff.toolkit import Molecule
from openff.units import unit
import numpy as np

molecules = defaultdict(list)

columns = ["mapped_smiles", "ATOM", "parent", "directory"]
df = input_dataset.to_table(columns=columns).to_pandas()
all_molecules = []
# the fragment dataset was organized by directory
for _, subdf in df.groupby(by=["parent", "directory"]):
    unique_smiles = subdf.mapped_smiles.unique()
    assert len(unique_smiles) == 1
    mol = Molecule.from_mapped_smiles(
        unique_smiles[0],
        allow_undefined_stereo=True,
    )
    mol._conformers = []
    for each in subdf.ATOM.values:
        conformer = np.array(each).reshape((-1, 3))
        mol._conformers.append(conformer * unit.angstrom)
    all_molecules.append(mol)

len(all_molecules)

12796

In [8]:
input_n_confs = [len(mol.conformers) for mol in all_molecules]
print(f"Total input conformers: {sum(input_n_confs)}")

Total input conformers: 158575


In [9]:
for i, mol in tqdm.tqdm(enumerate(all_molecules)):
    dataset.add_molecule(
        dataset_factory.create_index(molecule=mol),
        mol
    )































































































































































































































































































































































12796it [36:15,  5.88it/s]


In [11]:
not_molecules = {
    k: v
    for k, v in dataset.dict().items()
    if k != "dataset"
}
not_molecules

{'qc_specifications': {'default': {'method': 'B3LYP-D3BJ',
   'basis': 'DZVP',
   'program': 'psi4',
   'spec_name': 'default',
   'spec_description': 'Standard OpenFF optimization quantum chemistry specification.',
   'store_wavefunction': 'none',
   'implicit_solvent': None,
   'maxiter': 200,
   'scf_properties': ['dipole',
    'quadrupole',
    'wiberg_lowdin_indices',
    'mayer_indices'],
   'keywords': {}}},
 'driver': 'deferred',
 'priority': 'normal',
 'dataset_tags': ['openff'],
 'compute_tag': 'openff',
 'dataset_name': 'XtalPi 20-percent Fragments OptimizationDataset v1.0',
 'dataset_tagline': 'B3LYP-D3BJ/DZVP optimization of 20% the fragment dataset used by XtalPi to fit XFF.',
 'type': 'OptimizationDataset',
 'description': 'A dataset containing 20% the fragments used by XtalPi in fitting the XFF force field (DOI: 10.1021/acs.jctc.3c00920). Conformers are the post-optimization geometries shared by XtalPi. ',
 'metadata': {'submitter': 'lilyminium',
  'creation_date': date

## Exporting dataset

In [12]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file('dataset.smi', 'smi')
dataset.visualize("dataset.pdf", columns=8)

print(dataset.qc_specifications)

{'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>, implicit_solvent=None, maxiter=200, scf_properties=[<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.MayerIndices: 'mayer_indices'>], keywords={})}


## Dataset information

In [13]:
import numpy as np
from collections import Counter

In [14]:
print("n_molecules:", dataset.n_molecules)
print("n_conformers:", dataset.n_records)

n_molecules: 10069
n_conformers: 128180


In [15]:
n_confs = np.array(
    [mol.n_conformers for mol in dataset.molecules]
)
n_heavy_atoms = np.array(
    [mol.to_rdkit().GetNumHeavyAtoms() for mol in dataset.molecules]
)

In [16]:
print(f"Total #confs: {sum(n_confs)}")

Total #confs: 128180


In [17]:
print(
    "Number of conformers (min, mean, max):",
    n_confs.min(), n_confs.mean(), n_confs.max()
)
print("# heavy atoms")
counts = Counter(n_heavy_atoms)
for n_heavy in sorted(counts):
    print(f"{str(n_heavy):>3}: {counts[n_heavy]}")

Number of conformers (min, mean, max): 1 12.699891013573763 30
# heavy atoms
  3: 5
  4: 16
  5: 86
  6: 191
  7: 294
  8: 440
  9: 603
 10: 779
 11: 906
 12: 964
 13: 1239
 14: 1287
 15: 1057
 16: 677
 17: 446
 18: 316
 19: 245
 20: 197
 21: 140
 22: 85
 23: 57
 24: 38
 25: 10
 26: 10
 27: 3
 28: 2


In [18]:
unique_charges = set([
    mol.total_charge.m_as(unit.elementary_charge)
    for mol in dataset.molecules
])
unique_charges

{-2.0, -1.0, 0.0, 1.0, 2.0}

In [19]:
masses = np.array([
    sum([atom.mass.m for atom in mol.atoms])
    for mol in dataset.molecules
])
print("MW (min, mean, max):", masses.min(), masses.mean(), masses.max())

MW (min, mean, max): 42.040114 189.05655430910036 387.454031


In [20]:
elements = set(
    atom.symbol
    for mol in dataset.molecules
    for atom in mol.atoms
)
print(elements)

{'Cl', 'B', 'C', 'N', 'P', 'O', 'Si', 'S', 'F', 'I', 'Br', 'H'}
