# Generation process

This notebook documents the generation of a dataset of representative optimization molecules shared by XtalPi. The conformers used are the post-optimization conformers shared by XtalPi.

## Imports

In [1]:
import openff.qcsubmit
import openff.toolkit
import openeye
import qcelemental
import qcportal
import pyarrow
import pyarrow.dataset as ds
import numpy as np

print("OpenFF QCSubmit:", openff.qcsubmit.__version__)
print("OpenFF Toolkit:", openff.toolkit.__version__)
print("OpenEye:", openeye.__version__)
print("QCElemental:", qcelemental.__version__)
print("QCPortal:", qcportal.__version__)
print("PyArrow:", pyarrow.__version__)

OpenFF QCSubmit: 0.50.2
OpenFF Toolkit: 0.15.0
OpenEye: 2022.1.1
QCElemental: 0.27.1
QCPortal: 0.53
PyArrow: 14.0.2


In [2]:
import tqdm

from openff.units import unit

from openff.toolkit import Molecule
from openff.toolkit.utils import OpenEyeToolkitWrapper, ToolkitRegistry

from openff.qcsubmit.datasets import OptimizationDataset
from openff.qcsubmit.factories import OptimizationDatasetFactory

## Setting up dataset

In [3]:
dataset_factory = OptimizationDatasetFactory()

In [4]:
dataset = OptimizationDataset(
    dataset_name="XtalPi Shared Fragments OptimizationDataset v1.0",
    dataset_tagline="B3LYP-D3BJ/DZVP optimization of fragments shared by XtalPi.",
    description=(
        "A dataset containing representative fragments shared by XtalPi "
        "used in fitting the XFF force field "
        "(DOI: 10.1021/acs.jctc.3c00920). "
        "Conformers are the post-optimization geometries shared by XtalPi. "
        "Each conformer will be converged according to the 'GAU_LOOSE' criteria."
    ),
)
dataset.metadata.submitter = "lilyminium"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/"
    "2024-01-30-xtalpi-shared-fragments-optimization-v1.0"
)

## Loading input

In [5]:
input_dataset = ds.dataset("/data/chodera/lilywang/datasets/xff/output/xff-opt-dataset")
input_dataset.schema

ATOM: list<item: double>
  child 0, item: double
AM1BCC_CHARGE: list<item: double>
  child 0, item: double
RESP_CHARGE: list<item: double>
  child 0, item: double
ENERGY: double
INPUT_MOLECULE: list<item: double>
  child 0, item: double
CA_TYPE: string
CORE_TIME: double
HESSIAN: list<item: double>
  child 0, item: double
FINGERPRINT: string
GRADIENT: list<item: double>
  child 0, item: double
filename: string
directory: string
parent: string
smiles: string
mapped_smiles: string
n_atoms: int64
atomic_numbers: list<item: int64>
  child 0, item: int64

In [6]:
from collections import defaultdict
from openff.toolkit import Molecule
from openff.units import unit
import numpy as np

molecules = defaultdict(list)

columns = ["mapped_smiles", "ATOM", "parent", "directory"]
df = input_dataset.to_table(columns=columns).to_pandas()
all_molecules = []
# the fragment dataset was organized by directory
for _, subdf in df.groupby(by=["parent", "directory"]):
    unique_smiles = subdf.mapped_smiles.unique()
    assert len(unique_smiles) == 1
    mol = Molecule.from_mapped_smiles(
        unique_smiles[0],
        allow_undefined_stereo=True,
    )
    mol._conformers = []
    for each in subdf.ATOM.values:
        conformer = np.array(each).reshape((-1, 3))
        mol._conformers.append(conformer * unit.angstrom)
    all_molecules.append(mol)

len(all_molecules)

234

In [7]:
for i, mol in tqdm.tqdm(enumerate(all_molecules)):
    dataset.add_molecule(
        dataset_factory.create_index(molecule=mol),
        mol
    )

234it [00:39,  5.87it/s]


In [8]:
dataset.dict()

## Exporting dataset

In [None]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file('dataset.smi', 'smi')
dataset.visualize("dataset.pdf", columns=8)

print(dataset.qc_specifications)

## Dataset information

In [None]:
import numpy as np
from collections import Counter

In [None]:
print("n_molecules:", dataset.n_molecules)
print("n_conformers:", dataset.n_records)

In [None]:
n_confs = np.array(
    [mol.n_conformers for mol in dataset.molecules]
)
n_heavy_atoms = np.array(
    [mol.to_rdkit().GetNumHeavyAtoms() for mol in dataset.molecules]
)

In [None]:
print(
    "Number of conformers (min, mean, max):",
    n_confs.min(), n_confs.mean(), n_confs.max()
)
print("# heavy atoms")
counts = Counter(n_heavy_atoms)
for n_heavy in sorted(counts):
    print(f"{str(n_heavy):>3}: {counts[n_heavy]}")

In [None]:
unique_charges = set([
    mol.total_charge.m_as(unit.elementary_charge)
    for mol in dataset.molecules
])
unique_charges

In [None]:
masses = np.array([
    sum([atom.mass.m for atom in mol.atoms])
    for mol in dataset.molecules
])
print("MW (min, mean, max):", masses.min(), masses.mean(), masses.max())

In [None]:
elements = set(
    atom.symbol
    for mol in dataset.molecules
    for atom in mol.atoms
)
print(elements)