# Generation process

This notebook documents the generation of a dataset of representative torsiondrive molecules shared by XtalPi. The conformers used are the post-optimization conformers shared by XtalPi.

## Imports

In [1]:
import openff.qcsubmit
import openff.toolkit
import openeye
import qcelemental
import qcportal
import pyarrow
import pyarrow.dataset as ds
import numpy as np

print("OpenFF QCSubmit:", openff.qcsubmit.__version__)
print("OpenFF Toolkit:", openff.toolkit.__version__)
print("OpenEye:", openeye.__version__)
print("QCElemental:", qcelemental.__version__)
print("QCPortal:", qcportal.__version__)
print("PyArrow:", pyarrow.__version__)

OpenFF QCSubmit: 0.50.2
OpenFF Toolkit: 0.15.0
OpenEye: 2022.1.1
QCElemental: 0.27.1
QCPortal: 0.53
PyArrow: 14.0.2


In [2]:
import tqdm

from openff.units import unit

from openff.toolkit import Molecule
from openff.toolkit.utils import OpenEyeToolkitWrapper, ToolkitRegistry

from openff.qcsubmit.datasets import TorsiondriveDataset
from openff.qcsubmit.factories import TorsiondriveDatasetFactory

## Setting up dataset

In [3]:
dataset_factory = TorsiondriveDatasetFactory()

## Loading input

In [4]:
input_dataset = ds.dataset("/data/chodera/lilywang/datasets/xff/output/xff-td-dataset")
input_dataset.schema

ATOM: list<item: double>
  child 0, item: double
frozen_atoms: list<item: int64>
  child 0, item: int64
frozen_0: int64
frozen_1: int64
frozen_2: int64
frozen_3: int64
angle: int64
CORE_TIME: double
INPUT_MOLECULE: list<item: double>
  child 0, item: double
FINGERPRINT: string
CA_TYPE: string
ENERGY: double
filename: string
directory: string
parent: string
smiles: string
mapped_smiles: string
n_atoms: int64
atomic_numbers: list<item: int64>
  child 0, item: int64

In [5]:
from openff.toolkit import Molecule
from openff.units import unit
import numpy as np

from openff.qcsubmit.workflow_components import TorsionIndexer

columns = [
    "mapped_smiles", "ATOM", "ENERGY",
    "frozen_atoms", "frozen_0", "frozen_1", "frozen_2", "frozen_3",
]
df = input_dataset.to_table(columns=columns).to_pandas()
all_molecules = []
# sort by torsion scan and take lowest energy conformer
groupby = ["mapped_smiles", "frozen_0", "frozen_1", "frozen_2", "frozen_3"]
for (mapped_smiles, *_), subdf in df.groupby(by=groupby):
    mol = Molecule.from_mapped_smiles(
        mapped_smiles,
        allow_undefined_stereo=True,
    )
    subdf = subdf.sort_values("ENERGY")
    conformer = np.array(subdf.ATOM.values[0]).reshape((-1, 3))
    mol._conformers = [conformer * unit.angstrom]
    
    torsion_indexer = TorsionIndexer()
    # frozen atoms indexes from 1
    frozen_atoms = tuple(
        [atom - 1 for atom in subdf.frozen_atoms.values[0]]
    )
    assert max(frozen_atoms) < len(mol.atoms)
    torsion_indexer.add_torsion(frozen_atoms, (0, 3), (-165, 180))
    mol.properties["dihedrals"] = torsion_indexer
    
    all_molecules.append(mol)

len(all_molecules)

424

In [6]:
dataset = dataset_factory.create_dataset(
    dataset_name="XtalPi Shared Fragments TorsiondriveDataset v1.0",
    tagline="B3LYP-D3BJ/DZVP torsion drive of fragments shared by XtalPi.",
    description=(
        "A dataset containing representative fragments shared by XtalPi "
        "used in fitting the XFF force field "
        "(DOI: 10.1021/acs.jctc.3c00920). "
        "Conformers are the post-optimization geometries shared by XtalPi. "
        "Each conformer will be converged according to the 'GAU_LOOSE' criteria."
    ),
    molecules=all_molecules
)
dataset.metadata.submitter = "lilyminium"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/"
    "2024-01-30-xtalpi-shared-fragments-torsiondrive-v1.0"
)

Deduplication                 : 100%|████████| 424/424 [00:02<00:00, 177.47it/s]
Preparation                   : 100%|█████████| 170/170 [00:09<00:00, 18.48it/s]


In [7]:
dataset.dict()

{'qc_specifications': {'default': {'method': 'B3LYP-D3BJ',
   'basis': 'DZVP',
   'program': 'psi4',
   'spec_name': 'default',
   'spec_description': 'Standard OpenFF optimization quantum chemistry specification.',
   'store_wavefunction': 'none',
   'implicit_solvent': None,
   'maxiter': 200,
   'scf_properties': ['dipole',
    'quadrupole',
    'wiberg_lowdin_indices',
    'mayer_indices'],
   'keywords': {}}},
 'driver': 'deferred',
 'priority': 'normal',
 'dataset_tags': ['openff'],
 'compute_tag': 'openff',
 'dataset_name': 'XtalPi Shared Fragments TorsiondriveDataset v1.0',
 'dataset_tagline': 'B3LYP-D3BJ/DZVP torsion drive of fragments shared by XtalPi.',
 'type': 'TorsionDriveDataset',
 'description': "A dataset containing representative fragments shared by XtalPi used in fitting the XFF force field (DOI: 10.1021/acs.jctc.3c00920). Conformers are the post-optimization geometries shared by XtalPi. Each conformer will be converged according to the 'GAU_LOOSE' criteria.",
 'meta

## Exporting dataset

In [8]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file('dataset.smi', 'smi')
dataset.visualize("dataset.pdf", columns=8)

print(dataset.qc_specifications)

{'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>, implicit_solvent=None, maxiter=200, scf_properties=[<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.MayerIndices: 'mayer_indices'>], keywords={})}


## Dataset information

In [9]:
import numpy as np
from collections import Counter

In [10]:
print("n_molecules:", dataset.n_molecules)
print("n_conformers:", dataset.n_records)

n_molecules: 169
n_conformers: 169


In [11]:
n_confs = np.array(
    [mol.n_conformers for mol in dataset.molecules]
)
n_heavy_atoms = np.array(
    [mol.to_rdkit().GetNumHeavyAtoms() for mol in dataset.molecules]
)

In [12]:
print(
    "Number of conformers (min, mean, max):",
    n_confs.min(), n_confs.mean(), n_confs.max()
)
print("# heavy atoms")
counts = Counter(n_heavy_atoms)
for n_heavy in sorted(counts):
    print(f"{str(n_heavy):>3}: {counts[n_heavy]}")

Number of conformers (min, mean, max): 1 2.485207100591716 6
# heavy atoms
  4: 1
  5: 1
  6: 4
  7: 4
  8: 4
  9: 8
 10: 3
 11: 15
 12: 12
 13: 26
 14: 21
 15: 16
 16: 19
 17: 8
 18: 5
 19: 12
 20: 5
 21: 4
 24: 1


In [13]:
unique_charges = set([
    mol.total_charge.m_as(unit.elementary_charge)
    for mol in dataset.molecules
])
unique_charges

{-2.0, -1.0, 0.0}

In [14]:
masses = np.array([
    sum([atom.mass.m for atom in mol.atoms])
    for mol in dataset.molecules
])
print("MW (min, mean, max):", masses.min(), masses.mean(), masses.max())

MW (min, mean, max): 78.49803100000001 201.1501543544379 314.3822960000001


In [15]:
elements = set(
    atom.symbol
    for mol in dataset.molecules
    for atom in mol.atoms
)
print(elements)

{'C', 'H', 'O', 'P', 'F', 'N', 'Cl', 'S', 'Br'}
