# Generation process

This notebook documents the generation of a TorsionDrive dataset for the 16 unique dinucleoside monophosphates (DNMPs), 5'-XpY-3', of RNA. For each DNMP, we will perform a 1-D torsion scan of
- the five backbone dihedrals around the central phosphodiester: epsilon, zeta, alpha, beta, and gamma
- the glycosidic dihedral chi and the 2' hydroxyl dihedral for each nucleoside
- the terminal dihedrals HO5'-O5'-C5'-C4' and O5'-C5'-C4'-C3' for the 5' residue
- the terminal dihedral C4'-C3'-O3'-HO3' for the 3' residue

We will not scan in-ring dihedrals in the sugar, including the backbone dihedral delta.

Non-driven dihedrals will be constrained to reference values rounded to the nearest multiple of 15 deg
- Backbone dihedrals: values in an A-form helix (60, 75, -150, -75, -60, 180, 60, and 75 for 5' gamma, 5' delta, epsilon, zeta, alpha, beta, 3' gamma, 3' delta)
- Glycosidic dihedrals: anti value (-150)
- Hydroxyl dihedrals: values to avoid intramolecular hydrogen bonds in an A-form helix (-120 for C3'-C2'-O2'-HO2', -60 for C4'-C3'-O3'-HO3', and 180 for HO5'-O5'-C5'-C4')

## Setup

In [1]:
import numpy
import tqdm
from pathlib import Path

from openff.qcsubmit.datasets import TorsiondriveDataset
from openff.qcsubmit.factories import TorsiondriveDatasetFactory
from openff.qcsubmit.workflow_components import TorsionIndexer
from openff.toolkit import Molecule
from openff.units import unit
import qcelemental
import qcportal
from rdkit import Chem
from rdkit.Chem import AllChem

In [2]:
# Fragment SMILES
nucleobases = {
    "A": "n2c3ncnc(N)c3nc2",
    "C": "n2c(=O)nc(N)cc2",
    "G": "n2c3nc(N)[nH]c(=O)c3nc2",
    "U": "n2c(=O)[nH]c(=O)cc2",
}
ribonucleoside = "OC[C@H]1O[C@@H]({nucleobase})[C@H](O)[C@@H]1O"
phosphodiester = "P(=O)([O-])"

# Dihedral SMIRKS
ribofuranose_smarts = "[#6X4][#6X4@H]@1@[#8X2]@[#6X4@@H](~[*])@[#6X4@H]([#8X2H1])@[#6X4@@H]@1"
purine_smarts = "[#6x3]@3(@[#7x2]@2)@[#7x2]@[#6x2]@[#7x2]@[#6x2]@[#6x3]@3@[#7x2]@[#6x2]@2"
pyrimidine_smarts = "[#6x2](@[#7x2]@2)@[#7x2]@[#6x2]@[#6x2]@[#6x2]@2"
reference_dihedrals = {
    "5pOH": {
        "SMARTS": "[#1:1][#8X2:2][#6X4:3][#6X4@H:4]@1@[#8X2]@[#6X4@@H](~[*])@[#6X4@H]([#8X2H1])@[#6X4@@H]@1[#8X2]",
        "Reference": 180,
    },
    "gamma": {
        "SMARTS": "[#8X2:1][#6X4:2][#6X4@H:3]@1@[#8X2]@[#6X4@@H](~[*])@[#6X4@H]([#8X2H1])@[#6X4@@H:4]@1[#8X2]",
        "Reference": 60,
    },
    "delta": {
        "SMARTS": "[#8X2][#6X4:1][#6X4@H:2]@1@[#8X2]@[#6X4@@H](~[*])@[#6X4@H]([#8X2H1])@[#6X4@@H:3]@1[#8X2:4]",
        "Reference": 81,
    },
    "nu4": {
        "SMARTS": "[#8X2][#6X4][#6X4@H:3]@1@[#8X2:2]@[#6X4@@H:1](~[*])@[#6X4@H]([#8X2H1])@[#6X4@@H:4]@1[#8X2]",
        "Reference": 24,
    },
    "nu3": {
        "SMARTS": "[#8X2][#6X4][#6X4@H:2]@1@[#8X2:1]@[#6X4@@H](~[*])@[#6X4@H:4]([#8X2H1])@[#6X4@@H:3]@1[#8X2]",
        "Reference": -38,
    },
    "nu2": {
        "SMARTS": "[#8X2][#6X4][#6X4@H:1]@1@[#8X2]@[#6X4@@H:4](~[*])@[#6X4@H:3]([#8X2H1])@[#6X4@@H:2]@1[#8X2]",
        "Reference": 38,
    },
    "nu1": {
        "SMARTS": "[#8X2][#6X4][#6X4@H]@1@[#8X2:4]@[#6X4@@H:3](~[*])@[#6X4@H:2]([#8X2H1])@[#6X4@@H:1]@1[#8X2]",
        "Reference": -24,
    },
    "nu0": {
        "SMARTS": "[#8X2][#6X4][#6X4@H:4]@1@[#8X2:3]@[#6X4@@H:2](~[*])@[#6X4@H:1]([#8X2H1])@[#6X4@@H]@1[#8X2]",
        "Reference": 0,
    },
    "chi": {
        "SMARTS": f"[#8X2][#6X4][#6X4@H]@1@[#8X2:1]@[#6X4@@H:2]([#7x2:3]~[$({purine_smarts}),$({pyrimidine_smarts}):4])@[#6X4@H]([#8X2H1])@[#6X4@@H]@1[#8X2]",
        "Reference": -150,
    },
    "2pOH": {
        "SMARTS": "[#8X2][#6X4][#6X4@H]@1@[#8X2]@[#6X4@@H](~[*])@[#6X4@H:2]([#8X2:3][#1:4])@[#6X4@@H:1]@1[#8X2]",
        "Reference": -120,
    },
    "epsilon": {
        "SMARTS": f"[#8X2][#6X4][#6X4@H:1]@1@[#8X2]@[#6X4@@H](~[*])@[#6X4@H]([#8X2H1])@[#6X4@@H:2]@1[#8X2:3][#15X4:4](~[#8X1])(~[#8X1])[#8X2]{ribofuranose_smarts}",
        "Reference": -150,
    },
    "zeta": {
        "SMARTS": f"[#8X2][#6X4][#6X4@H]@1@[#8X2]@[#6X4@@H](~[*])@[#6X4@H]([#8X2H1])@[#6X4@@H:1]@1[#8X2:2][#15X4:3](~[#8X1])(~[#8X1])[#8X2:4]{ribofuranose_smarts}",
        "Reference": -75,
    },
    "alpha": {
        "SMARTS": f"{ribofuranose_smarts}[#8X2:1][#15X4:2](~[#8X1])(~[#8X1])[#8X2:3][#6X4:4][#6X4@H]@1@[#8X2]@[#6X4@@H](~[*])@[#6X4@H]([#8X2H1])@[#6X4@@H]@1[#8X2]",
        "Reference": -60,
    },
    "beta": {
        "SMARTS": f"{ribofuranose_smarts}[#8X2][#15X4:1](~[#8X1])(~[#8X1])[#8X2:2][#6X4:3][#6X4@H:4]@1@[#8X2]@[#6X4@@H](~[*])@[#6X4@H]([#8X2H1])@[#6X4@@H]@1[#8X2]",
        "Reference": 180,
    },
    "3pOH": {
        "SMARTS": "[#8X2][#6X4][#6X4@H:1]@1@[#8X2]@[#6X4@@H](~[*])@[#6X4@H]([#8X2H1])@[#6X4@@H:2]@1[#8X2:3][#1:4]",
        "Reference": -60,
    },
}

# Set scan spacing for non-ring dihedrals to 15 deg
for dihedral, reference in reference_dihedrals.items():
    if dihedral == "delta" or dihedral.startswith("nu"):
        continue
    reference["Spacing"] = 15

In [3]:
def get_downstream_atoms(rdmol, anchor_idx, query_idx):
    """
    Finds all atoms downstream of the query atom with respect to the anchor atom.
    If the query and anchor atoms are in a ring, then traverse at most one downstream ring bond.
    """
    visited_atoms = [False] * rdmol.GetNumAtoms()
    visited_atoms[anchor_idx] = True
    visited_atoms[query_idx] = True
    query_is_in_ring = rdmol.GetBondBetweenAtoms(anchor_idx, query_idx).IsInRing()

    atom_stack = list()
    for neighbor in rdmol.GetAtomWithIdx(query_idx).GetNeighbors():
        neighbor_idx = neighbor.GetIdx()
        if neighbor_idx == anchor_idx:
            continue
        visited_atoms[neighbor_idx] = True

        # If query is in a ring, traverse at most one downstream ring bond
        if query_is_in_ring and rdmol.GetBondBetweenAtoms(query_idx, neighbor_idx).IsInRing():
            for ring_neighbor in neighbor.GetNeighbors():
                ring_neighbor_idx = ring_neighbor.GetIdx()
                if not rdmol.GetBondBetweenAtoms(neighbor_idx, ring_neighbor_idx).IsInRing():
                    visited_atoms[ring_neighbor_idx] = True
                    atom_stack.append(ring_neighbor_idx)
        else:
            atom_stack.append(neighbor_idx)

    # Find atoms downstream of query neighbors
    while atom_stack:
        stack_idx = atom_stack.pop()
        stack_atom = rdmol.GetAtomWithIdx(stack_idx)
        for neighbor in stack_atom.GetNeighbors():
            neighbor_idx = neighbor.GetIdx()
            if not visited_atoms[neighbor_idx]:
                visited_atoms[neighbor_idx] = True
                atom_stack.append(neighbor_idx)

    return [
        atom_idx for atom_idx in range(rdmol.GetNumAtoms())
        if (visited_atoms[atom_idx] and atom_idx != anchor_idx and atom_idx != query_idx)
    ]

def set_ring_dihedral(conformer, atom_idx_i, atom_idx_j, atom_idx_k, atom_idx_l, dihedral_value):
    """
    Sets the dihedral angle between atoms i, j, k, and l when atoms j and k are in a ring bond. Based on
    https://github.com/rdkit/rdkit/blob/693796d7af78a9f6ce54771747aee732776bb3c6/Code/GraphMol/MolTransforms/MolTransforms.cpp#L612
    """
    positions = conformer.GetPositions()

    # Rotation angle is the difference between the specified value and the current value
    conformer_dihedral = Chem.rdMolTransforms.GetDihedralDeg(conformer, atom_idx_i, atom_idx_j, atom_idx_k, atom_idx_l)
    rotation_angle = numpy.deg2rad(dihedral_value - conformer_dihedral)

    # Rotation axis is the j-to-k bond vector
    rotation_origin = positions[atom_idx_j]
    rot_axis = positions[atom_idx_k] - rotation_origin
    rot_axis /= numpy.linalg.norm(rot_axis)

    # Set up rotation matrix
    cos_ang = numpy.cos(rotation_angle)
    sin_ang = numpy.sin(rotation_angle)
    cos_diff = 1 - cos_ang
    rotation_matrix = numpy.array(
        [
            [
                cos_diff * rot_axis[0] * rot_axis[0] + cos_ang,
                cos_diff * rot_axis[0] * rot_axis[1] - sin_ang * rot_axis[2],
                cos_diff * rot_axis[0] * rot_axis[2] + sin_ang * rot_axis[1],
            ],
            [
                cos_diff * rot_axis[1] * rot_axis[0] + sin_ang * rot_axis[2],
                cos_diff * rot_axis[1] * rot_axis[1] + cos_ang,
                cos_diff * rot_axis[1] * rot_axis[2] - sin_ang * rot_axis[0],
            ],
            [
                cos_diff * rot_axis[2] * rot_axis[0] - sin_ang * rot_axis[1],
                cos_diff * rot_axis[2] * rot_axis[1] + sin_ang * rot_axis[0],
                cos_diff * rot_axis[2] * rot_axis[2] + cos_ang,
            ],
        ]
    )

    # Get list of atoms to be rotated
    for atom_idx in get_downstream_atoms(conformer.GetOwningMol(), atom_idx_j, atom_idx_k):
        # Translate to rotation origin
        rotated_coords = positions[atom_idx] - rotation_origin
        # Apply rotation
        rotated_coords = numpy.dot(rotation_matrix, rotated_coords)
        # Apply inverse of translation to rotation origin
        rotated_coords += rotation_origin
        conformer.SetAtomPosition(atom_idx, rotated_coords)

## Create reference structures

In [4]:
for nucleobase_1, nucleobase_smiles_1 in nucleobases.items():
    for nucleobase_2, nucleobase_smiles_2 in nucleobases.items():
        dnmp_name = f"{nucleobase_1}{nucleobase_2}"

        # Build RDKit Molecule from SMILES
        dnmp_smiles = ribonucleoside.format(nucleobase=nucleobase_smiles_1) + phosphodiester + ribonucleoside.format(nucleobase=nucleobase_smiles_2)
        rdmol = Chem.MolFromSmiles(dnmp_smiles, sanitize=False)
        Chem.SanitizeMol(rdmol, Chem.SANITIZE_ALL ^ Chem.SANITIZE_ADJUSTHS ^ Chem.SANITIZE_SETAROMATICITY)
        Chem.SetAromaticity(rdmol, Chem.AromaticityModel.AROMATICITY_MDL)
        Chem.AssignStereochemistry(rdmol)
        rdmol = Chem.AddHs(rdmol)

        # Generate a random conformer for the DNMP.
        # Setting ring conformations is difficult, so regenerate until both nucleosides have a C3' endo sugar pucker.
        bad_ring_puckers = True
        embed_seed = 0
        while bad_ring_puckers:
            embed_seed += 1
            rdmol.RemoveAllConformers()
            AllChem.EmbedMolecule(rdmol, randomSeed=embed_seed, useRandomCoords=True)
            conformer = rdmol.GetConformer()

            # Estimate sugar pucker from values of the delta backbone dihedral
            query_rdmol = Chem.MolFromSmarts(reference_dihedrals["delta"]["SMARTS"])
            idx_map = dict()
            for atom in query_rdmol.GetAtoms():
                smirks_index = atom.GetAtomMapNum()
                if smirks_index != 0:
                    idx_map[smirks_index - 1] = atom.GetIdx()
            map_list = [idx_map[i] for i in sorted(idx_map)]
            conformer_delta_values = numpy.array([
                Chem.rdMolTransforms.GetDihedralDeg(conformer, *tuple(match[i] for i in map_list))
                for match in rdmol.GetSubstructMatches(query_rdmol, uniquify=True, useChirality=True)
            ])
            bad_ring_puckers = numpy.any(numpy.abs(conformer_delta_values - 75) > 15)

        print(f"Conformer embedding seed for {dnmp_name} was {embed_seed}")

        # Set dihedral angles to reference values
        conformer_dihedrals = list()
        for dihedral, reference in reference_dihedrals.items():
            query_rdmol = Chem.MolFromSmarts(reference["SMARTS"])
            idx_map = dict()
            for atom in query_rdmol.GetAtoms():
                smirks_index = atom.GetAtomMapNum()
                if smirks_index != 0:
                    idx_map[smirks_index - 1] = atom.GetIdx()
            map_list = [idx_map[i] for i in sorted(idx_map)]
            for match in rdmol.GetSubstructMatches(query_rdmol, uniquify=True, useChirality=True):
                match_indices = tuple(match[i] for i in map_list)
                conformer_dihedrals.append(
                    {
                        "Dihedral": dihedral,
                        "Indices": match_indices,
                        "Reference Value": reference["Reference"],
                        "Initial Value": Chem.rdMolTransforms.GetDihedralDeg(conformer, *match_indices),
                    }
                )

                # Ring dihedrals for the ribose are overpsecified. Set sugar pucker through delta, nu2, and nu1.
                if dihedral in {"nu4", "nu3", "nu0"}:
                    continue
                elif dihedral in {"delta", "nu2", "nu1"}:
                    set_ring_dihedral(conformer, *match_indices, reference["Reference"])
                else:
                    Chem.rdMolTransforms.SetDihedralDeg(conformer, *match_indices, reference["Reference"])

        # Print initial and final dihedral angle values
        for conformer_dihedral in conformer_dihedrals:
            dihedral = conformer_dihedral["Dihedral"]
            match_indices = conformer_dihedral["Indices"]
            reference_value = conformer_dihedral["Reference Value"]
            initial_value = conformer_dihedral["Initial Value"]
            final_value = Chem.rdMolTransforms.GetDihedralDeg(conformer, *match_indices)
            print(
                f"{dnmp_name:2s} {dihedral:7s} {match_indices[0]:2d} {match_indices[1]:2d} {match_indices[2]:2d} {match_indices[3]:2d} "
                f"{reference_value:7.2f} {initial_value:7.2f} {final_value:7.2f}"
            )

        # Write reference structure to SDF
        with Chem.SDWriter(Path("initial-conformers", f"{nucleobase_1}{nucleobase_2}-reference.sdf")) as sdf_writer:
            sdf_writer.write(rdmol)

Conformer embedding seed for AA was 17
AA 5pOH    41  0  1  2  180.00  170.19 -180.00
AA gamma    0  1  2 17   60.00  -64.09   60.00
AA gamma   22 23 24 39   60.00  -66.77   60.00
AA delta    1  2 17 18   81.00   76.09   81.00
AA delta   23 24 39 40   81.00   73.15   81.00
AA nu4      4  3  2 17   24.00   36.23   25.24
AA nu4     26 25 24 39   24.00   48.97   24.65
AA nu3      3  2 17 15  -38.00  -38.50  -38.07
AA nu3     25 24 39 37  -38.00  -38.64  -36.67
AA nu2      2 17 15  4   38.00   27.79   38.00
AA nu2     24 39 37 26   38.00   16.12   38.00
AA nu1     17 15  4  3  -24.00  -23.54  -24.00
AA nu1     39 37 26 25  -24.00  -21.86  -24.00
AA nu0     15  4  3  2    0.00   -0.32   -0.32
AA nu0     37 26 25 24    0.00   -0.74   -0.74
AA chi      3  4  5  6 -150.00  176.19 -150.00
AA chi     25 26 27 28 -150.00   61.17 -150.00
AA 2pOH    17 15 16 51 -120.00   72.48 -120.00
AA 2pOH    39 37 38 62 -120.00   92.67 -120.00
AA epsilon  2 17 18 19 -150.00 -125.63 -150.00
AA zeta    17 18 19 2

## Generate initial structures for torsion scans

In [5]:
scan_dihedrals = dict()
for nucleobase_1 in nucleobases.keys():
    for nucleobase_2 in nucleobases.keys():
        # Read reference structure from SDF
        dnmp_name = f"{nucleobase_1}{nucleobase_2}"
        with Chem.SDMolSupplier(
            Path("initial-conformers", f"{dnmp_name}-reference.sdf"),
            removeHs=False,
            sanitize=False,
            strictParsing=True,
        ) as sdf_reader:
            ref_rdmol = sdf_reader[0]

        Chem.SanitizeMol(ref_rdmol, Chem.SANITIZE_ALL ^ Chem.SANITIZE_ADJUSTHS ^ Chem.SANITIZE_SETAROMATICITY)
        Chem.AssignStereochemistryFrom3D(ref_rdmol)
        Chem.SetAromaticity(ref_rdmol, Chem.AromaticityModel.AROMATICITY_MDL)

        # Loop over dihedral angles to be scanned
        scan_dihedrals[dnmp_name] = dict()
        for dihedral, reference in reference_dihedrals.items():
            if not "Spacing" in reference:
                continue

            # Get dihedral SMARTS matches
            query_rdmol = Chem.MolFromSmarts(reference["SMARTS"])
            idx_map = dict()
            for atom in query_rdmol.GetAtoms():
                smirks_index = atom.GetAtomMapNum()
                if smirks_index != 0:
                    idx_map[smirks_index - 1] = atom.GetIdx()
            map_list = [idx_map[i] for i in sorted(idx_map)]
            dihedral_matches = ref_rdmol.GetSubstructMatches(query_rdmol, uniquify=True, useChirality=True)

            if len(dihedral_matches) == 1:
                scan_dihedral_names = (dihedral,)
            elif len(dihedral_matches) == 2:
                scan_dihedral_names = (f"5p{dihedral}", f"3p{dihedral}")
            else:
                raise ValueError(f"Query SMARTS {reference['SMARTS']} had {len(dihedral_matches)} matches for {nucleobase_1}{nucleobase_2}")

            # Loop over dihedral SMARTS matches
            for scan_dihedral_name, match in zip(scan_dihedral_names, dihedral_matches):
                match_indices = tuple(match[i] for i in map_list)
                scan_dihedrals[dnmp_name][scan_dihedral_name] = {"Indices": match_indices, "Spacing": reference["Spacing"]}
    
                scan_rdmol = Chem.Mol(ref_rdmol)
                tmp_rdmol = Chem.Mol(ref_rdmol)
                conformer = tmp_rdmol.GetConformer()

                # Loop over scan values
                for scan_value in numpy.arange(reference["Reference"] + reference["Spacing"], reference["Reference"] + 360, reference["Spacing"]):
                    scan_value = (scan_value + 180) % 360 - 180.0
                    Chem.rdMolTransforms.SetDihedralDeg(conformer, *match_indices, scan_value)
                    scan_rdmol.AddConformer(conformer, assignId=True)

                # Write scan conformers to SDF
                with Chem.SDWriter(Path("initial-conformers", f"{dnmp_name}-{scan_dihedral_name}-initial-conformers.sdf")) as sdf_writer:
                    for conformer in scan_rdmol.GetConformers():
                        sdf_writer.write(scan_rdmol, confId=conformer.GetId())

In [7]:
offmol = Molecule.from_file("initial-conformers/AU-5pchi-initial-conformers.sdf", allow_undefined_stereo=True)
for j in range(1, len(offmol)):
    offmol[0].add_conformer(offmol[j].conformers[0])
offmol[0].visualize(backend="nglview")

NGLWidget(max_frame=23)

## Set up TorsionDrive dataset

In [22]:
dataset = TorsiondriveDataset(
    dataset_name="OpenFF RNA Dinucleoside Monophosphate TorsionDrives v1.0",
    dataset_tagline="TorsionDrives of non-ring backbone, glycosidic, and hydroxyl dihedrals in RNA XpY 2-mers.",
    description=(
        "TorsionDrives for RNA dinucleoside monophosphates (DNMP), i.e. XpY 2-mers, "
        "on epsilon, zeta, alpha, beta, gamma, chi, 2' OH, 3' OH, and 5' OH "
        "with non-driven dihedrals constrained to A-form helix references values."
    ),
)
dataset.metadata.submitter = "chapincavender"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/"
    "2024-03-26-OpenFF-RNA-Dinucleoside-Monophosphate-TorsionDrives-v1.0"
)

# Add molecules with constraints on non-driven dihedrals to dataset
for nucleobase_1 in nucleobases.keys():
    for nucleobase_2 in nucleobases.keys():
        dnmp_name = f"{nucleobase_1}{nucleobase_2}"

        for scan_dihedral_name, scan_dihedral in scan_dihedrals[dnmp_name].items():
            scan_dihedral_indices = scan_dihedral["Indices"]

            # Read initial conformers for torsion scan
            offmol = Molecule.from_file(
                Path("initial-conformers", f"{dnmp_name}-{scan_dihedral_name}-initial-conformers.sdf")
            )
            for j in range(1, len(offmol)):
                offmol[0].add_conformer(offmol[j].conformers[0])
            offmol = offmol[0]

            print(
                f"{dnmp_name:2s} {scan_dihedral_name:7s} {scan_dihedral_indices[0]:2d} {scan_dihedral_indices[1]:2d} "
                f"{scan_dihedral_indices[2]:2d} {scan_dihedral_indices[3]:2d} ({offmol.n_conformers:2d} conformers)"
            )

            # Molecule metadata
            mol_index = f"{dnmp_name}-{scan_dihedral_name}"

            # Add molecule to dataset
            dataset.add_molecule(
                index = mol_index,
                molecule = offmol,
                dihedrals = [scan_dihedral_indices],
                keywords = {"dihedral_ranges": [(-180, 165)], "grid_spacing": [scan_dihedral["Spacing"]]},
            )

            # Add constraints for non-driven torsions. Include delta but exclude sugar ring torsions.
            found_scan_match = False
            for dihedral, reference in reference_dihedrals.items():
                if dihedral.startswith("nu"):
                    continue

                for match_indices in offmol.chemical_environment_matches(reference["SMARTS"]):
                    if match_indices == scan_dihedral_indices:
                        found_scan_match = True
                        continue

                    dataset.dataset[mol_index].add_constraint(
                        constraint = "set",
                        constraint_type = "dihedral",
                        indices = match_indices,
                        value = reference["Reference"],
                    )

            if not found_scan_match:
                raise ValueError(f"No match found for {dnmp_name}")

AA 5pOH    41  0  1  2 (24 conformers)
AA 5pgamma  0  1  2 17 (24 conformers)
AA 3pgamma 22 23 24 39 (24 conformers)
AA 5pchi    3  4  5  6 (24 conformers)
AA 3pchi   25 26 27 28 (24 conformers)
AA 5p2pOH  17 15 16 51 (24 conformers)
AA 3p2pOH  39 37 38 62 (24 conformers)
AA epsilon  2 17 18 19 (24 conformers)
AA zeta    17 18 19 22 (24 conformers)
AA alpha   18 19 22 23 (24 conformers)
AA beta    19 22 23 24 (24 conformers)
AA 3pOH    24 39 40 64 (24 conformers)
AC 5pOH    39  0  1  2 (24 conformers)
AC 5pgamma  0  1  2 17 (24 conformers)
AC 3pgamma 22 23 24 37 (24 conformers)
AC 5pchi    3  4  5  6 (24 conformers)
AC 3pchi   25 26 27 28 (24 conformers)
AC 5p2pOH  17 15 16 49 (24 conformers)
AC 3p2pOH  37 35 36 60 (24 conformers)
AC epsilon  2 17 18 19 (24 conformers)
AC zeta    17 18 19 22 (24 conformers)
AC alpha   18 19 22 23 (24 conformers)
AC beta    19 22 23 24 (24 conformers)
AC 3pOH    24 37 38 62 (24 conformers)
AG 5pOH    42  0  1  2 (24 conformers)
AG 5pgamma  0  1  2 17 (2

## Export and describe dataset

In [23]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file('dataset.smi', 'smi')
dataset.visualize("dataset.pdf", columns=8)

print(dataset.qc_specifications)

{'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>, implicit_solvent=None, maxiter=200, scf_properties=[<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.MayerIndices: 'mayer_indices'>], keywords={})}


## Dataset information

In [24]:
from collections import Counter
print("n_molecules:", dataset.n_molecules)
print("n_conformers:", dataset.n_records)

n_confs = numpy.array(
    [mol.n_conformers for mol in dataset.molecules]
)
print(
    "Number of conformers (min, mean, max):",
    n_confs.min(), n_confs.mean(), n_confs.max()
)

n_heavy_atoms = numpy.array(
    [mol.to_rdkit().GetNumHeavyAtoms() for mol in dataset.molecules]
)
counts = Counter(n_heavy_atoms)
print("# heavy atoms")
for n_heavy in sorted(counts):
    print(f"{str(n_heavy):>3}: {counts[n_heavy]}")

unique_charges = set([
    mol.total_charge.m_as(unit.elementary_charge)
    for mol in dataset.molecules
])
print("Unique formal charges:", unique_charges)

masses = numpy.array([
    sum([atom.mass.m for atom in mol.atoms])
    for mol in dataset.molecules
])
print("MW (min, mean, max):", masses.min(), masses.mean(), masses.max())

elements = set(
    atom.symbol
    for mol in dataset.molecules
    for atom in mol.atoms
)
print(elements)

n_molecules: 16
n_conformers: 192
Number of conformers (min, mean, max): 24 24.0 24
# heavy atoms
 37: 48
 39: 48
 40: 48
 41: 12
 42: 24
 43: 12
Unique formal charges: {-1.0}
MW (min, mean, max): 547.3920101999988 579.9088966999988 627.4404501999985
{'H', 'O', 'C', 'P', 'N'}
