# PDB-CCD TM

## Import Packages and Datasets

In [1]:

import copy
import os
import json
import traceback
import requests
from collections import defaultdict
import importlib
import contextlib

import numpy as np
from datetime import date
import py3Dmol
import networkx as nx
from pdbeccdutils.core import ccd_reader

import rdkit
from rdkit import Chem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.molSize = 500, 500

import qcportal
from qcportal.molecules import Molecule
from qcportal.singlepoint import SinglepointDriver
from qcportal.singlepoint import QCSpecification
from qcportal.optimization import OptimizationSpecification
from qcelemental.physical_constants import constants

#client = qcportal.PortalClient("https://api.qcarchive.molssi.org:443", cache_dir=".")
from qcfractal.snowflake import FractalSnowflake
snowflake = FractalSnowflake()
client = snowflake.client()

import sanitize_tm
import scaffold


def view3D(molecule):
    """Format 3D view of RDKit molecule

    Args:
        molecule (rdkit.Chem.rdchem.Mol): RDKit molecule to be viewed

    Returns:
        py3Dmol.view: View object.
    """
    mol = Chem.Mol(molecule)
    view = py3Dmol.view(
        data=Chem.MolToMolBlock(mol),  # Convert the RDKit molecule for py3Dmol
        style={"stick": {}, "sphere": {"scale": 0.3}}
    )
    
    # Add atom labels (indices)
    for i, atom in enumerate(mol.GetAtoms()):
        pos = mol.GetConformer().GetAtomPosition(i)
        view.addLabel(str(i), {"position": {"x": pos.x, "y": pos.y, "z": pos.z}, 
                                "backgroundColor": "white", "backgroundOpacity": 0.2, "fontColor": "black"})

    return view


def topology_from_rdkit(rdkit_molecule):
    """Extract the molecule graph from a RDKit molecule

    Args:
        rdkit_molecule (rdkit.Chem.rdchem.Mol): RDKit molecule

    Returns:
        networkx.Grapth: Graph representation of a molecule
    """

    topology = nx.Graph()
    for atom in rdkit_molecule.GetAtoms():
        # Add the atoms as nodes
        topology.add_node(atom.GetIdx(), symbol=atom.GetSymbol())

        # Add the bonds as edges
        for bonded in atom.GetNeighbors():
            topology.add_edge(atom.GetIdx(), bonded.GetIdx())

    return topology


def is_isomorphic(rdkit_molecule1, rdkit_molecule2):
    """Compare two RDKit molecules and determine if their molecular graphs are the same

    Args:
        rdkit_molecule1 (rdkit.Chem.rdchem.Mol): First RDKit molecule
        rdkit_molecule2 (rdkit.Chem.rdchem.Mol): Second RDKit molecule

    Returns:
        bool: True if the molecular graphs of two molecules are the same.
    """
    def node_match(n1, n2):
        return n1["symbol"] == n2["symbol"]
    
    return nx.is_isomorphic(
        topology_from_rdkit(rdkit_molecule1),
        topology_from_rdkit(rdkit_molecule2),
        node_match=node_match
    )


def first_traceback(keyword="During handling of the above exception"):
    """Exatract the first traceback of an error and return those lines only

    Args:
        keyword (str, optional): Key phase to denote that all other output should be discarded.
        Defaults to "During handling of the above exception".

    Returns:
        str: Lines relating to the first traceback message
    """
    error_msg = traceback.format_exc().split("\n")
    ind = [i for i, x in enumerate(error_msg) if keyword in x]
    ind = ind[0] if len(ind) > 0 else len(error_msg)
    return "\n".join(error_msg[:ind])


def get_entry_cif(pdb_id):
    """
    Downloads and writes PDB entry updated mmCIF to a file and 
    returns the path

    Args:
        pdb_id (str): PDB id of an entry

    Returns:
        str: Path to the downloaded file
    """
    ENTRY_URL = "https://files.rcsb.org/ligands/download/"
    entry_url = os.path.join(ENTRY_URL, f"{pdb_id}.cif")
    response = requests.get(entry_url, stream=True)
    response.raise_for_status()
    with open(f"{pdb_id}.cif", "wb") as fh:
        fh.write(response.content)

    return f"{pdb_id}.cif"


def rdkit_to_smiles( rdmol, isomeric=True, explicit_hydrogens=True, mapped=False):
    """Take an RDKit molecule and output a desired Canonical SMILES string
    
    Based on openff-toolkit.RDKitToolkitWrapper.to_smiles

    Args:
        rdmol (rdkit.Chem.rdchem.Mol): RDKit molecule from which to extract a SMILES string
        isomeric (bool, optional): Choose whether the SMILES should include isomerism. Defaults to True.
        explicit_hydrogens (bool, optional): Choose whether hydrogens are included in SMILES or not. Defaults to True.
        mapped (bool, optional): Choose whether to include atom IDs for mapping purposes. Defaults to False.

    Returns:
        str: Canonical SMILES according to the input options
    """
    if not explicit_hydrogens:
        # remove the hydrogens from the molecule
        rdmol = Chem.RemoveHs(rdmol)
        
    if mapped:
        for atom in rdmol.GetAtoms():
            # the mapping must start from 1, as RDKit uses 0 to represent no mapping.
            atom.SetAtomMapNum(atom.GetIdx() + 1)

    return Chem.MolToSmiles(
        rdmol, isomericSmiles=isomeric, allHsExplicit=explicit_hydrogens, kekuleSmiles=True
    )


def smiles_dict(rdmol):
    """From an RDKit molecule, output a dict with the CMILES types required in OpenFF QCSubmit.

    Args:
        rdmol (rdkit.Chem.rdchem.Mol): RDKit molecule from which to extract CMILES strings.

    Returns:
        dict: Dictionary containing:
        
        - canonical_smiles: isomeric=False, explicit_hydrogens=False, mapped=False
        - canonical_isomeric_smiles: isomeric=True,  explicit_hydrogens=False, mapped=False
        - canonical_explicit_hydrogen_smiles: rdmol, isomeric=False, explicit_hydrogens=True,  mapped=False
        - canonical_isomeric_explicit_hydrogen_smiles: rdmol, isomeric=True,  explicit_hydrogens=True,  mapped=False
        - canonical_isomeric_explicit_hydrogen_mapped_smiles: rdmol, isomeric=True,  explicit_hydrogens=True,  mapped=True

    """
    return {
        "canonical_smiles": rdkit_to_smiles(rdmol, isomeric=False, explicit_hydrogens=False, mapped=False),
        "canonical_isomeric_smiles": rdkit_to_smiles(rdmol, isomeric=True,  explicit_hydrogens=False, mapped=False),
        "canonical_explicit_hydrogen_smiles": rdkit_to_smiles(rdmol, isomeric=False, explicit_hydrogens=True,  mapped=False),
        "canonical_isomeric_explicit_hydrogen_smiles": rdkit_to_smiles(rdmol, isomeric=True,  explicit_hydrogens=True,  mapped=False),
        "canonical_isomeric_explicit_hydrogen_mapped_smiles": rdkit_to_smiles(rdmol, isomeric=True,  explicit_hydrogens=True,  mapped=True),
    }



OSError: [Errno 66] Directory not empty: '/Users/jenniferclark/postgres'

In [None]:
chemical_space = "CS-A_primary"
with open(f"{chemical_space}.json", 'r') as f:
    dataset_dict = json.load(f)

print(f"There are {len(dataset_dict)} entries in the dataset {chemical_space}")

There are 151 entries in the dataset CS-A_primary


In [None]:
cif_path = "cifs"
os.makedirs(cif_path, exist_ok=True)
os.chdir(cif_path)
for label in dataset_dict:
    code = label.split("_")[1]
    if not os.path.isfile(f"{code}.cif"):
        get_entry_cif(code)
os.chdir("../")

## QC Dataset

### Get RDKit Molecules

In [None]:
error_sanitize_tm = defaultdict(list)
output = {}
for label, mol in dataset_dict.items():
    code = label.split("_")[1]
    metal = list(mol["metals"].keys())[0]
    rdmol_san = ccd_reader.read_pdb_cif_file(os.path.join(cif_path,f"{code}.cif"), sanitize=True).component.mol
    if rdmol_san.GetNumConformers() > 1:
        rdmol_san.RemoveConformer(0) # Remove openeye coord estimate

    try:
        tmc_mol = sanitize_tm.sanitize_complex(rdmol_san, mol["charge"])
    except Exception as e:
        error_sanitize_tm[str(e)[:39]].append([code, metal, mol["charge"], first_traceback()])
        continue
    
    output[label] = mol
    output[label]["rdmol"] = tmc_mol

print("\nXYZ2Mol Errors")
if error_sanitize_tm:
    for key, values in error_sanitize_tm.items():
        print(len(values), key)
else:
    print("None")
    
print(f"\nWe have {len(output)} left")

# Error with RDKit Sanitization: 'A1H1T', 'AOH', 'H79', 'L8W', 'NT3', 'S18', 'S31', 'WRK'


XYZ2Mol Errors
14 Molecule missing coordinates

We have 137 left


In [None]:
# View all structures to ensure they are viable, all 131 are viable
index = 11
#keys = list(output.keys())
#print(keys[index])

# ferrocenes
keys = ["data_670", 'data_6SG', 'data_84A', 'data_B6F', 'data_B9F', 'data_CFC', 'data_FEM', 'data_H57', 'data_H58', 'data_H79', 'data_HBF', 'data_L8W']
constraints = {
    "data_670": {"constraints": {"freeze": [{"type": "distance", "indices": [0, 16]}, {"type": "distance", "indices": [0, 17]}, {"type": "distance", "indices": [0, 18]}, {"type": "distance", "indices": [0, 19]}, {"type": "distance", "indices": [0, 20]},
                             {"type": "distance", "indices": [0, 21]}, {"type": "distance", "indices": [0, 22]}, {"type": "distance", "indices": [0, 23]}, {"type": "distance", "indices": [0, 24]}, {"type": "distance", "indices": [0, 25]},]},
    },
    "data_6SG": {"constraints": {"freeze": [{"type": "distance", "indices": [35, 25]}, {"type": "distance", "indices": [35, 26]}, {"type": "distance", "indices": [35, 27]}, {"type": "distance", "indices": [35, 28]}, {"type": "distance", "indices": [35, 29]},
                             {"type": "distance", "indices": [35, 30]}, {"type": "distance", "indices": [35, 31]}, {"type": "distance", "indices": [35, 32]}, {"type": "distance", "indices": [35, 33]}, {"type": "distance", "indices": [35, 34]},]},
    },
    "data_84A": {"constraints": {"freeze": [{"type": "distance", "indices": [1, 14]}, {"type": "distance", "indices": [1, 16]}, {"type": "distance", "indices": [1, 18]}, {"type": "distance", "indices": [1, 19]}, {"type": "distance", "indices": [1, 20]},
                             {"type": "distance", "indices": [1, 21]}, {"type": "distance", "indices": [1, 22]}, {"type": "distance", "indices": [1, 23]}, {"type": "distance", "indices": [1, 24]}, {"type": "distance", "indices": [1, 25]},]},
    },
    "data_B6F": {"constraints": {"freeze": [{"type": "distance", "indices": [0, 31]}, {"type": "distance", "indices": [0, 32]}, {"type": "distance", "indices": [0, 33]}, {"type": "distance", "indices": [0, 34]}, {"type": "distance", "indices": [0, 35]},
                             {"type": "distance", "indices": [0, 36]}, {"type": "distance", "indices": [0, 37]}, {"type": "distance", "indices": [0, 38]}, {"type": "distance", "indices": [0, 39]}, {"type": "distance", "indices": [0, 41]},]},
    },
    "data_B9F": {"constraints": {"freeze": [{"type": "distance", "indices": [25, 2]}, {"type": "distance", "indices": [25, 3]}, {"type": "distance", "indices": [25, 4]}, {"type": "distance", "indices": [25, 5]}, {"type": "distance", "indices": [25, 6]},
                             {"type": "distance", "indices": [25, 20]}, {"type": "distance", "indices": [25, 21]}, {"type": "distance", "indices": [25, 22]}, {"type": "distance", "indices": [25, 23]}, {"type": "distance", "indices": [25, 24]},]},
    },
    "data_CFC": {"constraints": {"freeze": [{"type": "distance", "indices": [14, 0]}, {"type": "distance", "indices": [14, 1]}, {"type": "distance", "indices": [14, 2]}, {"type": "distance", "indices": [14, 3]}, {"type": "distance", "indices": [14, 4]},
                             {"type": "distance", "indices": [14, 5]}, {"type": "distance", "indices": [14, 6]}, {"type": "distance", "indices": [14, 7]}, {"type": "distance", "indices": [14, 8]}, {"type": "distance", "indices": [14, 9]},]},
    },
    "data_FEM": {"constraints": {"freeze": [{"type": "distance", "indices": [0, 1]}, {"type": "distance", "indices": [0, 2]}, {"type": "distance", "indices": [0, 3]}, {"type": "distance", "indices": [0, 4]}, {"type": "distance", "indices": [0, 5]},
                             {"type": "distance", "indices": [0, 6]}, {"type": "distance", "indices": [0, 7]}, {"type": "distance", "indices": [0, 8]}, {"type": "distance", "indices": [0, 9]}, {"type": "distance", "indices": [0, 10]},]},
    },
    "data_H57": {"constraints": {"freeze": [{"type": "distance", "indices": [22, 12]}, {"type": "distance", "indices": [22, 13]}, {"type": "distance", "indices": [22, 14]}, {"type": "distance", "indices": [22, 15]}, {"type": "distance", "indices": [22, 16]},
                             {"type": "distance", "indices": [22, 17]}, {"type": "distance", "indices": [22, 18]}, {"type": "distance", "indices": [22, 19]}, {"type": "distance", "indices": [22, 20]}, {"type": "distance", "indices": [22, 21]},]},
    },
    "data_H58": {"constraints": {"freeze": [{"type": "distance", "indices": [24, 14]}, {"type": "distance", "indices": [24, 15]}, {"type": "distance", "indices": [24, 16]}, {"type": "distance", "indices": [24, 17]}, {"type": "distance", "indices": [24, 18]},
                             {"type": "distance", "indices": [24, 19]}, {"type": "distance", "indices": [24, 20]}, {"type": "distance", "indices": [24, 21]}, {"type": "distance", "indices": [24, 22]}, {"type": "distance", "indices": [24, 23]},]},
    },
    "data_H79": {"constraints": {"freeze": [{"type": "distance", "indices": [17, 16]}, {"type": "distance", "indices": [17, 18]}, {"type": "distance", "indices": [17, 19]}, {"type": "distance", "indices": [17, 20]}, {"type": "distance", "indices": [17, 21]},
                             {"type": "distance", "indices": [17, 22]}, {"type": "distance", "indices": [17, 23]}, {"type": "distance", "indices": [17, 24]}, {"type": "distance", "indices": [17, 25]}, {"type": "distance", "indices": [17, 26]},]},
    },
    "data_HBF": {"constraints": {"freeze": [{"type": "distance", "indices": [25, 2]}, {"type": "distance", "indices": [25, 3]}, {"type": "distance", "indices": [25, 4]}, {"type": "distance", "indices": [25, 5]}, {"type": "distance", "indices": [25, 6]},
                             {"type": "distance", "indices": [25, 20]}, {"type": "distance", "indices": [25, 21]}, {"type": "distance", "indices": [25, 22]}, {"type": "distance", "indices": [25, 23]}, {"type": "distance", "indices": [25, 24]},]},
    },
    "data_L8W": {"constraints": {"freeze": [{"type": "distance", "indices": [23, 7]}, {"type": "distance", "indices": [23, 8]}, {"type": "distance", "indices": [23, 24]}, {"type": "distance", "indices": [23, 25]}, {"type": "distance", "indices": [23, 26]},
                             {"type": "distance", "indices": [23, 4]}, {"type": "distance", "indices": [23, 5]}, {"type": "distance", "indices": [23, 6]}, {"type": "distance", "indices": [23, 21]}, {"type": "distance", "indices": [23, 22]},]},
    },
}
print(keys[index])
view3D(output[keys[index]]["rdmol"])
#IPythonConsole.drawMol3D(output[keys[index]]["rdmol"])

data_L8W


<py3Dmol.view at 0x15bd5cf10>

In [None]:
atm1 = output[keys[index]]["rdmol"].GetConformer().GetAtomPosition(22)
atm2 = output[keys[index]]["rdmol"].GetConformer().GetAtomPosition(60)
np.linalg.norm(np.array([atm1.x, atm1.y, atm1.z]) - np.array([atm2.x, atm2.y, atm2.z])) # Distance between C and H on the order of an Angstrom

1.0999999999999996

### Make QCElemental Molecules

In [None]:
# Extract/update data from RDKit molecule, prepare for making QCMolecule
bond_order_dict = {
    "SINGLE": 1,
    "DOUBLE": 2,
    "AROMATIC": 1.5,
    "DATIVE": 0, # this is 1 in rdkit, but it should be zero to determine multiplicity
    "TRIPLE": 3,
}
errors_smiles = defaultdict(list)
qc_input = {}
conformers_dict = defaultdict(list)
for label, mol_dict in output.items():
    
    rdconf = mol_dict['rdmol'].GetConformer()
    conformer = [list(x) for x in rdconf.GetPositions() / constants.bohr2angstroms] # Convert from nm to Bohr (a0)
    symbols = [a.GetSymbol() for a in mol_dict['rdmol'].GetAtoms()]
    connectivity = [(b.GetBeginAtomIdx(), b.GetBeginAtomIdx(), bond_order_dict[b.GetBondType().name]) for b in mol_dict['rdmol'].GetBonds()]
    molecular_weight = sum([a.GetMass() for a in mol_dict['rdmol'].GetAtoms()])
    try:
        identifiers = smiles_dict(mol_dict["rdmol"])
    except Exception as e:
        errors_smiles[str(e)[:39]].append([label, mol_dict["metals"], mol["charge"], first_traceback()]) 
        identifiers = {}
    identifiers["molecular_formula"] = Chem.rdMolDescriptors.CalcMolFormula(mol_dict['rdmol'])

    name = f"{label}"
    conformer_list = [name for name, values in qc_input.items() if is_isomorphic(mol_dict["rdmol"], values["rdmol"])]
    conformers_dict[name] = [name]
    for conf_name in conformer_list:
        if mol_dict["charge"] == qc_input[conf_name]["charge"]:
            conformers_dict[conf_name].append(name)
            conformers_dict[name].append(conf_name)

    qc_input[name] = {
        "conformers": [conformer],
        "charge": mol_dict["charge"],
        "identifiers": identifiers,
        "symbols": symbols,
        "connectivity": connectivity,
        "extras": {
            "molecular_weight": molecular_weight,
            "metals": mol_dict['metals'],
            "PDB CCD Info": {
                label: {
                    'smiles': mol_dict['smiles'],
                    'canonical_smiles': mol_dict['canonical_smiles'],
                    'standard_inchi': mol_dict['standard_inchi'],
                    'inchi_key': mol_dict['inchi_key'],
                    "molecular_formula": mol_dict['molecular_formula'],
                }
            },
        },
        "rdmol": mol_dict['rdmol'],
    }

print(f"Of the {len(output)} structures, {len(output) - len(errors_smiles[list(errors_smiles.keys())[0]])} have smiles strings")

Of the 137 structures, 136 have smiles strings


In [None]:
elements = []
multiplicities = defaultdict(list)
rdkit_pdf_molecules = []
molecules = defaultdict()
errors_misc = defaultdict(dict)
errors_mult = defaultdict(list)
for name, mol_dict in qc_input.items():
    elements.extend(list(set(mol_dict['symbols'])))
    rdkit_pdf_molecules.append(mol_dict["rdmol"])
    for multiplicity in list(range(1,7)):
        try:
            with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): # Suppress extraneous print statements
                molecules[f"{name}-m{multiplicity}"] = Molecule(
                    name=f"{name}-m{multiplicity}",
                    symbols=mol_dict['symbols'], 
                    geometry=mol_dict['conformers'],
                    molecular_charge=mol_dict["charge"],
                    molecular_multiplicity=multiplicity,
                    identifiers=mol_dict['identifiers'],
                    connectivity=mol_dict['connectivity'],
                    fix_com=True,
                    fix_orientation=True,
                    fix_symmetry="c1",
                    extras=mol_dict["extras"],
                    comment="Smiles were generated from RDKit Molecule using pdbeccdutils and custom sanitation scripts",
                )
                multiplicities[name].append(multiplicity)
        except Exception as e:
            if "Inconsistent or unspecified chg/mult" in str(e):
                errors_mult[name].append(multiplicity)
            else:
                errors_misc[str(e)[:39]][name] = [mol_dict["extras"], first_traceback()]
    
elements = list(set(elements))
print(elements)
print(f"Of the {len(dataset_dict)} structures * 5 multiplicities, {len(molecules)} were imported with the requested multiplicities.")

['H', 'Zn', 'Pd', 'Fe', 'Mg', 'S', 'N', 'F', 'O', 'Cu', 'C', 'Cl', 'P']
Of the 151 structures * 5 multiplicities, 411 were imported with the requested multiplicities.


In [None]:
dataset_name = "PDB CCD Chemical Space A: Optimization Dataset v0.0"
tagline = "GFN2-xTB Conformers for single metal complexes with Pd, Fe, Zn, Cu, Mg, Li and change of {-1,0,+1}"
description = (
    "This dataset includes molecules from the PDB CCD that contain a single metals: {'Pd', 'Fe', 'Zn', 'Mg', 'Cu', 'Li'}, "
    "and the non-metals: {'C', 'H', 'P', 'S', 'O', 'N', 'F', 'Cl', 'Br'}, with a complex charge of {-1,0,+1}. "
    "These complexes are minimized using gfn-xtb."
)

dataset = client.add_dataset( # https://docs.qcarchive.molssi.org/user_guide/qcportal_reference.html
    "optimization", # collection type
    dataset_name, # Dataset name
    tagline=tagline,
    description=description,
    tags=["openff"],
    provenance={
        "rdkit": rdkit.__version__,
        "pdbeccdutils": importlib.metadata.version("pdbeccdutils"),
        "qcportal": qcportal.__version__,
    },
    default_tag="openff",
    metadata={
        "submitter": "Jennifer A Clark",
        "creation_data": date.today(),
        'collection_type': 'OptimizationDataset',
        "long_description": description,
        'long_description_url': f'https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/{dataset_name.replace(" ", "-")}',
        "short_description": tagline,
        "dataset_name": dataset_name,
        "elements": elements,
    },
)

In [None]:
for name, mol in molecules.items():
    additional_keywords = constraints[name] if name in constraints else {}
    dataset.add_entry(name, mol, additional_keywords=additional_keywords)

In [None]:
spec = OptimizationSpecification( # See 2020-07-27-OpenFF-Benchmark-Ligands for specs 
    program='geometric',
    qc_specification=QCSpecification(
        program='xtb',
        driver=SinglepointDriver.deferred,
        method='gfn2xtb',
        basis=None,
        keywords={
            'maxiter': 200, 
            'scf_properties': ['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices', 
                               'lowdin_charges', 'dipole_polarizabilities', 'mulliken_charges']
        }, # keywords for xtb, spec.qc_keywords
    ),
    keywords={
        'tmax': 0.3,
        'check': 0,
        'qccnv': False,
        'reset': True,
        'trust': 0.1,
        'molcnv': False,
        'enforce': 0.0,
        'epsilon': 1e-05,
        'maxiter': 300,
        'coordsys': 'dlc',
        'constraints': {},
        'convergence_set': 'GAU',
    }, # keywords for geometric
#     protocols=OptimizationProtocols(trajectory=TrajectoryProtocolEnum.all)
)
dataset.add_specification(name="gfn2xtb", specification=spec)

InsertMetadata(error_description=None, errors=[], inserted_idx=[0], existing_idx=[])

In [None]:
scaffold.to_json(dataset, compress=True)

## Make Outputs

In [None]:
print("Elements:", elements)
print("Charges:", sorted(set(x["charge"] for _, x in qc_input.items())))
mw_array = [x["extras"]["molecular_weight"] for _, x in qc_input.items()]
print("Molecular Weight (min mean max):", int(np.min(mw_array)), int(np.mean(mw_array)), int(np.max(mw_array)))
tmp_dict = dict(conformers_dict)
keys = list(tmp_dict.keys())
conformers_dict2 = defaultdict(list)
for name in keys:
    if name not in tmp_dict:
        continue
    conformers_dict2[name] = tmp_dict[name]
    for name2 in tmp_dict[name]:
        if name2 in tmp_dict:
            del tmp_dict[name2]
            
print("Number of Molecules:", len(conformers_dict))
#print("Number of Molecules:", len(conformers_dict2)) # Conformers determined to be isomers not conformers in assess_conformers.ipynb
print("Number of Conformers:", len(conformers_dict))
n_conformers = [len(val) for _, val in conformers_dict2.items()]
#print("Number of conformers (min mean max):", int(np.min(n_conformers)), int(np.mean(n_conformers)), int(np.max(n_conformers)))
print("Number of conformers (min mean max):", 1, 1, 1)
n_multiplicities = np.array([len(x) for _, x in multiplicities.items()])
print("Number of multiplicities per molecule (min mean max):", int(np.min(n_multiplicities)), int(np.mean(n_multiplicities)), int(np.max(n_multiplicities)))


Elements: ['H', 'Zn', 'Pd', 'Fe', 'Mg', 'S', 'N', 'F', 'O', 'Cu', 'C', 'Cl', 'P']
Charges: [-1, 0, 1]
Molecular Weight (min mean max): 71 565 1177
Number of Molecules: 137
Number of Conformers: 137
Number of conformers (min mean max): 1 1 1
Number of multiplicities per molecule (min mean max): 3 3 3


In [None]:
with open("conformers.txt", "w") as f:
    for key, values in conformers_dict2.items():
        if len(values) > 1:
            f.write(", ".join(values)+"\n")

In [None]:

def write_structure_pdfs(rdkit_molecules, filename="dataset.pdf", columns=4):
    images = []
    for i in range(0, len(rdkit_molecules), 24):
        mol_chunk = rdkit_molecules[i : i + 24]
        for mol in rdkit_molecules[i : i + 24]:
            tmp_mol = copy.deepcopy(mol)
            Chem.AllChem.Compute2DCoords(tmp_mol)
            Chem.AllChem.StraightenDepiction(tmp_mol)
            mol_chunk.append(tmp_mol)

        # now make the image
        image = Chem.Draw.MolsToGridImage(
            mol_chunk,
            molsPerRow=columns,
            subImgSize=(500, 500),
            returnPNG=False,
        )
        # write the pdf to bytes and pass straight to the pdf merger
        images.append(image)

    images[0].save(filename, append_images=images[1:], save_all=True)

In [None]:
write_structure_pdfs(rdkit_pdf_molecules)