In [1]:
from collections import Counter
from collections import defaultdict
import json
import bz2

import periodictable as pt
import numpy as np
import qcportal




In [2]:
ds_id = 464
old_dataset_name = "tmQM xtb Dataset T=100K low-mw high-coordinate mult=3 v0.0"

new_dataset_name = "tmQM xtb Dataset T=100K low-mw high-coordinate xtb mult=3 v0.0"
new_tagline = "BP86/def2-TZVP Conformers for single metal complexes with Pd, Fe, Zn, Cu, and change of {-1,0,+1} run in xtb with a multiplicity of 3 and in DFT run with a multiplicity of 1, 3, or 5. MW <= 600 Da, generally high coordinate, and a max of 30 geometry samples"
new_description = ("""
This dataset was generated starting from an adaptation of the tmQM dataset (DOI: 10.5281/zenodo.14920177; 
https://zenodo.org/records/17042449). This dataset contains 6,905 unique systems with 206,240 total 
configurations / spin state combinations  below 600 Da.  The molecules are limited to containing transition 
metals Pd, Zn, Fe, or Cu, and also only contain elements Br, C, H, P, S, O, N, F, Cl, or Br with charges: 
{-1,0,+1}. The metal is restricted to greater than three coordination sites for Pd, four for Fe, 
and one for Cu and Zn. Each molecule was preprocessed using gfn2-xtb, and then a short MD simulation
performed to provide a maximum of 30 off-optimum configurations in addition to the minimized geometry per molecules at 
a multiplicity of 3. Using the geometries generated with gfn-xtb, this singlepoint dataset was then run with the DFT method
BP86/def2-TZVP and a multiplicity of either 1, 3, or 5. 
Each configuration is reported with the following properties: 'energy', 'gradient', 'dipole', 'quadrupole',
'wiberg_lowdin_indices', 'mayer_indices', 'lowdin_charges' 'dipole_polarizabilities', 'mulliken_charges'. SMILES
strings where generated from tmos (https://github.com/openforcefield/tmos) when possible. These SMILES strings can be
imported into RDKit for initial visualization, but will not reflect the coordinate geometries presented from tmQm.
""")

## Update scaffold.json

In [3]:
# scaffold is too large to import with qcportal, change metadata manually
scaffold_path = "scaffold_2.json.bz2"
with bz2.open("scaffold_2.json.bz2", "rt", encoding="utf-8") as fh:
    ds = json.load(fh)

print(f"Loaded {scaffold_path} as dict with keys:", list(ds.keys()))

Loaded scaffold_2.json.bz2 as dict with keys: ['metadata', 'entries', 'specifications']


In [4]:
ds["metadata"]["name"] = new_dataset_name
ds["metadata"]["tagline"] = new_tagline
ds["metadata"]["description"] = new_description
ds["metadata"]["extras"]["long_description"] = new_description
ds["metadata"]["extras"]["long_description_url"] = f'https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/{new_dataset_name.replace(" ", "-")}'
ds["metadata"]["extras"]["short_description"] = new_tagline
ds["metadata"]["extras"]["dataset_name"] = new_dataset_name

In [5]:
with bz2.open(scaffold_path, "wt", encoding="utf-8") as fh:
    json.dump(ds, fh, ensure_ascii=False, indent=2)

print(f"Updated scaffold written to: {scaffold_path}")

Updated scaffold written to: scaffold_2.json.bz2


In [6]:
root_entry_names = ["_".join(x.split("_")[:-1]) for x in ds["entries"].keys()]
n_conformers = list(Counter(root_entry_names).values())

charges = []
multiplicities = defaultdict(list)
multiplicities_array = []
molecular_weights = []

for entry_name, entry in ds["entries"].items():
    mol = entry["molecule"]
    charges.append(entry["molecule"]["molecular_charge"])
    multiplicities_array.append(entry["molecule"]["molecular_multiplicity"])
    multiplicities[entry["molecule"]["molecular_multiplicity"]].append(entry_name)
    symbols = entry["molecule"]["symbols"]

    mw = 0.0
    for s in symbols:
        el = getattr(pt, s)
        mw += float(getattr(el, "mass", None))

    molecular_weights.append(mw)

In [7]:
for mult, labels in multiplicities.items():
    print(mult, len(labels))

3 17780
5 184371
1 4089


In [8]:
print("Number of Molecules:", len(set(root_entry_names)))
print("Number of Conformers:", len(root_entry_names))
print("Number of conformers (min mean max):", int(np.min(n_conformers)), int(np.mean(n_conformers)), int(np.max(n_conformers)))

print("Elements:", ds["metadata"]["extras"]["elements"])
print("Charges:", sorted(set(charges)))
print("Multiplicities:", sorted(set(multiplicities)))
print("Molecular Weight (min mean max):", int(np.min(molecular_weights)), int(np.mean(molecular_weights)), int(np.max(molecular_weights)))

Number of Molecules: 6905
Number of Conformers: 206240
Number of conformers (min mean max): 2 29 30
Elements: ['Br', 'C', 'Cl', 'Cu', 'F', 'Fe', 'H', 'N', 'O', 'P', 'Pd', 'S', 'Zn']
Charges: [-1.0, 0.0, 1.0]
Multiplicities: [1, 3, 5]
Molecular Weight (min mean max): 190 463 600


## Update QCA Dataset

In [9]:
client = qcportal.PortalClient(
    "https://api.qcarchive.molssi.org:443", 
#    username=os.environ['QCARCHIVE_USER'],
#    password=os.environ['QCARCHIVE_PASSWORD'],
    cache_dir="."
)

In [10]:
dataset = client.get_dataset("singlepoint", old_dataset_name)
#dataset = client.get_dataset_by_id(ds_id)
dataset.id

464

In [11]:
#dataset.name = new_dataset_name
#dataset.tagline = new_tagline
#dataset.description = new_description
#dataset.extras["long_description"] = new_description
#dataset.extras["long_description_url"] = f'https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/{new_dataset_name.replace(" ", "-")}'
#dataset.extras["short_description"] = new_tagline
#dataset.extras["dataset_name"] = new_dataset_name