In [None]:
from datetime import date
from collections import Counter, defaultdict
import warnings

import periodictable
import h5py
import numpy as np

import qcportal
from qcportal.external import scaffold
from qcportal.molecules import Molecule
from qcportal.singlepoint import SinglepointDriver, QCSpecification
from qcelemental.physical_constants import constants

import tmos
warnings.filterwarnings("ignore", module="tmos")

ADDRESS = "https://api.qcarchive.molssi.org:443"
#qc_client = qcportal.PortalClient(ADDRESS, cache_dir=".")
from qcfractal.snowflake import FractalSnowflake
import warnings
snowflake = FractalSnowflake()
client = snowflake.client()



Acquisition of new tasks failed: HTTPConnectionPool(host='localhost', port=54211): Max retries exceeded with url: /compute/v1/tasks/claim (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x14ffac150>: Failed to establish a new connection: [Errno 61] Connection refused'))
Heartbeat failed: HTTPConnectionPool(host='localhost', port=54211): Max retries exceeded with url: /compute/v1/managers/snowflake_compute-vcv078084.vpn.uci.edu-8378d55d-7c50-401d-ac7f-8a36d73bb5ff (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x15fca7f90>: Failed to establish a new connection: [Errno 61] Connection refused')). QCFractal server down?
Missed 1 heartbeats so far
Heartbeat failed: HTTPConnectionPool(host='localhost', port=54211): Max retries exceeded with url: /compute/v1/managers/snowflake_compute-vcv078084.vpn.uci.edu-8378d55d-7c50-401d-ac7f-8a36d73bb5ff (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x16932f510>: Faile

In [2]:
#!aria2c "https://zenodo.org/records/17983516/files/tmqm_dataset_xtb_T100_ext_raw_sm3_v1.2.hdf5.gz?download=1"

## Helper Functions

In [3]:
def remove_extraneous_dimension(array):
    shape = list(np.shape(array))
    if 1 in shape:
        shape.remove(1)
    return np.array(array).reshape(shape)

def get_symbols(atomic_numbers):
    return [str(periodictable.elements[x])for x in remove_extraneous_dimension(atomic_numbers)]

def get_molecular_formula(atomic_numbers):
    return "".join([str(y) for x1, x2 in Counter(get_symbols(atomic_numbers)).items() for y in [x1, x2] if y != 1])

def get_molecular_weight(atomic_numbers):
    return sum(periodictable.elements[x].mass for x in remove_extraneous_dimension(atomic_numbers))


In [4]:
def apply_mapping(mapping, input_dict, index=0):

    output = defaultdict(dict)
    for key, value in mapping.items():
        if isinstance(value, str):
            data = input_dict[value]
            if not isinstance(data, str):
                if key == "geometry": # update number of frames
                    lx = np.shape(data)[0]
                
                if key != "geometry":
                    data = remove_extraneous_dimension(data)
                    lx = len(data)

                if lx is not None: # and len(np.shape(data)) > 1:
                    if len(data) == lx:
                        output[key] = data[index]
                        continue
                    else:
                        raise ValueError(f"Expected {lx} configuration, but {len(data)} are present")
            output[key] = data
        elif isinstance(value, tuple): # function, input pairs
            output[key] = value[0](*(input_dict[k2] for k2 in value[1:]))
        elif isinstance(value, list):
            output[key].update({k2: input_dict[k2] for k2 in value})
        elif isinstance(value, dict):
            output[key].update(apply_mapping(value, input_dict, index=0))
            
    return output
            
def convert_hdf5_group(hdf5_group):
    output = {}
    for key, value in hdf5_group.items():
        if isinstance(value, h5py.Group):
            output[key] = convert_hdf5_group(value)
        elif isinstance(value, h5py.Dataset):
            data = value[()]
            if isinstance(data, np.ndarray):
                output[key] = data
            elif isinstance(data, np.bytes_):
                output[key] = data.decode('utf-8')  # Convert to string
            else:
                output[key] = data.item() if isinstance(data, np.generic) else data  # Convert NumPy scalars
        else:
            output[key] = value

    return output

## Assembled Dataset

In [None]:
dataset_name = "tmQM xtb Dataset T=100K low-mw high-coordinate geom-mult=3 v0.0"
tagline = "BP86/def2-TZVP Conformers for single metal complexes with Pd, Fe, Zn, Cu, and change of {-1,0,+1} and multiplicity of 3. MW <= 600 Da, generally high coordinate, and 20 geometry samples"
description = ("""
This dataset was generated starting from an adaptation of the tmQM dataset (https://zenodo.org/records/17983516). 
This dataset contains 6,885 unique systems with 68,794 total configurations / spin states below 600 Da.  The molecules are 
limited to containing transition metals Pd, Zn, Fe, or Cu, and also only contain elements Br, C, H, P, S, O, N, F, Cl, 
or Br with charges: {-1,0,+1}. The metal is restricted to greater than three coordination sites for Pd, four for Fe, 
and one for Cu and Zn. Each molecule was preprocessed using gfn2-xtb, and then a short MD simulation
performed to provide 20 off-optimum configurations. This singlepoint dataset was then run with the BP86/def2-TZVP 
for with those geometries from molecular dynamics using gfn-xtb. Each configuration is reported with the following
properties: 'energy', 'gradient', 'dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices', 'lowdin_charges'
'dipole_polarizabilities', 'mulliken_charges'.
""")

dataset = client.add_dataset( # https://docs.qcarchive.molssi.org/user_guide/qcportal_reference.html
    "singlepoint", # collection type
    dataset_name, # Dataset name
    tagline=tagline,
    description=description,
    tags=["openff"],
    provenance={
        "qcportal": qcportal.__version__,
    },
    default_tag="openff",
    extras={
        "submitter": "jaclark5",
        "creation_date": date.today(),
        'collection_type': 'SinglepointDataset',
        "long_description": description,
        'long_description_url': f'https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2025-12-19-{dataset_name.replace(" ", "-")}',
        "short_description": tagline,
        "dataset_name": dataset_name,
    },
)

In [6]:
METALS_SYMBOLS = [periodictable.elements[x].symbol for x in tmos.reference_values.METALS_NUM]

hdf5_mapping = {
    "symbols": (get_symbols, "atomic_numbers"), 
    "geometry": "geometry",
    "molecular_charge": "total_charge",
    "molecular_multiplicity": "spin_multiplicity",
    "identifiers": {"molecular_formula": (get_molecular_formula, "atomic_numbers"),},
    "extras": {'molecular_weight': (get_molecular_weight, "atomic_numbers")},
}

elements, molecular_weights, charges, multiplicities = [], [], [], []
conformers = Counter()
count_molecules = 0

errors = defaultdict(list)
errors_struc = defaultdict(list)
errors_mult = []
errors_misc = defaultdict(lambda: defaultdict(list))
failed_metals = defaultdict(lambda: 0)
count_no = 0
count_yes = 0

with open("tmqm_m3.txt", "r") as f:
    entry_names = [line.rstrip() for line in f]

hdf5 = h5py.File(f"tmqm_dataset_xtb_T100_ext_raw_sm3_v1.2.hdf5", 'r')
for ii, (label, mol_hdf5) in enumerate(hdf5.items()):
    if label not in entry_names:
        continue
    count_yes += 1
    mol_dict = convert_hdf5_group(mol_hdf5)
    lx = mol_dict["n_configs"]
    
    ## Decide to filter
    try:
        input = apply_mapping(hdf5_mapping, mol_dict, index=0)
        if len(input["symbols"]) != np.shape(input["geometry"])[0]:
            raise ValueError(f"Geometries don't match number of symbols: {len(input['symbols'])} != {np.shape(input['geometry'])[0]}")
    except Exception as e:
        errors_struc[str(e)[:30]].append([label, 1, str(e)])
        continue
    input["geometry"] *= 10 # Convert from nm to Angstroms

    ## Import conformers
    for i in range(5,15): # Only take the middle 10 conformers
        # Get values from HDF5
        try:
            qc_input = apply_mapping(hdf5_mapping, mol_dict, index=i)
        except Exception as e:
            errors_struc[str(e)[:30]].append([label, 2, str(e)])
            continue
        qc_input["geometry"] *= 10 / constants.bohr2angstroms # Convert from nm to Bohr (a0)
    
        try:
            molecule = Molecule(
                name=label,
                fix_com=True,
                fix_orientation=True,
                fix_symmetry="c1",
                comment="Molecule coordinates taken from tmQM and SMILES from tmos",
                **qc_input
            )
            dataset.add_entry(name=label+f"_{i}", molecule=molecule)
            count_molecules += 1
            conformers[label[:-4]] += 1
        except Exception as e:
            if "Inconsistent or unspecified chg/mult" in str(e):
                errors_mult.append(label)
            else:
                errors_misc[str(e)[:30]][label].append([i, str(e)])
            continue

        elements.extend(list(set(qc_input['symbols'])))
        molecular_weights.append(qc_input['extras']["molecular_weight"])
        charges.append(qc_input["molecular_charge"])
        multiplicities.append(qc_input["molecular_multiplicity"])

dataset.extras["elements"] = sorted(list(set(elements)))

Connection error for http://localhost:54211/api/v1/datasets/singlepoint/1/entries/bulkCreate: HTTPConnectionPool(host='localhost', port=54211): Max retries exceeded with url: /api/v1/datasets/singlepoint/1/entries/bulkCreate (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1589c5590>: Failed to establish a new connection: [Errno 61] Connection refused')) - retrying in 0.48 seconds [1/5]
Connection error for http://localhost:54211/api/v1/datasets/singlepoint/1/entries/bulkCreate: HTTPConnectionPool(host='localhost', port=54211): Max retries exceeded with url: /api/v1/datasets/singlepoint/1/entries/bulkCreate (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x159f16350>: Failed to establish a new connection: [Errno 61] Connection refused')) - retrying in 0.50 seconds [1/5]
Connection error for http://localhost:54211/api/v1/datasets/singlepoint/1/entries/bulkCreate: HTTPConnectionPool(host='localhost', port=54211): Max retries exceed

In [7]:
print(f"Number of molecules removed for solvent assessment: {count_no}")
print(f"Number of molecules removed for unspecified chg/mult: {len(errors_mult)}")
print(f"Number of molecules removed for structure issues: {len(errors_struc)}")
print(f"Number of conformers accepted: {len(dataset.entry_names)}")

Number of molecules removed for solvent assessment: 0
Number of molecules removed for unspecified chg/mult: 0
Number of molecules removed for structure issues: 10
Number of conformers accepted: 68794


In [8]:
print(f"{len(dataset.entry_names)} conformers were imported.")

print("\nThe following errors DO remove molecules from the dataset:")
for err, values in errors_misc.items():
    print(f"    {len(values)}: '{err}'")

print(f"\nThere were {sum([len(x) for x in errors.values()])} molecules of {len(dataset.entry_names)} that failed to create SMILES.")

68794 conformers were imported.

The following errors DO remove molecules from the dataset:

There were 0 molecules of 68794 that failed to create SMILES.


In [9]:
spec = QCSpecification(
        program='psi4',
        driver=SinglepointDriver.gradient,
        method='BP86',
        basis='def2-TZVP',
        keywords={
            'maxiter': 500, 
            'scf_properties': ['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices', 'lowdin_charges', 'mulliken_charges'],
            'function_kwargs': {'properties': ['dipole_polarizabilities']},
        },
        protocols={'wavefunction': 'none'}
    )
dataset.add_specification(name="BP86/def2-TZVP", specification=spec)

InsertMetadata(error_description=None, errors=[], inserted_idx=[0], existing_idx=[])

In [10]:
scaffold.to_json(dataset, compress=True)
#dataset.submit()

## Make Outputs

In [11]:
print("Elements:", dataset.extras["elements"])
print("Charges:", sorted(set([float(x) for x in charges])))
print("Multiplicities:", sorted(set(multiplicities)))
print("Molecular Weight (min mean max):", int(np.min(molecular_weights)), int(np.mean(molecular_weights)), int(np.max(molecular_weights)))
            
print("Number of Molecules:", len(conformers))
print("Number of Conformers:", sum(conformers.values()))
n_conformers = np.array(list(conformers.values())) + 1
print("Number of conformers (min mean max):", int(np.min(n_conformers)), int(np.mean(n_conformers)), int(np.max(n_conformers)))

Elements: ['Br', 'C', 'Cl', 'Cu', 'F', 'Fe', 'H', 'N', 'O', 'P', 'Pd', 'S', 'Zn']
Charges: [-1.0, 0.0, 1.0]
Multiplicities: [np.int64(3)]
Molecular Weight (min mean max): 190 463 600
Number of Molecules: 6885
Number of Conformers: 68794
Number of conformers (min mean max): 2 10 11
