In [1]:
from collections import defaultdict
import json
import tqdm

import numpy as np
from openff.toolkit import Molecule, ForceField

In [2]:
QCFRACTAL_URL = "https://api.qcarchive.molssi.org:443/"

## Filter existing records

In [3]:
def filter_dataset(file, searches) -> dict[int, str]:
    """
    Quickly filter for records of interest.
    """
    output = {}
    seen_smiles_nonmatches = set()
    seen_smiles_matches = set()

    ff = ForceField("openff-2.2.1.offxml")

    with open(file, "r") as f:
        
        entries = json.load(f)["entries"][QCFRACTAL_URL]
        for entry in tqdm.tqdm(entries):
            smi = entry["cmiles"]
            if smi in seen_smiles_matches:
                output[int(entry["record_id"])] = smi
                continue
            elif smi in seen_smiles_nonmatches:
                continue
                
            mol = Molecule.from_smiles(smi, allow_undefined_stereo=True)
            torsions = [
                param.id for param in ff.label_molecules(
                    mol.to_topology()
                )[0]["ProperTorsions"].values()
            ]
            if any(torsion in torsions for torsion in searches):
                output[int(entry["record_id"])] = smi
                seen_smiles_matches.add(smi)
            else:
                seen_smiles_nonmatches.add(smi)
    return output

In [4]:
# quickly filter through existing data
searches = ["t17", "t48a", "t19", "t18", "t105"]
opt_record_ids = filter_dataset("sage-2.2.1/optimization-training-set.json", searches)
td_record_ids = filter_dataset("sage-2.2.1/torsion-training-set.json", searches)

  mol = Molecule.from_smiles(smi, allow_undefined_stereo=True)
100%|███████████████████████████████████████████████████████████████████████████| 5126/5126 [00:30<00:00, 170.15it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1290/1290 [00:15<00:00, 83.90it/s]


In [5]:
print(len(opt_record_ids))
print(len(td_record_ids))

3939
693


In [6]:
from openff.qcsubmit.results import (
    BasicResultCollection,
    OptimizationResultCollection,
    TorsionDriveResultCollection,
)

import qcportal as ptl

In [7]:
client = ptl.PortalClient(QCFRACTAL_URL, cache_dir=".")

In [8]:
# pull energies and forces for opts, energies for torsions

In [9]:
optimization_records_original = client.get_optimizations(
    list(opt_record_ids),
    include=["initial_molecule"]
)

In [10]:
optimization_records_original[0].final_molecule.identifiers

Identifiers(molecule_hash='7b42c896d68faf7b09b4d93149a769e47b5476bd', molecular_formula='C13H13NO2', smiles=None, inchi=None, inchikey=None, canonical_explicit_hydrogen_smiles=None, canonical_isomeric_explicit_hydrogen_mapped_smiles=None, canonical_isomeric_explicit_hydrogen_smiles=None, canonical_isomeric_smiles=None, canonical_smiles=None, pubchem_cid=None, pubchem_sid=None, pubchem_conformerid=None)

In [11]:
# this might be the lightest way to simply fetch the last frame
# alternatively could do `includes=['trajectory']`
# but I think this fetches the entire trajectory one-by-one

singlepoints_original_with_smiles = [
    (opt_record_ids[opt.id], opt.trajectory_element(-1))
    for opt in tqdm.tqdm(
        optimization_records_original,
        desc="Fetching last frame",
    )
]

Fetching last frame:   5%|██▉                                                     | 210/3939 [02:09<35:50,  1.73it/s]

KeyboardInterrupt: 

In [None]:
singlepoints_original_with_smiles[0][0]

In [None]:
torsiondrive_records_original = client.get_records(
    list(td_record_ids),
    include=["minimum_optimizations"]
)

In [None]:
torsiondrive_records_original[0]

In [None]:
singlepoints_torsiondrive_original = []
for record in tqdm.tqdm(torsiondrive_records_original, desc="Pulling last frames"):
    smi = td_record_ids[record.id]

    # first check this is rotating around the correct torsions
    mol = Molecule.from_mapped_smiles(smi, allow_undefined_stereo=True)
    ff = ForceField("openff-2.2.1.offxml")
    torsions = ff.label_molecules(
        mol.to_topology()
    )[0]["ProperTorsions"]
    
    dihedrals = record.specification.keywords.dihedrals
    scanned_torsions = []
    for dih in dihedrals:
        if dih in torsions:
            scanned_torsions.append(torsions[dih].id)
        elif dih[::-1] in torsions:
            scanned_torsions.append(torsions[dih[::-1]].id)

    if not any(tors in scanned_torsions for tors in searches):
        continue

    # now pull the constrained optimization
    for opt in record.minimum_optimizations.values():
        singlepoints_torsiondrive_original.append(
            (smi, opt.trajectory_element(-1))
        )
        
print(len(singlepoints_torsiondrive_original))

## Pull down additional complete datasets

In [15]:
# Pull down the records from additional datasets
# additional_torsiondrive_collection = TorsionDriveResultCollection.from_server(
#     client=client,
#     datasets="OpenFF Cresset Additional Coverage TorsionDrives v4.0",
#     spec_name="default",
# )
# additional_optimization_collection = OptimizationResultCollection.from_server(
#     client=client,
#     datasets="OpenFF Cresset Additional Coverage Optimizations v4.0",
#     spec_name="default",
# )
additional_hessian_collection = BasicResultCollection.from_server(
    client=client,
    datasets="OpenFF Cresset Additional Coverage Hessian v4.0",
    spec_name="default",
)

Fetching last frame:   5%|██▉                                                   | 210/3939 [05:09<1:31:42,  1.48s/it]


In [18]:
# additional_td_records_and_molecules = list(
#     additional_torsiondrive_collection.to_records()
# )
# additional_opt_records_and_molecules = list(
#     additional_optimization_collection.to_records()
# )
additional_hess_records_and_molecules = list(
    additional_hessian_collection.to_records()
)


In [22]:
additional_hess_records_and_molecules[0][0].__dict__.keys()

dict_keys(['id', 'record_type', 'is_service', 'properties', 'extras', 'status', 'manager_name', 'created_on', 'modified_on', 'owner_user', 'owner_group', 'compute_history_', 'task_', 'service_', 'comments_', 'native_files_', 'specification', 'molecule_id', 'molecule_', 'wavefunction_'])

In [24]:
additional_hess_records_and_molecules[0][0].properties.keys()

dict_keys(['pe energy', 'scf dipole', '-d gradient', 'calcinfo_nmo', 'dft xc energy', 'findif number', 'mayer indices', 'return_energy', 'return_result', 'calcinfo_natom', 'calcinfo_nbeta', 'current dipole', 'current energy', 'return_hessian', 'scf iterations', 'scf quadrupole', 'calcinfo_nalpha', 'calcinfo_nbasis', 'current hessian', 'dft vv10 energy', 'return_gradient', 'current gradient', 'dft total energy', 'scf total energy', 'dft total gradient', 'scf total energies', 'scf total gradient', 'dd solvation energy', 'grid electrons beta', 'one-electron energy', 'two-electron energy', 'grid electrons alpha', 'grid electrons total', 'scf iteration energy', 'wiberg lowdin indices', 'current dipole gradient', 'pcm polarization energy', 'b3lyp-d3bj total hessian', 'current reference energy', 'nuclear_repulsion_energy', 'b3lyp-d3bj total gradient', 'b3lyp-d3bj dipole gradient', 'dft functional total energy', 'dispersion correction energy', 'dispersion correction gradient', 'b3lyp-d3(bj) di

In [19]:
# singlepoints_additional = []
# for opt, _ in tqdm.tqdm(additional_opt_records_and_molecules):
#     last = opt.trajectory_element(-1)
#     singlepoints_additional.append(
#         (last.molecule.identifiers.canonical_isomeric_explicit_hydrogen_mapped_smiles, last)
#     )

100%|█████████████████████████████████████████| 393/393 [05:57<00:00,  1.10it/s]


In [20]:
# singlepoints_torsiondrive_additional = []
# for record, _ in tqdm.tqdm(additional_td_records_and_molecules, desc="Pulling last frames"):
#     for opt in record.minimum_optimizations.values():
#         last = opt.trajectory_element(-1)
#         singlepoints_torsiondrive_additional.append(
#             (last.molecule.identifiers.canonical_isomeric_explicit_hydrogen_mapped_smiles, last)
#         )

Pulling last frames: 100%|██████████████████████| 82/82 [32:08<00:00, 23.52s/it]


In [26]:
singlepoints_hessians_additional = []
for opt, _ in tqdm.tqdm(additional_hess_records_and_molecules):
    singlepoints_hessians_additional.append(
        (opt.molecule.identifiers.canonical_isomeric_explicit_hydrogen_mapped_smiles, opt)
    )

100%|██████████████████████████████████████████████████████████████████████████| 393/393 [00:00<00:00, 674728.40it/s]


## Convert to JSON

In [40]:
from openff.units import unit

bohr_to_angstrom = (1 * unit.bohr).m_as(unit.angstrom)
hartree_to_kcal = (1 * unit.hartree * unit.avogadro_constant).m_as(
    unit.kilocalories_per_mole
)

def convert_singlepoints(singlepoints, include_gradient: bool = True):
    """
    Convert data to smee-friendly format.
    """
    data_by_smiles = defaultdict(list)

    for mapped_smiles, record in tqdm.tqdm(singlepoints, desc="Converting"): 
        mol = record.molecule
        coords = np.array(mol.geometry * bohr_to_angstrom).flatten().tolist()
        energy = record.properties["return_energy"] * hartree_to_kcal
        hessian = (np.array(record.properties["return_hessian"]) * hartree_to_kcal / bohr_to_angstrom**2).tolist()
        entry = {
            "coords": coords,
            "energy": energy,
            "hessian": hessian
        }
        if include_gradient:
            try:
                gradient = np.array(record.properties["scf total gradient"]).reshape((-1, 3))
            except KeyError:
                # throw out any opts without gradients for now
                continue
            forces = ((-gradient) * hartree_to_kcal / bohr_to_angstrom)
            entry["forces"] = np.array(forces).flatten().tolist()
            
        data_by_smiles[mapped_smiles].append(entry)
    return data_by_smiles

In [23]:
# singlepoints_original[0].molecule.identifiers

In [25]:
# original_optimizations = convert_singlepoints(singlepoints_original)
# len(original_optimizations)

In [41]:
# convert optimizations

# original_optimizations = convert_singlepoints(singlepoints_original_with_smiles)
# with open("original-optimizations.json", "w") as f:
#     json.dump(original_optimizations, f)
    
# additional_optimizations = convert_singlepoints(singlepoints_additional)
# with open("additional-optimizations.json", "w") as f:
#     json.dump(additional_optimizations, f)

# original_torsiondrives = convert_singlepoints(
#     singlepoints_torsiondrive_original,
#     include_gradient=False
# )
# with open("original-torsiondrives.json", "w") as f:
#     json.dump(original_torsiondrives, f)
    
# additional_torsiondrives = convert_singlepoints(
#     singlepoints_torsiondrive_additional,
#     include_gradient=False
# )
# with open("additional-torsiondrives.json", "w") as f:
#     json.dump(additional_torsiondrives, f)

additional_hessians = convert_singlepoints(
    singlepoints_hessians_additional,
    # include_gradient=False
)
with open("additional-hessians.json", "w") as f:
    json.dump(additional_hessians, f)


Converting: 100%|████████████████████████████████████████████████████████████████| 393/393 [00:00<00:00, 4510.53it/s]


In [42]:
import collections

In [28]:
combined_optimizations = collections.defaultdict(list)
combined_torsiondrives = collections.defaultdict(list)

for dataset in [original_optimizations, additional_optimizations]:
    for k, v in dataset.items():
        combined_optimizations[k].extend(v)

for dataset in [original_torsiondrives, additional_torsiondrives]:
    for k, v in dataset.items():
        combined_torsiondrives[k].extend(v)

with open("combined-optimizations.json", "w") as f:
    json.dump(combined_optimizations, f)

with open("combined-torsiondrives.json", "w") as f:
    json.dump(combined_torsiondrives, f)
