# Basic Handling of QCFractal Dataset Views for Force Field Fitting

In this notebook we show examples of how the views can be interacted with. Using the provided docker image, feel free to browse the data.

## Import packages

In [None]:
import json
import glob
from pprint import pprint

import numpy as np
from qcportal import load_dataset_view
from qcportal.serialization import encode_to_json
from openff.units import unit
from openff.toolkit import Molecule
from forcebalance.molecule import Molecule as FBMolecule

In [None]:
filename_opt_dsv = "views/OpenFF-SMIRNOFF-Sage-2.2.0_optimization_view.sqlite"
filename_td_dsv = "views/OpenFF-SMIRNOFF-Sage-2.2.0_torsiondrive_view.sqlite"

## Optimization Dataset

In [None]:
dsv_opt = load_dataset_view(filename_opt_dsv)

all_records = list(dsv_opt.iterate_records())
all_entries = list(dsv_opt.iterate_entries())

# Map entry names to records and entries
name_to_records = {
    name: record
    for name, _, record in all_records
}
name_to_entry = {
    entry.name: entry
    for entry in all_entries
}

In [None]:
for name, record in name_to_records.items():
    mapped_smiles = name_to_entry[name].attributes["canonical_isomeric_explicit_hydrogen_mapped_smiles"]
    geometry_au = record.final_molecule.geometry

    molecule = Molecule.from_mapped_smiles(mapped_smiles, allow_undefined_stereo=True)
    molecule.add_conformer(
        np.array(geometry_au) * unit.bohr
    )
    # Files used in ForceBalance -- name uniquely for different targets
    molecule.to_file("outputs/mol.pdb", "PDB")
    molecule.to_file("outputs/mol.xyz", "XYZ")
    molecule.to_file("outputs/mol.sdf", "SDF")

    # Alternatively, using QCElemental
    xyz_str = record.final_molecule.to_string("xyz")
    open("outputs/qce_mol.xyz", "w").write(xyz_str)

    # Alternatively, using ForceBalance
    fb_molecule = FBMolecule()
    fb_molecule.Data = {
        "resname": ["UNK"] * molecule.n_atoms,
        "resid": [0] * molecule.n_atoms,
        "elem": [atom.symbol for atom in molecule.atoms],
        "bonds": [
            (bond.atom1_index, bond.atom2_index) for bond in molecule.bonds
        ],
        "name": f"{record.id}",
        "xyzs": [molecule.conformers[0].m_as(unit.angstrom)],
    }
    fb_molecule.write("outputs/mol.pdb")

## Torsiondrive Dataset

In [None]:
dsv_td = load_dataset_view(filename_td_dsv)

all_td_records = list(dsv_td.iterate_records())
all_td_entries = list(dsv_td.iterate_entries())

# Map entry names to records and entries
td_name_to_records = {
    name: record
    for name, _, record in all_td_records
}
td_name_to_entry = {
    entry.name: entry
    for entry in all_td_entries
}

In [None]:
for name, record in td_name_to_records.items():
    energies = record.final_energies
    mapped_smiles = td_name_to_entry[name].attributes["canonical_isomeric_explicit_hydrogen_mapped_smiles"]
    grid_ids = sorted(energies)
    
        
    grid_conformers = []
    molecule = Molecule.from_mapped_smiles(mapped_smiles, allow_undefined_stereo=True)
    for grid_id in grid_ids:
        # The `grid_id`` is a tuple of angles. Usually our TorsionDrives used in FF fits are 1D,
        # so `grid_id`` is expected to be a tuple with a single float (e.g. (0,)).
        energy = energies[grid_id]
        geometry_au = record.minimum_optimizations[grid_id].final_molecule.geometry
        
        molecule.add_conformer(
            np.array(geometry_au) * unit.bohr
        )
        # Save the molecule to file if you want to fit with your own workflow
        # molecule.to_file(...)
        grid_conformers.append(molecule.conformers[0].m_as(unit.angstrom))

    # Write files used in a TorsionProfileTarget fit in ForceBalance
    fb_molecule = FBMolecule()
    fb_molecule.Data = {
        "resname": ["UNK"] * molecule.n_atoms,
        "resid": [0] * molecule.n_atoms,
        "elem": [atom.symbol for atom in molecule.atoms],
        "bonds": [
            (bond.atom1_index, bond.atom2_index) for bond in molecule.bonds
        ],
        "name": f"{record.id}",
        "xyzs": grid_conformers,
        # Expect AU energies here
        "qm_energies": [energies[grid_id] for grid_id in grid_ids],
        "comms": [f"torsion grid {grid_id}" for grid_id in grid_ids],
    }
    fb_molecule.write("outputs/qdata.txt")
    fb_molecule.write("outputs/scan.xyz")

    # Write first conformer
    molecule._conformers = molecule._conformers[:1]
    molecule.to_file("outputs/input.sdf", "SDF")
    molecule.to_file("outputs/conf.pdb", "PDB")

    # Write metadata
    metadata = record.specification.optimization_specification.keywords
    # The dihedrals are the 0-indexed atoms that are the rotated dihedral.
    metadata["dihedrals"] = record.specification.keywords.dihedrals
    metadata["torsion_grid_ids"] = grid_ids
    metadata["energy_decrease_thresh"] = None
    metadata["energy_upper_limit"] = 8.0 # cutoff used in Sage 2.2.1
    
    with open("outputs/metadata.json", "w") as f:
        json.dump(metadata, f, indent=4)

## Pulling Energies / Geometries from Dataset View

In [None]:
entry_name = list(name_to_records.keys())[0]
record = name_to_records[entry_name]

print(f"The final energy of this record is: {record.energies[-1]}")
print(f"The final geometry of this record is:\n{record.final_molecule.geometry}")

In [None]:
# Create dictionary of output
rec_dict = encode_to_json(record)
pprint(rec_dict)