# Submission Preparation

We'll start from the JSON files submitted in the `2019-07-25-phenyl-set`:

In [33]:
!ls *.json

biphenyls_set_input.json  phenyl_set_torsiondrive_inputs.json


Our first task will be to take some of the machinery from the original submission script `03_create_torsiondrive_dataset.py` and use it here to create a QCSubmit dataset.

**Important**: Must add 1 to every initial_molecule geometry in order to trick the deduplication on the server into letting us submit these molecules.

In [2]:
import os
import json

import numpy as np

from qcsubmit.factories import TorsiondriveDatasetFactory
from openforcefield.utils.toolkits import RDKitToolkitWrapper, UndefinedStereochemistryError



In [3]:
def read_selected_torsions(input_json):
    """ Read data generated by select_torsions.py

    Returns
    -------
    selected_torsions: dict
        Dictionary for selected torsions, has this structure:
        {
            canonical_torsion_index1: {
                'initial_molecules': [ Molecule1a, Molecule1b, .. ],
                'atom_indices': [ (0,1,2,3) ],
                'attributes': {'canonical_explicit_hydrogen_smiles': .., 'canonical_isomeric_smiles': .., ..}
            },
            ..
        }
    """
    with open(input_json) as infile:
        selected_torsions = json.load(infile)
        
    return selected_torsions

First, are existing indices unique across both input files?

In [4]:
inputs = ['biphenyls_set_input.json',  'phenyl_set_torsiondrive_inputs.json']

In [5]:
set(read_selected_torsions(inputs[0]).keys()) & set(read_selected_torsions(inputs[1]).keys())

set()

Should be good! No overlap!

In [6]:
def get_qcelmol(torsion_data):
    from qcelemental.models import Molecule
    qcmol = torsion_data['input_molecules']
    
    if isinstance(qcmol, list):
        qcelmols = []
        for qcm in qcmol:
            qcm['extras'] = torsion_data['cmiles_identifiers']
            qcm['geometry'] = (np.array(qcm['geometry']) + 1.0).tolist()
            qcelmols.append(Molecule(**qcm))
    else:
        qcmol['extras'] = torsion_data["cmiles_identifiers"]
        qcmol['geometry'] = (np.array(qcmol['geometry']) + 1.0).tolist()

        qcelmols = [Molecule(**qcmol)]
        
    return qcelmols

In [7]:
factory = TorsiondriveDatasetFactory()

In [8]:
factory.scf_properties

['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices']

In [9]:
dataset = factory.create_dataset(dataset_name="OpenFF Substituted Phenyl Set 1 v2.0",
                                 tagline="Torsiondrives for selected dihedrals of various phenyl fragments",
                                 description="Torsiondrives for selected dihedrals of various phenyl fragments",
                                 molecules=[])

Deduplication                 : 0it [00:00, ?it/s]
Preparation                   : 0it [00:00, ?it/s]


In [10]:
dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2020-10-06-OpenFF-Phenyl-Set"

In [11]:
dataset.metadata.submitter = 'dotsdl'

Check that molecules don't already exist in QCArchive; that is, that our strategy of shifting the coordinates by 1.0 bohr worked to avoid deduplication.

In [12]:
from qcportal import FractalClient

In [13]:
client = FractalClient()

In [16]:
def populate_dataset(dataset, input_files, client):
    matches = []
    for input_file in input_files:
        print(f"extracting from '{input_file}'")
        input_mols = read_selected_torsions(input_file)

        for key, moldata in input_mols.items():
            mol = get_qcelmol(moldata)
            
            # we have three known molecules with iodines
            # that we can't converge and want to leave out
            if key in ('CC[CH2:4][NH:3][c:2]1ccnc([cH:1]1)I',
                       'C[C:4](=O)[NH:3][c:2]1[cH:1]cnc(c1)I',
                       '[CH3:4][O:3][c:2]1[cH:1]cnc(c1)I'):
                continue
            
            for m in mol:
                matches.append(client.query_molecules(molecule_hash=m.get_hash()))

            #attributes = factory.create_cmiles_metadata(offmol)
            attributes = mol[0].extras
            print(f"...adding '{key}'")

            dihedral_indices = moldata['dihedral']
            if not isinstance(dihedral_indices[0], list):
                dihedral_indices = [dihedral_indices]

            if len(dihedral_indices) == 0:
                print(f"...NO DIHEDRAL INDICES FOR '{key}'")
                continue

            dihedral_indices = [list(indices) for indices in dihedral_indices]

            dataset.add_molecule(index=key,
                                 molecule=None,
                                 dihedrals=dihedral_indices,
                                 attributes=attributes,
                                 initial_molecules=mol)
    return dataset, matches

In [17]:
ds, matches = populate_dataset(dataset, inputs, client)

extracting from 'biphenyls_set_input.json'
...adding '[cH:1]1cc(cc[c:2]1[c:3]2[cH:4]c[nH+]cc2)O'




...adding '[cH:1]1cc(cc[c:2]1[c:3]2[cH:4]c[nH+]cc2)[O-]'
...adding '[cH:1]1cc(cc[c:2]1[c:3]2[cH:4]cncc2)O'
...adding '[cH:1]1cc(cc[c:2]1[c:3]2[cH:4]cncc2)[O-]'




...adding 'c1[cH:1][c:2](c(cc1O)Cl)[c:4]2[cH:3]c[nH+]cc2'




...adding 'c1[cH:1][c:2](c(cc1O)Cl)[c:4]2[cH:3]cncc2'
...adding 'c1[cH:1][c:2](c(cc1O)F)[c:4]2[cH:3]c[nH+]cc2'




...adding 'c1[cH:1][c:2](c(cc1O)F)[c:4]2[cH:3]cncc2'
...adding 'c1[cH:1][c:2](c(cc1[O-])Cl)[c:4]2[cH:3]c[nH+]cc2'
...adding 'c1[cH:1][c:2](c(cc1[O-])Cl)[c:4]2[cH:3]cncc2'




...adding 'c1[cH:1][c:2](c(cc1[O-])F)[c:4]2[cH:3]c[nH+]cc2'
...adding 'c1[cH:1][c:2](c(cc1[O-])F)[c:4]2[cH:3]cncc2'




...adding 'c1c[cH:1][c:2](cc1)[c:3]2[cH:4]cncc2'
...adding 'c1ccc([c:2]([cH:1]1)[c:3]2[cH:4]cncc2)Cl'
...adding 'c1ccc([c:2]([cH:1]1)[c:3]2[cH:4]cncc2)F'
extracting from 'phenyl_set_torsiondrive_inputs.json'
...adding 'CCC(=O)Nc1[cH:1][c:2](ccn1)[NH:3][CH2:4]C'
...adding 'CCCNc1cc[c:2]([cH:1]n1)[N:3](C)[CH3:4]'
...adding 'CCCNc1cc[c:2]([cH:1]n1)[O:3][CH2:4]C'
...adding 'CCCNc1ccc[c:2]([cH:1]1)[N:3]([CH3:4])C'
...adding 'CCCNc1ccc[c:2]([cH:1]1)[NH:3][CH3:4]'
...adding 'CCNc1[cH:1][c:2](ccn1)[NH:3][C:4](=O)NC'
...adding 'CCNc1[cH:1][c:2](ccn1)[O:3][C:4](=O)N'
...adding 'CCOC(=O)c1cc[cH:1][c:2](c1)[N+:3](=O)[O-:4]'
...adding 'CCOC(=O)c1ccc[c:2]([cH:1]1)[NH:3][C:4](=O)N(C)C'
...adding 'CCO[C:3](=[O:4])[c:2]1[cH:1]cc(cc1)NC(=O)C'
...adding 'CCO[C:3](=[O:4])[c:2]1[cH:1]cnc(c1)[N+](C)(C)C'




...adding 'CCOc1[cH:1][c:2](ccn1)[NH:3][C:4](=O)NC'
...adding 'CCOc1[cH:1][c:2](cnc1)[O:3][CH3:4]'
...adding 'CCOc1cc[c:2]([cH:1]n1)[C:3](=O)[O:4]CC'
...adding 'CC[C:4](=O)[NH:3][c:2]1[cH:1]c(cnc1)N'
...adding 'CC[C:4](=O)[NH:3][c:2]1[cH:1]cc(cc1)F'
...adding 'CC[C:4](=O)[NH:3][c:2]1cc(cn[cH:1]1)[N+](C)(C)C'




...adding 'CC[C:4](=O)[NH:3][c:2]1ccc(c[cH:1]1)N(C)C'
...adding 'CC[C:4](=O)[NH:3][c:2]1ccc(n[cH:1]1)C(=O)OCC'
...adding 'CC[C:4](=O)[NH:3][c:2]1ccc(n[cH:1]1)[N+](=O)[O-]'
...adding 'CC[C:4](=O)[NH:3][c:2]1ccc(n[cH:1]1)[N+](C)(C)C'
...adding 'CC[C:4](=O)[NH:3][c:2]1ccnc([cH:1]1)[N+](=O)[O-]'
...adding 'CC[C:4](=O)[NH:3][c:2]1ccnc([cH:1]1)[N+](C)(C)C'
...adding 'CC[CH2:4][NH:3][c:2]1[cH:1]cc(cc1)NC'
...adding 'CC[CH2:4][NH:3][c:2]1[cH:1]cc(nc1)NC(=O)NC'
...adding 'CC[CH2:4][NH:3][c:2]1cc(cn[cH:1]1)C(=O)OCC'
...adding 'CC[CH2:4][NH:3][c:2]1ccc(c[cH:1]1)[N+](C)(C)C'




...adding 'CC[CH2:4][NH:3][c:2]1ccc(n[cH:1]1)[N+](C)(C)C'




...adding 'CC[CH2:4][NH:3][c:2]1cccc([cH:1]1)NC(=O)N(C)C'
...adding 'CC[CH2:4][NH:3][c:2]1cccc([cH:1]1)[N+](C)(C)C'
...adding 'CC[CH2:4][NH:3][c:2]1ccnc([cH:1]1)C#N'
...adding 'CC[CH2:4][NH:3][c:2]1ccnc([cH:1]1)[N+](C)(C)C'
...adding 'CC[O:4][C:3](=O)[c:2]1ccc(n[cH:1]1)N'
...adding 'CC[O:4][C:3](=O)[c:2]1cccc([cH:1]1)O'
...adding 'CC[O:4][C:3](=O)[c:2]1ccnc([cH:1]1)NC'
...adding 'CN(C)C(=O)Nc1c[cH:1][c:2](cc1)[NH:3][C:4](=O)N'
...adding 'CN(C)[C:4](=O)[NH:3][c:2]1[cH:1]cc(cc1)C(F)(F)F'
...adding 'CN(C)[C:4](=O)[NH:3][c:2]1[cH:1]cc(cc1)I'
...adding 'CN(C)[C:4](=O)[NH:3][c:2]1[cH:1]cc(cc1)[N+](C)(C)C'




...adding 'CN(C)[C:4](=O)[NH:3][c:2]1[cH:1]cc(nc1)[N+](C)(C)C'




...adding 'CN(C)[C:4](=O)[NH:3][c:2]1[cH:1]ccc(c1)[N+](C)(C)C'
...adding 'CN(C)[C:4](=O)[NH:3][c:2]1[cH:1]cnc(c1)[N+](=O)[O-]'
...adding 'CN(C)[C:4](=O)[NH:3][c:2]1[cH:1]cnc(c1)[N+](C)(C)C'
...adding 'CN(C)c1c[cH:1][c:2](cc1)[NH:3][C:4](=O)N'
...adding 'CN(C)c1c[cH:1][c:2](cc1)[O:3][CH3:4]'
...adding 'CN(C)c1c[cH:1][c:2](cn1)[NH:3][C:4](=O)N(C)C'
...adding 'CN(C)c1cc[cH:1][c:2](c1)[O:3][C:4](=O)N'
...adding 'CNC(=O)Nc1c[cH:1][c:2](cc1)[NH:3][C:4](=O)N(C)C'
...adding 'CN[C:4](=O)[NH:3][c:2]1[cH:1]cc(nc1)[N+](=O)[O-]'
...adding 'CN[C:4](=O)[NH:3][c:2]1[cH:1]cc(nc1)[N+](C)(C)C'




...adding 'CN[C:4](=O)[NH:3][c:2]1[cH:1]ccc(c1)N'
...adding 'CN[C:4](=O)[NH:3][c:2]1cc(cn[cH:1]1)[N+](C)(C)C'
...adding 'CN[C:4](=O)[NH:3][c:2]1ccc(c[cH:1]1)N(C)C'
...adding 'CN[C:4](=O)[NH:3][c:2]1ccc(c[cH:1]1)[N+](C)(C)C'
...adding 'CN[C:4](=O)[NH:3][c:2]1cccc([cH:1]1)[N+](C)(C)C'
...adding 'CNc1c[c:2]([cH:1]cn1)[N:3]([CH3:4])C'
...adding 'CNc1c[cH:1][c:2](cn1)[C:3](=[O:4])O'
...adding 'CNc1c[cH:1][c:2](cn1)[N+:3](=O)[O-:4]'
...adding 'C[C:4](=O)[NH:3][c:2]1[cH:1]c(cnc1)I'
...adding 'C[C:4](=O)[NH:3][c:2]1[cH:1]cc(nc1)[N+](C)(C)C'




...adding 'C[C:4](=O)[NH:3][c:2]1[cH:1]ccc(c1)N'
...adding 'C[C:4](=O)[NH:3][c:2]1cc(cn[cH:1]1)[N+](C)(C)C'
...adding 'C[C:4](=O)[NH:3][c:2]1ccc(c[cH:1]1)N(C)C'
...adding 'C[C:4](=O)[NH:3][c:2]1ccc(c[cH:1]1)[N+](C)(C)C'




...adding 'C[C:4](=O)[NH:3][c:2]1ccc(n[cH:1]1)[N+](=O)[O-]'
...adding 'C[C:4](=O)[NH:3][c:2]1cccc([cH:1]1)[N+](C)(C)C'
...adding 'C[CH2:4][NH:3][c:2]1[cH:1]c(cnc1)OCC'
...adding 'C[CH2:4][NH:3][c:2]1[cH:1]cc(cc1)N(C)C'
...adding 'C[CH2:4][NH:3][c:2]1[cH:1]cc(nc1)N'
...adding 'C[CH2:4][NH:3][c:2]1[cH:1]cc(nc1)[N+](C)(C)C'




...adding 'C[CH2:4][NH:3][c:2]1[cH:1]cnc(c1)Br'
...adding 'C[CH2:4][NH:3][c:2]1[cH:1]cnc(c1)Cl'
...adding 'C[CH2:4][NH:3][c:2]1ccc(c[cH:1]1)[N+](=O)[O-]'
...adding 'C[CH2:4][NH:3][c:2]1ccnc([cH:1]1)[N+](C)(C)C'
...adding 'C[CH2:4][O:3][c:2]1[cH:1]cc(cc1)I'
...adding 'C[CH2:4][O:3][c:2]1[cH:1]cc(cc1)[N+](C)(C)C'




...adding 'C[CH2:4][O:3][c:2]1[cH:1]cc(nc1)C(=O)O'
...adding 'C[CH2:4][O:3][c:2]1[cH:1]cc(nc1)[N+](=O)[O-]'
...adding 'C[CH2:4][O:3][c:2]1[cH:1]cc(nc1)[N+](C)(C)C'




...adding 'C[CH2:4][O:3][c:2]1[cH:1]cnc(c1)OC(=O)N'
...adding 'C[CH2:4][O:3][c:2]1cccc([cH:1]1)NC(=O)N(C)C'
...adding 'C[CH2:4][O:3][c:2]1ccnc([cH:1]1)[N+](C)(C)C'
...adding 'C[N+](C)(C)c1[cH:1][c:2](ccn1)[NH:3][C:4](=O)N'




...adding 'C[N+](C)(C)c1[cH:1][c:2](cnc1)[O:3][C:4](=O)N'




...adding 'C[N+](C)(C)c1c[c:2]([cH:1]cn1)[C:3](=[O:4])O'




...adding 'C[N+](C)(C)c1c[c:2]([cH:1]cn1)[N+:3](=O)[O-:4]'




...adding 'C[N+](C)(C)c1c[c:2]([cH:1]cn1)[O:3][CH3:4]'




...adding 'C[N+](C)(C)c1c[cH:1][c:2](cn1)[NH:3][C:4](=O)N'
...adding 'C[N+](C)(C)c1c[cH:1][c:2](cn1)[O:3][C:4](=O)N'
...adding 'C[N+](C)(C)c1c[cH:1][c:2](cn1)[O:3][CH3:4]'
...adding 'C[N+](C)(C)c1cc[c:2]([cH:1]c1)[O:3][C:4](=O)N'




...adding 'C[N+](C)(C)c1cc[cH:1][c:2](c1)[O:3][CH3:4]'




...adding 'C[N+](C)(C)c1ccc[c:2]([cH:1]1)[NH:3][C:4](=O)N'
...adding 'C[N:3]([CH3:4])[c:2]1[cH:1]ccc(c1)NC(=O)N(C)C'
...adding 'C[N:3]([CH3:4])[c:2]1cc(cn[cH:1]1)[N+](C)(C)C'
...adding 'C[N:3]([CH3:4])[c:2]1ccc(c[cH:1]1)[N+](C)(C)C'




...adding '[CH3:4][N:3](C)[c:2]1[cH:1]cc(cc1)N(C)C'
...adding '[CH3:4][N:3](C)[c:2]1[cH:1]cc(nc1)[N+](=O)[O-]'
...adding '[CH3:4][N:3](C)[c:2]1[cH:1]cc(nc1)[N+](C)(C)C'
...adding '[CH3:4][N:3](C)[c:2]1[cH:1]ccc(c1)OC(=O)N'
...adding '[CH3:4][N:3](C)[c:2]1ccc(n[cH:1]1)C(F)(F)F'
...adding '[CH3:4][NH:3][c:2]1[cH:1]c(cnc1)C(=O)O'
...adding '[CH3:4][NH:3][c:2]1[cH:1]cc(cc1)NC'
...adding '[CH3:4][NH:3][c:2]1[cH:1]cc(cc1)[N+](=O)[O-]'
...adding '[CH3:4][NH:3][c:2]1[cH:1]cc(nc1)C(=O)O'
...adding '[CH3:4][NH:3][c:2]1[cH:1]cc(nc1)O'
...adding '[CH3:4][NH:3][c:2]1[cH:1]cc(nc1)[N+](C)(C)C'




...adding '[CH3:4][NH:3][c:2]1[cH:1]cnc(c1)N'
...adding '[CH3:4][NH:3][c:2]1cc(cn[cH:1]1)[N+](C)(C)C'
...adding '[CH3:4][NH:3][c:2]1ccnc([cH:1]1)[N+](C)(C)C'
...adding '[CH3:4][O:3][c:2]1[cH:1]cc(nc1)[N+](=O)[O-]'
...adding '[CH3:4][O:3][c:2]1[cH:1]ccc(c1)C(=O)O'
...adding '[H:4][NH:3][c:2]1[cH:1]c(cnc1)NC(=O)NC'
...adding '[H:4][NH:3][c:2]1[cH:1]c(cnc1)[N+](C)(C)C'




...adding '[H:4][NH:3][c:2]1[cH:1]cc(cc1)C(=O)OCC'
...adding '[H:4][NH:3][c:2]1[cH:1]cc(cc1)N(C)C'
...adding '[H:4][NH:3][c:2]1[cH:1]cc(cc1)[N+](C)(C)C'




...adding '[H:4][NH:3][c:2]1[cH:1]cc(nc1)OC'
...adding '[H:4][NH:3][c:2]1[cH:1]cc(nc1)[N+](=O)[O-]'
...adding '[H:4][NH:3][c:2]1[cH:1]cnc(c1)[N+](=O)[O-]'
...adding '[H:4][NH:3][c:2]1ccc(n[cH:1]1)[N+](C)(C)C'
...adding '[H:4][O:3][c:2]1[cH:1]cc(cc1)[N+](=O)[O-]'
...adding '[H:4][O:3][c:2]1[cH:1]ccc(c1)C'
...adding '[H:4][O:3][c:2]1[cH:1]cnc(c1)N'
...adding '[H:4][O:3][c:2]1cc(cn[cH:1]1)NC(=O)N(C)C'
...adding '[H:4][O:3][c:2]1cc(cn[cH:1]1)[N+](C)(C)C'




...adding '[H:4][O:3][c:2]1ccc(c[cH:1]1)N(C)C'
...adding '[H:4][O:3][c:2]1ccc(n[cH:1]1)[N+](C)(C)C'
...adding '[H:4][O:3][c:2]1cccc([cH:1]1)[N+](C)(C)C'




...adding '[H:4][O:3][c:2]1ccnc([cH:1]1)[N+](C)(C)C'
...adding '[cH:1]1[c:2](cncc1OC(=O)N)[NH:3][C:4](=O)N'
...adding '[cH:1]1cc(nc[c:2]1[C:3](=[O:4])O)C#N'
...adding '[cH:1]1cc(nc[c:2]1[N+:3](=O)[O-:4])F'
...adding '[cH:1]1cc(nc[c:2]1[N+:3](=O)[O-:4])NC(=O)N'
...adding '[cH:1]1cc(nc[c:2]1[NH:3][C:4](=O)N)[N+](=O)[O-]'
...adding '[cH:1]1cc(nc[c:2]1[O:3][C:4](=O)N)[N+](=O)[O-]'
...adding '[cH:1]1cnc(c[c:2]1[N+:3](=O)[O-:4])N'
...adding '[cH:1]1cnc(c[c:2]1[NH:3][C:4](=O)N)[N+](=O)[O-]'
...adding 'c1cc(c[c:2]([cH:1]1)[NH:3][C:4](=O)N)C(=O)O'
...adding 'c1cc(c[cH:1][c:2]1[C:3](=[O:4])O)NC(=O)N'
...adding 'c1cnc([cH:1][c:2]1[C:3](=[O:4])O)NC(=O)N'


In [24]:
sum([len(m) for m in matches])

0

We're good to go! No matching molecules in the database.

In [25]:
# upping SCF convergence in case we hit issues on the iodine cases
dataset.maxiter = 800

In [26]:
dataset.scf_properties

['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices']

In [27]:
dataset.export_dataset('dataset.json.bz2')

In [28]:
dataset.visualize('molecules.pdf')

In [29]:
dataset.molecules_to_file('molecules.smi', 'smi')

In [30]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])
print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max", 
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

Number of unique molecules        155
Number of filtered molecules      0
Number of conformers              156
Number of conformers min mean max 1   2.97 20


In [31]:
dataset.metadata.elements

{'Br', 'C', 'Cl', 'F', 'H', 'I', 'N', 'O'}

In [32]:
dataset.qc_specifications

{'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>, implicit_solvent=None)}

## [scratchspace] Testing convergence of iodine cases

Create a dataset with just the problem cases, submit to local QCArchive for test.

Instead of filtering here, just use the same functions from above, but only take problem molecules with iodine.

In [54]:
factory = TorsiondriveDatasetFactory()

In [55]:
factory.scf_properties

['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices']

In [56]:
dataset = factory.create_dataset(dataset_name="OpenFF Substituted Phenyl Set 1 v2.3",
                                 tagline="Torsiondrives for selected dihedrals of various phenyl fragments",
                                 description="Torsiondrives for selected dihedrals of various phenyl fragments",
                                 molecules=[])

Deduplication                 : 0it [00:00, ?it/s]
Preparation                   : 0it [00:00, ?it/s]


In [57]:
dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/2020-10-06-OpenFF-Phenyl-Set"

In [58]:
dataset.metadata.submitter = 'dotsdl'

In [59]:
def populate_dataset(dataset, input_files):
    for input_file in input_files:
        print(f"extracting from '{input_file}'")
        input_mols = read_selected_torsions(input_file)

        for key, moldata in input_mols.items():
            mol = get_qcelmol(moldata)
            if not 53 in mol[0].atomic_numbers:
                continue
            
            # we have three known failures we want to focus on
            if key not in ('CC[CH2:4][NH:3][c:2]1ccnc([cH:1]1)I',
                           'C[C:4](=O)[NH:3][c:2]1[cH:1]cnc(c1)I',
                           '[CH3:4][O:3][c:2]1[cH:1]cnc(c1)I'):
                continue
                    
            #attributes = factory.create_cmiles_metadata(offmol)
            attributes = mol[0].extras
            print(f"...adding '{key}'")

            dihedral_indices = moldata['dihedral']
            if not isinstance(dihedral_indices[0], list):
                dihedral_indices = [dihedral_indices]

            if len(dihedral_indices) == 0:
                print(f"...NO DIHEDRAL INDICES FOR '{key}'")
                continue

            dihedral_indices = [list(indices) for indices in dihedral_indices]

            dataset.add_molecule(index=key,
                                 molecule=None,
                                 dihedrals=dihedral_indices,
                                 attributes=attributes,
                                 initial_molecules=mol)
    return dataset

In [60]:
ds = populate_dataset(dataset, inputs)

extracting from 'biphenyls_set_input.json'
extracting from 'phenyl_set_torsiondrive_inputs.json'
...adding 'CC[CH2:4][NH:3][c:2]1ccnc([cH:1]1)I'
...adding 'C[C:4](=O)[NH:3][c:2]1[cH:1]cnc(c1)I'
...adding '[CH3:4][O:3][c:2]1[cH:1]cnc(c1)I'


This dataset also had iodine cases that won't converge with the current SCF iteration number.
We could try upping the SCF iteration number.

In [61]:
# want to test this out on the iodine cases to ensure convergence
dataset.maxiter = 3200

Try doubling, run on an iodine case, see if we get convergence.

In [62]:
dataset.scf_properties

['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices']

In [63]:
from qcportal import FractalClient

In [64]:
client = FractalClient("https://localhost:7777", verify=False)

In [65]:
client

In [66]:
dataset.submit(client, ignore_errors=True)

{'default': 3}

In [67]:
client.list_collections()

Unnamed: 0_level_0,Unnamed: 1_level_0,tagline
collection,name,Unnamed: 2_level_1
TorsionDriveDataset,OpenFF Substituted Phenyl Set 1 v2.1,Torsiondrives for selected dihedrals of variou...
TorsionDriveDataset,OpenFF Substituted Phenyl Set 1 v2.2,Torsiondrives for selected dihedrals of variou...
TorsionDriveDataset,OpenFF Substituted Phenyl Set 1 v2.3,Torsiondrives for selected dihedrals of variou...


In [88]:
ds = client.get_collection("TorsionDriveDataset", "OpenFF Substituted Phenyl Set 1 v2.3")

In [89]:
ds.status()

Unnamed: 0,default
ERROR,3


In [90]:
ds.df

Unnamed: 0,default
CC[CH2:4][NH:3][c:2]1ccnc([cH:1]1)I,"TorsionDriveRecord(id='41', status='ERROR')"
C[C:4](=O)[NH:3][c:2]1[cH:1]cnc(c1)I,"TorsionDriveRecord(id='42', status='ERROR')"
[CH3:4][O:3][c:2]1[cH:1]cnc(c1)I,"TorsionDriveRecord(id='43', status='ERROR')"


1, 4, and 6 failed. We'll focus on re-running these.

In [91]:
import sys
sys.path.append('../../management/')

In [92]:
from management import get_unfinished_torsiondrive_optimizations, merge, restart_optimizations, restart_torsiondrives, get_unique_optimization_error_messages

In [93]:
opts = get_unfinished_torsiondrive_optimizations(ds, 'default', client)

In [94]:
erred = [opt for opt in merge(opts) if opt.status == 'ERROR']

In [95]:
erred

[OptimizationRecord(id='45', status='ERROR'),
 OptimizationRecord(id='46', status='ERROR'),
 OptimizationRecord(id='44', status='ERROR'),
 OptimizationRecord(id='47', status='ERROR'),
 OptimizationRecord(id='48', status='ERROR'),
 OptimizationRecord(id='59', status='ERROR'),
 OptimizationRecord(id='60', status='ERROR')]

In [96]:
print(list(get_unique_optimization_error_messages(erred, full=True))[0])

geomeTRIC run_json error:
Traceback (most recent call last):
  File "/home/david/.conda/envs/qcarchive-worker-openff-psi4/lib/python3.7/site-packages/geometric/run_json.py", line 225, in geometric_run_json
    geometric.optimize.Optimize(coords, M, IC, engine, None, params)
  File "/home/david/.conda/envs/qcarchive-worker-openff-psi4/lib/python3.7/site-packages/geometric/optimize.py", line 1331, in Optimize
    return optimizer.optimizeGeometry()
  File "/home/david/.conda/envs/qcarchive-worker-openff-psi4/lib/python3.7/site-packages/geometric/optimize.py", line 1298, in optimizeGeometry
    self.calcEnergyForce()
  File "/home/david/.conda/envs/qcarchive-worker-openff-psi4/lib/python3.7/site-packages/geometric/optimize.py", line 1002, in calcEnergyForce
    spcalc = self.engine.calc(self.X, self.dirname)
  File "/home/david/.conda/envs/qcarchive-worker-openff-psi4/lib/python3.7/site-packages/geometric/engine.py", line 873, in calc
    return self.calc_new(coords, dirname)
  File "/hom

In [97]:
restart_optimizations(erred, client)

OptimizationRecord(id='45', status='ERROR')
OptimizationRecord(id='46', status='ERROR')
OptimizationRecord(id='44', status='ERROR')
OptimizationRecord(id='47', status='ERROR')
OptimizationRecord(id='48', status='ERROR')
OptimizationRecord(id='59', status='ERROR')
OptimizationRecord(id='60', status='ERROR')


In [98]:
erred_tdrs = [tdr for tdr in ds.df.default.values if tdr.status == 'ERROR']

In [99]:
restart_torsiondrives(erred_tdrs, client)

TorsionDriveRecord(id='41', status='ERROR')
TorsionDriveRecord(id='42', status='ERROR')
TorsionDriveRecord(id='43', status='ERROR')


In [110]:
ds = client.get_collection("TorsionDriveDataset", "OpenFF Substituted Phenyl Set 1 v2.3")

In [111]:
ds.status()

Unnamed: 0,default
ERROR,2
RUNNING,1


In [112]:
opts = get_unfinished_torsiondrive_optimizations(ds, 'default', client)

In [113]:
erred = [opt for opt in merge(opts) if opt.status == 'ERROR']

In [114]:
erred

[OptimizationRecord(id='44', status='ERROR'),
 OptimizationRecord(id='45', status='ERROR'),
 OptimizationRecord(id='46', status='ERROR'),
 OptimizationRecord(id='47', status='ERROR')]

In [118]:
print(list(get_unique_optimization_error_messages(erred, full=True))[0])

geomeTRIC run_json error:
Traceback (most recent call last):
  File "/home/david/.conda/envs/qcarchive-worker-openff-psi4/lib/python3.7/site-packages/geometric/run_json.py", line 225, in geometric_run_json
    geometric.optimize.Optimize(coords, M, IC, engine, None, params)
  File "/home/david/.conda/envs/qcarchive-worker-openff-psi4/lib/python3.7/site-packages/geometric/optimize.py", line 1331, in Optimize
    return optimizer.optimizeGeometry()
  File "/home/david/.conda/envs/qcarchive-worker-openff-psi4/lib/python3.7/site-packages/geometric/optimize.py", line 1298, in optimizeGeometry
    self.calcEnergyForce()
  File "/home/david/.conda/envs/qcarchive-worker-openff-psi4/lib/python3.7/site-packages/geometric/optimize.py", line 1002, in calcEnergyForce
    spcalc = self.engine.calc(self.X, self.dirname)
  File "/home/david/.conda/envs/qcarchive-worker-openff-psi4/lib/python3.7/site-packages/geometric/engine.py", line 873, in calc
    return self.calc_new(coords, dirname)
  File "/hom

**Conclusion**: create submission with these molecules explicitly filtered out.
Choose 800 SCF iterations for submission.