Here we set up a workflow to generate a torsiondrive dataset using qcsubmit, from the biaryl set. 

In [1]:
from qcsubmit.factories import TorsiondriveDatasetFactory
from qcsubmit import workflow_components
from qcsubmit.common_structures import TorsionIndexer
from openforcefield.topology import Molecule
import os


In [2]:
# lets create the factory
factory = TorsiondriveDatasetFactory()
conformer_generator = workflow_components.StandardConformerGenerator(max_conformers=4)
factory.add_workflow_component(conformer_generator)
factory

TorsiondriveDatasetFactory(method='B3LYP-D3BJ', basis='DZVP', program='psi4', maxiter=200, driver=<DriverEnum.gradient: 'gradient'>, scf_properties=['dipole', 'quadrupole', 'wiberg_lowdin_indices'], spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', priority='normal', dataset_tags=['openff'], compute_tag='openff', workflow={'StandardConformerGenerator': StandardConformerGenerator(component_name='StandardConformerGenerator', component_description='Generate conformations for the given molecules', component_fail_message='Conformers could not be generated', toolkit='openeye', max_conformers=4, clear_existing=True)}, optimization_program=GeometricProcedure(program='geometric', coordsys='tric', enforce=0.1, epsilon=0.0, reset=True, qccnv=True, molcnv=False, check=0, trust=0.1, tmax=0.3, maxiter=300, convergence_set='GAU', constraints={}), grid_spacings=[15], energy_upper_limit=0.05, dihedral_ranges=None, energy_decrease_thresh=None)

In [3]:
# now write the settings out
factory.export_settings("biaryl_setttings.yaml")

In [8]:
# now create the dataset from the pdbs in the pdb folder

dataset = factory.create_dataset(dataset_name="OpenFF Rowley Biaryl v1.0", molecules="pdbs", description="A torsiondrive dataset of biaryl molecules.", tagline="Torsiondrives of biaryl molecules.")


In [5]:
dataset.n_molecules

87

In [6]:
dataset.n_records

91

In [9]:
dataset.metadata.long_description = "A torsiondrive dataset of biaryl molecules supplied by Christopher Rowley."
dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/2020-06-17-OpenFF-Biaryl-set"

In [29]:
# now there are some molecules which have been entered twice and we should remove some of the unwanted torsiondrives
# first we need to find the molecules with multipule hits
double_scans = set()
for molecule in dataset.molecules:
    hits = dataset.get_molecule_entry(molecule)
    if len(hits) > 1:
        for hit in hits:
            double_scans.add(hit)
        
double_scans

{'[H]C1=C(C(=O)N(C(=C1[H])C2=C(C(=[C:4]([N:3]2[C:2]([H:1])([H])[H])[H])[H])[H])[H])[H]',
 '[H]C1=C(C(=O)[N:1]([C:2](=C1[H])[C:3]2=C(C(=C([N:4]2C([H])([H])[H])[H])[H])[H])[H])[H]',
 '[H]c1c([c:1]([c:2](c(c1[H])[H])[C:3]2=C(C(=C([N:4]2C([H])([H])[H])[H])[H])[H])[H])[H]',
 '[H]c1c([c:1]([c:2](c(c1[H])[H])[C:3]2=[N:4]OC(=[N+]2[H])[N-][H])[H])[H]',
 '[H]c1c([n:1][c:2](nc1[H])[C:3]2=C(C(=C([N:4]2C([H])([H])[H])[H])[H])[H])[H]',
 '[H]c1c(c(c(c(c1[H])[H])C2=C(C(=[C:4]([N:3]2[C:2]([H:1])([H])[H])[H])[H])[H])[H])[H]',
 '[H]c1c(c(c(c(c1[H])[H])C2=N[O:1][C:2](=[N+]2[H])[N-:3][H:4])[H])[H]',
 '[H]c1c(nc(nc1[H])C2=C(C(=[C:4]([N:3]2[C:2]([H:1])([H])[H])[H])[H])[H])[H]'}

In [38]:
# now we need to remove the unwated scans
# these should be all scans with hydrogen 
import re
remove_scans = []
for scan in double_scans:
    match = re.search("H:[1-4]", scan)
    if match is not None:
        remove_scans.append(scan)
        
remove_scans

['[H]c1c(nc(nc1[H])C2=C(C(=[C:4]([N:3]2[C:2]([H:1])([H])[H])[H])[H])[H])[H]',
 '[H]c1c(c(c(c(c1[H])[H])C2=C(C(=[C:4]([N:3]2[C:2]([H:1])([H])[H])[H])[H])[H])[H])[H]',
 '[H]c1c(c(c(c(c1[H])[H])C2=N[O:1][C:2](=[N+]2[H])[N-:3][H:4])[H])[H]',
 '[H]C1=C(C(=O)N(C(=C1[H])C2=C(C(=[C:4]([N:3]2[C:2]([H:1])([H])[H])[H])[H])[H])[H])[H]']

In [39]:
# now remove the scans from the dataset
for scan in remove_scans:
    del dataset.dataset[scan]

In [40]:
dataset.n_molecules

87

In [41]:
dataset.n_records

87

In [42]:
# export the dataset
dataset.export_dataset("biaryl_dataset.json")

To submit the dataset use the following block.


In [47]:
dataset.molecules_to_file("biaryls.smi", "smi")

In [52]:
coverage = dataset.coverage_report("openff_unconstrained-1.0.0.offxml")
coverage 

{'openff_unconstrained-1.0.0.offxml': {'Bonds': ['b5',
   'b84',
   'b12',
   'b4',
   'b8',
   'b6',
   'b13',
   'b33',
   'b16',
   'b49',
   'b86',
   'b11',
   'b10',
   'b20',
   'b7',
   'b83',
   'b38',
   'b32',
   'b34',
   'b36',
   'b18',
   'b35'],
  'Angles': ['a10',
   'a11',
   'a22',
   'a19',
   'a14',
   'a28',
   'a35',
   'a20',
   'a1',
   'a2',
   'a13',
   'a18'],
  'ProperTorsions': ['t44',
   't76',
   't43',
   't75',
   't69',
   't71',
   't45',
   't128',
   't77',
   't47',
   't102',
   't106',
   't108',
   't107',
   't48',
   't68',
   't74',
   't73',
   't70',
   't59',
   't120',
   't78',
   't49',
   't127',
   't125',
   't129',
   't130',
   't126'],
  'ImproperTorsions': ['i1', 'i3', 'i4'],
  'vdW': ['n14',
   'n20',
   'n7',
   'n8',
   'n9',
   'n18',
   'n21',
   'n11',
   'n17',
   'n16',
   'n3']}}

In [44]:
from qcsubmit.datasets import TorsiondriveDataset
import qcportal as ptl

# start the connection the public archive
client = ptl.FractalClient.from_file()


# load up the dataset
biaryl_dataset = TorsiondriveDataset.parse_file("biaryl_dataset.json")

# submit the dataset
biaryl_dataset.submit(client=client)

FileNotFoundError: Could not find `qcportal_config.yaml` in the following paths:
    /Users/joshuahorton/Documents/Projects/qca-dataset-submission/2020-06-17-OpenFF-Biaryl-set, /Users/joshuahorton/.qca