In [1]:
import os 
from qcsubmit.factories import TorsiondriveDatasetFactory
from qcsubmit.datasets import TorsiondriveDataset
from qcsubmit import workflow_components



In [2]:
# for each of the targets we will create a fragmented dataset to look at the size and number of fragments made
# set up the factory
factory = TorsiondriveDatasetFactory()
factory

TorsiondriveDatasetFactory(method='B3LYP-D3BJ', basis='DZVP', program='psi4', maxiter=200, driver=<DriverEnum.gradient: 'gradient'>, scf_properties=['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices'], spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', priority='normal', dataset_tags=['openff'], compute_tag='openff', workflow={}, optimization_program=GeometricProcedure(program='geometric', coordsys='tric', enforce=0.1, epsilon=0.0, reset=True, qccnv=True, molcnv=False, check=0, trust=0.1, tmax=0.3, maxiter=300, convergence_set='GAU', constraints={}), grid_spacings=[15], energy_upper_limit=0.05, dihedral_ranges=None, energy_decrease_thresh=None)

In [3]:
fragmenter = workflow_components.WBOFragmenter()
# apply settings
fragmenter.keep_non_rotor_ring_substituents = True

# add fragmenter to the pipeline
factory.add_workflow_component(fragmenter)
factory

TorsiondriveDatasetFactory(method='B3LYP-D3BJ', basis='DZVP', program='psi4', maxiter=200, driver=<DriverEnum.gradient: 'gradient'>, scf_properties=['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices'], spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', priority='normal', dataset_tags=['openff'], compute_tag='openff', workflow={'WBOFragmenter': WBOFragmenter(component_name='WBOFragmenter', component_description='Fragment a molecule across all rotatble bonds using the WBO fragmenter.', component_fail_message='The molecule could not fragmented correctly.', toolkit='openeye', threshold=0.03, keep_non_rotor_ring_substituents=True, functional_groups=None, heuristic='path_length', include_parent=False)}, optimization_program=GeometricProcedure(program='geometric', coordsys='tric', enforce=0.1, epsilon=0.0, reset=True, qccnv=True, molcnv=False, check=0, trust=0.1, tmax=0.3, maxiter=300, convergence_set='GAU', constraints={}), gr

In [4]:
# set the target name
dataset_name = "tyk2"

In [5]:
# test making a dataset
dataset = factory.create_dataset(dataset_name=f"OpenFF-{dataset_name}-ligands-v1.0", molecules=f"{dataset_name}/{dataset_name}.sdf", description=f"Torsiondrives of fragments of {dataset_name} inhibitors.", tagline=f"Fragmented torsiondrives of {dataset_name} inhibitors.")




In [6]:
dataset.metadata.elements

{'C', 'Cl', 'F', 'H', 'N', 'O'}

In [7]:
dataset.n_molecules

22

In [8]:
dataset.n_records

45

In [9]:
dataset.visualize(f"{dataset_name}/fragments.pdf")

In [10]:
dataset.export_dataset(f"{dataset_name}/dataset.json")

In [11]:
# set the option off and run again
factory.workflow["WBOFragmenter"].keep_non_rotor_ring_substituents = False

In [12]:
dataset_without = factory.create_dataset(dataset_name=f"OpenFF-{dataset_name}-ligands-v1.0", molecules=f"{dataset_name}/{dataset_name}.sdf", description=f"Torsiondrives of fragments of {dataset_name} inhibitors.", tagline=f"Fragmented torsiondrives of {dataset_name} inhibitors.")




In [13]:
# output the result
dataset_without.metadata.elements

{'C', 'Cl', 'F', 'H', 'N', 'O'}

In [14]:
dataset_without.n_molecules

21

In [15]:
dataset_without.n_records

44

In [16]:
dataset_without.visualize(f"{dataset_name}/fragments_remove.pdf")

In [17]:
dataset_without.export_dataset(f"{dataset_name}/dataset_remove.json")

In [19]:
# combine datasets with datasets
dataset_name = "dataset_remove.json"
bace = TorsiondriveDataset.parse_file(f"bace/{dataset_name}")
jnk1 = TorsiondriveDataset.parse_file(f"jnk1/{dataset_name}")
cdk2 = TorsiondriveDataset.parse_file(f"cdk2/{dataset_name}")
mcl1 = TorsiondriveDataset.parse_file(f"mcl1/{dataset_name}")
p38a = TorsiondriveDataset.parse_file(f"p38a/{dataset_name}")
ptp1b = TorsiondriveDataset.parse_file(f"ptp1b/{dataset_name}")
thrombin = TorsiondriveDataset.parse_file(f"thrombin/{dataset_name}")
tyk2 = TorsiondriveDataset.parse_file(f"tyk2/{dataset_name}")





In [20]:
# now start to combine the datasets
new_dataset = bace + jnk1 + cdk2 + mcl1 + p38a + ptp1b + thrombin + tyk2

In [21]:
new_dataset.n_molecules

274

In [22]:
new_dataset.n_records

375

In [24]:
new_dataset.visualize("all_fragments_remove.pdf")

In [25]:
new_dataset.export_dataset("all_fragments_remove.json")

In [26]:
new_dataset.metadata.elements

{'Br', 'C', 'Cl', 'F', 'H', 'I', 'N', 'O', 'S'}