In [None]:
# import modules
import deepchem as dc

In [1]:
# featurize inputs
tasks, datasets, transformers = dc.molnet.load_muv()
train_dataset, valid_dataset, test_dataset = datasets
train_smiles = train_dataset.ids

Loading raw samples now.
shard_size: 8192
About to start loading CSV from /tmp/muv.csv.gz
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 0 took 71.357 s
Loading shard 2 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 1 took 70.375 s
Loading shard 3 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 2 took 68.101 s
Loading shard 4 of size 8192.
Featurizing sample 0
Feat

In [3]:
tokens = set()
for s in train_smiles:
    tokens = tokens.union(set(s))
tokens = sorted(list(tokens))
max_length = max(len(s) for s in train_smiles)

In [8]:
from deepchem.models.tensorgraph.optimizers import Adam, ExponentialDecay
import deepchem.models.seqtoseq as auto

model = auto(tokens, max_length, variational=True)

TypeError: 'module' object is not callable

In [None]:
batches_per_epoch = len(train_smiles)/model.batch_size
learning_rate = ExponentialDecay(0.001, 0.95, batches_per_epoch)
model.set_optimizer(Adam(learning_rate=learning_rate))

In [None]:
def generate_sequences(epochs):
    for i in range(epochs):
        for s in train_smiles:
            yield (s, s)
model.fit_sequences(generate_sequences(50))

In [None]:
# filter invalid SMILES strings
import numpy as np
from rdkit import Chem
predictions = model.predict_from_embeddings(np.random.normal(size=(1000,196)))
molecules = []
for p in predictions:
    smiles = ''.join(p)
    if Chem.MolFromSmiles(smiles) is not None:
        molecules.append(smiles)
for m in molecules:
    print(m)

Filter and analyze good molecules from list

In [None]:
# print a sorted list for the molecules generated 
print(sorted([x.GetNumAtoms() for x in molecules]))

# create a histogram to print the distribution of molecule atom sizes


In [None]:
# filter the list of molecules down to be physiologically relevant
good_mol_list = [x for x in molecules if x.GetNumAtoms() > 10 and x.GetNumAtoms() < 50]
print(len(good_mol_list))

In [None]:
# analyze QED scores of our generated targets
qed_list = [QED.qed(x) for x in good_mol_list]
final_mol_list = [(a,b) for a,b in zip(good_mol_list,qed_list) if b > 0.5]

In [None]:
MolsToGridImage([x[0] for x in final_mol_list],molsPerRow=3,useSVG=True,subImgSize=(250, 250),legends=[f"{x[1]:.2f}" for x in final_mol_list])