In [1]:
import deepchem as dc
tasks, datasets, transformers = dc.molnet.load_sider()
train_dataset, valid_dataset, test_dataset = datasets
train_smiles = train_dataset.ids
valid_smiles = valid_dataset.ids




Loading dataset from disk.
Loading dataset from disk.
Loading dataset from disk.


In [4]:
train_smiles.size

1141

In [5]:
train_smiles

array(['C(CNCCNCCNCCN)N',
       'CC(C)(C)C1=CC(=C(C=C1NC(=O)C2=CNC3=CC=CC=C3C2=O)O)C(C)(C)C',
       'CC[C@]12CC(=C)[C@H]3[C@H]([C@@H]1CC[C@]2(C#C)O)CCC4=CCCC[C@H]34',
       ..., 'CC12CC(C3(C(C1CC(C2(C(=O)CO)O)O)CCC4=CC(=O)C=CC43C)F)O',
       'CC1=CC(=CC(=C1OC2=NC(=NC(=C2Br)N)NC3=CC=C(C=C3)C#N)C)C#N',
       'CC1=NN=C2N1C3=C(C=C(C=C3)Cl)C(=NC2)C4=CC=CC=C4Cl'], dtype=object)

In [3]:
train_smiles

array(['C(CNCCNCCNCCN)N',
       'CC(C)(C)C1=CC(=C(C=C1NC(=O)C2=CNC3=CC=CC=C3C2=O)O)C(C)(C)C',
       'CC[C@]12CC(=C)[C@H]3[C@H]([C@@H]1CC[C@]2(C#C)O)CCC4=CCCC[C@H]34',
       ..., 'CC12CC(C3(C(C1CC(C2(C(=O)CO)O)O)CCC4=CC(=O)C=CC43C)F)O',
       'CC1=CC(=CC(=C1OC2=NC(=NC(=C2Br)N)NC3=CC=C(C=C3)C#N)C)C#N',
       'CC1=NN=C2N1C3=C(C=C(C=C3)Cl)C(=NC2)C4=CC=CC=C4Cl'], dtype=object)

In [None]:
tokens = set()
for s in train_smiles:
  tokens = tokens.union(set(c for c in s))
tokens = sorted(list(tokens))

In [None]:
tokens[0:5]

In [None]:
train_dataset.get_task_names()

In [None]:
from deepchem.models.tensorgraph.optimizers import Adam, ExponentialDecay
max_length = max(len(s) for s in train_smiles)
model = dc.models.SeqToSeq(tokens,
                           tokens,
                           max_length,
                           encoder_layers=2,
                           decoder_layers=2,
                           embedding_dimension=256,
                           model_dir='fingerprint')
batches_per_epoch = len(train_smiles)/model.batch_size
model.set_optimizer(Adam(learning_rate=ExponentialDecay(0.004, 0.9, batches_per_epoch)))

In [None]:
def generate_sequences(epochs):
  for i in range(epochs):
    for s in train_smiles:
      yield (s, s)

model.fit_sequences(generate_sequences(2))

In [None]:
predicted = model.predict_from_sequences(valid_smiles[:500])
count = 0
for s,p in zip(valid_smiles[:500], predicted):
  if ''.join(p) == s:
    count += 1
print('reproduced', count, 'of 500 validation SMILES strings')

In [7]:
train_dataset.w

array([[0.92059219, 0.43273092, 1.        , ..., 0.44433198, 0.09432515,
        1.        ],
       [1.        , 0.43273092, 1.        , ..., 1.        , 0.09432515,
        1.        ],
       [1.        , 0.43273092, 1.        , ..., 1.        , 0.09432515,
        1.        ],
       ...,
       [0.92059219, 0.43273092, 1.        , ..., 0.44433198, 0.09432515,
        0.50845666],
       [0.92059219, 0.43273092, 1.        , ..., 0.44433198, 0.09432515,
        0.50845666],
       [0.92059219, 0.43273092, 1.        , ..., 0.44433198, 0.09432515,
        0.50845666]])

In [None]:
train_embeddings = model.predict_embeddings(train_smiles)
train_embeddings_dataset = dc.data.NumpyDataset(train_embeddings,
                                                train_dataset.y,
                                                train_dataset.w,
                                                train_dataset.ids)

valid_embeddings = model.predict_embeddings(valid_smiles)
valid_embeddings_dataset = dc.data.NumpyDataset(valid_embeddings,
                                                valid_dataset.y,
                                                valid_dataset.w,
                                                valid_dataset.ids)

In [None]:
classifier = dc.models.MultiTaskClassifier(n_tasks=len(tasks),
                                                      n_features=256,
                                                      layer_sizes=[512])
classifier.fit(train_embeddings_dataset, nb_epoch=10)

In [None]:
import numpy as np
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification")
train_score = classifier.evaluate(train_embeddings_dataset, [metric], transformers)
valid_score = classifier.evaluate(valid_embeddings_dataset, [metric], transformers)
print('Training set ROC AUC:', train_score)
print('Validation set ROC AUC:', valid_score)