This code is adapted based on code examples in Ramsundar, Bharath; Eastman, Peter; Walters, Patrick; Pande, Vijay. Deep Learning for the Life Sciences, Chapter 3.

In [0]:
# Installing RDKit
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit

In [0]:
# append rdkit path to current python system path.
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import os
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [0]:
# Install DeepChem 
!pip install deepchem

In [0]:
# Train a neural network to predict the solubility of molecules.  First load the data.
import deepchem as dc
import numpy as np

In [23]:
# Load and process Tox21 toxicity dataset
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()

Loading dataset from disk.
Loading dataset from disk.
Loading dataset from disk.


In [29]:
# Each task corresponds with a particular experiment, 
# i.e. for an enzymatic assay which measures whether the molecules in tox21 bind with a specific biological target.
# NR-AR, NR-AhR, ... are targets. 
print ('Targets: ', tox21_tasks)
print('Number of Tasks: ', len(tox21_tasks))

Targets:  ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']
Number of Tasks:  12


In [46]:
# the three datasets represent the training, validation, and test sets
# the 12 labels correspond to the 12 tasks
train_dataset, valid_dataset, test_dataset = tox21_datasets
print('train_dataset X size (samples, features)= ', train_dataset.X.shape)
print('train_dataset y size (samples, labels)= ', train_dataset.y.shape, '\n')

print('valid_dataset X size (samples, features)= ', valid_dataset.X.shape)
print('valid_dataset y size (samples, labels)= ', valid_dataset.y.shape, '\n')

print('test_dataset X size (samples, features)= ', test_dataset.X.shape)
print('test_dataset y size (samples, labels)= ', test_dataset.y.shape)

train_dataset X size (samples, features)=  (6264, 1024)
train_dataset y size (samples, labels)=  (6264, 12) 

valid_dataset X size (samples, features)=  (783, 1024)
valid_dataset y size (samples, labels)=  (783, 12) 

test_dataset X size (samples, features)=  (784, 1024)
test_dataset y size (samples, labels)=  (784, 12)


In [51]:
# Tox21 did not test every molecule in every task, i.e. some of the 12 labels are meaningless placeholders.
# In such cases, the cprresonding w is zero, representing missing experiments.
print('train_dataset w size (samples, weights)= ', train_dataset.w.shape)
print('Number of non-zero weights in the training set: ', np.count_nonzero(train_dataset.w))
print('Number of zero weights (missing experiments): ', np.count_nonzero(train_dataset.w == 0))

train_dataset w size (samples, weights)=  (6264, 12)
Number of non-zero weights in the training set:  62166
Number of zero weights (missing experiments):  13002


In [54]:
# The Balancing Transformer adjusts the weights for individual data points so that the total weight assigned to every class is the same.
transformers

[<deepchem.trans.transformers.BalancingTransformer at 0x7f08175b6ef0>]

In [0]:
# define a fully connected network with 12 output nodes and a single hidden layer with 1,000 nodes
model = dc.models.MultitaskClassifier(n_tasks=12, n_features=1024, layer_sizes=[1000])

In [56]:
# fit to data
model.fit(train_dataset, nb_epoch=10)

W0712 20:51:25.712803 139674036209536 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/deepchem/models/tensorgraph/tensor_graph.py:715: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0712 20:51:25.727665 139674036209536 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/deepchem/models/tensorgraph/layers.py:2464: The name tf.FIFOQueue is deprecated. Please use tf.queue.FIFOQueue instead.

W0712 20:51:25.739868 139674036209536 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/deepchem/models/tensorgraph/layers.py:1216: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0712 20:51:25.888552 139674036209536 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/deepchem/models/tensorgraph/tensor_graph.py:728: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

W0712 20:51:25.918762 13

805.3353067549448

In [58]:
# evaluate the model
metric = dc.metrics.Metric(dc.metrics.roc_auc_score,np.mean)
train_scores = model.evaluate(train_dataset, [metric], transformers)
test_scores = model.evaluate(test_dataset, [metric], transformers)

computed_metrics: [0.9880325059436628, 0.9957098988439306, 0.9601642202900521, 0.9801012627020332, 0.8959728271108304, 0.9838539112636719, 0.9914559424105617, 0.903913367310219, 0.9874250085257128, 0.9689304859219681, 0.9463020625582572, 0.9755177043128042]
computed_metrics: [0.7959786017339974, 0.8473557016804362, 0.9012261435121014, 0.8057881515595562, 0.7160504586156562, 0.7938605271938606, 0.716898378020523, 0.7172906148575551, 0.8500101895251682, 0.7109426499670403, 0.8662287994679082, 0.7924057649667406]


In [75]:
print('train_scores:', train_scores)
print('test_scores: ', test_scores)

train_scores: {'mean-roc_auc_score': 0.964781599766142}
test_scores:  {'mean-roc_auc_score': 0.7928363317583785}
