This code is adapted based on code examples in Ramsundar, Bharath; Eastman, Peter; Walters, Patrick; Pande, Vijay. Deep Learning for the Life Sciences, Chapter 4.

In [1]:
# Installing RDKit
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit

--2019-07-24 18:03:15--  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.continuum.io (repo.continuum.io)... 104.18.201.79, 104.18.200.79, 2606:4700::6812:c84f, ...
Connecting to repo.continuum.io (repo.continuum.io)|104.18.201.79|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 70348401 (67M) [application/x-sh]
Saving to: ‘Miniconda3-latest-Linux-x86_64.sh’


2019-07-24 18:03:20 (171 MB/s) - ‘Miniconda3-latest-Linux-x86_64.sh’ saved [70348401/70348401]

PREFIX=/usr/local
reinstalling: python-3.7.3-h0371630_0 ...
using -f (force) option
Python 3.7.3
reinstalling: ca-certificates-2019.1.23-0 ...
using -f (force) option
reinstalling: libgcc-ng-8.2.0-hdf63c60_1 ...
using -f (force) option
reinstalling: libstdcxx-ng-8.2.0-hdf63c60_1 ...
using -f (force) option
reinstalling: libffi-3.2.1-hd88cf55_4 ...
using -f (force) option
reinstalling: ncurses-6.1-he6710b0_1 ...
using -f (force) option
reinstalling: openssl-1.1.1b-h7b6447c_1 

In [0]:
# append rdkit path to current python system path.
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import os
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [3]:
# Install DeepChem 
!pip install deepchem

Collecting deepchem
[?25l  Downloading https://files.pythonhosted.org/packages/05/03/ccdd048c61c070dca8aa572010c7ae39a46caad162ca7a3ecc62881b5124/deepchem-2.2.1.dev54.tar.gz (3.9MB)
[K    100% |████████████████████████████████| 3.9MB 9.0MB/s 
[?25hBuilding wheels for collected packages: deepchem
  Building wheel for deepchem (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/c7/49/0f/0b4235337998b7eadd19f137bf648515da501ad09fd63d4ba0
Successfully built deepchem
Installing collected packages: deepchem
Successfully installed deepchem-2.2.1.dev54


In [4]:
# Train a neural network to predict the solubility of molecules. 
import deepchem as dc
import numpy as np



In [5]:
# Train a neural network to predict the solubility of molecules.  First load the data.
# to learn more about the dataset, see https://github.com/deepchem/deepchem/blob/master/datasets/delaney-processed.csv
tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = datasets

Loading raw samples now.
shard_size: 8192
About to start loading CSV from /tmp/delaney-processed.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
TIMING: featurizing shard 0 took 2.412 s
TIMING: dataset construction took 2.836 s
Loading dataset from disk.
TIMING: dataset construction took 0.441 s
Loading dataset from disk.
TIMING: dataset construction took 0.386 s
Loading dataset from disk.
TIMING: dataset construction took 0.165 s
Loading dataset from disk.
TIMING: dataset construction took 0.348 s
Loading dataset from disk.


In [6]:
# Create and train the model.

model = dc.models.GraphConvModel(n_tasks=1, mode='regression', dropout=0.2)
model.fit(train_dataset, nb_epoch=100)

W0724 18:11:41.191144 140286520739712 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/deepchem/models/tensorgraph/tensor_graph.py:715: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0724 18:11:41.218941 140286520739712 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/deepchem/models/tensorgraph/layers.py:2464: The name tf.FIFOQueue is deprecated. Please use tf.queue.FIFOQueue instead.

W0724 18:11:41.232506 140286520739712 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/deepchem/models/tensorgraph/layers.py:1216: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0724 18:11:41.256841 140286520739712 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in

0.015170799568295479

In [7]:
# Evaluate it.

metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score")
print(model.evaluate(train_dataset, [metric], transformers))
print("Test set score")
print(model.evaluate(test_dataset, [metric], transformers))

Training set score
computed_metrics: [0.9577330602567239]
{'pearson_r2_score': 0.9577330602567239}
Test set score
computed_metrics: [0.8551884356744202]
{'pearson_r2_score': 0.8551884356744202}


In [8]:
# Use it to predict the solubility of some molecules.

# We specify the SMILES string for five new molecules 
smiles = ['COC(C)(C)CCCC(C)CC=CC(C)=CC(=O)OC(C)C',
          'CCOC(=O)CC',
          'CSc1nc(NC(C)C)nc(NC(C)C)n1',
          'CC(C#C)N(C)C(=O)Nc1ccc(Cl)cc1',
          'Cc1cc2ccccc2cc1C']

#convert SMILES string to the format expected by graph convolution model
# We will use the rdkit package to do this
from rdkit import Chem
mols = [Chem.MolFromSmiles(s) for s in smiles]
featurizer = dc.feat.ConvMolFeaturizer()
x = featurizer.featurize(mols)

# now predict the solubility using the model we developed previosuly
predicted_solubility = model.predict_on_batch(x)
for m,s in zip(smiles, predicted_solubility):
    print()
    print('Molecule:', m)
    print('Predicted solubility:', s)


Molecule: COC(C)(C)CCCC(C)CC=CC(C)=CC(=O)OC(C)C
Predicted solubility: [-0.50721705]

Molecule: CCOC(=O)CC
Predicted solubility: [1.1177869]

Molecule: CSc1nc(NC(C)C)nc(NC(C)C)n1
Predicted solubility: [0.18848886]

Molecule: CC(C#C)N(C)C(=O)Nc1ccc(Cl)cc1
Predicted solubility: [0.06250391]

Molecule: Cc1cc2ccccc2cc1C
Predicted solubility: [-0.16222693]
