This code is adapted based on code examples in Ramsundar, Bharath; Eastman, Peter; Walters, Patrick; Pande, Vijay. Deep Learning for the Life Sciences, Chapter 4.

# Installing DeepChem

In [1]:
# Installing RDKit
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit

--2019-11-03 21:34:35--  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.continuum.io (repo.continuum.io)... 104.18.200.79, 104.18.201.79, 2606:4700::6812:c94f, ...
Connecting to repo.continuum.io (repo.continuum.io)|104.18.200.79|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 71785000 (68M) [application/x-sh]
Saving to: ‘Miniconda3-latest-Linux-x86_64.sh’


2019-11-03 21:34:36 (116 MB/s) - ‘Miniconda3-latest-Linux-x86_64.sh’ saved [71785000/71785000]

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ done
Solving environment: / - done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - _libgcc_mutex==0.1=main
    - asn1crypto==1.2.0=py37_0
    - ca-certificates==2019.10.16=0
    - certifi==2019.9.11=py37_0
    - cffi==1.13.0=py37h2e261b9_0
    - chardet==3.0.4=py37_1003
    - conda-package-handling==1.6.0=py37h7b6447c_0
    - conda=

In [0]:
# append rdkit path to current python system path.
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import os
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [3]:
# Install DeepChem 
!pip install deepchem

Collecting deepchem
[?25l  Downloading https://files.pythonhosted.org/packages/05/03/ccdd048c61c070dca8aa572010c7ae39a46caad162ca7a3ecc62881b5124/deepchem-2.2.1.dev54.tar.gz (3.9MB)
[K     |████████████████████████████████| 3.9MB 2.8MB/s 
[?25hBuilding wheels for collected packages: deepchem
  Building wheel for deepchem (setup.py) ... [?25l[?25hdone
  Created wheel for deepchem: filename=deepchem-2.2.1.dev54-cp37-none-any.whl size=1894931 sha256=6af5d5f1dc750e28928b9035be51843c32624c04acabbff2a272b2006496c207
  Stored in directory: /root/.cache/pip/wheels/c7/49/0f/0b4235337998b7eadd19f137bf648515da501ad09fd63d4ba0
Successfully built deepchem
Installing collected packages: deepchem
Successfully installed deepchem-2.2.1.dev54


In [4]:
# Train a model to predict binding sites for the transcription factor JUND.

import deepchem as dc
import deepchem.models.tensorgraph.layers as layers
import tensorflow as tf



The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



# Dataset
Transcription Factor dataset.

In [9]:
# Upload the provided zip file containing the train, validation, and test folders
from google.colab import files
files.upload()

Saving TFData.zip to TFData.zip


In [11]:
# Now, unzip the files
!unzip TFData.zip

Archive:  TFData.zip
 extracting: test_dataset/metadata.csv.gzip  
  inflating: test_dataset/shard-0-ids.joblib  
  inflating: test_dataset/shard-0-w.joblib  
  inflating: test_dataset/shard-0-X.joblib  
  inflating: test_dataset/shard-0-y.joblib  
 extracting: test_dataset/tasks.json  
 extracting: train_dataset/metadata.csv.gzip  
  inflating: train_dataset/shard-0-ids.joblib  
  inflating: train_dataset/shard-0-w.joblib  
  inflating: train_dataset/shard-0-X.joblib  
  inflating: train_dataset/shard-0-y.joblib  
 extracting: train_dataset/tasks.json  
 extracting: valid_dataset/metadata.csv.gzip  
  inflating: valid_dataset/shard-0-ids.joblib  
  inflating: valid_dataset/shard-0-w.joblib  
  inflating: valid_dataset/shard-0-X.joblib  
  inflating: valid_dataset/shard-0-y.joblib  
 extracting: valid_dataset/tasks.json  


In [12]:
# check the files on colab drive
! ls

Miniconda3-latest-Linux-x86_64.sh  test_dataset  train_dataset
sample_data			   TFData.zip	 valid_dataset


In [13]:
# Load the TF data.

train = dc.data.DiskDataset('train_dataset')
valid = dc.data.DiskDataset('valid_dataset')

Loading dataset from disk.
Loading dataset from disk.


# Building the Model

In [0]:
# Build the TF binding prediction model.

model = dc.models.TensorGraph(batch_size=1000, model_dir='tf')

# First, define the input layer
features = layers.Feature(shape=(None, 101, 4))
labels = layers.Label(shape=(None, 1))
# To address data imbalance
weights = layers.Weights(shape=(None, 1))
prev = features

# Three conv layers, each followed by a dropout layer to prevent overfitting
for i in range(3):
    prev = layers.Conv1D(filters=15, kernel_size=10, activation=tf.nn.relu, padding='same', in_layers=prev)
    prev = layers.Dropout(dropout_prob=0.5, in_layers=prev)
    
# Dense layer, followed by sigmoid function to convert the values to range [0-1]    
logits = layers.Dense(out_channels=1, in_layers=layers.Flatten(prev))
output = layers.Sigmoid(logits)
model.add_output(output)

# Loss is based on cross-entropy
loss = layers.SigmoidCrossEntropy(in_layers=[labels, logits])

# We need these wights becasue the data is very unbalanced
weighted_loss = layers.WeightedError(in_layers=[loss, weights])
model.set_loss(weighted_loss)

# Displaying the Results

In [0]:
# Train the model, tracking its performance on the training and validation datasets.

metric = dc.metrics.Metric(dc.metrics.roc_auc_score)

# train for 200 epochs, display the results every 10 epochs
for i in range(20):
    model.fit(train, nb_epoch=10)
    print(model.evaluate(train, [metric]))
    print(model.evaluate(valid, [metric]))



















Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where






































computed_metrics: [0.6416913487429143]
{'roc_auc_score': 0.6416913487429143}
computed_metrics: [0.5882954228064005]
{'roc_auc_score': 0.5882954228064005}
computed_metrics: [0.7687544397605519]
{'roc_auc_score': 0.7687544397605519}
computed_metrics: [0.7370541151870678]
{'roc_auc_score': 0.7370541151870678}
computed_metrics: [0.8017397284055625]
{'roc_auc_score': 0.8017397284055625}
computed_metrics: [0.7170140763475207]
{'roc_auc_score': 0.7170140763475207}
