This code is adapted based on code examples in Ramsundar, Bharath; Eastman, Peter; Walters, Patrick; Pande, Vijay. Deep Learning for the Life Sciences, Chapter 4.

# Installing DeepChem

In [7]:
# Installing RDKit
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit

--2019-07-31 19:24:00--  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.continuum.io (repo.continuum.io)... 104.18.200.79, 104.18.201.79, 2606:4700::6812:c84f, ...
Connecting to repo.continuum.io (repo.continuum.io)|104.18.200.79|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ done
Solving environment: / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / 

In [0]:
# append rdkit path to current python system path.
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import os
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [9]:
# Install DeepChem 
!pip install deepchem



In [0]:
# Train a model to predict binding sites for the transcription factor JUND.

import deepchem as dc
import deepchem.models.tensorgraph.layers as layers
import tensorflow as tf

# Dataset
Transcription Factor dataset.

In [11]:
# Upload the provided zip file containing the train, validation, and test folders
from google.colab import files
files.upload()

Saving data.zip to data.zip


In [12]:
# Now, unzip the files
!unzip data.zip

Archive:  data.zip
 extracting: test_dataset/metadata.csv.gzip  
  inflating: test_dataset/shard-0-ids.joblib  
  inflating: test_dataset/shard-0-w.joblib  
  inflating: test_dataset/shard-0-X.joblib  
  inflating: test_dataset/shard-0-y.joblib  
 extracting: test_dataset/tasks.json  
 extracting: train_dataset/metadata.csv.gzip  
  inflating: train_dataset/shard-0-ids.joblib  
  inflating: train_dataset/shard-0-w.joblib  
  inflating: train_dataset/shard-0-X.joblib  
  inflating: train_dataset/shard-0-y.joblib  
 extracting: train_dataset/tasks.json  
 extracting: valid_dataset/metadata.csv.gzip  
  inflating: valid_dataset/shard-0-ids.joblib  
  inflating: valid_dataset/shard-0-w.joblib  
  inflating: valid_dataset/shard-0-X.joblib  
  inflating: valid_dataset/shard-0-y.joblib  
 extracting: valid_dataset/tasks.json  


In [13]:
# check the files on colab drive
! ls

data.zip			   sample_data	 train_dataset
Miniconda3-latest-Linux-x86_64.sh  test_dataset  valid_dataset


In [14]:
# Load the TF data.

train = dc.data.DiskDataset('train_dataset')
valid = dc.data.DiskDataset('valid_dataset')

Loading dataset from disk.
Loading dataset from disk.


# Building the Model

In [0]:
# Build the TF binding prediction model.

model = dc.models.TensorGraph(batch_size=1000, model_dir='tf')

# First, define the input layer
features = layers.Feature(shape=(None, 101, 4))
labels = layers.Label(shape=(None, 1))
weights = layers.Weights(shape=(None, 1))
prev = features

# Three conv layers, each followed by a dropout layer to prevent overfitting
for i in range(3):
    prev = layers.Conv1D(filters=15, kernel_size=10, activation=tf.nn.relu, padding='same', in_layers=prev)
    prev = layers.Dropout(dropout_prob=0.5, in_layers=prev)
    
# Dense layer, followed by sigmoid function to convert the values to range [0-1]    
logits = layers.Dense(out_channels=1, in_layers=layers.Flatten(prev))
output = layers.Sigmoid(logits)
model.add_output(output)

# Loss is based on cross-entropy
loss = layers.SigmoidCrossEntropy(in_layers=[labels, logits])

# We need these wights becasue the data is very unbalanced
weighted_loss = layers.WeightedError(in_layers=[loss, weights])
model.set_loss(weighted_loss)

# Displaying the Results

In [0]:
# Train the model, tracking its performance on the training and validation datasets.

metric = dc.metrics.Metric(dc.metrics.roc_auc_score)

# train for 200 epochs, display the results every 10 epochs
for i in range(20):
    model.fit(train, nb_epoch=10)
    print(model.evaluate(train, [metric]))
    print(model.evaluate(valid, [metric]))

W0731 19:40:04.767672 140466739328896 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/deepchem/models/tensorgraph/tensor_graph.py:715: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0731 19:40:04.780155 140466739328896 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/deepchem/models/tensorgraph/layers.py:2464: The name tf.FIFOQueue is deprecated. Please use tf.queue.FIFOQueue instead.

W0731 19:40:04.792989 140466739328896 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/deepchem/models/tensorgraph/layers.py:1216: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0731 19:40:04.801641 140466739328896 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in

computed_metrics: [0.6446113309786425]
{'roc_auc_score': 0.6446113309786425}
computed_metrics: [0.5940872885748858]
{'roc_auc_score': 0.5940872885748858}
computed_metrics: [0.7639608726576442]
{'roc_auc_score': 0.7639608726576442}
computed_metrics: [0.7377479915834366]
{'roc_auc_score': 0.7377479915834366}
computed_metrics: [0.7840484078769016]
{'roc_auc_score': 0.7840484078769016}
computed_metrics: [0.7339977121801505]
{'roc_auc_score': 0.7339977121801505}
