This code is adapted based on code examples in Ramsundar, Bharath; Eastman, Peter; Walters, Patrick; Pande, Vijay. Deep Learning for the Life Sciences, Chapter 6.

# Installing DeepChem

In [1]:
# Installing RDKit
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit

--2019-08-01 15:01:32--  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.continuum.io (repo.continuum.io)... 104.18.201.79, 104.18.200.79, 2606:4700::6812:c84f, ...
Connecting to repo.continuum.io (repo.continuum.io)|104.18.201.79|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done
Solving environment: | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / 

In [0]:
# append rdkit path to current python system path.
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import os
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [5]:
# Install DeepChem 
!pip install deepchem



In [6]:
# Train a model to predict how well sequences will work for RNA interference.

import deepchem as dc
import deepchem.models.tensorgraph.layers as layers
import tensorflow as tf
import matplotlib.pyplot as plot



# Dataset
Transcription Factor dataset.

In [7]:
# Upload the provided siRNAData.zip file containing the train, validation, and test folders
# It also contains a text file with the accessibility values
from google.colab import files
files.upload()

Saving siRNAData.zip to siRNAData.zip


{'siRNAData.zip': b'PK\x03\x04\x14\x00\x00\x00\x00\x00\xc3|\xdaNB\xe9\xaa\xabU\x00\x00\x00U\x00\x00\x00\x1d\x00\x00\x00train_siRNA/metadata.csv.gzip\x1f\x8b\x08\x08\x81\x95\xaa[\x02\xffmetadata.csv.gzip\x00\xcaL)\xd6\x89\xd0\xa9\xd4)\xe7*\xceH,J\xd15\xd0\xcdL)\xd6\xcb\xcaO\xca\xc9L\xd2\x81\tE\xa0\x0bT\xa2\x0b\x94C\x05\xb8\x00\x00\x00\x00\xff\xff\x03\x00y\xcd\x93\xabP\x00\x00\x00PK\x03\x04\x14\x00\x00\x00\x08\x00\xc3|\xdaN\x9b:\xac\xc5h\x08\x00\x00\x12\x0e\x00\x00\x1e\x00\x00\x00train_siRNA/shard-0-ids.joblib\xed\x96\xe9_\xd2\t\x02\xc6\xf3\xc4\x03\xf1F\x11\xcc\xd2\xd4\x0e\xd4\x82\xd4\x1c\r+\x0b\xb3\xccn\x8c\xd1\x92\x9bJL\x85DH\xf3\x16/\xc4)wpH\x1dw>m6\xb3Z\xa9l\x06\xe5xN\xca\xf9\xa3v&\x8d\xd1\x14<`w\xb2Ie\xd6\xc6td{\xb9\x9f\xfd\x03\xb6\x17\xed\xbb\xe7\xfd\xf7\xf3<\xdf\x87s!|\x82\xfc\xf2%\x0e\xbeA\xf7sw\xc6x\xf7\xbd\x17\xdd\x93\x1f\x02\xab-c|\xe2\xe7{\xaf.\x8f\xff\xfa\xfe\xcd\x9b\x9cBM\xddD\x94q\xd6:\x96\x19\xc8e\xde\xdc?\xd0\xf8\xf0\xcdO\x7f\xbb\xde\xd8*\xfa\x8c\xfd\xf6\x994\xf76\x90q\x

In [8]:
# Now, unzip the files
!unzip siRNAData.zip

Archive:  siRNAData.zip
 extracting: train_siRNA/metadata.csv.gzip  
  inflating: train_siRNA/shard-0-ids.joblib  
  inflating: train_siRNA/shard-0-w.joblib  
  inflating: train_siRNA/shard-0-X.joblib  
  inflating: train_siRNA/shard-0-y.joblib  
 extracting: train_siRNA/tasks.json  
 extracting: valid_siRNA/metadata.csv.gzip  
 extracting: valid_siRNA/shard-0-ids.joblib  
 extracting: valid_siRNA/shard-0-w.joblib  
 extracting: valid_siRNA/shard-0-X.joblib  
 extracting: valid_siRNA/shard-0-y.joblib  
 extracting: valid_siRNA/tasks.json  


In [9]:
# check the files on colab drive
! ls

 accessibility.txt		     sample_data     train_siRNA
 chromatin			     siRNAData.zip   valid_dataset
'data (1).zip'			     test_dataset    valid_siRNA
 Miniconda3-latest-Linux-x86_64.sh   train_dataset


In [10]:
# Load the data.

train = dc.data.DiskDataset('train_siRNA')
valid = dc.data.DiskDataset('valid_siRNA')

Loading dataset from disk.
Loading dataset from disk.


# Building the Model

In [0]:
# Build the model.

model = dc.models.TensorGraph(model_dir='rnai')
features = layers.Feature(shape=(None, 21, 4))
labels = layers.Label(shape=(None, 1))
prev = features
for i in range(2):
    prev = layers.Conv1D(filters=10, kernel_size=10, activation=tf.nn.relu, padding='same', in_layers=prev)
    prev = layers.Dropout(dropout_prob=0.3, in_layers=prev)
output = layers.Dense(out_channels=1, activation_fn=tf.sigmoid, in_layers=layers.Flatten(prev))
model.add_output(output)
loss = layers.ReduceMean(layers.L2Loss(in_layers=[labels, output]))
model.set_loss(loss)

# Displaying the Results

In [12]:
# Train the model, tracking its performance on the training and validation datasets.

metric = dc.metrics.Metric(dc.metrics.pearsonr, mode='regression')
for i in range(20):
    model.fit(train, nb_epoch=10)
    print(model.evaluate(train, [metric])['pearsonr'][0])
    print(model.evaluate(valid, [metric])['pearsonr'][0])


W0801 15:11:22.908983 139723547162496 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/deepchem/models/tensorgraph/tensor_graph.py:715: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0801 15:11:22.930755 139723547162496 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/deepchem/models/tensorgraph/layers.py:2464: The name tf.FIFOQueue is deprecated. Please use tf.queue.FIFOQueue instead.

W0801 15:11:22.945761 139723547162496 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/deepchem/models/tensorgraph/layers.py:1216: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0801 15:11:22.950990 139723547162496 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in

computed_metrics: [(0.5457555359289006, 5.247716373713173e-170)]
0.5457555359289006
computed_metrics: [(0.4481082665263447, 1.8733888410018194e-13)]
0.4481082665263447
computed_metrics: [(0.6555691250803435, 7.929492941975894e-269)]
0.6555691250803435
computed_metrics: [(0.5858610366582658, 7.001721198416603e-24)]
0.5858610366582658
computed_metrics: [(0.6851107492952487, 5.311233906627562e-303)]
0.6851107492952487
computed_metrics: [(0.6075394205378534, 5.222715453406928e-26)]
0.6075394205378534
computed_metrics: [(0.7044010588954356, 0.0)]
0.7044010588954356
computed_metrics: [(0.6122213778749184, 1.724996582830778e-26)]
0.6122213778749184
computed_metrics: [(0.724228190951935, 0.0)]
0.724228190951935
computed_metrics: [(0.6262193560043677, 5.616275970304259e-28)]
0.6262193560043677
computed_metrics: [(0.7345244619943151, 0.0)]
0.7345244619943151
computed_metrics: [(0.6266319406178767, 5.063626427537396e-28)]
0.6266319406178767
computed_metrics: [(0.7452444443120135, 0.0)]
0.74524444