This code is adapted based on code examples in Ramsundar, Bharath; Eastman, Peter; Walters, Patrick; Pande, Vijay. Deep Learning for the Life Sciences, Chapter 6.

# Installing DeepChem

In [0]:
# Installing RDKit
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit

--2019-08-01 13:07:18--  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.continuum.io (repo.continuum.io)... 104.18.201.79, 104.18.200.79, 2606:4700::6812:c94f, ...
Connecting to repo.continuum.io (repo.continuum.io)|104.18.201.79|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75257002 (72M) [application/x-sh]
Saving to: ‘Miniconda3-latest-Linux-x86_64.sh’


2019-08-01 13:07:24 (124 MB/s) - ‘Miniconda3-latest-Linux-x86_64.sh’ saved [75257002/75257002]

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ | done
Solving environment: - \ | done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - _libgcc_mutex==0.1=main
    - asn1crypto==0.24.0=py37_0
    - bzip2==1.0.8=h7b6447c_0
    - ca-certificates==2019.5.15=0
    - certifi==2019.6.16=py37_0
    - cffi==1.12.3=py37h2e261b9_0
    - chardet==3.0.4=py37_1
    - conda-package-handling

In [0]:
# append rdkit path to current python system path.
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import os
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [0]:
# Install DeepChem 
!pip install deepchem

Collecting deepchem
[?25l  Downloading https://files.pythonhosted.org/packages/05/03/ccdd048c61c070dca8aa572010c7ae39a46caad162ca7a3ecc62881b5124/deepchem-2.2.1.dev54.tar.gz (3.9MB)
[K     |████████████████████████████████| 3.9MB 2.8MB/s 
[?25hBuilding wheels for collected packages: deepchem
  Building wheel for deepchem (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/c7/49/0f/0b4235337998b7eadd19f137bf648515da501ad09fd63d4ba0
Successfully built deepchem
Installing collected packages: deepchem
Successfully installed deepchem-2.2.1.dev54


In [0]:
# Train a model to predict transcription factor binding, based on both
# sequence and chromatin accessibility.

import deepchem as dc
import deepchem.models.tensorgraph.layers as layers
import tensorflow as tf
import numpy as np



# Dataset
Transcription Factor dataset.

In [0]:
# Upload the provided TFData.zip file containing the train, validation, and test folders
# It also contains a text file with the accessibility values
from google.colab import files
files.upload()

In [0]:
# Now, unzip the files
!unzip TFData.zip

In [0]:
# check the files on colab drive
! ls

In [0]:
# Load the data.
train = dc.data.DiskDataset('train_dataset')
valid = dc.data.DiskDataset('valid_dataset')

# Create a dictionary to hold the accessibility value for each region
span_accessibility = {}
for line in open('accessibility.txt'):
    fields = line.split()
    span_accessibility[fields[0]] = float(fields[1])

Loading dataset from disk.
Loading dataset from disk.


# Building the Model

In [0]:
# Build the model. Almost the same as the previosu model, 
# with addition of accessibility feature, added right before the dense layer
model = dc.models.TensorGraph(batch_size=1000, model_dir='chromatin')
features = layers.Feature(shape=(None, 101, 4))
accessibility = layers.Feature(shape=(None, 1))
labels = layers.Label(shape=(None, 1))
weights = layers.Weights(shape=(None, 1))
prev = features
for i in range(3):
    prev = layers.Conv1D(filters=15, kernel_size=10, activation=tf.nn.relu, padding='same', in_layers=prev)
    prev = layers.Dropout(dropout_prob=0.5, in_layers=prev)
prev = layers.Concat([layers.Flatten(prev), accessibility])
logits = layers.Dense(out_channels=1, in_layers=prev)
output = layers.Sigmoid(logits)
model.add_output(output)
loss = layers.SigmoidCrossEntropy(in_layers=[labels, logits])
weighted_loss = layers.WeightedError(in_layers=[loss, weights])
model.set_loss(weighted_loss)


# Define a generator function to produce batches.
# Since we have two sets of features, fit(dataset) will not work automatically.
def generate_batches(dataset, epochs):
    for epoch in range(epochs):
        for X, y, w, ids in dataset.iterbatches(batch_size=1000, pad_batches=True):
            yield {
                features: X,
                accessibility: np.array([span_accessibility[id] for id in ids]),
                labels: y,
                weights: w
            }

# Displaying the Results

In [0]:
# Train the model, tracking its performance on the training and validation datasets.

metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
for i in range(20):
    model.fit_generator(generate_batches(train, epochs=10))
    print(model.evaluate_generator(generate_batches(train, 1), [metric], labels=[labels], weights=[weights]))
    print(model.evaluate_generator(generate_batches(valid, 1), [metric], labels=[labels], weights=[weights]))

W0801 13:37:44.061543 140071252936576 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/deepchem/models/tensorgraph/tensor_graph.py:715: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0801 13:37:44.078584 140071252936576 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/deepchem/models/tensorgraph/layers.py:2464: The name tf.FIFOQueue is deprecated. Please use tf.queue.FIFOQueue instead.

W0801 13:37:44.092575 140071252936576 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/deepchem/models/tensorgraph/layers.py:1216: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0801 13:37:44.104421 140071252936576 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in

computed_metrics: [0.7471351482411239]
{'roc_auc_score': 0.7471351482411239}
computed_metrics: [0.7274185367595116]
{'roc_auc_score': 0.7274185367595116}
computed_metrics: [0.751249042570602]
{'roc_auc_score': 0.751249042570602}
computed_metrics: [0.7285084585983099]
{'roc_auc_score': 0.7285084585983099}
computed_metrics: [0.8364644659283381]
{'roc_auc_score': 0.8364644659283381}
computed_metrics: [0.7777871299574909]
{'roc_auc_score': 0.7777871299574909}
computed_metrics: [0.8363089638052714]
{'roc_auc_score': 0.8363089638052714}
computed_metrics: [0.7699812695208768]
{'roc_auc_score': 0.7699812695208768}
computed_metrics: [0.8646372788779483]
{'roc_auc_score': 0.8646372788779483}
computed_metrics: [0.7820186205107711]
{'roc_auc_score': 0.7820186205107711}
computed_metrics: [0.879513647094923]
{'roc_auc_score': 0.879513647094923}
computed_metrics: [0.785926680593335]
{'roc_auc_score': 0.785926680593335}
computed_metrics: [0.8868990143015636]
{'roc_auc_score': 0.8868990143015636}
compu